chore_: add `bits-and-blooms/bloom/v3` module
This commit is contained in:
parent
d69b3e5cc9
commit
1715defec8
3
go.mod
3
go.mod
|
@ -81,6 +81,7 @@ require (
|
|||
github.com/Masterminds/squirrel v1.5.4
|
||||
github.com/afex/hystrix-go v0.0.0-20180502004556-fa1af6a1f4f5
|
||||
github.com/andybalholm/brotli v1.0.5
|
||||
github.com/bits-and-blooms/bloom/v3 v3.7.0
|
||||
github.com/cenkalti/backoff/v4 v4.2.1
|
||||
github.com/gorilla/sessions v1.2.1
|
||||
github.com/ipfs/go-log/v2 v2.5.1
|
||||
|
@ -131,7 +132,7 @@ require (
|
|||
github.com/benbjohnson/clock v1.3.5 // indirect
|
||||
github.com/benbjohnson/immutable v0.3.0 // indirect
|
||||
github.com/beorn7/perks v1.0.1 // indirect
|
||||
github.com/bits-and-blooms/bitset v1.2.0 // indirect
|
||||
github.com/bits-and-blooms/bitset v1.13.0 // indirect
|
||||
github.com/bradfitz/iter v0.0.0-20191230175014-e8f45d346db8 // indirect
|
||||
github.com/btcsuite/btcd v0.22.1 // indirect
|
||||
github.com/btcsuite/btcd/btcec/v2 v2.3.2 // indirect
|
||||
|
|
8
go.sum
8
go.sum
|
@ -411,8 +411,12 @@ github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6r
|
|||
github.com/bgentry/speakeasy v0.1.0/go.mod h1:+zsyZBPWlz7T6j88CTgSN5bM796AkVf0kBD4zp0CCIs=
|
||||
github.com/bitly/go-hostpool v0.0.0-20171023180738-a3a6125de932/go.mod h1:NOuUCSz6Q9T7+igc/hlvDOUdtWKryOrtFyIVABv/p7k=
|
||||
github.com/bitly/go-simplejson v0.5.0/go.mod h1:cXHtHw4XUPsvGaxgjIAn8PhEWG9NfngEKAMDJEczWVA=
|
||||
github.com/bits-and-blooms/bitset v1.2.0 h1:Kn4yilvwNtMACtf1eYDlG8H77R07mZSPbMjLyS07ChA=
|
||||
github.com/bits-and-blooms/bitset v1.2.0/go.mod h1:gIdJ4wp64HaoK2YrL1Q5/N7Y16edYb8uY+O0FJTyyDA=
|
||||
github.com/bits-and-blooms/bitset v1.10.0/go.mod h1:7hO7Gc7Pp1vODcmWvKMRA9BNmbv6a/7QIWpPxHddWR8=
|
||||
github.com/bits-and-blooms/bitset v1.13.0 h1:bAQ9OPNFYbGHV6Nez0tmNI0RiEu7/hxlYJRUA0wFAVE=
|
||||
github.com/bits-and-blooms/bitset v1.13.0/go.mod h1:7hO7Gc7Pp1vODcmWvKMRA9BNmbv6a/7QIWpPxHddWR8=
|
||||
github.com/bits-and-blooms/bloom/v3 v3.7.0 h1:VfknkqV4xI+PsaDIsoHueyxVDZrfvMn56jeWUzvzdls=
|
||||
github.com/bits-and-blooms/bloom/v3 v3.7.0/go.mod h1:VKlUSvp0lFIYqxJjzdnSsZEw4iHb1kOL2tfHTgyJBHg=
|
||||
github.com/bkaradzic/go-lz4 v1.0.0/go.mod h1:0YdlkowM3VswSROI7qDxhRvJ3sLhlFrRRwjwegp5jy4=
|
||||
github.com/bketelsen/crypt v0.0.3-0.20200106085610-5cbc8cc4026c/go.mod h1:MKsuJmJgSg28kpZDP6UIiPt0e0Oz0kqKNGyRaWEPv84=
|
||||
github.com/blang/semver v3.1.0+incompatible/go.mod h1:kRBLl5iJ+tD4TcOOxsy/0fnwebNt5EWlYSAyrTnjyyk=
|
||||
|
@ -2109,6 +2113,8 @@ github.com/tsenart/tb v0.0.0-20181025101425-0d2499c8b6e9 h1:kjbwitOGH46vD01f2s3l
|
|||
github.com/tsenart/tb v0.0.0-20181025101425-0d2499c8b6e9/go.mod h1:EcGP24b8DY+bWHnpfJDP7fM+o8Nmz4fYH0l2xTtNr3I=
|
||||
github.com/ttacon/chalk v0.0.0-20160626202418-22c06c80ed31/go.mod h1:onvgF043R+lC5RZ8IT9rBXDaEDnpnw/Cl+HFiw+v/7Q=
|
||||
github.com/tv42/httpunix v0.0.0-20191220191345-2ba4b9c3382c/go.mod h1:hzIxponao9Kjc7aWznkXaL4U4TWaDSs8zcsY4Ka08nM=
|
||||
github.com/twmb/murmur3 v1.1.6 h1:mqrRot1BRxm+Yct+vavLMou2/iJt0tNVTTC0QoIjaZg=
|
||||
github.com/twmb/murmur3 v1.1.6/go.mod h1:Qq/R7NUyOfr65zD+6Q5IHKsJLwP7exErjN6lyyq3OSQ=
|
||||
github.com/tyler-smith/go-bip39 v1.0.1-0.20181017060643-dbb3b84ba2ef/go.mod h1:sJ5fKU0s6JVwZjjcUEX2zFOnvq0ASQ2K9Zr6cf67kNs=
|
||||
github.com/tyler-smith/go-bip39 v1.1.0 h1:5eUemwrMargf3BSLRRCalXT93Ns6pQJIjYQN2nyfOP8=
|
||||
github.com/tyler-smith/go-bip39 v1.1.0/go.mod h1:gUYDtqQw1JS3ZJ8UWVcGTGqqr6YIN3CWg+kkNaLt55U=
|
||||
|
|
|
@ -7,6 +7,15 @@
|
|||
[![PkgGoDev](https://pkg.go.dev/badge/github.com/bits-and-blooms/bitset?tab=doc)](https://pkg.go.dev/github.com/bits-and-blooms/bitset?tab=doc)
|
||||
|
||||
|
||||
This library is part of the [awesome go collection](https://github.com/avelino/awesome-go). It is used in production by several important systems:
|
||||
|
||||
* [beego](https://github.com/beego/beego)
|
||||
* [CubeFS](https://github.com/cubefs/cubefs)
|
||||
* [Amazon EKS Distro](https://github.com/aws/eks-distro)
|
||||
* [sourcegraph](https://github.com/sourcegraph/sourcegraph)
|
||||
* [torrent](https://github.com/anacrolix/torrent)
|
||||
|
||||
|
||||
## Description
|
||||
|
||||
Package bitset implements bitsets, a mapping between non-negative integers and boolean values.
|
||||
|
@ -60,19 +69,76 @@ func main() {
|
|||
}
|
||||
```
|
||||
|
||||
As an alternative to BitSets, one should check out the 'big' package, which provides a (less set-theoretical) view of bitsets.
|
||||
|
||||
Package documentation is at: https://pkg.go.dev/github.com/bits-and-blooms/bitset?tab=doc
|
||||
|
||||
## Serialization
|
||||
|
||||
|
||||
You may serialize a bitset safely and portably to a stream
|
||||
of bytes as follows:
|
||||
```Go
|
||||
const length = 9585
|
||||
const oneEvery = 97
|
||||
bs := bitset.New(length)
|
||||
// Add some bits
|
||||
for i := uint(0); i < length; i += oneEvery {
|
||||
bs = bs.Set(i)
|
||||
}
|
||||
|
||||
var buf bytes.Buffer
|
||||
n, err := bs.WriteTo(&buf)
|
||||
if err != nil {
|
||||
// failure
|
||||
}
|
||||
// Here n == buf.Len()
|
||||
```
|
||||
You can later deserialize the result as follows:
|
||||
|
||||
```Go
|
||||
// Read back from buf
|
||||
bs = bitset.New()
|
||||
n, err = bs.ReadFrom(&buf)
|
||||
if err != nil {
|
||||
// error
|
||||
}
|
||||
// n is the number of bytes read
|
||||
```
|
||||
|
||||
The `ReadFrom` function attempts to read the data into the existing
|
||||
BitSet instance, to minimize memory allocations.
|
||||
|
||||
|
||||
*Performance tip*:
|
||||
When reading and writing to a file or a network connection, you may get better performance by
|
||||
wrapping your streams with `bufio` instances.
|
||||
|
||||
E.g.,
|
||||
```Go
|
||||
f, err := os.Create("myfile")
|
||||
w := bufio.NewWriter(f)
|
||||
```
|
||||
```Go
|
||||
f, err := os.Open("myfile")
|
||||
r := bufio.NewReader(f)
|
||||
```
|
||||
|
||||
## Memory Usage
|
||||
|
||||
The memory usage of a bitset using N bits is at least N/8 bytes. The number of bits in a bitset is at least as large as one plus the greatest bit index you have accessed. Thus it is possible to run out of memory while using a bitset. If you have lots of bits, you might prefer compressed bitsets, like the [Roaring bitmaps](http://roaringbitmap.org) and its [Go implementation](https://github.com/RoaringBitmap/roaring).
|
||||
The memory usage of a bitset using `N` bits is at least `N/8` bytes. The number of bits in a bitset is at least as large as one plus the greatest bit index you have accessed. Thus it is possible to run out of memory while using a bitset. If you have lots of bits, you might prefer compressed bitsets, like the [Roaring bitmaps](http://roaringbitmap.org) and its [Go implementation](https://github.com/RoaringBitmap/roaring).
|
||||
|
||||
The `roaring` library allows you to go back and forth between compressed Roaring bitmaps and the conventional bitset instances:
|
||||
```Go
|
||||
mybitset := roaringbitmap.ToBitSet()
|
||||
newroaringbitmap := roaring.FromBitSet(mybitset)
|
||||
```
|
||||
|
||||
|
||||
## Implementation Note
|
||||
|
||||
Go 1.9 introduced a native `math/bits` library. We provide backward compatibility to Go 1.7, which might be removed.
|
||||
|
||||
It is possible that a later version will match the `math/bits` return signature for counts (which is `int`, rather than our library's `unit64`). If so, the version will be bumped.
|
||||
It is possible that a later version will match the `math/bits` return signature for counts (which is `int`, rather than our library's `uint64`). If so, the version will be bumped.
|
||||
|
||||
## Installation
|
||||
|
||||
|
|
|
@ -0,0 +1,5 @@
|
|||
# Security Policy
|
||||
|
||||
## Reporting a Vulnerability
|
||||
|
||||
You can report privately a vulnerability by email at daniel@lemire.me (current maintainer).
|
|
@ -33,12 +33,10 @@ Example use:
|
|||
|
||||
As an alternative to BitSets, one should check out the 'big' package,
|
||||
which provides a (less set-theoretical) view of bitsets.
|
||||
|
||||
*/
|
||||
package bitset
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"bytes"
|
||||
"encoding/base64"
|
||||
"encoding/binary"
|
||||
|
@ -52,6 +50,9 @@ import (
|
|||
// the wordSize of a bit set
|
||||
const wordSize = uint(64)
|
||||
|
||||
// the wordSize of a bit set in bytes
|
||||
const wordBytes = wordSize / 8
|
||||
|
||||
// log2WordSize is lg(wordSize)
|
||||
const log2WordSize = uint(6)
|
||||
|
||||
|
@ -87,12 +88,23 @@ func (b *BitSet) safeSet() []uint64 {
|
|||
return b.set
|
||||
}
|
||||
|
||||
// From is a constructor used to create a BitSet from an array of integers
|
||||
func From(buf []uint64) *BitSet {
|
||||
return &BitSet{uint(len(buf)) * 64, buf}
|
||||
// SetBitsetFrom fills the bitset with an array of integers without creating a new BitSet instance
|
||||
func (b *BitSet) SetBitsetFrom(buf []uint64) {
|
||||
b.length = uint(len(buf)) * 64
|
||||
b.set = buf
|
||||
}
|
||||
|
||||
// Bytes returns the bitset as array of integers
|
||||
// From is a constructor used to create a BitSet from an array of words
|
||||
func From(buf []uint64) *BitSet {
|
||||
return FromWithLength(uint(len(buf))*64, buf)
|
||||
}
|
||||
|
||||
// FromWithLength constructs from an array of words and length.
|
||||
func FromWithLength(len uint, set []uint64) *BitSet {
|
||||
return &BitSet{len, set}
|
||||
}
|
||||
|
||||
// Bytes returns the bitset as array of words
|
||||
func (b *BitSet) Bytes() []uint64 {
|
||||
return b.set
|
||||
}
|
||||
|
@ -105,6 +117,17 @@ func wordsNeeded(i uint) int {
|
|||
return int((i + (wordSize - 1)) >> log2WordSize)
|
||||
}
|
||||
|
||||
// wordsNeededUnbound calculates the number of words needed for i bits, possibly exceeding the capacity.
|
||||
// This function is useful if you know that the capacity cannot be exceeded (e.g., you have an existing bitmap).
|
||||
func wordsNeededUnbound(i uint) int {
|
||||
return int((i + (wordSize - 1)) >> log2WordSize)
|
||||
}
|
||||
|
||||
// wordsIndex calculates the index of words in a `uint64`
|
||||
func wordsIndex(i uint) uint {
|
||||
return i & (wordSize - 1)
|
||||
}
|
||||
|
||||
// New creates a new BitSet with a hint that length bits will be required
|
||||
func New(length uint) (bset *BitSet) {
|
||||
defer func() {
|
||||
|
@ -135,24 +158,22 @@ func (b *BitSet) Len() uint {
|
|||
return b.length
|
||||
}
|
||||
|
||||
// extendSetMaybe adds additional words to incorporate new bits if needed
|
||||
func (b *BitSet) extendSetMaybe(i uint) {
|
||||
if i >= b.length { // if we need more bits, make 'em
|
||||
if i >= Cap() {
|
||||
panic("You are exceeding the capacity")
|
||||
}
|
||||
nsize := wordsNeeded(i + 1)
|
||||
if b.set == nil {
|
||||
b.set = make([]uint64, nsize)
|
||||
} else if cap(b.set) >= nsize {
|
||||
b.set = b.set[:nsize] // fast resize
|
||||
} else if len(b.set) < nsize {
|
||||
newset := make([]uint64, nsize, 2*nsize) // increase capacity 2x
|
||||
copy(newset, b.set)
|
||||
b.set = newset
|
||||
}
|
||||
b.length = i + 1
|
||||
// extendSet adds additional words to incorporate new bits if needed
|
||||
func (b *BitSet) extendSet(i uint) {
|
||||
if i >= Cap() {
|
||||
panic("You are exceeding the capacity")
|
||||
}
|
||||
nsize := wordsNeeded(i + 1)
|
||||
if b.set == nil {
|
||||
b.set = make([]uint64, nsize)
|
||||
} else if cap(b.set) >= nsize {
|
||||
b.set = b.set[:nsize] // fast resize
|
||||
} else if len(b.set) < nsize {
|
||||
newset := make([]uint64, nsize, 2*nsize) // increase capacity 2x
|
||||
copy(newset, b.set)
|
||||
b.set = newset
|
||||
}
|
||||
b.length = i + 1
|
||||
}
|
||||
|
||||
// Test whether bit i is set.
|
||||
|
@ -160,7 +181,7 @@ func (b *BitSet) Test(i uint) bool {
|
|||
if i >= b.length {
|
||||
return false
|
||||
}
|
||||
return b.set[i>>log2WordSize]&(1<<(i&(wordSize-1))) != 0
|
||||
return b.set[i>>log2WordSize]&(1<<wordsIndex(i)) != 0
|
||||
}
|
||||
|
||||
// Set bit i to 1, the capacity of the bitset is automatically
|
||||
|
@ -170,8 +191,10 @@ func (b *BitSet) Test(i uint) bool {
|
|||
// may lead to a memory shortage and a panic: the caller is responsible
|
||||
// for providing sensible parameters in line with their memory capacity.
|
||||
func (b *BitSet) Set(i uint) *BitSet {
|
||||
b.extendSetMaybe(i)
|
||||
b.set[i>>log2WordSize] |= 1 << (i & (wordSize - 1))
|
||||
if i >= b.length { // if we need more bits, make 'em
|
||||
b.extendSet(i)
|
||||
}
|
||||
b.set[i>>log2WordSize] |= 1 << wordsIndex(i)
|
||||
return b
|
||||
}
|
||||
|
||||
|
@ -180,7 +203,7 @@ func (b *BitSet) Clear(i uint) *BitSet {
|
|||
if i >= b.length {
|
||||
return b
|
||||
}
|
||||
b.set[i>>log2WordSize] &^= 1 << (i & (wordSize - 1))
|
||||
b.set[i>>log2WordSize] &^= 1 << wordsIndex(i)
|
||||
return b
|
||||
}
|
||||
|
||||
|
@ -205,7 +228,7 @@ func (b *BitSet) Flip(i uint) *BitSet {
|
|||
if i >= b.length {
|
||||
return b.Set(i)
|
||||
}
|
||||
b.set[i>>log2WordSize] ^= 1 << (i & (wordSize - 1))
|
||||
b.set[i>>log2WordSize] ^= 1 << wordsIndex(i)
|
||||
return b
|
||||
}
|
||||
|
||||
|
@ -218,15 +241,23 @@ func (b *BitSet) FlipRange(start, end uint) *BitSet {
|
|||
if start >= end {
|
||||
return b
|
||||
}
|
||||
|
||||
b.extendSetMaybe(end - 1)
|
||||
if end-1 >= b.length { // if we need more bits, make 'em
|
||||
b.extendSet(end - 1)
|
||||
}
|
||||
var startWord uint = start >> log2WordSize
|
||||
var endWord uint = end >> log2WordSize
|
||||
b.set[startWord] ^= ^(^uint64(0) << (start & (wordSize - 1)))
|
||||
for i := startWord; i < endWord; i++ {
|
||||
b.set[i] = ^b.set[i]
|
||||
b.set[startWord] ^= ^(^uint64(0) << wordsIndex(start))
|
||||
if endWord > 0 {
|
||||
// bounds check elimination
|
||||
data := b.set
|
||||
_ = data[endWord-1]
|
||||
for i := startWord; i < endWord; i++ {
|
||||
data[i] = ^data[i]
|
||||
}
|
||||
}
|
||||
if end&(wordSize-1) != 0 {
|
||||
b.set[endWord] ^= ^uint64(0) >> wordsIndex(-end)
|
||||
}
|
||||
b.set[endWord] ^= ^uint64(0) >> (-end & (wordSize - 1))
|
||||
return b
|
||||
}
|
||||
|
||||
|
@ -254,7 +285,10 @@ func (b *BitSet) Shrink(lastbitindex uint) *BitSet {
|
|||
copy(shrunk, b.set[:idx])
|
||||
b.set = shrunk
|
||||
b.length = length
|
||||
b.set[idx-1] &= (allBits >> (uint64(64) - uint64(length&(wordSize-1))))
|
||||
lastWordUsedBits := length % 64
|
||||
if lastWordUsedBits != 0 {
|
||||
b.set[idx-1] &= allBits >> uint64(64-wordsIndex(lastWordUsedBits))
|
||||
}
|
||||
return b
|
||||
}
|
||||
|
||||
|
@ -283,7 +317,7 @@ func (b *BitSet) Compact() *BitSet {
|
|||
// this method could be extremely slow and in some cases might cause the entire BitSet
|
||||
// to be recopied.
|
||||
func (b *BitSet) InsertAt(idx uint) *BitSet {
|
||||
insertAtElement := (idx >> log2WordSize)
|
||||
insertAtElement := idx >> log2WordSize
|
||||
|
||||
// if length of set is a multiple of wordSize we need to allocate more space first
|
||||
if b.isLenExactMultiple() {
|
||||
|
@ -302,13 +336,13 @@ func (b *BitSet) InsertAt(idx uint) *BitSet {
|
|||
|
||||
// generate a mask to extract the data that we need to shift left
|
||||
// within the element where we insert a bit
|
||||
dataMask := ^(uint64(1)<<uint64(idx&(wordSize-1)) - 1)
|
||||
dataMask := uint64(1)<<uint64(wordsIndex(idx)) - 1
|
||||
|
||||
// extract that data that we'll shift
|
||||
data := b.set[i] & dataMask
|
||||
data := b.set[i] & (^dataMask)
|
||||
|
||||
// set the positions of the data mask to 0 in the element where we insert
|
||||
b.set[i] &= ^dataMask
|
||||
b.set[i] &= dataMask
|
||||
|
||||
// shift data mask to the left and insert its data to the slice element
|
||||
b.set[i] |= data << 1
|
||||
|
@ -356,7 +390,7 @@ func (b *BitSet) DeleteAt(i uint) *BitSet {
|
|||
|
||||
// generate a mask for the data that needs to be shifted right
|
||||
// within that slice element that gets modified
|
||||
dataMask := ^((uint64(1) << (i & (wordSize - 1))) - 1)
|
||||
dataMask := ^((uint64(1) << wordsIndex(i)) - 1)
|
||||
|
||||
// extract the data that we'll shift right from the slice element
|
||||
data := b.set[deleteAtElement] & dataMask
|
||||
|
@ -394,16 +428,20 @@ func (b *BitSet) NextSet(i uint) (uint, bool) {
|
|||
return 0, false
|
||||
}
|
||||
w := b.set[x]
|
||||
w = w >> (i & (wordSize - 1))
|
||||
w = w >> wordsIndex(i)
|
||||
if w != 0 {
|
||||
return i + trailingZeroes64(w), true
|
||||
}
|
||||
x = x + 1
|
||||
x++
|
||||
// bounds check elimination in the loop
|
||||
if x < 0 {
|
||||
return 0, false
|
||||
}
|
||||
for x < len(b.set) {
|
||||
if b.set[x] != 0 {
|
||||
return uint(x)*wordSize + trailingZeroes64(b.set[x]), true
|
||||
}
|
||||
x = x + 1
|
||||
x++
|
||||
|
||||
}
|
||||
return 0, false
|
||||
|
@ -413,21 +451,20 @@ func (b *BitSet) NextSet(i uint) (uint, bool) {
|
|||
// including possibly the current index and up to cap(buffer).
|
||||
// If the returned slice has len zero, then no more set bits were found
|
||||
//
|
||||
// buffer := make([]uint, 256) // this should be reused
|
||||
// j := uint(0)
|
||||
// j, buffer = bitmap.NextSetMany(j, buffer)
|
||||
// for ; len(buffer) > 0; j, buffer = bitmap.NextSetMany(j,buffer) {
|
||||
// for k := range buffer {
|
||||
// do something with buffer[k]
|
||||
// }
|
||||
// j += 1
|
||||
// }
|
||||
//
|
||||
// buffer := make([]uint, 256) // this should be reused
|
||||
// j := uint(0)
|
||||
// j, buffer = bitmap.NextSetMany(j, buffer)
|
||||
// for ; len(buffer) > 0; j, buffer = bitmap.NextSetMany(j,buffer) {
|
||||
// for k := range buffer {
|
||||
// do something with buffer[k]
|
||||
// }
|
||||
// j += 1
|
||||
// }
|
||||
//
|
||||
// It is possible to retrieve all set bits as follow:
|
||||
//
|
||||
// indices := make([]uint, bitmap.Count())
|
||||
// bitmap.NextSetMany(0, indices)
|
||||
// indices := make([]uint, bitmap.Count())
|
||||
// bitmap.NextSetMany(0, indices)
|
||||
//
|
||||
// However if bitmap.Count() is large, it might be preferable to
|
||||
// use several calls to NextSetMany, for performance reasons.
|
||||
|
@ -438,7 +475,7 @@ func (b *BitSet) NextSetMany(i uint, buffer []uint) (uint, []uint) {
|
|||
if x >= len(b.set) || capacity == 0 {
|
||||
return 0, myanswer[:0]
|
||||
}
|
||||
skip := i & (wordSize - 1)
|
||||
skip := wordsIndex(i)
|
||||
word := b.set[x] >> skip
|
||||
myanswer = myanswer[:capacity]
|
||||
size := int(0)
|
||||
|
@ -481,17 +518,23 @@ func (b *BitSet) NextClear(i uint) (uint, bool) {
|
|||
return 0, false
|
||||
}
|
||||
w := b.set[x]
|
||||
w = w >> (i & (wordSize - 1))
|
||||
wA := allBits >> (i & (wordSize - 1))
|
||||
w = w >> wordsIndex(i)
|
||||
wA := allBits >> wordsIndex(i)
|
||||
index := i + trailingZeroes64(^w)
|
||||
if w != wA && index < b.length {
|
||||
return index, true
|
||||
}
|
||||
x++
|
||||
// bounds check elimination in the loop
|
||||
if x < 0 {
|
||||
return 0, false
|
||||
}
|
||||
for x < len(b.set) {
|
||||
index = uint(x)*wordSize + trailingZeroes64(^b.set[x])
|
||||
if b.set[x] != allBits && index < b.length {
|
||||
return index, true
|
||||
if b.set[x] != allBits {
|
||||
index = uint(x)*wordSize + trailingZeroes64(^b.set[x])
|
||||
if index < b.length {
|
||||
return index, true
|
||||
}
|
||||
}
|
||||
x++
|
||||
}
|
||||
|
@ -508,9 +551,21 @@ func (b *BitSet) ClearAll() *BitSet {
|
|||
return b
|
||||
}
|
||||
|
||||
// SetAll sets the entire BitSet
|
||||
func (b *BitSet) SetAll() *BitSet {
|
||||
if b != nil && b.set != nil {
|
||||
for i := range b.set {
|
||||
b.set[i] = allBits
|
||||
}
|
||||
|
||||
b.cleanLastWord()
|
||||
}
|
||||
return b
|
||||
}
|
||||
|
||||
// wordCount returns the number of words used in a bit set
|
||||
func (b *BitSet) wordCount() int {
|
||||
return len(b.set)
|
||||
return wordsNeededUnbound(b.length)
|
||||
}
|
||||
|
||||
// Clone this BitSet
|
||||
|
@ -522,9 +577,10 @@ func (b *BitSet) Clone() *BitSet {
|
|||
return c
|
||||
}
|
||||
|
||||
// Copy into a destination BitSet
|
||||
// Returning the size of the destination BitSet
|
||||
// like array copy
|
||||
// Copy into a destination BitSet using the Go array copy semantics:
|
||||
// the number of bits copied is the minimum of the number of bits in the current
|
||||
// BitSet (Len()) and the destination Bitset.
|
||||
// We return the number of bits copied in the destination BitSet.
|
||||
func (b *BitSet) Copy(c *BitSet) (count uint) {
|
||||
if c == nil {
|
||||
return
|
||||
|
@ -536,9 +592,33 @@ func (b *BitSet) Copy(c *BitSet) (count uint) {
|
|||
if b.length < c.length {
|
||||
count = b.length
|
||||
}
|
||||
// Cleaning the last word is needed to keep the invariant that other functions, such as Count, require
|
||||
// that any bits in the last word that would exceed the length of the bitmask are set to 0.
|
||||
c.cleanLastWord()
|
||||
return
|
||||
}
|
||||
|
||||
// CopyFull copies into a destination BitSet such that the destination is
|
||||
// identical to the source after the operation, allocating memory if necessary.
|
||||
func (b *BitSet) CopyFull(c *BitSet) {
|
||||
if c == nil {
|
||||
return
|
||||
}
|
||||
c.length = b.length
|
||||
if len(b.set) == 0 {
|
||||
if c.set != nil {
|
||||
c.set = c.set[:0]
|
||||
}
|
||||
} else {
|
||||
if cap(c.set) < len(b.set) {
|
||||
c.set = make([]uint64, len(b.set))
|
||||
} else {
|
||||
c.set = c.set[:len(b.set)]
|
||||
}
|
||||
copy(c.set, b.set)
|
||||
}
|
||||
}
|
||||
|
||||
// Count (number of set bits).
|
||||
// Also known as "popcount" or "population count".
|
||||
func (b *BitSet) Count() uint {
|
||||
|
@ -561,10 +641,15 @@ func (b *BitSet) Equal(c *BitSet) bool {
|
|||
if b.length == 0 { // if they have both length == 0, then could have nil set
|
||||
return true
|
||||
}
|
||||
// testing for equality shoud not transform the bitset (no call to safeSet)
|
||||
|
||||
for p, v := range b.set {
|
||||
if c.set[p] != v {
|
||||
wn := b.wordCount()
|
||||
// bounds check elimination
|
||||
if wn <= 0 {
|
||||
return true
|
||||
}
|
||||
_ = b.set[wn-1]
|
||||
_ = c.set[wn-1]
|
||||
for p := 0; p < wn; p++ {
|
||||
if c.set[p] != b.set[p] {
|
||||
return false
|
||||
}
|
||||
}
|
||||
|
@ -583,9 +668,9 @@ func (b *BitSet) Difference(compare *BitSet) (result *BitSet) {
|
|||
panicIfNull(b)
|
||||
panicIfNull(compare)
|
||||
result = b.Clone() // clone b (in case b is bigger than compare)
|
||||
l := int(compare.wordCount())
|
||||
if l > int(b.wordCount()) {
|
||||
l = int(b.wordCount())
|
||||
l := compare.wordCount()
|
||||
if l > b.wordCount() {
|
||||
l = b.wordCount()
|
||||
}
|
||||
for i := 0; i < l; i++ {
|
||||
result.set[i] = b.set[i] &^ compare.set[i]
|
||||
|
@ -597,9 +682,9 @@ func (b *BitSet) Difference(compare *BitSet) (result *BitSet) {
|
|||
func (b *BitSet) DifferenceCardinality(compare *BitSet) uint {
|
||||
panicIfNull(b)
|
||||
panicIfNull(compare)
|
||||
l := int(compare.wordCount())
|
||||
if l > int(b.wordCount()) {
|
||||
l = int(b.wordCount())
|
||||
l := compare.wordCount()
|
||||
if l > b.wordCount() {
|
||||
l = b.wordCount()
|
||||
}
|
||||
cnt := uint64(0)
|
||||
cnt += popcntMaskSlice(b.set[:l], compare.set[:l])
|
||||
|
@ -612,12 +697,19 @@ func (b *BitSet) DifferenceCardinality(compare *BitSet) uint {
|
|||
func (b *BitSet) InPlaceDifference(compare *BitSet) {
|
||||
panicIfNull(b)
|
||||
panicIfNull(compare)
|
||||
l := int(compare.wordCount())
|
||||
if l > int(b.wordCount()) {
|
||||
l = int(b.wordCount())
|
||||
l := compare.wordCount()
|
||||
if l > b.wordCount() {
|
||||
l = b.wordCount()
|
||||
}
|
||||
if l <= 0 {
|
||||
return
|
||||
}
|
||||
// bounds check elimination
|
||||
data, cmpData := b.set, compare.set
|
||||
_ = data[l-1]
|
||||
_ = cmpData[l-1]
|
||||
for i := 0; i < l; i++ {
|
||||
b.set[i] &^= compare.set[i]
|
||||
data[i] &^= cmpData[i]
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -660,18 +752,29 @@ func (b *BitSet) IntersectionCardinality(compare *BitSet) uint {
|
|||
func (b *BitSet) InPlaceIntersection(compare *BitSet) {
|
||||
panicIfNull(b)
|
||||
panicIfNull(compare)
|
||||
l := int(compare.wordCount())
|
||||
if l > int(b.wordCount()) {
|
||||
l = int(b.wordCount())
|
||||
l := compare.wordCount()
|
||||
if l > b.wordCount() {
|
||||
l = b.wordCount()
|
||||
}
|
||||
for i := 0; i < l; i++ {
|
||||
b.set[i] &= compare.set[i]
|
||||
if l > 0 {
|
||||
// bounds check elimination
|
||||
data, cmpData := b.set, compare.set
|
||||
_ = data[l-1]
|
||||
_ = cmpData[l-1]
|
||||
|
||||
for i := 0; i < l; i++ {
|
||||
data[i] &= cmpData[i]
|
||||
}
|
||||
}
|
||||
for i := l; i < len(b.set); i++ {
|
||||
b.set[i] = 0
|
||||
if l >= 0 {
|
||||
for i := l; i < len(b.set); i++ {
|
||||
b.set[i] = 0
|
||||
}
|
||||
}
|
||||
if compare.length > 0 {
|
||||
b.extendSetMaybe(compare.length - 1)
|
||||
if compare.length-1 >= b.length {
|
||||
b.extendSet(compare.length - 1)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -706,15 +809,22 @@ func (b *BitSet) UnionCardinality(compare *BitSet) uint {
|
|||
func (b *BitSet) InPlaceUnion(compare *BitSet) {
|
||||
panicIfNull(b)
|
||||
panicIfNull(compare)
|
||||
l := int(compare.wordCount())
|
||||
if l > int(b.wordCount()) {
|
||||
l = int(b.wordCount())
|
||||
l := compare.wordCount()
|
||||
if l > b.wordCount() {
|
||||
l = b.wordCount()
|
||||
}
|
||||
if compare.length > 0 {
|
||||
b.extendSetMaybe(compare.length - 1)
|
||||
if compare.length > 0 && compare.length-1 >= b.length {
|
||||
b.extendSet(compare.length - 1)
|
||||
}
|
||||
for i := 0; i < l; i++ {
|
||||
b.set[i] |= compare.set[i]
|
||||
if l > 0 {
|
||||
// bounds check elimination
|
||||
data, cmpData := b.set, compare.set
|
||||
_ = data[l-1]
|
||||
_ = cmpData[l-1]
|
||||
|
||||
for i := 0; i < l; i++ {
|
||||
data[i] |= cmpData[i]
|
||||
}
|
||||
}
|
||||
if len(compare.set) > l {
|
||||
for i := l; i < len(compare.set); i++ {
|
||||
|
@ -754,15 +864,21 @@ func (b *BitSet) SymmetricDifferenceCardinality(compare *BitSet) uint {
|
|||
func (b *BitSet) InPlaceSymmetricDifference(compare *BitSet) {
|
||||
panicIfNull(b)
|
||||
panicIfNull(compare)
|
||||
l := int(compare.wordCount())
|
||||
if l > int(b.wordCount()) {
|
||||
l = int(b.wordCount())
|
||||
l := compare.wordCount()
|
||||
if l > b.wordCount() {
|
||||
l = b.wordCount()
|
||||
}
|
||||
if compare.length > 0 {
|
||||
b.extendSetMaybe(compare.length - 1)
|
||||
if compare.length > 0 && compare.length-1 >= b.length {
|
||||
b.extendSet(compare.length - 1)
|
||||
}
|
||||
for i := 0; i < l; i++ {
|
||||
b.set[i] ^= compare.set[i]
|
||||
if l > 0 {
|
||||
// bounds check elimination
|
||||
data, cmpData := b.set, compare.set
|
||||
_ = data[l-1]
|
||||
_ = cmpData[l-1]
|
||||
for i := 0; i < l; i++ {
|
||||
data[i] ^= cmpData[i]
|
||||
}
|
||||
}
|
||||
if len(compare.set) > l {
|
||||
for i := l; i < len(compare.set); i++ {
|
||||
|
@ -773,17 +889,17 @@ func (b *BitSet) InPlaceSymmetricDifference(compare *BitSet) {
|
|||
|
||||
// Is the length an exact multiple of word sizes?
|
||||
func (b *BitSet) isLenExactMultiple() bool {
|
||||
return b.length%wordSize == 0
|
||||
return wordsIndex(b.length) == 0
|
||||
}
|
||||
|
||||
// Clean last word by setting unused bits to 0
|
||||
func (b *BitSet) cleanLastWord() {
|
||||
if !b.isLenExactMultiple() {
|
||||
b.set[len(b.set)-1] &= allBits >> (wordSize - b.length%wordSize)
|
||||
b.set[len(b.set)-1] &= allBits >> (wordSize - wordsIndex(b.length))
|
||||
}
|
||||
}
|
||||
|
||||
// Complement computes the (local) complement of a biset (up to length bits)
|
||||
// Complement computes the (local) complement of a bitset (up to length bits)
|
||||
func (b *BitSet) Complement() (result *BitSet) {
|
||||
panicIfNull(b)
|
||||
result = New(b.length)
|
||||
|
@ -811,7 +927,6 @@ func (b *BitSet) None() bool {
|
|||
return false
|
||||
}
|
||||
}
|
||||
return true
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
@ -824,12 +939,16 @@ func (b *BitSet) Any() bool {
|
|||
|
||||
// IsSuperSet returns true if this is a superset of the other set
|
||||
func (b *BitSet) IsSuperSet(other *BitSet) bool {
|
||||
for i, e := other.NextSet(0); e; i, e = other.NextSet(i + 1) {
|
||||
if !b.Test(i) {
|
||||
l := other.wordCount()
|
||||
if b.wordCount() < l {
|
||||
l = b.wordCount()
|
||||
}
|
||||
for i, word := range other.set[:l] {
|
||||
if b.set[i]&word != word {
|
||||
return false
|
||||
}
|
||||
}
|
||||
return true
|
||||
return popcntSlice(other.set[l:]) == 0
|
||||
}
|
||||
|
||||
// IsStrictSuperSet returns true if this is a strict superset of the other set
|
||||
|
@ -837,7 +956,8 @@ func (b *BitSet) IsStrictSuperSet(other *BitSet) bool {
|
|||
return b.Count() > other.Count() && b.IsSuperSet(other)
|
||||
}
|
||||
|
||||
// DumpAsBits dumps a bit set as a string of bits
|
||||
// DumpAsBits dumps a bit set as a string of bits. Following the usual convention in Go,
|
||||
// the least significant bits are printed last (index 0 is at the end of the string).
|
||||
func (b *BitSet) DumpAsBits() string {
|
||||
if b.set == nil {
|
||||
return "."
|
||||
|
@ -850,78 +970,156 @@ func (b *BitSet) DumpAsBits() string {
|
|||
return buffer.String()
|
||||
}
|
||||
|
||||
// BinaryStorageSize returns the binary storage requirements
|
||||
// BinaryStorageSize returns the binary storage requirements (see WriteTo) in bytes.
|
||||
func (b *BitSet) BinaryStorageSize() int {
|
||||
return binary.Size(uint64(0)) + binary.Size(b.set)
|
||||
return int(wordBytes + wordBytes*uint(b.wordCount()))
|
||||
}
|
||||
|
||||
// WriteTo writes a BitSet to a stream
|
||||
func readUint64Array(reader io.Reader, data []uint64) error {
|
||||
length := len(data)
|
||||
bufferSize := 128
|
||||
buffer := make([]byte, bufferSize*int(wordBytes))
|
||||
for i := 0; i < length; i += bufferSize {
|
||||
end := i + bufferSize
|
||||
if end > length {
|
||||
end = length
|
||||
buffer = buffer[:wordBytes*uint(end-i)]
|
||||
}
|
||||
chunk := data[i:end]
|
||||
if _, err := io.ReadFull(reader, buffer); err != nil {
|
||||
return err
|
||||
}
|
||||
for i := range chunk {
|
||||
chunk[i] = uint64(binaryOrder.Uint64(buffer[8*i:]))
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func writeUint64Array(writer io.Writer, data []uint64) error {
|
||||
bufferSize := 128
|
||||
buffer := make([]byte, bufferSize*int(wordBytes))
|
||||
for i := 0; i < len(data); i += bufferSize {
|
||||
end := i + bufferSize
|
||||
if end > len(data) {
|
||||
end = len(data)
|
||||
buffer = buffer[:wordBytes*uint(end-i)]
|
||||
}
|
||||
chunk := data[i:end]
|
||||
for i, x := range chunk {
|
||||
binaryOrder.PutUint64(buffer[8*i:], x)
|
||||
}
|
||||
_, err := writer.Write(buffer)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// WriteTo writes a BitSet to a stream. The format is:
|
||||
// 1. uint64 length
|
||||
// 2. []uint64 set
|
||||
// Upon success, the number of bytes written is returned.
|
||||
//
|
||||
// Performance: if this function is used to write to a disk or network
|
||||
// connection, it might be beneficial to wrap the stream in a bufio.Writer.
|
||||
// E.g.,
|
||||
//
|
||||
// f, err := os.Create("myfile")
|
||||
// w := bufio.NewWriter(f)
|
||||
func (b *BitSet) WriteTo(stream io.Writer) (int64, error) {
|
||||
length := uint64(b.length)
|
||||
|
||||
// Write length
|
||||
err := binary.Write(stream, binaryOrder, length)
|
||||
err := binary.Write(stream, binaryOrder, &length)
|
||||
if err != nil {
|
||||
return 0, err
|
||||
// Upon failure, we do not guarantee that we
|
||||
// return the number of bytes written.
|
||||
return int64(0), err
|
||||
}
|
||||
|
||||
// Write set
|
||||
err = binary.Write(stream, binaryOrder, b.set)
|
||||
return int64(b.BinaryStorageSize()), err
|
||||
err = writeUint64Array(stream, b.set[:b.wordCount()])
|
||||
if err != nil {
|
||||
// Upon failure, we do not guarantee that we
|
||||
// return the number of bytes written.
|
||||
return int64(wordBytes), err
|
||||
}
|
||||
return int64(b.BinaryStorageSize()), nil
|
||||
}
|
||||
|
||||
// ReadFrom reads a BitSet from a stream written using WriteTo
|
||||
// The format is:
|
||||
// 1. uint64 length
|
||||
// 2. []uint64 set
|
||||
// Upon success, the number of bytes read is returned.
|
||||
// If the current BitSet is not large enough to hold the data,
|
||||
// it is extended. In case of error, the BitSet is either
|
||||
// left unchanged or made empty if the error occurs too late
|
||||
// to preserve the content.
|
||||
//
|
||||
// Performance: if this function is used to read from a disk or network
|
||||
// connection, it might be beneficial to wrap the stream in a bufio.Reader.
|
||||
// E.g.,
|
||||
//
|
||||
// f, err := os.Open("myfile")
|
||||
// r := bufio.NewReader(f)
|
||||
func (b *BitSet) ReadFrom(stream io.Reader) (int64, error) {
|
||||
var length uint64
|
||||
|
||||
// Read length first
|
||||
err := binary.Read(stream, binaryOrder, &length)
|
||||
if err != nil {
|
||||
if err == io.EOF {
|
||||
err = io.ErrUnexpectedEOF
|
||||
}
|
||||
return 0, err
|
||||
}
|
||||
newset := New(uint(length))
|
||||
newlength := uint(length)
|
||||
|
||||
if uint64(newset.length) != length {
|
||||
if uint64(newlength) != length {
|
||||
return 0, errors.New("unmarshalling error: type mismatch")
|
||||
}
|
||||
nWords := wordsNeeded(uint(newlength))
|
||||
if cap(b.set) >= nWords {
|
||||
b.set = b.set[:nWords]
|
||||
} else {
|
||||
b.set = make([]uint64, nWords)
|
||||
}
|
||||
|
||||
// Read remaining bytes as set
|
||||
err = binary.Read(stream, binaryOrder, newset.set)
|
||||
b.length = newlength
|
||||
|
||||
err = readUint64Array(stream, b.set)
|
||||
if err != nil {
|
||||
if err == io.EOF {
|
||||
err = io.ErrUnexpectedEOF
|
||||
}
|
||||
// We do not want to leave the BitSet partially filled as
|
||||
// it is error prone.
|
||||
b.set = b.set[:0]
|
||||
b.length = 0
|
||||
return 0, err
|
||||
}
|
||||
|
||||
*b = *newset
|
||||
return int64(b.BinaryStorageSize()), nil
|
||||
}
|
||||
|
||||
// MarshalBinary encodes a BitSet into a binary form and returns the result.
|
||||
func (b *BitSet) MarshalBinary() ([]byte, error) {
|
||||
var buf bytes.Buffer
|
||||
writer := bufio.NewWriter(&buf)
|
||||
|
||||
_, err := b.WriteTo(writer)
|
||||
_, err := b.WriteTo(&buf)
|
||||
if err != nil {
|
||||
return []byte{}, err
|
||||
}
|
||||
|
||||
err = writer.Flush()
|
||||
|
||||
return buf.Bytes(), err
|
||||
}
|
||||
|
||||
// UnmarshalBinary decodes the binary form generated by MarshalBinary.
|
||||
func (b *BitSet) UnmarshalBinary(data []byte) error {
|
||||
buf := bytes.NewReader(data)
|
||||
reader := bufio.NewReader(buf)
|
||||
|
||||
_, err := b.ReadFrom(reader)
|
||||
|
||||
_, err := b.ReadFrom(buf)
|
||||
return err
|
||||
}
|
||||
|
||||
// MarshalJSON marshals a BitSet as a JSON structure
|
||||
func (b *BitSet) MarshalJSON() ([]byte, error) {
|
||||
func (b BitSet) MarshalJSON() ([]byte, error) {
|
||||
buffer := bytes.NewBuffer(make([]byte, 0, b.BinaryStorageSize()))
|
||||
_, err := b.WriteTo(buffer)
|
||||
if err != nil {
|
||||
|
@ -950,3 +1148,37 @@ func (b *BitSet) UnmarshalJSON(data []byte) error {
|
|||
_, err = b.ReadFrom(bytes.NewReader(buf))
|
||||
return err
|
||||
}
|
||||
|
||||
// Rank returns the nunber of set bits up to and including the index
|
||||
// that are set in the bitset.
|
||||
// See https://en.wikipedia.org/wiki/Ranking#Ranking_in_statistics
|
||||
func (b *BitSet) Rank(index uint) uint {
|
||||
if index >= b.length {
|
||||
return b.Count()
|
||||
}
|
||||
leftover := (index + 1) & 63
|
||||
answer := uint(popcntSlice(b.set[:(index+1)>>6]))
|
||||
if leftover != 0 {
|
||||
answer += uint(popcount(b.set[(index+1)>>6] << (64 - leftover)))
|
||||
}
|
||||
return answer
|
||||
}
|
||||
|
||||
// Select returns the index of the jth set bit, where j is the argument.
|
||||
// The caller is responsible to ensure that 0 <= j < Count(): when j is
|
||||
// out of range, the function returns the length of the bitset (b.length).
|
||||
//
|
||||
// Note that this function differs in convention from the Rank function which
|
||||
// returns 1 when ranking the smallest value. We follow the conventional
|
||||
// textbook definition of Select and Rank.
|
||||
func (b *BitSet) Select(index uint) uint {
|
||||
leftover := index
|
||||
for idx, word := range b.set {
|
||||
w := uint(popcount(word))
|
||||
if w > leftover {
|
||||
return uint(idx)*64 + select64(word, leftover)
|
||||
}
|
||||
leftover -= w
|
||||
}
|
||||
return b.length
|
||||
}
|
||||
|
|
|
@ -1,3 +1,4 @@
|
|||
//go:build go1.9
|
||||
// +build go1.9
|
||||
|
||||
package bitset
|
||||
|
@ -14,6 +15,10 @@ func popcntSlice(s []uint64) uint64 {
|
|||
|
||||
func popcntMaskSlice(s, m []uint64) uint64 {
|
||||
var cnt int
|
||||
// this explicit check eliminates a bounds check in the loop
|
||||
if len(m) < len(s) {
|
||||
panic("mask slice is too short")
|
||||
}
|
||||
for i := range s {
|
||||
cnt += bits.OnesCount64(s[i] &^ m[i])
|
||||
}
|
||||
|
@ -22,6 +27,10 @@ func popcntMaskSlice(s, m []uint64) uint64 {
|
|||
|
||||
func popcntAndSlice(s, m []uint64) uint64 {
|
||||
var cnt int
|
||||
// this explicit check eliminates a bounds check in the loop
|
||||
if len(m) < len(s) {
|
||||
panic("mask slice is too short")
|
||||
}
|
||||
for i := range s {
|
||||
cnt += bits.OnesCount64(s[i] & m[i])
|
||||
}
|
||||
|
@ -30,6 +39,10 @@ func popcntAndSlice(s, m []uint64) uint64 {
|
|||
|
||||
func popcntOrSlice(s, m []uint64) uint64 {
|
||||
var cnt int
|
||||
// this explicit check eliminates a bounds check in the loop
|
||||
if len(m) < len(s) {
|
||||
panic("mask slice is too short")
|
||||
}
|
||||
for i := range s {
|
||||
cnt += bits.OnesCount64(s[i] | m[i])
|
||||
}
|
||||
|
@ -38,6 +51,10 @@ func popcntOrSlice(s, m []uint64) uint64 {
|
|||
|
||||
func popcntXorSlice(s, m []uint64) uint64 {
|
||||
var cnt int
|
||||
// this explicit check eliminates a bounds check in the loop
|
||||
if len(m) < len(s) {
|
||||
panic("mask slice is too short")
|
||||
}
|
||||
for i := range s {
|
||||
cnt += bits.OnesCount64(s[i] ^ m[i])
|
||||
}
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
// +build !go1.9
|
||||
// +build amd64,!appengine
|
||||
//go:build !go1.9 && amd64 && !appengine
|
||||
// +build !go1.9,amd64,!appengine
|
||||
|
||||
package bitset
|
||||
|
||||
|
|
|
@ -1,3 +1,4 @@
|
|||
//go:build !go1.9 && (!amd64 || appengine)
|
||||
// +build !go1.9
|
||||
// +build !amd64 appengine
|
||||
|
||||
|
|
|
@ -0,0 +1,45 @@
|
|||
package bitset
|
||||
|
||||
func select64(w uint64, j uint) uint {
|
||||
seen := 0
|
||||
// Divide 64bit
|
||||
part := w & 0xFFFFFFFF
|
||||
n := uint(popcount(part))
|
||||
if n <= j {
|
||||
part = w >> 32
|
||||
seen += 32
|
||||
j -= n
|
||||
}
|
||||
ww := part
|
||||
|
||||
// Divide 32bit
|
||||
part = ww & 0xFFFF
|
||||
|
||||
n = uint(popcount(part))
|
||||
if n <= j {
|
||||
part = ww >> 16
|
||||
seen += 16
|
||||
j -= n
|
||||
}
|
||||
ww = part
|
||||
|
||||
// Divide 16bit
|
||||
part = ww & 0xFF
|
||||
n = uint(popcount(part))
|
||||
if n <= j {
|
||||
part = ww >> 8
|
||||
seen += 8
|
||||
j -= n
|
||||
}
|
||||
ww = part
|
||||
|
||||
// Lookup in final byte
|
||||
counter := 0
|
||||
for ; counter < 8; counter++ {
|
||||
j -= uint((ww >> counter) & 1)
|
||||
if j+1 == 0 {
|
||||
break
|
||||
}
|
||||
}
|
||||
return uint(seen + counter)
|
||||
}
|
|
@ -1,3 +1,4 @@
|
|||
//go:build !go1.9
|
||||
// +build !go1.9
|
||||
|
||||
package bitset
|
||||
|
|
|
@ -1,3 +1,4 @@
|
|||
//go:build go1.9
|
||||
// +build go1.9
|
||||
|
||||
package bitset
|
||||
|
|
|
@ -0,0 +1,27 @@
|
|||
# Compiled Object files, Static and Dynamic libs (Shared Objects)
|
||||
*.o
|
||||
*.a
|
||||
*.so
|
||||
|
||||
# Folders
|
||||
_obj
|
||||
_test
|
||||
|
||||
# Architecture specific extensions/prefixes
|
||||
*.[568vq]
|
||||
[568vq].out
|
||||
|
||||
*.cgo1.go
|
||||
*.cgo2.c
|
||||
_cgo_defun.c
|
||||
_cgo_gotypes.go
|
||||
_cgo_export.*
|
||||
|
||||
_testmain.go
|
||||
|
||||
*.exe
|
||||
*.test
|
||||
*.prof
|
||||
|
||||
target
|
||||
.idea
|
|
@ -0,0 +1,38 @@
|
|||
language: go
|
||||
|
||||
sudo: false
|
||||
|
||||
branches:
|
||||
except:
|
||||
- release
|
||||
|
||||
branches:
|
||||
only:
|
||||
- master
|
||||
- develop
|
||||
- travis
|
||||
|
||||
go:
|
||||
- 1.8
|
||||
- tip
|
||||
|
||||
matrix:
|
||||
allow_failures:
|
||||
- go: tip
|
||||
|
||||
before_install:
|
||||
- if [ -n "$GH_USER" ]; then git config --global github.user ${GH_USER}; fi;
|
||||
- if [ -n "$GH_TOKEN" ]; then git config --global github.token ${GH_TOKEN}; fi;
|
||||
- go get github.com/mattn/goveralls
|
||||
|
||||
before_script:
|
||||
- make deps
|
||||
|
||||
script:
|
||||
- make qa
|
||||
|
||||
after_failure:
|
||||
- cat ./target/test/report.xml
|
||||
|
||||
after_success:
|
||||
- if [ "$TRAVIS_GO_VERSION" = "1.8" ]; then $HOME/gopath/bin/goveralls -covermode=count -coverprofile=target/report/coverage.out -service=travis-ci; fi;
|
|
@ -0,0 +1,24 @@
|
|||
Copyright (c) 2014 Will Fitzgerald. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
* Redistributions in binary form must reproduce the above
|
||||
copyright notice, this list of conditions and the following disclaimer
|
||||
in the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
@ -0,0 +1,197 @@
|
|||
# MAKEFILE
|
||||
#
|
||||
# @author Nicola Asuni <info@tecnick.com>
|
||||
# @link https://github.com/bits-and-blooms/bloom
|
||||
# ------------------------------------------------------------------------------
|
||||
|
||||
# List special make targets that are not associated with files
|
||||
.PHONY: help all test format fmtcheck vet lint coverage cyclo ineffassign misspell structcheck varcheck errcheck gosimple astscan qa deps clean nuke
|
||||
|
||||
# Use bash as shell (Note: Ubuntu now uses dash which doesn't support PIPESTATUS).
|
||||
SHELL=/bin/bash
|
||||
|
||||
# CVS path (path to the parent dir containing the project)
|
||||
CVSPATH=github.com/bits-and-blooms
|
||||
|
||||
# Project owner
|
||||
OWNER=bits-and-blooms
|
||||
|
||||
# Project vendor
|
||||
VENDOR=bits-and-blooms
|
||||
|
||||
# Project name
|
||||
PROJECT=bloom
|
||||
|
||||
# Project version
|
||||
VERSION=$(shell cat VERSION)
|
||||
|
||||
# Name of RPM or DEB package
|
||||
PKGNAME=${VENDOR}-${PROJECT}
|
||||
|
||||
# Current directory
|
||||
CURRENTDIR=$(shell pwd)
|
||||
|
||||
# GO lang path
|
||||
ifneq ($(GOPATH),)
|
||||
ifeq ($(findstring $(GOPATH),$(CURRENTDIR)),)
|
||||
# the defined GOPATH is not valid
|
||||
GOPATH=
|
||||
endif
|
||||
endif
|
||||
ifeq ($(GOPATH),)
|
||||
# extract the GOPATH
|
||||
GOPATH=$(firstword $(subst /src/, ,$(CURRENTDIR)))
|
||||
endif
|
||||
|
||||
# --- MAKE TARGETS ---
|
||||
|
||||
# Display general help about this command
|
||||
help:
|
||||
@echo ""
|
||||
@echo "$(PROJECT) Makefile."
|
||||
@echo "GOPATH=$(GOPATH)"
|
||||
@echo "The following commands are available:"
|
||||
@echo ""
|
||||
@echo " make qa : Run all the tests"
|
||||
@echo " make test : Run the unit tests"
|
||||
@echo ""
|
||||
@echo " make format : Format the source code"
|
||||
@echo " make fmtcheck : Check if the source code has been formatted"
|
||||
@echo " make vet : Check for suspicious constructs"
|
||||
@echo " make lint : Check for style errors"
|
||||
@echo " make coverage : Generate the coverage report"
|
||||
@echo " make cyclo : Generate the cyclomatic complexity report"
|
||||
@echo " make ineffassign : Detect ineffectual assignments"
|
||||
@echo " make misspell : Detect commonly misspelled words in source files"
|
||||
@echo " make structcheck : Find unused struct fields"
|
||||
@echo " make varcheck : Find unused global variables and constants"
|
||||
@echo " make errcheck : Check that error return values are used"
|
||||
@echo " make gosimple : Suggest code simplifications"
|
||||
@echo " make astscan : GO AST scanner"
|
||||
@echo ""
|
||||
@echo " make docs : Generate source code documentation"
|
||||
@echo ""
|
||||
@echo " make deps : Get the dependencies"
|
||||
@echo " make clean : Remove any build artifact"
|
||||
@echo " make nuke : Deletes any intermediate file"
|
||||
@echo ""
|
||||
|
||||
# Alias for help target
|
||||
all: help
|
||||
|
||||
# Run the unit tests
|
||||
test:
|
||||
@mkdir -p target/test
|
||||
@mkdir -p target/report
|
||||
GOPATH=$(GOPATH) \
|
||||
go test \
|
||||
-covermode=atomic \
|
||||
-bench=. \
|
||||
-race \
|
||||
-cpuprofile=target/report/cpu.out \
|
||||
-memprofile=target/report/mem.out \
|
||||
-mutexprofile=target/report/mutex.out \
|
||||
-coverprofile=target/report/coverage.out \
|
||||
-v ./... | \
|
||||
tee >(PATH=$(GOPATH)/bin:$(PATH) go-junit-report > target/test/report.xml); \
|
||||
test $${PIPESTATUS[0]} -eq 0
|
||||
|
||||
# Format the source code
|
||||
format:
|
||||
@find . -type f -name "*.go" -exec gofmt -s -w {} \;
|
||||
|
||||
# Check if the source code has been formatted
|
||||
fmtcheck:
|
||||
@mkdir -p target
|
||||
@find . -type f -name "*.go" -exec gofmt -s -d {} \; | tee target/format.diff
|
||||
@test ! -s target/format.diff || { echo "ERROR: the source code has not been formatted - please use 'make format' or 'gofmt'"; exit 1; }
|
||||
|
||||
# Check for syntax errors
|
||||
vet:
|
||||
GOPATH=$(GOPATH) go vet .
|
||||
|
||||
# Check for style errors
|
||||
lint:
|
||||
GOPATH=$(GOPATH) PATH=$(GOPATH)/bin:$(PATH) golint .
|
||||
|
||||
# Generate the coverage report
|
||||
coverage:
|
||||
@mkdir -p target/report
|
||||
GOPATH=$(GOPATH) \
|
||||
go tool cover -html=target/report/coverage.out -o target/report/coverage.html
|
||||
|
||||
# Report cyclomatic complexity
|
||||
cyclo:
|
||||
@mkdir -p target/report
|
||||
GOPATH=$(GOPATH) gocyclo -avg ./ | tee target/report/cyclo.txt ; test $${PIPESTATUS[0]} -eq 0
|
||||
|
||||
# Detect ineffectual assignments
|
||||
ineffassign:
|
||||
@mkdir -p target/report
|
||||
GOPATH=$(GOPATH) ineffassign ./ | tee target/report/ineffassign.txt ; test $${PIPESTATUS[0]} -eq 0
|
||||
|
||||
# Detect commonly misspelled words in source files
|
||||
misspell:
|
||||
@mkdir -p target/report
|
||||
GOPATH=$(GOPATH) misspell -error ./ | tee target/report/misspell.txt ; test $${PIPESTATUS[0]} -eq 0
|
||||
|
||||
# Find unused struct fields
|
||||
structcheck:
|
||||
@mkdir -p target/report
|
||||
GOPATH=$(GOPATH) structcheck -a ./ | tee target/report/structcheck.txt
|
||||
|
||||
# Find unused global variables and constants
|
||||
varcheck:
|
||||
@mkdir -p target/report
|
||||
GOPATH=$(GOPATH) varcheck -e ./ | tee target/report/varcheck.txt
|
||||
|
||||
# Check that error return values are used
|
||||
errcheck:
|
||||
@mkdir -p target/report
|
||||
GOPATH=$(GOPATH) errcheck ./ | tee target/report/errcheck.txt
|
||||
|
||||
# Suggest code simplifications
|
||||
gosimple:
|
||||
@mkdir -p target/report
|
||||
GOPATH=$(GOPATH) gosimple ./ | tee target/report/gosimple.txt
|
||||
|
||||
# AST scanner
|
||||
astscan:
|
||||
@mkdir -p target/report
|
||||
GOPATH=$(GOPATH) gas .//*.go | tee target/report/astscan.txt ; test $${PIPESTATUS[0]} -eq 0
|
||||
|
||||
# Generate source docs
|
||||
docs:
|
||||
@mkdir -p target/docs
|
||||
nohup sh -c 'GOPATH=$(GOPATH) godoc -http=127.0.0.1:6060' > target/godoc_server.log 2>&1 &
|
||||
wget --directory-prefix=target/docs/ --execute robots=off --retry-connrefused --recursive --no-parent --adjust-extension --page-requisites --convert-links http://127.0.0.1:6060/pkg/github.com/${VENDOR}/${PROJECT}/ ; kill -9 `lsof -ti :6060`
|
||||
@echo '<html><head><meta http-equiv="refresh" content="0;./127.0.0.1:6060/pkg/'${CVSPATH}'/'${PROJECT}'/index.html"/></head><a href="./127.0.0.1:6060/pkg/'${CVSPATH}'/'${PROJECT}'/index.html">'${PKGNAME}' Documentation ...</a></html>' > target/docs/index.html
|
||||
|
||||
# Alias to run all quality-assurance checks
|
||||
qa: fmtcheck test vet lint coverage cyclo ineffassign misspell structcheck varcheck errcheck gosimple astscan
|
||||
|
||||
# --- INSTALL ---
|
||||
|
||||
# Get the dependencies
|
||||
deps:
|
||||
GOPATH=$(GOPATH) go get ./...
|
||||
GOPATH=$(GOPATH) go get github.com/golang/lint/golint
|
||||
GOPATH=$(GOPATH) go get github.com/jstemmer/go-junit-report
|
||||
GOPATH=$(GOPATH) go get github.com/axw/gocov/gocov
|
||||
GOPATH=$(GOPATH) go get github.com/fzipp/gocyclo
|
||||
GOPATH=$(GOPATH) go get github.com/gordonklaus/ineffassign
|
||||
GOPATH=$(GOPATH) go get github.com/client9/misspell/cmd/misspell
|
||||
GOPATH=$(GOPATH) go get github.com/opennota/check/cmd/structcheck
|
||||
GOPATH=$(GOPATH) go get github.com/opennota/check/cmd/varcheck
|
||||
GOPATH=$(GOPATH) go get github.com/kisielk/errcheck
|
||||
GOPATH=$(GOPATH) go get honnef.co/go/tools/cmd/gosimple
|
||||
GOPATH=$(GOPATH) go get github.com/securego/gosec
|
||||
|
||||
# Remove any build artifact
|
||||
clean:
|
||||
GOPATH=$(GOPATH) go clean ./...
|
||||
|
||||
# Deletes any intermediate file
|
||||
nuke:
|
||||
rm -rf ./target
|
||||
GOPATH=$(GOPATH) go clean -i ./...
|
|
@ -0,0 +1,153 @@
|
|||
Bloom filters
|
||||
-------------
|
||||
[![Test](https://github.com/bits-and-blooms/bloom/actions/workflows/test.yml/badge.svg)](https://github.com/bits-and-blooms/bloom/actions/workflows/test.yml)
|
||||
[![Go Report Card](https://goreportcard.com/badge/github.com/bits-and-blooms/bloom)](https://goreportcard.com/report/github.com/bits-and-blooms/bloom)
|
||||
[![Go Reference](https://pkg.go.dev/badge/github.com/bits-and-blooms/bloom.svg)](https://pkg.go.dev/github.com/bits-and-blooms/bloom/v3)
|
||||
|
||||
This library is used by popular systems such as [Milvus](https://github.com/milvus-io/milvus) and [beego](https://github.com/beego/Beego).
|
||||
|
||||
A Bloom filter is a concise/compressed representation of a set, where the main
|
||||
requirement is to make membership queries; _i.e._, whether an item is a
|
||||
member of a set. A Bloom filter will always correctly report the presence
|
||||
of an element in the set when the element is indeed present. A Bloom filter
|
||||
can use much less storage than the original set, but it allows for some 'false positives':
|
||||
it may sometimes report that an element is in the set whereas it is not.
|
||||
|
||||
When you construct, you need to know how many elements you have (the desired capacity), and what is the desired false positive rate you are willing to tolerate. A common false-positive rate is 1%. The
|
||||
lower the false-positive rate, the more memory you are going to require. Similarly, the higher the
|
||||
capacity, the more memory you will use.
|
||||
You may construct the Bloom filter capable of receiving 1 million elements with a false-positive
|
||||
rate of 1% in the following manner.
|
||||
|
||||
```Go
|
||||
filter := bloom.NewWithEstimates(1000000, 0.01)
|
||||
```
|
||||
|
||||
You should call `NewWithEstimates` conservatively: if you specify a number of elements that it is
|
||||
too small, the false-positive bound might be exceeded. A Bloom filter is not a dynamic data structure:
|
||||
you must know ahead of time what your desired capacity is.
|
||||
|
||||
Our implementation accepts keys for setting and testing as `[]byte`. Thus, to
|
||||
add a string item, `"Love"`:
|
||||
|
||||
```Go
|
||||
filter.Add([]byte("Love"))
|
||||
```
|
||||
|
||||
Similarly, to test if `"Love"` is in bloom:
|
||||
|
||||
```Go
|
||||
if filter.Test([]byte("Love"))
|
||||
```
|
||||
|
||||
For numerical data, we recommend that you look into the encoding/binary library. But, for example, to add a `uint32` to the filter:
|
||||
|
||||
```Go
|
||||
i := uint32(100)
|
||||
n1 := make([]byte, 4)
|
||||
binary.BigEndian.PutUint32(n1, i)
|
||||
filter.Add(n1)
|
||||
```
|
||||
|
||||
Godoc documentation: https://pkg.go.dev/github.com/bits-and-blooms/bloom/v3
|
||||
|
||||
|
||||
## Installation
|
||||
|
||||
```bash
|
||||
go get -u github.com/bits-and-blooms/bloom/v3
|
||||
```
|
||||
|
||||
## Verifying the False Positive Rate
|
||||
|
||||
|
||||
Sometimes, the actual false positive rate may differ (slightly) from the
|
||||
theoretical false positive rate. We have a function to estimate the false positive rate of a
|
||||
Bloom filter with _m_ bits and _k_ hashing functions for a set of size _n_:
|
||||
|
||||
```Go
|
||||
if bloom.EstimateFalsePositiveRate(20*n, 5, n) > 0.001 ...
|
||||
```
|
||||
|
||||
You can use it to validate the computed m, k parameters:
|
||||
|
||||
```Go
|
||||
m, k := bloom.EstimateParameters(n, fp)
|
||||
ActualfpRate := bloom.EstimateFalsePositiveRate(m, k, n)
|
||||
```
|
||||
|
||||
or
|
||||
|
||||
```Go
|
||||
f := bloom.NewWithEstimates(n, fp)
|
||||
ActualfpRate := bloom.EstimateFalsePositiveRate(f.m, f.k, n)
|
||||
```
|
||||
|
||||
You would expect `ActualfpRate` to be close to the desired false-positive rate `fp` in these cases.
|
||||
|
||||
The `EstimateFalsePositiveRate` function creates a temporary Bloom filter. It is
|
||||
also relatively expensive and only meant for validation.
|
||||
|
||||
## Serialization
|
||||
|
||||
You can read and write the Bloom filters as follows:
|
||||
|
||||
|
||||
```Go
|
||||
f := New(1000, 4)
|
||||
var buf bytes.Buffer
|
||||
bytesWritten, err := f.WriteTo(&buf)
|
||||
if err != nil {
|
||||
t.Fatal(err.Error())
|
||||
}
|
||||
var g BloomFilter
|
||||
bytesRead, err := g.ReadFrom(&buf)
|
||||
if err != nil {
|
||||
t.Fatal(err.Error())
|
||||
}
|
||||
if bytesRead != bytesWritten {
|
||||
t.Errorf("read unexpected number of bytes %d != %d", bytesRead, bytesWritten)
|
||||
}
|
||||
```
|
||||
|
||||
*Performance tip*:
|
||||
When reading and writing to a file or a network connection, you may get better performance by
|
||||
wrapping your streams with `bufio` instances.
|
||||
|
||||
E.g.,
|
||||
```Go
|
||||
f, err := os.Create("myfile")
|
||||
w := bufio.NewWriter(f)
|
||||
```
|
||||
```Go
|
||||
f, err := os.Open("myfile")
|
||||
r := bufio.NewReader(f)
|
||||
```
|
||||
|
||||
## Contributing
|
||||
|
||||
If you wish to contribute to this project, please branch and issue a pull request against master ("[GitHub Flow](https://guides.github.com/introduction/flow/)")
|
||||
|
||||
This project includes a Makefile that allows you to test and build the project with simple commands.
|
||||
To see all available options:
|
||||
```bash
|
||||
make help
|
||||
```
|
||||
|
||||
## Running all tests
|
||||
|
||||
Before committing the code, please check if it passes all tests using (note: this will install some dependencies):
|
||||
```bash
|
||||
make deps
|
||||
make qa
|
||||
```
|
||||
|
||||
## Design
|
||||
|
||||
A Bloom filter has two parameters: _m_, the number of bits used in storage, and _k_, the number of hashing functions on elements of the set. (The actual hashing functions are important, too, but this is not a parameter for this implementation). A Bloom filter is backed by a [BitSet](https://github.com/bits-and-blooms/bitset); a key is represented in the filter by setting the bits at each value of the hashing functions (modulo _m_). Set membership is done by _testing_ whether the bits at each value of the hashing functions (again, modulo _m_) are set. If so, the item is in the set. If the item is actually in the set, a Bloom filter will never fail (the true positive rate is 1.0); but it is susceptible to false positives. The art is to choose _k_ and _m_ correctly.
|
||||
|
||||
In this implementation, the hashing functions used is [murmurhash](github.com/twmb/murmur3), a non-cryptographic hashing function.
|
||||
|
||||
|
||||
Given the particular hashing scheme, it's best to be empirical about this. Note
|
||||
that estimating the FP rate will clear the Bloom filter.
|
|
@ -0,0 +1,5 @@
|
|||
# Security Policy
|
||||
|
||||
## Reporting a Vulnerability
|
||||
|
||||
You can report privately a vulnerability by email at daniel@lemire.me (current maintainer).
|
|
@ -0,0 +1,453 @@
|
|||
/*
|
||||
Package bloom provides data structures and methods for creating Bloom filters.
|
||||
|
||||
A Bloom filter is a representation of a set of _n_ items, where the main
|
||||
requirement is to make membership queries; _i.e._, whether an item is a
|
||||
member of a set.
|
||||
|
||||
A Bloom filter has two parameters: _m_, a maximum size (typically a reasonably large
|
||||
multiple of the cardinality of the set to represent) and _k_, the number of hashing
|
||||
functions on elements of the set. (The actual hashing functions are important, too,
|
||||
but this is not a parameter for this implementation). A Bloom filter is backed by
|
||||
a BitSet; a key is represented in the filter by setting the bits at each value of the
|
||||
hashing functions (modulo _m_). Set membership is done by _testing_ whether the
|
||||
bits at each value of the hashing functions (again, modulo _m_) are set. If so,
|
||||
the item is in the set. If the item is actually in the set, a Bloom filter will
|
||||
never fail (the true positive rate is 1.0); but it is susceptible to false
|
||||
positives. The art is to choose _k_ and _m_ correctly.
|
||||
|
||||
In this implementation, the hashing functions used is murmurhash,
|
||||
a non-cryptographic hashing function.
|
||||
|
||||
This implementation accepts keys for setting as testing as []byte. Thus, to
|
||||
add a string item, "Love":
|
||||
|
||||
uint n = 1000
|
||||
filter := bloom.New(20*n, 5) // load of 20, 5 keys
|
||||
filter.Add([]byte("Love"))
|
||||
|
||||
Similarly, to test if "Love" is in bloom:
|
||||
|
||||
if filter.Test([]byte("Love"))
|
||||
|
||||
For numeric data, I recommend that you look into the binary/encoding library. But,
|
||||
for example, to add a uint32 to the filter:
|
||||
|
||||
i := uint32(100)
|
||||
n1 := make([]byte,4)
|
||||
binary.BigEndian.PutUint32(n1,i)
|
||||
f.Add(n1)
|
||||
|
||||
Finally, there is a method to estimate the false positive rate of a
|
||||
Bloom filter with _m_ bits and _k_ hashing functions for a set of size _n_:
|
||||
|
||||
if bloom.EstimateFalsePositiveRate(20*n, 5, n) > 0.001 ...
|
||||
|
||||
You can use it to validate the computed m, k parameters:
|
||||
|
||||
m, k := bloom.EstimateParameters(n, fp)
|
||||
ActualfpRate := bloom.EstimateFalsePositiveRate(m, k, n)
|
||||
|
||||
or
|
||||
|
||||
f := bloom.NewWithEstimates(n, fp)
|
||||
ActualfpRate := bloom.EstimateFalsePositiveRate(f.m, f.k, n)
|
||||
|
||||
You would expect ActualfpRate to be close to the desired fp in these cases.
|
||||
|
||||
The EstimateFalsePositiveRate function creates a temporary Bloom filter. It is
|
||||
also relatively expensive and only meant for validation.
|
||||
*/
|
||||
package bloom
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"encoding/binary"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"io"
|
||||
"math"
|
||||
|
||||
"github.com/bits-and-blooms/bitset"
|
||||
)
|
||||
|
||||
// A BloomFilter is a representation of a set of _n_ items, where the main
|
||||
// requirement is to make membership queries; _i.e._, whether an item is a
|
||||
// member of a set.
|
||||
type BloomFilter struct {
|
||||
m uint
|
||||
k uint
|
||||
b *bitset.BitSet
|
||||
}
|
||||
|
||||
func max(x, y uint) uint {
|
||||
if x > y {
|
||||
return x
|
||||
}
|
||||
return y
|
||||
}
|
||||
|
||||
// New creates a new Bloom filter with _m_ bits and _k_ hashing functions
|
||||
// We force _m_ and _k_ to be at least one to avoid panics.
|
||||
func New(m uint, k uint) *BloomFilter {
|
||||
return &BloomFilter{max(1, m), max(1, k), bitset.New(m)}
|
||||
}
|
||||
|
||||
// From creates a new Bloom filter with len(_data_) * 64 bits and _k_ hashing
|
||||
// functions. The data slice is not going to be reset.
|
||||
func From(data []uint64, k uint) *BloomFilter {
|
||||
m := uint(len(data) * 64)
|
||||
return FromWithM(data, m, k)
|
||||
}
|
||||
|
||||
// FromWithM creates a new Bloom filter with _m_ length, _k_ hashing functions.
|
||||
// The data slice is not going to be reset.
|
||||
func FromWithM(data []uint64, m, k uint) *BloomFilter {
|
||||
return &BloomFilter{m, k, bitset.From(data)}
|
||||
}
|
||||
|
||||
// baseHashes returns the four hash values of data that are used to create k
|
||||
// hashes
|
||||
func baseHashes(data []byte) [4]uint64 {
|
||||
var d digest128 // murmur hashing
|
||||
hash1, hash2, hash3, hash4 := d.sum256(data)
|
||||
return [4]uint64{
|
||||
hash1, hash2, hash3, hash4,
|
||||
}
|
||||
}
|
||||
|
||||
// location returns the ith hashed location using the four base hash values
|
||||
func location(h [4]uint64, i uint) uint64 {
|
||||
ii := uint64(i)
|
||||
return h[ii%2] + ii*h[2+(((ii+(ii%2))%4)/2)]
|
||||
}
|
||||
|
||||
// location returns the ith hashed location using the four base hash values
|
||||
func (f *BloomFilter) location(h [4]uint64, i uint) uint {
|
||||
return uint(location(h, i) % uint64(f.m))
|
||||
}
|
||||
|
||||
// EstimateParameters estimates requirements for m and k.
|
||||
// Based on https://bitbucket.org/ww/bloom/src/829aa19d01d9/bloom.go
|
||||
// used with permission.
|
||||
func EstimateParameters(n uint, p float64) (m uint, k uint) {
|
||||
m = uint(math.Ceil(-1 * float64(n) * math.Log(p) / math.Pow(math.Log(2), 2)))
|
||||
k = uint(math.Ceil(math.Log(2) * float64(m) / float64(n)))
|
||||
return
|
||||
}
|
||||
|
||||
// NewWithEstimates creates a new Bloom filter for about n items with fp
|
||||
// false positive rate
|
||||
func NewWithEstimates(n uint, fp float64) *BloomFilter {
|
||||
m, k := EstimateParameters(n, fp)
|
||||
return New(m, k)
|
||||
}
|
||||
|
||||
// Cap returns the capacity, _m_, of a Bloom filter
|
||||
func (f *BloomFilter) Cap() uint {
|
||||
return f.m
|
||||
}
|
||||
|
||||
// K returns the number of hash functions used in the BloomFilter
|
||||
func (f *BloomFilter) K() uint {
|
||||
return f.k
|
||||
}
|
||||
|
||||
// BitSet returns the underlying bitset for this filter.
|
||||
func (f *BloomFilter) BitSet() *bitset.BitSet {
|
||||
return f.b
|
||||
}
|
||||
|
||||
// Add data to the Bloom Filter. Returns the filter (allows chaining)
|
||||
func (f *BloomFilter) Add(data []byte) *BloomFilter {
|
||||
h := baseHashes(data)
|
||||
for i := uint(0); i < f.k; i++ {
|
||||
f.b.Set(f.location(h, i))
|
||||
}
|
||||
return f
|
||||
}
|
||||
|
||||
// Merge the data from two Bloom Filters.
|
||||
func (f *BloomFilter) Merge(g *BloomFilter) error {
|
||||
// Make sure the m's and k's are the same, otherwise merging has no real use.
|
||||
if f.m != g.m {
|
||||
return fmt.Errorf("m's don't match: %d != %d", f.m, g.m)
|
||||
}
|
||||
|
||||
if f.k != g.k {
|
||||
return fmt.Errorf("k's don't match: %d != %d", f.m, g.m)
|
||||
}
|
||||
|
||||
f.b.InPlaceUnion(g.b)
|
||||
return nil
|
||||
}
|
||||
|
||||
// Copy creates a copy of a Bloom filter.
|
||||
func (f *BloomFilter) Copy() *BloomFilter {
|
||||
fc := New(f.m, f.k)
|
||||
fc.Merge(f) // #nosec
|
||||
return fc
|
||||
}
|
||||
|
||||
// AddString to the Bloom Filter. Returns the filter (allows chaining)
|
||||
func (f *BloomFilter) AddString(data string) *BloomFilter {
|
||||
return f.Add([]byte(data))
|
||||
}
|
||||
|
||||
// Test returns true if the data is in the BloomFilter, false otherwise.
|
||||
// If true, the result might be a false positive. If false, the data
|
||||
// is definitely not in the set.
|
||||
func (f *BloomFilter) Test(data []byte) bool {
|
||||
h := baseHashes(data)
|
||||
for i := uint(0); i < f.k; i++ {
|
||||
if !f.b.Test(f.location(h, i)) {
|
||||
return false
|
||||
}
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
// TestString returns true if the string is in the BloomFilter, false otherwise.
|
||||
// If true, the result might be a false positive. If false, the data
|
||||
// is definitely not in the set.
|
||||
func (f *BloomFilter) TestString(data string) bool {
|
||||
return f.Test([]byte(data))
|
||||
}
|
||||
|
||||
// TestLocations returns true if all locations are set in the BloomFilter, false
|
||||
// otherwise.
|
||||
func (f *BloomFilter) TestLocations(locs []uint64) bool {
|
||||
for i := 0; i < len(locs); i++ {
|
||||
if !f.b.Test(uint(locs[i] % uint64(f.m))) {
|
||||
return false
|
||||
}
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
// TestAndAdd is equivalent to calling Test(data) then Add(data).
|
||||
// The filter is written to unconditionnally: even if the element is present,
|
||||
// the corresponding bits are still set. See also TestOrAdd.
|
||||
// Returns the result of Test.
|
||||
func (f *BloomFilter) TestAndAdd(data []byte) bool {
|
||||
present := true
|
||||
h := baseHashes(data)
|
||||
for i := uint(0); i < f.k; i++ {
|
||||
l := f.location(h, i)
|
||||
if !f.b.Test(l) {
|
||||
present = false
|
||||
}
|
||||
f.b.Set(l)
|
||||
}
|
||||
return present
|
||||
}
|
||||
|
||||
// TestAndAddString is the equivalent to calling Test(string) then Add(string).
|
||||
// The filter is written to unconditionnally: even if the string is present,
|
||||
// the corresponding bits are still set. See also TestOrAdd.
|
||||
// Returns the result of Test.
|
||||
func (f *BloomFilter) TestAndAddString(data string) bool {
|
||||
return f.TestAndAdd([]byte(data))
|
||||
}
|
||||
|
||||
// TestOrAdd is equivalent to calling Test(data) then if not present Add(data).
|
||||
// If the element is already in the filter, then the filter is unchanged.
|
||||
// Returns the result of Test.
|
||||
func (f *BloomFilter) TestOrAdd(data []byte) bool {
|
||||
present := true
|
||||
h := baseHashes(data)
|
||||
for i := uint(0); i < f.k; i++ {
|
||||
l := f.location(h, i)
|
||||
if !f.b.Test(l) {
|
||||
present = false
|
||||
f.b.Set(l)
|
||||
}
|
||||
}
|
||||
return present
|
||||
}
|
||||
|
||||
// TestOrAddString is the equivalent to calling Test(string) then if not present Add(string).
|
||||
// If the string is already in the filter, then the filter is unchanged.
|
||||
// Returns the result of Test.
|
||||
func (f *BloomFilter) TestOrAddString(data string) bool {
|
||||
return f.TestOrAdd([]byte(data))
|
||||
}
|
||||
|
||||
// ClearAll clears all the data in a Bloom filter, removing all keys
|
||||
func (f *BloomFilter) ClearAll() *BloomFilter {
|
||||
f.b.ClearAll()
|
||||
return f
|
||||
}
|
||||
|
||||
// EstimateFalsePositiveRate returns, for a BloomFilter of m bits
|
||||
// and k hash functions, an estimation of the false positive rate when
|
||||
//
|
||||
// storing n entries. This is an empirical, relatively slow
|
||||
//
|
||||
// test using integers as keys.
|
||||
// This function is useful to validate the implementation.
|
||||
func EstimateFalsePositiveRate(m, k, n uint) (fpRate float64) {
|
||||
rounds := uint32(100000)
|
||||
// We construct a new filter.
|
||||
f := New(m, k)
|
||||
n1 := make([]byte, 4)
|
||||
// We populate the filter with n values.
|
||||
for i := uint32(0); i < uint32(n); i++ {
|
||||
binary.BigEndian.PutUint32(n1, i)
|
||||
f.Add(n1)
|
||||
}
|
||||
fp := 0
|
||||
// test for number of rounds
|
||||
for i := uint32(0); i < rounds; i++ {
|
||||
binary.BigEndian.PutUint32(n1, i+uint32(n)+1)
|
||||
if f.Test(n1) {
|
||||
fp++
|
||||
}
|
||||
}
|
||||
fpRate = float64(fp) / (float64(rounds))
|
||||
return
|
||||
}
|
||||
|
||||
// Approximating the number of items
|
||||
// https://en.wikipedia.org/wiki/Bloom_filter#Approximating_the_number_of_items_in_a_Bloom_filter
|
||||
func (f *BloomFilter) ApproximatedSize() uint32 {
|
||||
x := float64(f.b.Count())
|
||||
m := float64(f.Cap())
|
||||
k := float64(f.K())
|
||||
size := -1 * m / k * math.Log(1-x/m) / math.Log(math.E)
|
||||
return uint32(math.Floor(size + 0.5)) // round
|
||||
}
|
||||
|
||||
// bloomFilterJSON is an unexported type for marshaling/unmarshaling BloomFilter struct.
|
||||
type bloomFilterJSON struct {
|
||||
M uint `json:"m"`
|
||||
K uint `json:"k"`
|
||||
B *bitset.BitSet `json:"b"`
|
||||
}
|
||||
|
||||
// MarshalJSON implements json.Marshaler interface.
|
||||
func (f BloomFilter) MarshalJSON() ([]byte, error) {
|
||||
return json.Marshal(bloomFilterJSON{f.m, f.k, f.b})
|
||||
}
|
||||
|
||||
// UnmarshalJSON implements json.Unmarshaler interface.
|
||||
func (f *BloomFilter) UnmarshalJSON(data []byte) error {
|
||||
var j bloomFilterJSON
|
||||
err := json.Unmarshal(data, &j)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
f.m = j.M
|
||||
f.k = j.K
|
||||
f.b = j.B
|
||||
return nil
|
||||
}
|
||||
|
||||
// WriteTo writes a binary representation of the BloomFilter to an i/o stream.
|
||||
// It returns the number of bytes written.
|
||||
//
|
||||
// Performance: if this function is used to write to a disk or network
|
||||
// connection, it might be beneficial to wrap the stream in a bufio.Writer.
|
||||
// E.g.,
|
||||
//
|
||||
// f, err := os.Create("myfile")
|
||||
// w := bufio.NewWriter(f)
|
||||
func (f *BloomFilter) WriteTo(stream io.Writer) (int64, error) {
|
||||
err := binary.Write(stream, binary.BigEndian, uint64(f.m))
|
||||
if err != nil {
|
||||
return 0, err
|
||||
}
|
||||
err = binary.Write(stream, binary.BigEndian, uint64(f.k))
|
||||
if err != nil {
|
||||
return 0, err
|
||||
}
|
||||
numBytes, err := f.b.WriteTo(stream)
|
||||
return numBytes + int64(2*binary.Size(uint64(0))), err
|
||||
}
|
||||
|
||||
// ReadFrom reads a binary representation of the BloomFilter (such as might
|
||||
// have been written by WriteTo()) from an i/o stream. It returns the number
|
||||
// of bytes read.
|
||||
//
|
||||
// Performance: if this function is used to read from a disk or network
|
||||
// connection, it might be beneficial to wrap the stream in a bufio.Reader.
|
||||
// E.g.,
|
||||
//
|
||||
// f, err := os.Open("myfile")
|
||||
// r := bufio.NewReader(f)
|
||||
func (f *BloomFilter) ReadFrom(stream io.Reader) (int64, error) {
|
||||
var m, k uint64
|
||||
err := binary.Read(stream, binary.BigEndian, &m)
|
||||
if err != nil {
|
||||
return 0, err
|
||||
}
|
||||
err = binary.Read(stream, binary.BigEndian, &k)
|
||||
if err != nil {
|
||||
return 0, err
|
||||
}
|
||||
b := &bitset.BitSet{}
|
||||
numBytes, err := b.ReadFrom(stream)
|
||||
if err != nil {
|
||||
return 0, err
|
||||
}
|
||||
f.m = uint(m)
|
||||
f.k = uint(k)
|
||||
f.b = b
|
||||
return numBytes + int64(2*binary.Size(uint64(0))), nil
|
||||
}
|
||||
|
||||
// GobEncode implements gob.GobEncoder interface.
|
||||
func (f *BloomFilter) GobEncode() ([]byte, error) {
|
||||
var buf bytes.Buffer
|
||||
_, err := f.WriteTo(&buf)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return buf.Bytes(), nil
|
||||
}
|
||||
|
||||
// GobDecode implements gob.GobDecoder interface.
|
||||
func (f *BloomFilter) GobDecode(data []byte) error {
|
||||
buf := bytes.NewBuffer(data)
|
||||
_, err := f.ReadFrom(buf)
|
||||
|
||||
return err
|
||||
}
|
||||
|
||||
// MarshalBinary implements binary.BinaryMarshaler interface.
|
||||
func (f *BloomFilter) MarshalBinary() ([]byte, error) {
|
||||
var buf bytes.Buffer
|
||||
_, err := f.WriteTo(&buf)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return buf.Bytes(), nil
|
||||
}
|
||||
|
||||
// UnmarshalBinary implements binary.BinaryUnmarshaler interface.
|
||||
func (f *BloomFilter) UnmarshalBinary(data []byte) error {
|
||||
buf := bytes.NewBuffer(data)
|
||||
_, err := f.ReadFrom(buf)
|
||||
|
||||
return err
|
||||
}
|
||||
|
||||
// Equal tests for the equality of two Bloom filters
|
||||
func (f *BloomFilter) Equal(g *BloomFilter) bool {
|
||||
return f.m == g.m && f.k == g.k && f.b.Equal(g.b)
|
||||
}
|
||||
|
||||
// Locations returns a list of hash locations representing a data item.
|
||||
func Locations(data []byte, k uint) []uint64 {
|
||||
locs := make([]uint64, k)
|
||||
|
||||
// calculate locations
|
||||
h := baseHashes(data)
|
||||
for i := uint(0); i < k; i++ {
|
||||
locs[i] = location(h, i)
|
||||
}
|
||||
|
||||
return locs
|
||||
}
|
|
@ -0,0 +1,289 @@
|
|||
/*
|
||||
The bloom library relied on the excellent murmur library
|
||||
by Sébastien Paolacci. Unfortunately, it involved some heap
|
||||
allocation. We want to avoid any heap allocation whatsoever
|
||||
in the hashing process. To preserve backward compatibility, we roll
|
||||
our own hashing functions. They are designed to be strictly equivalent
|
||||
to Paolacci's implementation.
|
||||
|
||||
License on original code:
|
||||
|
||||
|
||||
Copyright 2013, Sébastien Paolacci.
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are met:
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
* Neither the name of the library nor the
|
||||
names of its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
||||
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
DISCLAIMED. IN NO EVENT SHALL <COPYRIGHT HOLDER> BE LIABLE FOR ANY
|
||||
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
||||
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
package bloom
|
||||
|
||||
import (
|
||||
"encoding/binary"
|
||||
"math/bits"
|
||||
"unsafe"
|
||||
)
|
||||
|
||||
const (
|
||||
c1_128 = 0x87c37b91114253d5
|
||||
c2_128 = 0x4cf5ad432745937f
|
||||
block_size = 16
|
||||
)
|
||||
|
||||
// digest128 represents a partial evaluation of a 128 bites hash.
|
||||
type digest128 struct {
|
||||
h1 uint64 // Unfinalized running hash part 1.
|
||||
h2 uint64 // Unfinalized running hash part 2.
|
||||
}
|
||||
|
||||
// bmix will hash blocks (16 bytes)
|
||||
func (d *digest128) bmix(p []byte) {
|
||||
nblocks := len(p) / block_size
|
||||
for i := 0; i < nblocks; i++ {
|
||||
b := (*[16]byte)(unsafe.Pointer(&p[i*block_size]))
|
||||
k1, k2 := binary.LittleEndian.Uint64(b[:8]), binary.LittleEndian.Uint64(b[8:])
|
||||
d.bmix_words(k1, k2)
|
||||
}
|
||||
}
|
||||
|
||||
// bmix_words will hash two 64-bit words (16 bytes)
|
||||
func (d *digest128) bmix_words(k1, k2 uint64) {
|
||||
h1, h2 := d.h1, d.h2
|
||||
|
||||
k1 *= c1_128
|
||||
k1 = bits.RotateLeft64(k1, 31)
|
||||
k1 *= c2_128
|
||||
h1 ^= k1
|
||||
|
||||
h1 = bits.RotateLeft64(h1, 27)
|
||||
h1 += h2
|
||||
h1 = h1*5 + 0x52dce729
|
||||
|
||||
k2 *= c2_128
|
||||
k2 = bits.RotateLeft64(k2, 33)
|
||||
k2 *= c1_128
|
||||
h2 ^= k2
|
||||
|
||||
h2 = bits.RotateLeft64(h2, 31)
|
||||
h2 += h1
|
||||
h2 = h2*5 + 0x38495ab5
|
||||
d.h1, d.h2 = h1, h2
|
||||
}
|
||||
|
||||
// sum128 computers two 64-bit hash value. It is assumed that
|
||||
// bmix was first called on the data to process complete blocks
|
||||
// of 16 bytes. The 'tail' is a slice representing the 'tail' (leftover
|
||||
// elements, fewer than 16). If pad_tail is true, we make it seem like
|
||||
// there is an extra element with value 1 appended to the tail.
|
||||
// The length parameter represents the full length of the data (including
|
||||
// the blocks of 16 bytes, and, if pad_tail is true, an extra byte).
|
||||
func (d *digest128) sum128(pad_tail bool, length uint, tail []byte) (h1, h2 uint64) {
|
||||
h1, h2 = d.h1, d.h2
|
||||
|
||||
var k1, k2 uint64
|
||||
if pad_tail {
|
||||
switch (len(tail) + 1) & 15 {
|
||||
case 15:
|
||||
k2 ^= uint64(1) << 48
|
||||
break
|
||||
case 14:
|
||||
k2 ^= uint64(1) << 40
|
||||
break
|
||||
case 13:
|
||||
k2 ^= uint64(1) << 32
|
||||
break
|
||||
case 12:
|
||||
k2 ^= uint64(1) << 24
|
||||
break
|
||||
case 11:
|
||||
k2 ^= uint64(1) << 16
|
||||
break
|
||||
case 10:
|
||||
k2 ^= uint64(1) << 8
|
||||
break
|
||||
case 9:
|
||||
k2 ^= uint64(1) << 0
|
||||
|
||||
k2 *= c2_128
|
||||
k2 = bits.RotateLeft64(k2, 33)
|
||||
k2 *= c1_128
|
||||
h2 ^= k2
|
||||
|
||||
break
|
||||
|
||||
case 8:
|
||||
k1 ^= uint64(1) << 56
|
||||
break
|
||||
case 7:
|
||||
k1 ^= uint64(1) << 48
|
||||
break
|
||||
case 6:
|
||||
k1 ^= uint64(1) << 40
|
||||
break
|
||||
case 5:
|
||||
k1 ^= uint64(1) << 32
|
||||
break
|
||||
case 4:
|
||||
k1 ^= uint64(1) << 24
|
||||
break
|
||||
case 3:
|
||||
k1 ^= uint64(1) << 16
|
||||
break
|
||||
case 2:
|
||||
k1 ^= uint64(1) << 8
|
||||
break
|
||||
case 1:
|
||||
k1 ^= uint64(1) << 0
|
||||
k1 *= c1_128
|
||||
k1 = bits.RotateLeft64(k1, 31)
|
||||
k1 *= c2_128
|
||||
h1 ^= k1
|
||||
}
|
||||
|
||||
}
|
||||
switch len(tail) & 15 {
|
||||
case 15:
|
||||
k2 ^= uint64(tail[14]) << 48
|
||||
fallthrough
|
||||
case 14:
|
||||
k2 ^= uint64(tail[13]) << 40
|
||||
fallthrough
|
||||
case 13:
|
||||
k2 ^= uint64(tail[12]) << 32
|
||||
fallthrough
|
||||
case 12:
|
||||
k2 ^= uint64(tail[11]) << 24
|
||||
fallthrough
|
||||
case 11:
|
||||
k2 ^= uint64(tail[10]) << 16
|
||||
fallthrough
|
||||
case 10:
|
||||
k2 ^= uint64(tail[9]) << 8
|
||||
fallthrough
|
||||
case 9:
|
||||
k2 ^= uint64(tail[8]) << 0
|
||||
|
||||
k2 *= c2_128
|
||||
k2 = bits.RotateLeft64(k2, 33)
|
||||
k2 *= c1_128
|
||||
h2 ^= k2
|
||||
|
||||
fallthrough
|
||||
|
||||
case 8:
|
||||
k1 ^= uint64(tail[7]) << 56
|
||||
fallthrough
|
||||
case 7:
|
||||
k1 ^= uint64(tail[6]) << 48
|
||||
fallthrough
|
||||
case 6:
|
||||
k1 ^= uint64(tail[5]) << 40
|
||||
fallthrough
|
||||
case 5:
|
||||
k1 ^= uint64(tail[4]) << 32
|
||||
fallthrough
|
||||
case 4:
|
||||
k1 ^= uint64(tail[3]) << 24
|
||||
fallthrough
|
||||
case 3:
|
||||
k1 ^= uint64(tail[2]) << 16
|
||||
fallthrough
|
||||
case 2:
|
||||
k1 ^= uint64(tail[1]) << 8
|
||||
fallthrough
|
||||
case 1:
|
||||
k1 ^= uint64(tail[0]) << 0
|
||||
k1 *= c1_128
|
||||
k1 = bits.RotateLeft64(k1, 31)
|
||||
k1 *= c2_128
|
||||
h1 ^= k1
|
||||
}
|
||||
|
||||
h1 ^= uint64(length)
|
||||
h2 ^= uint64(length)
|
||||
|
||||
h1 += h2
|
||||
h2 += h1
|
||||
|
||||
h1 = fmix64(h1)
|
||||
h2 = fmix64(h2)
|
||||
|
||||
h1 += h2
|
||||
h2 += h1
|
||||
|
||||
return h1, h2
|
||||
}
|
||||
|
||||
func fmix64(k uint64) uint64 {
|
||||
k ^= k >> 33
|
||||
k *= 0xff51afd7ed558ccd
|
||||
k ^= k >> 33
|
||||
k *= 0xc4ceb9fe1a85ec53
|
||||
k ^= k >> 33
|
||||
return k
|
||||
}
|
||||
|
||||
// sum256 will compute 4 64-bit hash values from the input.
|
||||
// It is designed to never allocate memory on the heap. So it
|
||||
// works without any byte buffer whatsoever.
|
||||
// It is designed to be strictly equivalent to
|
||||
//
|
||||
// a1 := []byte{1}
|
||||
// hasher := murmur3.New128()
|
||||
// hasher.Write(data) // #nosec
|
||||
// v1, v2 := hasher.Sum128()
|
||||
// hasher.Write(a1) // #nosec
|
||||
// v3, v4 := hasher.Sum128()
|
||||
//
|
||||
// See TestHashRandom.
|
||||
func (d *digest128) sum256(data []byte) (hash1, hash2, hash3, hash4 uint64) {
|
||||
// We always start from zero.
|
||||
d.h1, d.h2 = 0, 0
|
||||
// Process as many bytes as possible.
|
||||
d.bmix(data)
|
||||
// We have enough to compute the first two 64-bit numbers
|
||||
length := uint(len(data))
|
||||
tail_length := length % block_size
|
||||
tail := data[length-tail_length:]
|
||||
hash1, hash2 = d.sum128(false, length, tail)
|
||||
// Next we want to 'virtually' append 1 to the input, but,
|
||||
// we do not want to append to an actual array!!!
|
||||
if tail_length+1 == block_size {
|
||||
// We are left with no tail!!!
|
||||
word1 := binary.LittleEndian.Uint64(tail[:8])
|
||||
word2 := uint64(binary.LittleEndian.Uint32(tail[8 : 8+4]))
|
||||
word2 = word2 | (uint64(tail[12]) << 32) | (uint64(tail[13]) << 40) | (uint64(tail[14]) << 48)
|
||||
// We append 1.
|
||||
word2 = word2 | (uint64(1) << 56)
|
||||
// We process the resulting 2 words.
|
||||
d.bmix_words(word1, word2)
|
||||
tail := data[length:] // empty slice, deliberate.
|
||||
hash3, hash4 = d.sum128(false, length+1, tail)
|
||||
} else {
|
||||
// We still have a tail (fewer than 15 bytes) but we
|
||||
// need to append '1' to it.
|
||||
hash3, hash4 = d.sum128(true, length+1, tail)
|
||||
}
|
||||
|
||||
return hash1, hash2, hash3, hash4
|
||||
}
|
|
@ -137,9 +137,12 @@ github.com/benbjohnson/immutable
|
|||
# github.com/beorn7/perks v1.0.1
|
||||
## explicit; go 1.11
|
||||
github.com/beorn7/perks/quantile
|
||||
# github.com/bits-and-blooms/bitset v1.2.0
|
||||
## explicit; go 1.14
|
||||
# github.com/bits-and-blooms/bitset v1.13.0
|
||||
## explicit; go 1.16
|
||||
github.com/bits-and-blooms/bitset
|
||||
# github.com/bits-and-blooms/bloom/v3 v3.7.0
|
||||
## explicit; go 1.16
|
||||
github.com/bits-and-blooms/bloom/v3
|
||||
# github.com/bradfitz/iter v0.0.0-20191230175014-e8f45d346db8
|
||||
## explicit; go 1.11
|
||||
github.com/bradfitz/iter
|
||||
|
|
Loading…
Reference in New Issue