in progress

This commit is contained in:
Ryan Oldenburg 2020-11-11 13:46:37 -06:00
parent aff70e3af6
commit ca4a0c80a4
10 changed files with 207 additions and 101 deletions

View File

@ -26,7 +26,7 @@ Benchmarks can be run comparing different deflate implementations. My benchmarki
`nim c -d:release -r .\tests\benchmark.nim`
### Compress
### Compress at default compression level
Each file is compressed 1000 times.
@ -78,6 +78,41 @@ To prevent Zippy from causing a crash or otherwise misbehaving on bad input data
import zippy
```
## **const** NoCompression
```nim
NoCompression = 0
```
## **const** BestSpeed
```nim
BestSpeed = 1
```
## **const** BestCompression
```nim
BestCompression = 9
```
## **const** DefaultCompression
```nim
DefaultCompression = -1
```
## **const** HuffmanOnly
```nim
HuffmanOnly = -2
```
## **type** CompressedDataFormat
Supported compressed data formats
@ -92,7 +127,8 @@ CompressedDataFormat = enum
Compresses src and returns the compressed data.
```nim
func compress(src: seq[uint8]; dataFormat = dfGzip): seq[uint8] {.raises: [ZippyError].}
func compress(src: seq[uint8]; level = DefaultCompression; dataFormat = dfGzip): seq[
uint8] {.raises: [ZippyError, ValueError].}
```
## **template** compress
@ -100,7 +136,7 @@ func compress(src: seq[uint8]; dataFormat = dfGzip): seq[uint8] {.raises: [Zippy
Helper for when preferring to work with strings.
```nim
template compress(src: string; dataFormat = dfGzip): string
template compress(src: string; level = DefaultCompression; dataFormat = dfGzip): string
```
## **func** uncompress

View File

@ -8,7 +8,7 @@ proc cb(req: Request) {.async.} =
let headers = newHttpHeaders([("Content-Encoding", "gzip")])
await req.respond(
Http200,
compress("gzip'ed response body", dfGzip),
compress("gzip'ed response body", BestSpeed, dfGzip),
headers
)
else:

View File

@ -2,11 +2,20 @@ import zippy/common, zippy/deflate, zippy/inflate, zippy/zippyerror
export zippyerror
const
NoCompression* = 0
BestSpeed* = 1
BestCompression* = 9
DefaultCompression* = -1
HuffmanOnly* = -2
type
CompressedDataFormat* = enum ## Supported compressed data formats
dfDetect, dfZlib, dfGzip, dfDeflate
func compress*(src: seq[uint8], dataFormat = dfGzip): seq[uint8] =
func compress*(
src: seq[uint8], level = DefaultCompression, dataFormat = dfGzip
): seq[uint8] =
## Compresses src and returns the compressed data.
if dataFormat == dfDetect:
@ -15,7 +24,7 @@ func compress*(src: seq[uint8], dataFormat = dfGzip): seq[uint8] =
"A data format must be specified to compress"
)
let deflated = deflate(src)
let deflated = deflate(src, level)
if dataFormat == dfGzip:
result.setLen(10)
@ -61,7 +70,9 @@ func compress*(src: seq[uint8], dataFormat = dfGzip): seq[uint8] =
else:
result = deflated
template compress*(src: string, dataFormat = dfGzip): string =
template compress*(
src: string, level = DefaultCompression, dataFormat = dfGzip
): string =
## Helper for when preferring to work with strings.
when nimvm:
# This is unfortunately needed to convert to and from string -> seq[uint]
@ -75,7 +86,7 @@ template compress*(src: string, dataFormat = dfGzip): string =
result.add(c.char)
result
else:
cast[string](compress(cast[seq[uint8]](src), dataFormat))
cast[string](compress(cast[seq[uint8]](src), level, dataFormat))
func uncompress(
src: seq[uint8], dataFormat: CompressedDataFormat, dst: var seq[uint8]
@ -136,22 +147,12 @@ func uncompress(
inflate(src[pos ..< ^8], dst)
let checksum = (
src[^8].uint32 shl 0 or
src[^7].uint32 shl 8 or
src[^6].uint32 shl 16 or
src[^5].uint32 shl 24
)
let checksum = read32(src, src.len - 8)
if checksum != crc32(dst):
raise newException(ZippyError, "Checksum verification failed")
let isize = (
src[^4].uint32 shl 0 or
src[^3].uint32 shl 8 or
src[^2].uint32 shl 16 or
src[^1].uint32 shl 24
)
if isize != dst.len.uint32:
let isize = read32(src, src.len - 4)
if isize != (dst.len mod (1 shl 32)).uint32:
raise newException(ZippyError, "Size verification failed")
of dfZlib:
if src.len < 6:

View File

@ -109,11 +109,27 @@ const
when defined(release):
{.push checks: off.}
template read32*(p: pointer): uint32 =
cast[ptr uint32](p)[]
template read32*(s: seq[uint8], pos: int): uint32 =
when nimvm:
(s[pos + 0].uint32 shl 0) or
(s[pos + 1].uint32 shl 8) or
(s[pos + 2].uint32 shl 16) or
(s[pos + 3].uint32 shl 24)
else:
cast[ptr uint32](s[pos].unsafeAddr)[]
template read64*(p: pointer): uint64 =
cast[ptr uint64](p)[]
template read64*(s: seq[uint8], pos: int): uint64 =
when nimvm:
(s[pos + 0].uint64 shl 0) or
(s[pos + 1].uint64 shl 8) or
(s[pos + 2].uint64 shl 16) or
(s[pos + 3].uint64 shl 24) or
(s[pos + 4].uint64 shl 32) or
(s[pos + 5].uint64 shl 40) or
(s[pos + 6].uint64 shl 48) or
(s[pos + 7].uint64 shl 56)
else:
cast[ptr uint64](s[pos].unsafeAddr)[]
template reverseUint16*(code: uint16, length: uint8): uint16 =
(

View File

@ -1,4 +1,4 @@
import bitops, bitstreams, common, zippyerror
import bitops, bitstreams, common, strformat, zippyerror
const
minMatchLen = 3
@ -8,15 +8,35 @@ const
maxLitLenCodeLength = 9
maxDistCodeLength = 6
# The uint16 high bit is reserved to signal that a offset and length are
# encoded in the uint16.
maxLiteralLength = uint16.high.int shr 1
windowSize = 1 shl 15
maxChainLen = 32
goodMatchLen = 32
hashBits = 16
hashSize = 1 shl hashBits
hashMask = hashSize - 1
hashShift = (hashBits + minMatchLen - 1) div minMatchLen
type
CompressionConfig = object
good, lazy, nice, chain: int
const
configurationTable = [
CompressionConfig(), # No compression
CompressionConfig(), # Custom algorithm based on Snappy
CompressionConfig(good: 4, lazy: 0, nice: 16, chain: 8),
CompressionConfig(good: 4, lazy: 0, nice: 32, chain: 32),
CompressionConfig(good: 4, lazy: 4, nice: 16, chain: 16),
CompressionConfig(good: 8, lazy: 16, nice: 32, chain: 32),
CompressionConfig(good: 8, lazy: 16, nice: 128, chain: 128), # Default
CompressionConfig(good: 8, lazy: 32, nice: 256, chain: 256),
CompressionConfig(good: 32, lazy: 128, nice: 258, chain: 1024),
CompressionConfig(good: 32, lazy: 258, nice: 258, chain: 4096) # Max compression
]
when defined(release):
{.push checks: off.}
@ -182,7 +202,29 @@ func findCodeIndex(a: openarray[uint16], value: uint16): uint16 =
return i.uint16 - 1
a.high.uint16
func lz77Encode(src: seq[uint8]): (seq[uint16], seq[int], seq[int], int) =
func findMatchLength(src: seq[uint8], s1, s2, limit: int): int {.inline.} =
var
s1 = s1
s2 = s2
while s2 <= limit - 8:
let x = read64(src, s2) xor read64(src, s1 + result)
if x == 0:
inc(s2, 8)
inc(result, 8)
else:
let matchingBits = countTrailingZeroBits(x)
inc(result, matchingBits shr 3)
return
while s2 < limit:
if src[s2] == src[s1 + result]:
inc s2
inc result
else:
return
func lz77Encode(
src: seq[uint8], config: CompressionConfig
): (seq[uint16], seq[int], seq[int], int) =
assert windowSize <= maxWindowSize
assert (windowSize and (windowSize - 1)) == 0
assert (hashSize and hashMask) == 0
@ -258,12 +300,9 @@ func lz77Encode(src: seq[uint8]): (seq[uint16], seq[int], seq[int], int) =
var
hashPos = chain[windowPos]
stop = min(src.len, pos + maxMatchLen)
chainLen, prevOffset, longestMatchOffset, longestMatchLen: int
while true:
if chainLen >= maxChainLen:
break
inc chainLen
tries = 32
prevOffset, longestMatchOffset, longestMatchLen: int
for i in countdown(tries, 1):
var offset: int
if hashPos <= windowPos:
offset = (windowPos - hashPos).int
@ -275,40 +314,12 @@ func lz77Encode(src: seq[uint8]): (seq[uint16], seq[int], seq[int], int) =
prevOffset = offset
var
matchLen: int
i: int
while i < stop - pos:
var useFastPath: bool
when nimvm:
useFastPath = false
else:
# Can we look at the next 8 bytes?
useFastPath = stop - pos - i > 8
if useFastPath:
let v = read64(src[pos - offset + i].unsafeAddr) xor
read64(src[pos + i].unsafeAddr)
if v == 0:
inc(matchLen, 8)
else:
let
zeroBits = countTrailingZeroBits(v)
matchingBytes = min(zeroBits shr 3, 8)
inc(matchLen, matchingBytes)
if matchingBytes < 8:
break
inc(i, 8)
else:
if src[pos - offset + i] != src[pos + i]:
break
inc matchLen
inc i
let matchLen = findMatchLength(src, pos - offset, pos, stop)
if matchLen > longestMatchLen:
longestMatchLen = matchLen
longestMatchOffset = offset
if longestMatchLen >= goodMatchLen or hashPos == chain[hashPos]:
if longestMatchLen >= 32 or hashPos == chain[hashPos]:
break
hashPos = chain[hashPos]
@ -328,7 +339,7 @@ func lz77Encode(src: seq[uint8]): (seq[uint16], seq[int], seq[int], int) =
else:
inc freqLitLen[src[pos]]
inc literalLen
if literalLen == uint16.high.int shr 1:
if literalLen == maxLiteralLength:
addLiteral(literalLen)
literalLen = 0
inc pos
@ -336,36 +347,75 @@ func lz77Encode(src: seq[uint8]): (seq[uint16], seq[int], seq[int], int) =
encoded.setLen(op)
(encoded, freqLitLen, freqDist, literalsTotal)
func deflate*(src: seq[uint8]): seq[uint8] =
var b: BitStream
func snappyEncode(
src: seq[uint8]
): (seq[uint16], seq[int], seq[int], int) =
discard
let (encoded, freqLitLen, freqDist, literalsTotal) = lz77Encode(src)
func huffmanOnlyEncode(
src: seq[uint8]
): (seq[uint16], seq[int], seq[int], int) =
var
encoded = newSeq[uint16]()
freqLitLen = newSeq[int](286)
freqDist = newSeq[int](baseDistance.len)
freqLitLen[256] = 1 # Alway 1 end-of-block symbol
for i, c in src:
inc freqLitLen[c]
for i in 0 ..< src.len div maxLiteralLength:
encoded.add(maxLiteralLength.uint16)
encoded.add((src.len mod maxLiteralLength).uint16)
(encoded, freqLitLen, freqDist, 0)
func deflateNoCompression(src: seq[uint8]): seq[uint8] =
let blockCount = max(
(src.len + maxUncompressedBlockSize - 1) div maxUncompressedBlockSize,
1
)
var b: BitStream
for i in 0 ..< blockCount:
b.data.setLen(b.data.len + 6)
let finalBlock = i == blockCount - 1
b.addBits(finalBlock.uint8, 8)
let
pos = i * maxUncompressedBlockSize
len = min(src.len - pos, maxUncompressedBlockSize).uint16
nlen = (maxUncompressedBlockSize - len).uint16
b.addBits(len, 16)
b.addBits(nlen, 16)
if len > 0:
b.addBytes(src, pos, len.int)
b.data.setLen(b.bytePos)
b.data
func deflate*(src: seq[uint8], level = -1): seq[uint8] =
if level < -2 or level > 9:
raise newException(ZippyError, &"Invalid compression level {level}")
if level == 0:
return deflateNoCompression(src)
let (encoded, freqLitLen, freqDist, literalsTotal) = block:
if level == -2:
huffmanOnlyEncode(src)
elif level == 1:
snappyEncode(src)
else:
lz77Encode(src, configurationTable[if level == -1: 6 else: level])
# If lz77 encoding returned almost all literal runs then write uncompressed.
if literalsTotal >= (src.len.float32 * 0.98).int:
let blockCount = max(
(src.len + maxUncompressedBlockSize - 1) div maxUncompressedBlockSize,
1
)
for i in 0 ..< blockCount:
b.data.setLen(b.data.len + 6)
let finalBlock = i == blockCount - 1
b.addBits(finalBlock.uint8, 8)
let
pos = i * maxUncompressedBlockSize
len = min(src.len - pos, maxUncompressedBlockSize).uint16
nlen = (maxUncompressedBlockSize - len).uint16
b.addBits(len, 16)
b.addBits(nlen, 16)
if len > 0:
b.addBytes(src, pos, len.int)
b.data.setLen(b.bytePos)
return b.data
return deflateNoCompression(src)
# Deflate using dynamic Huffman tree
@ -445,6 +495,7 @@ func deflate*(src: seq[uint8]): seq[uint8] =
hdist = distNumCodes.uint8 - 1
hclen = bitLensCodeLen.len.uint8 - 4
var b: BitStream
# TODO: Improve the b.data.setLens
b.data.setLen(
b.data.len +

View File

@ -77,7 +77,7 @@ block guzba_zippy_compress:
start = getMonoTime().ticks
var c: int
for i in 0 ..< iterations:
let compressed = zippy.compress(uncompressed, dfZlib)
let compressed = zippy.compress(uncompressed, dataFormat = dfZlib)
inc(c, compressed.len)
let
delta = float64(getMonoTime().ticks - start) / 1000000000.0

View File

@ -24,6 +24,6 @@ for i in 0 ..< 10000:
inc(i, runLength)
let
compressed = compress(data, dfGzip)
compressed = compress(data)
uncompressed = uncompress(compressed)
doAssert uncompressed == data

View File

@ -63,7 +63,7 @@ for dataFormat in [dfDeflate, dfZlib, dfGzip]:
for gold in golds:
let
original = readFile(&"tests/data/{gold}")
compressed = compress(original, dataFormat)
compressed = compress(original, dataFormat = dataFormat)
uncompressed = uncompress(
compressed,
if dataFormat == dfDeflate: dfDeflate else: dfDetect
@ -76,7 +76,7 @@ for dataFormat in [dfDeflate, dfZlib, dfGzip]:
for i in 0.uint8 .. high(uint8):
original.add(i)
let
compressed = compress(original, dataFormat)
compressed = compress(original, dataFormat = dataFormat)
uncompressed = uncompress(
compressed,
if dataFormat == dfDeflate: dfDeflate else: dfDetect

View File

@ -5,7 +5,7 @@ const
test1 = block:
let
original = readFile(test1Path)
compressed = compress(original, dfGzip)
compressed = compress(original)
uncompressed = uncompress(compressed)
doAssert uncompressed == original
compressed
@ -13,7 +13,7 @@ const
test2Seq = @[0.uint8, 8, 8, 8, 3, 8, 3, 3, 1, 1]
test2 = block:
let
compressed = compress(test2Seq, dfGzip)
compressed = compress(test2Seq)
uncompressed = uncompress(compressed)
doAssert uncompressed == test2Seq
compressed

View File

@ -31,7 +31,7 @@ block nimlang_zip: # Requires zlib1.dll
for gold in golds:
let original = readFile(&"tests/data/{gold}")
doAssert zlib.uncompress(
zippy.compress(original, dfZlib), stream = ZLIB_STREAM
zippy.compress(original, dataFormat = dfZlib), stream = ZLIB_STREAM
) == original
doassert zippy.uncompress(
zlib.compress(original, stream = ZLIB_STREAM)
@ -46,7 +46,9 @@ block treeform_miniz:
# Something bad happens here with miniz
discard
else:
doAssert miniz.uncompress(zippy.compress(original, dfZlib)) == original
doAssert miniz.uncompress(
zippy.compress(original, dataFormat = dfZlib)
) == original
doAssert zippy.uncompress(miniz.compress(original)) == original
echo "pass!"
@ -55,7 +57,7 @@ block jangko_nimPNG:
for gold in golds:
let original = readFile(&"tests/data/{gold}")
doAssert nimz.zlib_decompress(
nzInflateInit(zippy.compress(original, dfZlib))
nzInflateInit(zippy.compress(original, dataFormat = dfZlib))
) == original
doAssert zippy.uncompress(
zlib_compress(nzDeflateInit(original))