in progress
This commit is contained in:
parent
aff70e3af6
commit
ca4a0c80a4
42
README.md
42
README.md
|
@ -26,7 +26,7 @@ Benchmarks can be run comparing different deflate implementations. My benchmarki
|
|||
|
||||
`nim c -d:release -r .\tests\benchmark.nim`
|
||||
|
||||
### Compress
|
||||
### Compress at default compression level
|
||||
|
||||
Each file is compressed 1000 times.
|
||||
|
||||
|
@ -78,6 +78,41 @@ To prevent Zippy from causing a crash or otherwise misbehaving on bad input data
|
|||
import zippy
|
||||
```
|
||||
|
||||
## **const** NoCompression
|
||||
|
||||
|
||||
```nim
|
||||
NoCompression = 0
|
||||
```
|
||||
|
||||
## **const** BestSpeed
|
||||
|
||||
|
||||
```nim
|
||||
BestSpeed = 1
|
||||
```
|
||||
|
||||
## **const** BestCompression
|
||||
|
||||
|
||||
```nim
|
||||
BestCompression = 9
|
||||
```
|
||||
|
||||
## **const** DefaultCompression
|
||||
|
||||
|
||||
```nim
|
||||
DefaultCompression = -1
|
||||
```
|
||||
|
||||
## **const** HuffmanOnly
|
||||
|
||||
|
||||
```nim
|
||||
HuffmanOnly = -2
|
||||
```
|
||||
|
||||
## **type** CompressedDataFormat
|
||||
|
||||
Supported compressed data formats
|
||||
|
@ -92,7 +127,8 @@ CompressedDataFormat = enum
|
|||
Compresses src and returns the compressed data.
|
||||
|
||||
```nim
|
||||
func compress(src: seq[uint8]; dataFormat = dfGzip): seq[uint8] {.raises: [ZippyError].}
|
||||
func compress(src: seq[uint8]; level = DefaultCompression; dataFormat = dfGzip): seq[
|
||||
uint8] {.raises: [ZippyError, ValueError].}
|
||||
```
|
||||
|
||||
## **template** compress
|
||||
|
@ -100,7 +136,7 @@ func compress(src: seq[uint8]; dataFormat = dfGzip): seq[uint8] {.raises: [Zippy
|
|||
Helper for when preferring to work with strings.
|
||||
|
||||
```nim
|
||||
template compress(src: string; dataFormat = dfGzip): string
|
||||
template compress(src: string; level = DefaultCompression; dataFormat = dfGzip): string
|
||||
```
|
||||
|
||||
## **func** uncompress
|
||||
|
|
|
@ -8,7 +8,7 @@ proc cb(req: Request) {.async.} =
|
|||
let headers = newHttpHeaders([("Content-Encoding", "gzip")])
|
||||
await req.respond(
|
||||
Http200,
|
||||
compress("gzip'ed response body", dfGzip),
|
||||
compress("gzip'ed response body", BestSpeed, dfGzip),
|
||||
headers
|
||||
)
|
||||
else:
|
||||
|
|
|
@ -2,11 +2,20 @@ import zippy/common, zippy/deflate, zippy/inflate, zippy/zippyerror
|
|||
|
||||
export zippyerror
|
||||
|
||||
const
|
||||
NoCompression* = 0
|
||||
BestSpeed* = 1
|
||||
BestCompression* = 9
|
||||
DefaultCompression* = -1
|
||||
HuffmanOnly* = -2
|
||||
|
||||
type
|
||||
CompressedDataFormat* = enum ## Supported compressed data formats
|
||||
dfDetect, dfZlib, dfGzip, dfDeflate
|
||||
|
||||
func compress*(src: seq[uint8], dataFormat = dfGzip): seq[uint8] =
|
||||
func compress*(
|
||||
src: seq[uint8], level = DefaultCompression, dataFormat = dfGzip
|
||||
): seq[uint8] =
|
||||
## Compresses src and returns the compressed data.
|
||||
|
||||
if dataFormat == dfDetect:
|
||||
|
@ -15,7 +24,7 @@ func compress*(src: seq[uint8], dataFormat = dfGzip): seq[uint8] =
|
|||
"A data format must be specified to compress"
|
||||
)
|
||||
|
||||
let deflated = deflate(src)
|
||||
let deflated = deflate(src, level)
|
||||
|
||||
if dataFormat == dfGzip:
|
||||
result.setLen(10)
|
||||
|
@ -61,7 +70,9 @@ func compress*(src: seq[uint8], dataFormat = dfGzip): seq[uint8] =
|
|||
else:
|
||||
result = deflated
|
||||
|
||||
template compress*(src: string, dataFormat = dfGzip): string =
|
||||
template compress*(
|
||||
src: string, level = DefaultCompression, dataFormat = dfGzip
|
||||
): string =
|
||||
## Helper for when preferring to work with strings.
|
||||
when nimvm:
|
||||
# This is unfortunately needed to convert to and from string -> seq[uint]
|
||||
|
@ -75,7 +86,7 @@ template compress*(src: string, dataFormat = dfGzip): string =
|
|||
result.add(c.char)
|
||||
result
|
||||
else:
|
||||
cast[string](compress(cast[seq[uint8]](src), dataFormat))
|
||||
cast[string](compress(cast[seq[uint8]](src), level, dataFormat))
|
||||
|
||||
func uncompress(
|
||||
src: seq[uint8], dataFormat: CompressedDataFormat, dst: var seq[uint8]
|
||||
|
@ -136,22 +147,12 @@ func uncompress(
|
|||
|
||||
inflate(src[pos ..< ^8], dst)
|
||||
|
||||
let checksum = (
|
||||
src[^8].uint32 shl 0 or
|
||||
src[^7].uint32 shl 8 or
|
||||
src[^6].uint32 shl 16 or
|
||||
src[^5].uint32 shl 24
|
||||
)
|
||||
let checksum = read32(src, src.len - 8)
|
||||
if checksum != crc32(dst):
|
||||
raise newException(ZippyError, "Checksum verification failed")
|
||||
|
||||
let isize = (
|
||||
src[^4].uint32 shl 0 or
|
||||
src[^3].uint32 shl 8 or
|
||||
src[^2].uint32 shl 16 or
|
||||
src[^1].uint32 shl 24
|
||||
)
|
||||
if isize != dst.len.uint32:
|
||||
let isize = read32(src, src.len - 4)
|
||||
if isize != (dst.len mod (1 shl 32)).uint32:
|
||||
raise newException(ZippyError, "Size verification failed")
|
||||
of dfZlib:
|
||||
if src.len < 6:
|
||||
|
|
|
@ -109,11 +109,27 @@ const
|
|||
when defined(release):
|
||||
{.push checks: off.}
|
||||
|
||||
template read32*(p: pointer): uint32 =
|
||||
cast[ptr uint32](p)[]
|
||||
template read32*(s: seq[uint8], pos: int): uint32 =
|
||||
when nimvm:
|
||||
(s[pos + 0].uint32 shl 0) or
|
||||
(s[pos + 1].uint32 shl 8) or
|
||||
(s[pos + 2].uint32 shl 16) or
|
||||
(s[pos + 3].uint32 shl 24)
|
||||
else:
|
||||
cast[ptr uint32](s[pos].unsafeAddr)[]
|
||||
|
||||
template read64*(p: pointer): uint64 =
|
||||
cast[ptr uint64](p)[]
|
||||
template read64*(s: seq[uint8], pos: int): uint64 =
|
||||
when nimvm:
|
||||
(s[pos + 0].uint64 shl 0) or
|
||||
(s[pos + 1].uint64 shl 8) or
|
||||
(s[pos + 2].uint64 shl 16) or
|
||||
(s[pos + 3].uint64 shl 24) or
|
||||
(s[pos + 4].uint64 shl 32) or
|
||||
(s[pos + 5].uint64 shl 40) or
|
||||
(s[pos + 6].uint64 shl 48) or
|
||||
(s[pos + 7].uint64 shl 56)
|
||||
else:
|
||||
cast[ptr uint64](s[pos].unsafeAddr)[]
|
||||
|
||||
template reverseUint16*(code: uint16, length: uint8): uint16 =
|
||||
(
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
import bitops, bitstreams, common, zippyerror
|
||||
import bitops, bitstreams, common, strformat, zippyerror
|
||||
|
||||
const
|
||||
minMatchLen = 3
|
||||
|
@ -8,15 +8,35 @@ const
|
|||
maxLitLenCodeLength = 9
|
||||
maxDistCodeLength = 6
|
||||
|
||||
# The uint16 high bit is reserved to signal that a offset and length are
|
||||
# encoded in the uint16.
|
||||
maxLiteralLength = uint16.high.int shr 1
|
||||
|
||||
windowSize = 1 shl 15
|
||||
maxChainLen = 32
|
||||
goodMatchLen = 32
|
||||
|
||||
hashBits = 16
|
||||
hashSize = 1 shl hashBits
|
||||
hashMask = hashSize - 1
|
||||
hashShift = (hashBits + minMatchLen - 1) div minMatchLen
|
||||
|
||||
type
|
||||
CompressionConfig = object
|
||||
good, lazy, nice, chain: int
|
||||
|
||||
const
|
||||
configurationTable = [
|
||||
CompressionConfig(), # No compression
|
||||
CompressionConfig(), # Custom algorithm based on Snappy
|
||||
CompressionConfig(good: 4, lazy: 0, nice: 16, chain: 8),
|
||||
CompressionConfig(good: 4, lazy: 0, nice: 32, chain: 32),
|
||||
CompressionConfig(good: 4, lazy: 4, nice: 16, chain: 16),
|
||||
CompressionConfig(good: 8, lazy: 16, nice: 32, chain: 32),
|
||||
CompressionConfig(good: 8, lazy: 16, nice: 128, chain: 128), # Default
|
||||
CompressionConfig(good: 8, lazy: 32, nice: 256, chain: 256),
|
||||
CompressionConfig(good: 32, lazy: 128, nice: 258, chain: 1024),
|
||||
CompressionConfig(good: 32, lazy: 258, nice: 258, chain: 4096) # Max compression
|
||||
]
|
||||
|
||||
when defined(release):
|
||||
{.push checks: off.}
|
||||
|
||||
|
@ -182,7 +202,29 @@ func findCodeIndex(a: openarray[uint16], value: uint16): uint16 =
|
|||
return i.uint16 - 1
|
||||
a.high.uint16
|
||||
|
||||
func lz77Encode(src: seq[uint8]): (seq[uint16], seq[int], seq[int], int) =
|
||||
func findMatchLength(src: seq[uint8], s1, s2, limit: int): int {.inline.} =
|
||||
var
|
||||
s1 = s1
|
||||
s2 = s2
|
||||
while s2 <= limit - 8:
|
||||
let x = read64(src, s2) xor read64(src, s1 + result)
|
||||
if x == 0:
|
||||
inc(s2, 8)
|
||||
inc(result, 8)
|
||||
else:
|
||||
let matchingBits = countTrailingZeroBits(x)
|
||||
inc(result, matchingBits shr 3)
|
||||
return
|
||||
while s2 < limit:
|
||||
if src[s2] == src[s1 + result]:
|
||||
inc s2
|
||||
inc result
|
||||
else:
|
||||
return
|
||||
|
||||
func lz77Encode(
|
||||
src: seq[uint8], config: CompressionConfig
|
||||
): (seq[uint16], seq[int], seq[int], int) =
|
||||
assert windowSize <= maxWindowSize
|
||||
assert (windowSize and (windowSize - 1)) == 0
|
||||
assert (hashSize and hashMask) == 0
|
||||
|
@ -258,12 +300,9 @@ func lz77Encode(src: seq[uint8]): (seq[uint16], seq[int], seq[int], int) =
|
|||
var
|
||||
hashPos = chain[windowPos]
|
||||
stop = min(src.len, pos + maxMatchLen)
|
||||
chainLen, prevOffset, longestMatchOffset, longestMatchLen: int
|
||||
while true:
|
||||
if chainLen >= maxChainLen:
|
||||
break
|
||||
inc chainLen
|
||||
|
||||
tries = 32
|
||||
prevOffset, longestMatchOffset, longestMatchLen: int
|
||||
for i in countdown(tries, 1):
|
||||
var offset: int
|
||||
if hashPos <= windowPos:
|
||||
offset = (windowPos - hashPos).int
|
||||
|
@ -275,40 +314,12 @@ func lz77Encode(src: seq[uint8]): (seq[uint16], seq[int], seq[int], int) =
|
|||
|
||||
prevOffset = offset
|
||||
|
||||
var
|
||||
matchLen: int
|
||||
i: int
|
||||
while i < stop - pos:
|
||||
var useFastPath: bool
|
||||
when nimvm:
|
||||
useFastPath = false
|
||||
else:
|
||||
# Can we look at the next 8 bytes?
|
||||
useFastPath = stop - pos - i > 8
|
||||
if useFastPath:
|
||||
let v = read64(src[pos - offset + i].unsafeAddr) xor
|
||||
read64(src[pos + i].unsafeAddr)
|
||||
if v == 0:
|
||||
inc(matchLen, 8)
|
||||
else:
|
||||
let
|
||||
zeroBits = countTrailingZeroBits(v)
|
||||
matchingBytes = min(zeroBits shr 3, 8)
|
||||
inc(matchLen, matchingBytes)
|
||||
if matchingBytes < 8:
|
||||
break
|
||||
inc(i, 8)
|
||||
else:
|
||||
if src[pos - offset + i] != src[pos + i]:
|
||||
break
|
||||
inc matchLen
|
||||
inc i
|
||||
|
||||
let matchLen = findMatchLength(src, pos - offset, pos, stop)
|
||||
if matchLen > longestMatchLen:
|
||||
longestMatchLen = matchLen
|
||||
longestMatchOffset = offset
|
||||
|
||||
if longestMatchLen >= goodMatchLen or hashPos == chain[hashPos]:
|
||||
if longestMatchLen >= 32 or hashPos == chain[hashPos]:
|
||||
break
|
||||
|
||||
hashPos = chain[hashPos]
|
||||
|
@ -328,7 +339,7 @@ func lz77Encode(src: seq[uint8]): (seq[uint16], seq[int], seq[int], int) =
|
|||
else:
|
||||
inc freqLitLen[src[pos]]
|
||||
inc literalLen
|
||||
if literalLen == uint16.high.int shr 1:
|
||||
if literalLen == maxLiteralLength:
|
||||
addLiteral(literalLen)
|
||||
literalLen = 0
|
||||
inc pos
|
||||
|
@ -336,36 +347,75 @@ func lz77Encode(src: seq[uint8]): (seq[uint16], seq[int], seq[int], int) =
|
|||
encoded.setLen(op)
|
||||
(encoded, freqLitLen, freqDist, literalsTotal)
|
||||
|
||||
func deflate*(src: seq[uint8]): seq[uint8] =
|
||||
var b: BitStream
|
||||
func snappyEncode(
|
||||
src: seq[uint8]
|
||||
): (seq[uint16], seq[int], seq[int], int) =
|
||||
discard
|
||||
|
||||
let (encoded, freqLitLen, freqDist, literalsTotal) = lz77Encode(src)
|
||||
func huffmanOnlyEncode(
|
||||
src: seq[uint8]
|
||||
): (seq[uint16], seq[int], seq[int], int) =
|
||||
var
|
||||
encoded = newSeq[uint16]()
|
||||
freqLitLen = newSeq[int](286)
|
||||
freqDist = newSeq[int](baseDistance.len)
|
||||
|
||||
freqLitLen[256] = 1 # Alway 1 end-of-block symbol
|
||||
|
||||
for i, c in src:
|
||||
inc freqLitLen[c]
|
||||
|
||||
for i in 0 ..< src.len div maxLiteralLength:
|
||||
encoded.add(maxLiteralLength.uint16)
|
||||
|
||||
encoded.add((src.len mod maxLiteralLength).uint16)
|
||||
|
||||
(encoded, freqLitLen, freqDist, 0)
|
||||
|
||||
func deflateNoCompression(src: seq[uint8]): seq[uint8] =
|
||||
let blockCount = max(
|
||||
(src.len + maxUncompressedBlockSize - 1) div maxUncompressedBlockSize,
|
||||
1
|
||||
)
|
||||
|
||||
var b: BitStream
|
||||
for i in 0 ..< blockCount:
|
||||
b.data.setLen(b.data.len + 6)
|
||||
|
||||
let finalBlock = i == blockCount - 1
|
||||
b.addBits(finalBlock.uint8, 8)
|
||||
|
||||
let
|
||||
pos = i * maxUncompressedBlockSize
|
||||
len = min(src.len - pos, maxUncompressedBlockSize).uint16
|
||||
nlen = (maxUncompressedBlockSize - len).uint16
|
||||
|
||||
b.addBits(len, 16)
|
||||
b.addBits(nlen, 16)
|
||||
if len > 0:
|
||||
b.addBytes(src, pos, len.int)
|
||||
|
||||
b.data.setLen(b.bytePos)
|
||||
b.data
|
||||
|
||||
func deflate*(src: seq[uint8], level = -1): seq[uint8] =
|
||||
if level < -2 or level > 9:
|
||||
raise newException(ZippyError, &"Invalid compression level {level}")
|
||||
|
||||
if level == 0:
|
||||
return deflateNoCompression(src)
|
||||
|
||||
let (encoded, freqLitLen, freqDist, literalsTotal) = block:
|
||||
if level == -2:
|
||||
huffmanOnlyEncode(src)
|
||||
elif level == 1:
|
||||
snappyEncode(src)
|
||||
else:
|
||||
lz77Encode(src, configurationTable[if level == -1: 6 else: level])
|
||||
|
||||
# If lz77 encoding returned almost all literal runs then write uncompressed.
|
||||
if literalsTotal >= (src.len.float32 * 0.98).int:
|
||||
let blockCount = max(
|
||||
(src.len + maxUncompressedBlockSize - 1) div maxUncompressedBlockSize,
|
||||
1
|
||||
)
|
||||
|
||||
for i in 0 ..< blockCount:
|
||||
b.data.setLen(b.data.len + 6)
|
||||
|
||||
let finalBlock = i == blockCount - 1
|
||||
b.addBits(finalBlock.uint8, 8)
|
||||
|
||||
let
|
||||
pos = i * maxUncompressedBlockSize
|
||||
len = min(src.len - pos, maxUncompressedBlockSize).uint16
|
||||
nlen = (maxUncompressedBlockSize - len).uint16
|
||||
|
||||
b.addBits(len, 16)
|
||||
b.addBits(nlen, 16)
|
||||
if len > 0:
|
||||
b.addBytes(src, pos, len.int)
|
||||
|
||||
b.data.setLen(b.bytePos)
|
||||
return b.data
|
||||
return deflateNoCompression(src)
|
||||
|
||||
# Deflate using dynamic Huffman tree
|
||||
|
||||
|
@ -445,6 +495,7 @@ func deflate*(src: seq[uint8]): seq[uint8] =
|
|||
hdist = distNumCodes.uint8 - 1
|
||||
hclen = bitLensCodeLen.len.uint8 - 4
|
||||
|
||||
var b: BitStream
|
||||
# TODO: Improve the b.data.setLens
|
||||
b.data.setLen(
|
||||
b.data.len +
|
||||
|
|
|
@ -77,7 +77,7 @@ block guzba_zippy_compress:
|
|||
start = getMonoTime().ticks
|
||||
var c: int
|
||||
for i in 0 ..< iterations:
|
||||
let compressed = zippy.compress(uncompressed, dfZlib)
|
||||
let compressed = zippy.compress(uncompressed, dataFormat = dfZlib)
|
||||
inc(c, compressed.len)
|
||||
let
|
||||
delta = float64(getMonoTime().ticks - start) / 1000000000.0
|
||||
|
|
|
@ -24,6 +24,6 @@ for i in 0 ..< 10000:
|
|||
inc(i, runLength)
|
||||
|
||||
let
|
||||
compressed = compress(data, dfGzip)
|
||||
compressed = compress(data)
|
||||
uncompressed = uncompress(compressed)
|
||||
doAssert uncompressed == data
|
||||
|
|
|
@ -63,7 +63,7 @@ for dataFormat in [dfDeflate, dfZlib, dfGzip]:
|
|||
for gold in golds:
|
||||
let
|
||||
original = readFile(&"tests/data/{gold}")
|
||||
compressed = compress(original, dataFormat)
|
||||
compressed = compress(original, dataFormat = dataFormat)
|
||||
uncompressed = uncompress(
|
||||
compressed,
|
||||
if dataFormat == dfDeflate: dfDeflate else: dfDetect
|
||||
|
@ -76,7 +76,7 @@ for dataFormat in [dfDeflate, dfZlib, dfGzip]:
|
|||
for i in 0.uint8 .. high(uint8):
|
||||
original.add(i)
|
||||
let
|
||||
compressed = compress(original, dataFormat)
|
||||
compressed = compress(original, dataFormat = dataFormat)
|
||||
uncompressed = uncompress(
|
||||
compressed,
|
||||
if dataFormat == dfDeflate: dfDeflate else: dfDetect
|
||||
|
|
|
@ -5,7 +5,7 @@ const
|
|||
test1 = block:
|
||||
let
|
||||
original = readFile(test1Path)
|
||||
compressed = compress(original, dfGzip)
|
||||
compressed = compress(original)
|
||||
uncompressed = uncompress(compressed)
|
||||
doAssert uncompressed == original
|
||||
compressed
|
||||
|
@ -13,7 +13,7 @@ const
|
|||
test2Seq = @[0.uint8, 8, 8, 8, 3, 8, 3, 3, 1, 1]
|
||||
test2 = block:
|
||||
let
|
||||
compressed = compress(test2Seq, dfGzip)
|
||||
compressed = compress(test2Seq)
|
||||
uncompressed = uncompress(compressed)
|
||||
doAssert uncompressed == test2Seq
|
||||
compressed
|
||||
|
|
|
@ -31,7 +31,7 @@ block nimlang_zip: # Requires zlib1.dll
|
|||
for gold in golds:
|
||||
let original = readFile(&"tests/data/{gold}")
|
||||
doAssert zlib.uncompress(
|
||||
zippy.compress(original, dfZlib), stream = ZLIB_STREAM
|
||||
zippy.compress(original, dataFormat = dfZlib), stream = ZLIB_STREAM
|
||||
) == original
|
||||
doassert zippy.uncompress(
|
||||
zlib.compress(original, stream = ZLIB_STREAM)
|
||||
|
@ -46,7 +46,9 @@ block treeform_miniz:
|
|||
# Something bad happens here with miniz
|
||||
discard
|
||||
else:
|
||||
doAssert miniz.uncompress(zippy.compress(original, dfZlib)) == original
|
||||
doAssert miniz.uncompress(
|
||||
zippy.compress(original, dataFormat = dfZlib)
|
||||
) == original
|
||||
doAssert zippy.uncompress(miniz.compress(original)) == original
|
||||
echo "pass!"
|
||||
|
||||
|
@ -55,7 +57,7 @@ block jangko_nimPNG:
|
|||
for gold in golds:
|
||||
let original = readFile(&"tests/data/{gold}")
|
||||
doAssert nimz.zlib_decompress(
|
||||
nzInflateInit(zippy.compress(original, dfZlib))
|
||||
nzInflateInit(zippy.compress(original, dataFormat = dfZlib))
|
||||
) == original
|
||||
doAssert zippy.uncompress(
|
||||
zlib_compress(nzDeflateInit(original))
|
||||
|
|
Loading…
Reference in New Issue