working snappy framing compress prototype

This commit is contained in:
andri lim 2020-03-31 12:21:44 +07:00 committed by zah
parent 45b8258af4
commit f08cbf9dc5
4 changed files with 110 additions and 12 deletions

View File

@ -474,7 +474,7 @@ proc snappyUncompress*(src: openArray[byte], dst: var openArray[byte]): int =
if dst.len < int(len):
return
if int(len) > 0:
let errCode = decode(dst.toOpenArray(0, len.int-1), src[bytesRead..^1])
if errCode != 0:

View File

@ -1,20 +1,20 @@
import
../snappy,
../snappy, ../snappy/utils,
../tests/openarrays_snappy as oas,
faststreams/[output_stream, input_stream],
stew/endians2
{.compile: "crc32c.c".}
# TODO: we don't have a native implementation of CRC32C algorithm yet.
# we can't use nimPNG CRC32
proc masked_crc32c(buf: ptr byte, len: cuint): cuint {.cdecl, importc.}
proc masked_crc32c(buf: ptr byte, len: uint): cuint {.cdecl, importc.}
func checkCrc32(data: openArray[byte], expected: uint32): bool =
let actual = masked_crc32c(data[0].unsafeAddr, data.len.cuint)
let actual = masked_crc32c(data[0].unsafeAddr, data.len.uint)
result = actual == expected
proc checkData(data: openArray[byte], crc: uint32, output: OutputStreamVar): bool =
if not checkCrc32(data, crc):
echo "BAD CRC"
return
output.append(data)
@ -26,9 +26,11 @@ func seekForward(_: openArray[byte]) =
const
# maximum chunk data length
MAX_DATA_LEN = 16777215
# MAX_DATA_LEN = 16777215
# maximum uncompressed data length excluding checksum
MAX_UNCOMPRESSED_DATA_LEN = 65536
# maximum uncompressed data length excluding checksum
MAX_COMPRESSED_DATA_LEN = maxEncodedLen(MAX_UNCOMPRESSED_DATA_LEN)
COMPRESSED_DATA_IDENTIFIER = 0x00
UNCOMPRESSED_DATA_IDENTIFIER = 0x01
@ -37,9 +39,13 @@ const
STREAM_HEADER = "\xff\x06\x00\x00sNaPpY"
proc framing_format_uncompress*(input: ByteStreamVar, output: OutputStreamVar) =
if input[].ensureBytes(STREAM_HEADER.len):
if input.readBytes(STREAM_HEADER.len) != STREAM_HEADER.toOpenArrayByte(0, STREAM_HEADER.len-1):
return
if not input[].ensureBytes(STREAM_HEADER.len):
# debugEcho "NOT A SNAPPY STREAM"
return
if input.readBytes(STREAM_HEADER.len) != STREAM_HEADER.toOpenArrayByte(0, STREAM_HEADER.len-1):
# debugEcho "BAD HEADER"
return
var uncompressedData = newSeq[byte](MAX_UNCOMPRESSED_DATA_LEN)
@ -48,10 +54,22 @@ proc framing_format_uncompress*(input: ByteStreamVar, output: OutputStreamVar) =
break
# ensure bytes
if not input[].ensureBytes(4):
# debugEcho "CHK 1 NOT ENOUGH BYTES"
return
let x = uint32.fromBytesLE(input.readBytes(4))
let id = x and 0xFF
let dataLen = (x shr 8).int
if not input[].ensureBytes(dataLen):
# debugEcho "CHK 2 NOT ENOUGH BYTES"
# debugEcho "request: ", dataLen
# debugEcho "pos: ", input[].pos
# debugEcho "endPos: ", input.endPos
# debugEcho "distance: ", input.endPos - input[].pos
return
if id == COMPRESSED_DATA_IDENTIFIER:
let crc = uint32.fromBytesLE(input.readBytes(4))
@ -60,21 +78,66 @@ proc framing_format_uncompress*(input: ByteStreamVar, output: OutputStreamVar) =
uncompressedData
)
if uncompressedLen < 0:
if uncompressedLen <= 0:
# debugEcho "BAD LEN"
return
if not checkData(uncompressedData.toOpenArray(0, uncompressedLen-1), crc, output):
# debugEcho "BAD CRC"
return
elif id == UNCOMPRESSED_DATA_IDENTIFIER:
let crc = uint32.fromBytesLE(input.readBytes(4))
if not checkData(input.readBytes(dataLen - 4), crc, output):
# debugEcho "BAD CRC UNCOMP"
return
elif id < 0x80:
# Reserved unskippable chunks (chunk types 0x02-0x7f)
# if we encounter this type of chunk, stop decoding
# the spec says it is an error
# debugEcho "BAD CHUNK"
return
else:
# Reserved skippable chunks (chunk types 0x80-0xfe)
# including STREAM_HEADER (0xff) should be skipped
seekForward(input.readBytes(dataLen))
output.flush()
proc processFrame*(output: OutputStreamVar, dst: var openArray[byte], src: openArray[byte]) =
let
crc = masked_crc32c(src[0].unsafeAddr, src.len.uint)
varintLen = oas.putUvarint(dst, src.len.uint64)
encodedLen = oas.encodeBlock(dst.toOpenArray(varintLen, dst.len-1), src) + varintLen
if encodedLen >= (src.len - (src.len div 8)):
let frameLen = src.len + 4 # include 4 bytes crc
let header = (uint32(frameLen) shl 8) or UNCOMPRESSED_DATA_IDENTIFIER.uint32
output.append toBytesLE(header)
output.append toBytesLE(crc)
output.append src
else:
let frameLen = encodedLen + 4 # include 4 bytes crc
let header = (uint32(frameLen) shl 8) or COMPRESSED_DATA_IDENTIFIER.uint32
output.append toBytesLE(header)
output.append toBytesLE(crc)
output.append dst.toOpenArray(0, encodedLen-1)
proc framing_format_compress*(output: OutputStreamVar, src: openArray[byte]) =
const maxFrameSize = MAX_UNCOMPRESSED_DATA_LEN
var compressedData = newSeq[byte](MAX_COMPRESSED_DATA_LEN)
# write the magic identifier
output.append(STREAM_HEADER)
var
p = 0
len = src.len
while len > 0:
let frameSize = min(len, maxFrameSize)
processFrame(output, compressedData, src[p..<p+frameSize])
inc(p, frameSize)
dec(len, frameSize)
output.flush()

View File

@ -7,7 +7,7 @@ const
inputMargin = 16 - 1
# PutUvarint encodes a uint64 into buf and returns the number of bytes written.
func putUvarint(buf: var openArray[byte], x: uint64): int =
func putUvarint*(buf: var openArray[byte], x: uint64): int =
var
i = 0
x = x
@ -169,7 +169,7 @@ func hash(u, shift: uint32): uint32 =
# It also assumes that:
# len(dst) >= MaxEncodedLen(len(src)) and
# minNonLiteralBlockSize <= len(src) and len(src) <= maxBlockSize
func encodeBlock(dst, src: var openArray[byte]): int =
func encodeBlock*(dst: var openArray[byte], src: openArray[byte]): int =
# Initialize the hash table. Its size ranges from 1shl8 to 1shl14 inclusive.
# The table element type is uint16, as s < sLimit and sLimit < len(src)
# and len(src) <= maxBlockSize and maxBlockSize == 65536.

View File

@ -18,6 +18,26 @@ template check_uncompress(source, target: string) =
else:
check true
template check_roundtrip(source) =
test "roundtrip " & source:
let expected = readFile(uncompDir & source)
var ost = OutputStream.init
framing_format_compress(ost, expected.toOpenArrayByte(0, expected.len-1))
let compressed = ost.getOutput(string)
debugEcho "compressed len: ", compressed.len
var inst = memoryStream(compressed)
var outst = OutputStream.init
framing_format_uncompress(inst, outst)
let actual = outst.getOutput(string)
check actual.len == expected.len
if actual != expected:
check false
else:
check true
proc main() =
suite "framing":
setup:
@ -29,4 +49,19 @@ proc main() =
check_uncompress("alice29.txt.sz-64k", "alice29.txt")
check_uncompress("house.jpg.sz", "house.jpg")
check_roundtrip("alice29.txt")
check_roundtrip("house.jpg")
check_roundtrip("html")
check_roundtrip("urls.10K")
check_roundtrip("fireworks.jpeg")
check_roundtrip("paper-100k.pdf")
check_roundtrip("html_x_4")
check_roundtrip("asyoulik.txt")
check_roundtrip("lcet10.txt")
check_roundtrip("plrabn12.txt")
check_roundtrip("geo.protodata")
check_roundtrip("kppkn.gtb")
check_roundtrip("Mark.Twain-Tom.Sawyer.txt")
main()