allow skipping crc32 integrity check (#22)

Some data is already protected by stronger checks - crc32 on the other
hand significantly slows down framed reading - ie 2.5x slower:

```
118.853 / 41.781, 129.115 /  0.000, 188.438 /  0.000,  90.565 / 44.371,           50,    115613038, state-6800000-488b7150-d613b584.ssz
186.600 / 97.202, 191.935 /123.325,   0.000 /  0.000,   0.000 /  0.000,           50,    115613038, state-6800000-488b7150-d613b584.ssz(framed)
```

The difference between unframed and framed decoding is the CRC32 check -
it takes ~50ms on a decent laptop for a 110mb file.
This commit is contained in:
Jacek Sieka 2023-07-25 17:50:36 +02:00 committed by GitHub
parent e36f19d886
commit ecbcee1d10
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 73 additions and 31 deletions

1
.gitignore vendored
View File

@ -7,3 +7,4 @@ master
# Fuzzer-generated files
crash-*
build/

View File

@ -106,7 +106,7 @@ func uncompress*(input: openArray[byte], output: var openArray[byte]):
if written.uint64 != lenU32:
return err(CodecError.invalidInput) # Header does not match content
return ok(written)
ok(written)
func decode*(input: openArray[byte], maxSize = maxUncompressedLen): seq[byte] =
## Decode input returning the uncompressed output. On error, return an empty
@ -117,7 +117,7 @@ func decode*(input: openArray[byte], maxSize = maxUncompressedLen): seq[byte] =
let uncompressed = uncompressedLen(input).valueOr:
return
if uncompressed > maxSize.uint64 or uncompressed > int.high.uint64:
if uncompressed > maxSize or uncompressed > int.high.uint64:
return
# TODO https://github.com/nim-lang/Nim/issues/19357
@ -166,7 +166,8 @@ func encodeFramed*(input: openArray[byte]): seq[byte] =
result.setLen(written)
func uncompressFramed*(
input: openArray[byte], output: var openArray[byte], checkHeader = true):
input: openArray[byte], output: var openArray[byte], checkHeader = true,
checkIntegrity = true):
Result[tuple[read: int, written: int], FrameError] =
## Uncompress as many frames as possible from `input` and write them to
## `output`, returning the number of bytes read and written.
@ -226,7 +227,7 @@ func uncompressFramed*(
of CodecError.invalidInput: err(FrameError.invalidInput)
return res
if maskedCrc(
if checkIntegrity and maskedCrc(
output.toOpenArray(written, written + (uncompressed - 1))) != crc:
return err(FrameError.crcMismatch)
@ -239,7 +240,8 @@ func uncompressFramed*(
let
crc = uint32.fromBytesLE input.toOpenArray(read, read + 3)
if maskedCrc(input.toOpenArray(read + 4, read + (dataLen - 1))) != crc:
if checkIntegrity and
maskedCrc(input.toOpenArray(read + 4, read + (dataLen - 1))) != crc:
return err(FrameError.crcMismatch)
let uncompressed = dataLen - 4 # dataLen includes CRC length
@ -263,7 +265,9 @@ func uncompressFramed*(
ok((read, written))
func decodeFramed*(input: openArray[byte], maxSize = int.high): seq[byte] =
func decodeFramed*(
input: openArray[byte], maxSize = int.high,
checkIntegrity = true): seq[byte] =
## Uncompress as many frames as possible from `input` and return the
## uncompressed output.
##
@ -275,13 +279,13 @@ func decodeFramed*(input: openArray[byte], maxSize = int.high): seq[byte] =
let uncompressed = uncompressedLenFramed(input).valueOr:
return
if uncompressed > maxSize.uint64 or uncompressed > int.high.uint64:
if uncompressed > maxSize.uint64:
return
# TODO https://github.com/nim-lang/Nim/issues/19357
result = newSeqUninitialized[byte](int uncompressed)
if uncompressFramed(input, result).isErr():
if uncompressFramed(input, result, checkIntegrity = checkIntegrity).isErr():
result = @[] # Empty return on error
template compress*(input: openArray[byte]): seq[byte] {.

View File

@ -196,7 +196,7 @@ func uncompressedLenFramed*(input: openArray[byte]): Opt[uint64] =
else: 0'u32 # Reserved skippable (for example framing format header)
if uncompressed > maxUncompressedFrameDataLen:
return # Uncomnpressed data has limits (for the known chunk types)
return # Uncompressed data has limits (for the known chunk types)
expected += uncompressed
read += dataLen

View File

@ -10,9 +10,10 @@ export
inputs, multisync, outputs, codec, exceptions
proc checkCrcAndAppend(
output: OutputStream, data: openArray[byte], crc: uint32): bool {.
output: OutputStream, data: openArray[byte], crc: uint32,
checkIntegrity: bool): bool {.
raises: [IOError].}=
if maskedCrc(data) == crc:
if not checkIntegrity or maskedCrc(data) == crc:
output.write(data)
return true
@ -85,7 +86,8 @@ proc compressFramed*(input: openArray[byte], output: OutputStream) {.
raises: [IOError].} =
compressFramed(unsafeMemoryInput(input), output)
proc uncompressFramed*(input: InputStream, output: OutputStream) {.
proc uncompressFramed*(
input: InputStream, output: OutputStream, checkIntegrity = true) {.
fsMultiSync, raises: [IOError, SnappyDecodingError].} =
if not input.readable(framingHeader.len):
raise newException(UnexpectedEofError, "Failed to read stream header")
@ -112,7 +114,8 @@ proc uncompressFramed*(input: InputStream, output: OutputStream) {.
uncompressed = uncompress(input.read(dataLen - 4), tmp).valueOr:
raise newException(MalformedSnappyData, "Failed to decompress content")
if not checkCrcAndAppend(Sync output, tmp.toOpenArray(0, uncompressed-1), crc):
if not checkCrcAndAppend(
Sync output, tmp.toOpenArray(0, uncompressed-1), crc, checkIntegrity):
raise newException(MalformedSnappyData, "Content CRC checksum failed")
elif id == chunkUncompressed:
@ -123,7 +126,8 @@ proc uncompressFramed*(input: InputStream, output: OutputStream) {.
raise newException(MalformedSnappyData, "Invalid frame length: " & $dataLen)
let crc = uint32.fromBytesLE(input.read(4))
if not checkCrcAndAppend(Sync output, input.read(dataLen - 4), crc):
if not checkCrcAndAppend(
Sync output, input.read(dataLen - 4), crc, checkIntegrity):
raise newException(MalformedSnappyData, "Content CRC checksum failed")
elif id < 0x80:
@ -142,6 +146,8 @@ proc uncompressFramed*(input: InputStream, output: OutputStream) {.
output.flush()
proc uncompressFramed*(input: openArray[byte], output: OutputStream) {.
proc uncompressFramed*(
input: openArray[byte], output: OutputStream, checkIntegrity = true) {.
raises: [IOError, SnappyDecodingError].} =
uncompressFramed(unsafeMemoryInput(input), output)
uncompressFramed(
unsafeMemoryInput(input), output, checkIntegrity = checkIntegrity)

View File

@ -49,7 +49,13 @@ proc readSource(sourceName: string): seq[byte] =
doAssert(size == f.readBytes(result, 0, size))
f.close()
proc streamsEncode(input: openArray[byte]): seq[byte] =
proc memEncode(input: openArray[byte]): seq[byte] {.noinline.} =
snappy.encode(input)
proc memDecode(input: openArray[byte]): seq[byte] {.noinline.} =
snappy.decode(input)
proc streamsEncode(input: openArray[byte]): seq[byte] {.noinline.} =
let
ins = newStringStream(string.fromBytes(input))
outs = newStringStream()
@ -57,21 +63,27 @@ proc streamsEncode(input: openArray[byte]): seq[byte] =
outs.setPosition(0)
outs.readAll().toBytes() # This line is a hotspot due to missing RVO
proc faststreamsEncode(input: openArray[byte]): seq[byte] =
proc faststreamsEncode(input: openArray[byte]): seq[byte] {.noinline.} =
let
ins = unsafeMemoryInput(input)
outs = memoryOutput()
compress(ins, outs)
outs.getOutput() # This line is a hotspot due to missing RVO
proc faststreamsEncodeFramed(input: openArray[byte]): seq[byte] =
proc memEncodeFramed(input: openArray[byte]): seq[byte] {.noinline.} =
snappy.encodeFramed(input)
proc memDecodeFramed(input: openArray[byte]): seq[byte] {.noinline.} =
snappy.decodeFramed(input)
proc faststreamsEncodeFramed(input: openArray[byte]): seq[byte] {.noinline.} =
let
ins = unsafeMemoryInput(input)
outs = memoryOutput()
compressFramed(ins, outs)
outs.getOutput() # This line is a hotspot due to missing RVO
proc faststreamsDecodeFramed(input: openArray[byte]): seq[byte] =
proc faststreamsDecodeFramed(input: openArray[byte]): seq[byte] {.noinline.} =
let
ins = unsafeMemoryInput(input)
outs = memoryOutput()
@ -87,9 +99,9 @@ proc timedRoundTrip(msg: string, source: openArray[byte], iterations = 100) =
for i in 0..<iterations:
timeit(timers.inMemory[0]):
let encodedWithSnappy = snappy.encode(source)
let encodedWithSnappy = memEncode(source)
timeit(timers.inMemory[1]):
let decodedWithSnappy = snappy.decode(encodedWithSnappy)
let decodedWithSnappy = memDecode(encodedWithSnappy)
timeit(timers.fastStreams[0]):
let encodedWithFastStreams = faststreamsEncode(source)
@ -122,9 +134,9 @@ proc timedRoundTripFramed(msg: string, source: openArray[byte], iterations = 100
for i in 0..<iterations:
timeit(timers.inMemory[0]):
let encodedWithSnappy = snappy.encodeFramed(source)
let encodedWithSnappy = memEncodeFramed(source)
timeit(timers.inMemory[1]):
let decodedWithSnappy = snappy.decodeFramed(encodedWithSnappy)
let decodedWithSnappy = memDecodeFramed(encodedWithSnappy)
timeit(timers.fastStreams[0]):
let encodedWithFastStreams = faststreamsEncodeFramed(source)
@ -163,6 +175,6 @@ roundTrip(dataDir & "geo.protodata")
roundTrip(dataDir & "kppkn.gtb")
roundTrip(dataDir & "Mark.Twain-Tom.Sawyer.txt")
# ncli_db --db:db dumpState 0x114a593d248af2ad05580299b803657d4b78a3b6578f47425cc396c9644e800e 2560000
if fileExists(dataDir & "state-2560000-114a593d-0d5e08e8.ssz"):
roundTrip(dataDir & "state-2560000-114a593d-0d5e08e8.ssz", 50)
# ncli_db --db:db rewindState 0x488b7150f092949f1dfc3137c4e2909a20fe9739d67a5185d75dbd0440c51edd 6800000
if fileExists(dataDir & "state-6800000-488b7150-d613b584.ssz"):
roundTrip(dataDir & "state-6800000-488b7150-d613b584.ssz", 50)

View File

@ -94,15 +94,15 @@ proc checkInvalidFramed(payload: openArray[byte], uncompressedLen: int) =
check uncompressedLenFramed(payload).isNone
proc checkValidFramed(payload: openArray[byte], expected: openArray[byte]) =
proc checkValidFramed(payload: openArray[byte], expected: openArray[byte], checkIntegrity = true) =
var tmp = newSeqUninitialized[byte](expected.len)
check:
decodeFramed(payload) == expected
uncompressFramed(payload, tmp).get() == (payload.len, expected.len)
decodeFramed(payload, checkIntegrity = checkIntegrity) == expected
uncompressFramed(payload, tmp, checkIntegrity = checkIntegrity).get() == (payload.len, expected.len)
tmp == expected
var output = memoryOutput()
uncompressFramed(unsafeMemoryInput(payload), output)
uncompressFramed(unsafeMemoryInput(payload), output, checkIntegrity = checkIntegrity)
check:
output.getOutput() == expected
@ -176,6 +176,25 @@ suite "framing":
checkValidFramed(framed, data)
checkValidFramed(framedCompressed, data)
test "checkIntegrity false":
let
data = newSeq[byte](maxUncompressedFrameDataLen)
compressed = snappy.encode(data)
framed =
@framingHeader & @[byte chunkUncompressed] &
@((data.len + 4).uint32.toBytesLE().toOpenArray(0, 2)) &
@([byte 0, 0, 0, 0]) &
data
framedCompressed =
@framingHeader & @[byte chunkCompressed] &
@((compressed.len + 4).uint32.toBytesLE().toOpenArray(0, 2)) &
@([byte 0, 0, 0, 0]) &
compressed
checkValidFramed(framed, data, checkIntegrity = false)
checkValidFramed(framedCompressed, data, checkIntegrity = false)
test "invalid header":
checkInvalidFramed([byte 3, 2, 1, 0], 0)