import stew/[leb128, ranges/ptr_arith], faststreams/[inputs, outputs, buffers, multisync], snappy/types export types const tagLiteral* = 0x00 tagCopy1* = 0x01 tagCopy2* = 0x02 tagCopy4* = 0x03 inputMargin = 16 - 1 func load32(b: openArray[byte]): uint32 {.inline.} = result = uint32(b[0]) or (uint32(b[1]) shl 8 ) or (uint32(b[2]) shl 16) or (uint32(b[3]) shl 24) func load32(b: openArray[byte], i: int): uint32 = result = load32(b.toOpenArray(i, i + 4 - 1)) func load64(b: openArray[byte]): uint64 {.inline.} = result = uint64(b[0]) or (uint64(b[1]) shl 8 ) or (uint64(b[2]) shl 16) or (uint64(b[3]) shl 24) or (uint64(b[4]) shl 32) or (uint64(b[5]) shl 40) or (uint64(b[6]) shl 48) or (uint64(b[7]) shl 56) func load64(b: openArray[byte], i: int): uint64 = result = load64(b.toOpenArray(i, i + 8 - 1)) # emitLiteral writes a literal chunk. # # It assumes that: # 1 <= len(lit) and len(lit) <= 65536 proc emitLiteral(s: OutputStream, lit: openarray[byte]) = let n = lit.len - 1 if n < 60: s.write (byte(n) shl 2) or tagLiteral elif n < (1 shl 8): s.write (60 shl 2) or tagLiteral s.write byte(n and 0xFF) else: s.write (61 shl 2) or tagLiteral s.write byte(n and 0xFF) s.write byte((n shr 8) and 0xFF) s.writeAndWait lit # emitCopy writes a copy chunk. # # It assumes that: # 1 <= offset and offset <= 65535 # 4 <= length and length <= 65535 proc emitCopy(s: OutputStream, offset, length: int) = var length = length # The maximum length for a single tagCopy1 or tagCopy2 op is 64 bytes. The # threshold for this loop is a little higher (at 68 = 64 + 4), and the # length emitted down below is is a little lower (at 60 = 64 - 4), because # it's shorter to encode a length 67 copy as a length 60 tagCopy2 followed # by a length 7 tagCopy1 (which encodes as 3+2 bytes) than to encode it as # a length 64 tagCopy2 followed by a length 3 tagCopy2 (which encodes as # 3+3 bytes). The magic 4 in the 64±4 is because the minimum length for a # tagCopy1 op is 4 bytes, which is why a length 3 copy has to be an # encodes-as-3-bytes tagCopy2 instead of an encodes-as-2-bytes tagCopy1. while length >= 68: # Emit a length 64 copy, encoded as 3 bytes. s.write (63 shl 2) or tagCopy2 s.write byte(offset and 0xFF) s.write byte((offset shr 8) and 0xFF) dec(length, 64) if length > 64: # Emit a length 60 copy, encoded as 3 bytes. s.write (59 shl 2) or tagCopy2 s.write byte(offset and 0xFF) s.write byte((offset shr 8) and 0xFF) dec(length, 60) if (length >= 12) or (offset >= 2048): # Emit the remaining copy, encoded as 3 bytes. s.write byte((((length-1) shl 2) or tagCopy2) and 0xFF) s.write byte(offset and 0xFF) s.write byte((offset shr 8) and 0xFF) return s.write byte((((offset shr 8) shl 5) or ((length-4) shl 2) or tagCopy1) and 0xFF) s.write byte(offset and 0xFF) when false: # extendMatch returns the largest k such that k <= len(src) and that # src[i:i+k-j] and src[j:k] have the same contents. # # It assumes that: # 0 <= i and i < j and j <= len(src) func extendMatch(src: openArray[byte], i, j: int): int = var i = i j = j while j < src.len and src[i] == src[j]: inc i inc j result = j func hash(u, shift: uint32): uint32 = result = (u * 0x1e35a7bd) shr shift # encodeBlock encodes a non-empty src to a guaranteed-large-enough dst. It # assumes that the varint-encoded length of the decompressed bytes has already # been written. # # It also assumes that: # len(dst) >= maxCompressedLen(len(src)) and # minNonLiteralBlockSize <= len(src) and len(src) <= maxBlockSize proc encodeBlock(output: OutputStream, src: openArray[byte]) = # Initialize the hash table. Its size ranges from 1shl8 to 1shl14 inclusive. # The table element type is uint16, as s < sLimit and sLimit < len(src) # and len(src) <= maxBlockSize and maxBlockSize == 65536. const maxTableSize = 1 shl 14 # tableMask is redundant, but helps the compiler eliminate bounds # checks. tableMask = maxTableSize - 1 var shift = 32 - 8 tableSize = 1 shl 8 while tableSize < maxTableSize and tableSize < src.len: tableSize = tableSize * 2 dec shift # In Nim, all array elements are zero-initialized, so there is no advantage # to a smaller tableSize per se. However, it matches the C++ algorithm, # and in the asm versions of this code, we can get away with zeroing only # the first tableSize elements. var table: array[maxTableSize, uint16] # sLimit is when to stop looking for offset/length copies. The inputMargin # lets us use a fast path for emitLiteral in the main loop, while we are # looking for copies. var sLimit = src.len - inputMargin # nextEmit is where in src the next emitLiteral should start from. var nextEmit = 0 # The encoded form must start with a literal, as there are no previous # bytes to copy, so we start looking for hash matches at s == 1. var s = 1 var nextHash = hash(load32(src, s), shift.uint32) template emitRemainder(): untyped = if nextEmit < src.len: emitLiteral(output, src.toOpenArray(nextEmit, src.high)) return while true: # Copied from the C++ snappy implementation: # # Heuristic match skipping: If 32 bytes are scanned with no matches # found, start looking only at every other byte. If 32 more bytes are # scanned (or skipped), look at every third byte, etc.. When a match # is found, immediately go back to looking at every byte. This is a # small loss (~5% performance, ~0.1% density) for compressible data # due to more bookkeeping, but for non-compressible data (such as # JPEG) it's a huge win since the compressor quickly "realizes" the # data is incompressible and doesn't bother looking for matches # everywhere. # # The "skip" variable keeps track of how many bytes there are since # the last match; dividing it by 32 (ie. right-shifting by five) gives # the number of bytes to move ahead for each iteration. var skip = 32 var nextS = s var candidate = 0 while true: s = nextS let bytesBetweenHashLookups = skip shr 5 nextS = s + bytesBetweenHashLookups inc(skip, bytesBetweenHashLookups) if nextS > sLimit: emitRemainder() candidate = int(table[nextHash and tableMask]) table[nextHash and tableMask] = uint16(s) nextHash = hash(load32(src, nextS), shift.uint32) if load32(src, s) == load32(src, candidate): break # A 4-byte match has been found. We'll later see if more than 4 bytes # match. But, prior to the match, src[nextEmit:s] are unmatched. Emit # them as literal bytes. output.emitLiteral src.toOpenArray(nextEmit, s - 1) # Call emitCopy, and then see if another emitCopy could be our next # move. Repeat until we find no match for the input immediately after # what was consumed by the last emitCopy call. # # If we exit this loop normally then we need to call emitLiteral next, # though we don't yet know how big the literal will be. We handle that # by proceeding to the next iteration of the main loop. We also can # exit this loop via goto if we get close to exhausting the input. while true: # Invariant: we have a 4-byte match at s, and no need to emit any # literal bytes prior to s. var base = s # Extend the 4-byte match as long as possible. # # This is an inlined version of: # s = extendMatch(src, candidate+4, s+4) inc(s, 4) var i = candidate + 4 while s < src.len and src[i] == src[s]: inc i inc s output.emitCopy(base-candidate, s-base) nextEmit = s if s >= sLimit: emitRemainder() # We could immediately start working at s now, but to improve # compression we first update the hash table at s-1 and at s. If # another emitCopy is not our next move, also calculate nextHash # at s+1. At least on ARCH=amd64, these three hash calculations # are faster as one load64 call (with some shifts) instead of # three load32 calls. var x = load64(src, s-1) var prevHash = hash(uint32(x shr 0), shift.uint32) table[prevHash and tableMask] = uint16(s - 1) var currHash = hash(uint32(x shr 8), shift.uint32) candidate = int(table[currHash and tableMask]) table[currHash and tableMask] = uint16(s) if uint32(x shr 8) != load32(src, candidate): nextHash = hash(uint32(x shr 16), shift.uint32) inc s break const decodeErrCodeCorrupt = 1 decodeErrCodeUnsupportedLiteralLength = 2 func decode(dst: var openArray[byte], src: openArray[byte]): int = var d = 0 s = 0 offset = 0 length = 0 while s < src.len: let tag = src[s] and 0x03 case tag of tagLiteral: var x = int(src[s]) shr 2 if x < 60: inc s elif x == 60: inc(s, 2) if s > src.len: return decodeErrCodeCorrupt x = int(src[s-1]) elif x == 61: inc(s, 3) if s > src.len: return decodeErrCodeCorrupt x = int(src[s-2]) or (int(src[s-1]) shl 8) elif x == 62: inc(s, 4) if s > src.len: return decodeErrCodeCorrupt x = int(src[s-3]) or (int(src[s-2]) shl 8) or (int(src[s-1]) shl 16) elif x == 63: inc(s, 5) if s > src.len: return decodeErrCodeCorrupt x = int(src[s-4]) or (int(src[s-3]) shl 8) or (int(src[s-2]) shl 16) or (int(src[s-1]) shl 24) length = x + 1 if length <= 0: return decodeErrCodeUnsupportedLiteralLength if (length > (dst.len-d)) or (length > (src.len-s)): return decodeErrCodeCorrupt copyMem(addr dst[d], unsafeAddr src[s], length) inc(d, length) inc(s, length) continue of tagCopy1: inc(s, 2) if s > src.len: return decodeErrCodeCorrupt length = 4 + ((int(src[s-2]) shr 2) and 0x07) offset = ((int(src[s-2]) and 0xe0) shl 3) or int(src[s-1]) of tagCopy2: s += 3 if s > src.len: return decodeErrCodeCorrupt length = 1 + (int(src[s-3]) shr 2) offset = int(src[s-2]) or (int(src[s-1]) shl 8) of tagCopy4: s += 5 if s > src.len: return decodeErrCodeCorrupt length = 1 + (int(src[s-5]) shr 2) offset = int(src[s-4]) or (int(src[s-3]) shl 8) or (int(src[s-2]) shl 16) or (int(src[s-1]) shl 24) else: discard if offset <= 0 or d < offset or (length > (dst.len-d)): return decodeErrCodeCorrupt # Copy from an earlier sub-slice of dst to a later sub-slice. Unlike # the built-in copy function, this byte-by-byte copy always runs # forwards, even if the slices overlap. Conceptually, this is: # # d += forwardCopy(dst[d:d+length], dst[d-offset:]) var stop = d + length while d != stop: dst[d] = dst[d-offset] inc d if d != dst.len: return decodeErrCodeCorrupt return 0 # minNonLiteralBlockSize is the minimum size of the input to encodeBlock that # could be encoded with a copy tag. This is the minimum with respect to the # algorithm used by encodeBlock, not a minimum enforced by the file format. # # The encoded output must start with at least a 1 byte literal, as there are # no previous bytes to copy. A minimal (1 byte) copy after that, generated # from an emitCopy call in encodeBlock's main loop, would require at least # another inputMargin bytes, for the reason above: we want any emitLiteral # calls inside encodeBlock's main loop to use the fast path if possible, which # requires being able to overrun by inputMargin bytes. Thus, # minNonLiteralBlockSize equals 1 + 1 + inputMargin. # # The C++ code doesn't use this exact threshold, but it could, as discussed at # https:#groups.google.com/d/topic/snappy-compression/oGbhsdIJSJ8/discussion # The difference between Nim (2+inputMargin) and C++ (inputMargin) is purely an # optimization. It should not affect the encoded form. This is tested by # TestSameEncodingAsCppShortCopies. const minNonLiteralBlockSize = 1 + 1 + inputMargin # Encode returns the encoded form of src. The returned slice may be a sub- # slice of dst if dst was large enough to hold the entire encoded block. # Otherwise, a newly allocated slice will be returned. # # The dst and src must not overlap. It is valid to pass a nil dst. proc appendSnappyBytes*(s: OutputStream, src: openArray[byte]) = var lenU32 = checkInputLen(src.len) p = 0 # The block starts with the varint-encoded length of the decompressed bytes. s.write lenU32.toBytes(Leb128).toOpenArray() while lenU32 > maxBlockSize.uint32: s.encodeBlock src.toOpenArray(p, p + maxBlockSize) p += maxBlockSize lenU32 -= maxBlockSize.uint32 # The `lenU32.int` expressions below cannot overflow because # `lenU32` is already less than `maxBlockSize` here: if lenU32 < minNonLiteralBlockSize.uint32: s.emitLiteral src.toOpenArray(p, p + lenU32.int) else: s.encodeBlock src.toOpenArray(p, p + lenU32.int) proc snappyCompress*(input: InputStream, output: OutputStream) = try: let inputLen = input.len if inputLen.isSome: let lenU32 = checkInputLen(inputLen.get) output.ensureRunway maxCompressedLen(lenU32) output.write lenU32.toBytes(Leb128).toOpenArray() else: # TODO: This is a temporary limitation doAssert false, "snappy requires an input stream with a known length" while input.readable(maxBlockSize): encodeBlock(output, input.read(maxBlockSize)) let remainingBytes = input.totalUnconsumedBytes if remainingBytes > 0: if remainingBytes < minNonLiteralBlockSize: output.emitLiteral input.read(remainingBytes) else: output.encodeBlock input.read(remainingBytes) finally: close output # Encode returns the encoded form of src. func encode*(src: openarray[byte]): seq[byte] = # Memory streams doesn't have side effects: {.noSideEffect.}: let output = memoryOutput() snappyCompress(unsafeMemoryInput(src), output) output.getOutput func decode*(src: openArray[byte], maxSize = 0xffffffff'u32): seq[byte] = let (lenU32, bytesRead) = uint32.fromBytes(src, Leb128) if bytesRead <= 0 or lenU32 > maxSize: return if lenU32 > 0: when sizeof(uint) == 4: if lenU32 > 0x7fffffff'u32: return # `lenU32.int` cannot overflow because of the extra check above result = newSeq[byte](lenU32.int) let errCode = decode(result, src.toOpenArray(bytesRead, src.len - 1)) if errCode != 0: result = @[] proc snappyUncompress*(src: openArray[byte], dst: var openArray[byte]): uint32 = let (uncompressedLen, bytesRead) = uint32.fromBytes(src, Leb128) if bytesRead <= 0 or uncompressedLen.BiggestUInt > dst.len.BiggestUInt: return 0 if uncompressedLen > 0: # `result.int` cannot overflow here, because we've already # checked that it's smaller than the `dst.len` which is an int. let errCode = decode(dst.toOpenArray(0, uncompressedLen.int - 1), src.toOpenArray(bytesRead, src.len - 1)) if errCode != 0: return 0 return uncompressedLen