0.3.11, faster inflate (+ simpler)

2020-11-26 17:13:56 -06:00 · 2020-11-26 17:13:56 -06:00 · a310230eec
parent 0f8a1d1510
commit a310230eec
4 changed files with 91 additions and 164 deletions
--- a/README.md
+++ b/README.md
@ -131,12 +131,12 @@ Each file is uncompressed 1000 times:
 **https://github.com/guzba/zippy** results:
 File | Time
 --- | ---:
-alice29.txt | 0.7105s
-urls.10K | 3.2775s
-rfctest3.gold | 0.1618s
+alice29.txt | 0.5906s
+urls.10K | 2.9935s
+rfctest3.gold | 0.1405s
 randtest3.gold | 0.0373s
 paper-100k.pdf | 0.6453s
-geo.protodata | 0.2298s
+geo.protodata | 0.2070s

 https://github.com/nim-lang/zip results: (Requires zlib1.dll)
 File | Time
--- a/src/zippy/bitstreams.nim
+++ b/src/zippy/bitstreams.nim
@ -1,18 +1,5 @@
 import zippyerror

-const
-  masks = [
-    0b00000000.uint8,
-    0b00000001,
-    0b00000011,
-    0b00000111,
-    0b00001111,
-    0b00011111,
-    0b00111111,
-    0b01111111,
-    0b11111111,
-  ]
-
 type
  BitStream* = object
    bytePos*, bitPos*: int
@ -44,31 +31,25 @@ template checkBytePos*(b: BitStream) =
  if b.bytePos >= b.data.len:
    failEndOfBuffer()

-func read(b: var BitStream, bits: int): uint8 =
-  assert bits <= 8
-
+func readBits*(b: var BitStream, bits: int): uint16 =
  b.checkBytePos()

-  result = b.data[b.bytePos] shr b.bitPos
-
-  let bitsLeftInByte = 8 - b.bitPos
-  if bitsLeftInByte >= bits:
-    b.movePos(bits)
-    result = result and masks[bits]
-  else:
-    let bitsNeeded = bits - bitsLeftInByte
-    b.incPos()
-    b.checkBytePos()
-    result = result or
-      ((b.data[b.bytePos] and masks[bitsNeeded]) shl bitsLeftInByte)
-    inc(b.bitPos, bitsNeeded)
-
-func readBits*(b: var BitStream, bits: int): uint16 =
  assert bits <= 16

-  result = b.read(min(bits, 8)).uint16
-  if bits > 8:
-    result = result or (b.read(bits - 8).uint16 shl 8)
+  result = b.data[b.bytePos].uint16 shr b.bitPos
+  let numBits = 8 - b.bitPos
+
+  # Fill result up
+  if b.bytePos + 1 < b.data.len:
+    result = result or (b.data[b.bytePos + 1].uint16 shl numBits)
+  if b.bytePos + 2 < b.data.len:
+    result = result or (b.data[b.bytePos + 2].uint16 shl (numBits + 8))
+
+  # Mask out any bits past requested bit length
+  result = result and ((1 shl bits) - 1).uint16
+
+  b.bytePos += (bits + b.bitPos) shr 3
+  b.bitPos = (bits + b.bitPos) and 7

 func skipBits*(b: var BitStream, bits: int) =
  var bitsLeftToSkip = bits
@ -79,17 +60,6 @@ func skipBits*(b: var BitStream, bits: int) =
      dec(bitsLeftToSkip, skipping)
      b.movePos(skipping)

-func peekBits*(b: var BitStream, bits: int): uint16 =
-  let
-    bytePos = b.bytePos
-    bitPos = b.bitPos
-
-  result = b.readBits(bits)
-
-  # Restore these values after reading
-  b.bytePos = bytePos
-  b.bitPos = bitPos
-
 func skipRemainingBitsInCurrentByte*(b: var BitStream) =
  if b.bitPos > 0:
    b.incPos()
--- a/src/zippy/inflate.nim
+++ b/src/zippy/inflate.nim
@ -1,118 +1,71 @@
 import bitstreams, common, zippyerror

 const
-  huffmanChunkBits = 9
-  huffmanNumChunks = 1 shl huffmanChunkBits
-  huffmanCountMask = 15
-  huffmanValueShift = 4
+  fastBits = 9
+  fastMask = (1 shl 9) - 1

 type
  Huffman = object
-    minCodeLength, maxCodeLength: uint8
-    chunks: array[huffmanNumChunks, uint16]
-    links: seq[array[64, uint16]]
-    linkMask: uint16
+    firstCode, firstSymbol: array[16, uint16]
+    maxCodes: array[17, int]
+    lengths: array[288, uint8]
+    values: array[288, uint16]
+    fast: array[1 shl 9, uint16]

 when defined(release):
  {.push checks: off.}

+func reverse16Bits(n: int): int {.inline.} =
+  result = n
+  result = ((result and 0xAAAA) shr 1) or ((result and 0x5555) shl 1)
+  result = ((result and 0xCCCC) shr 2) or ((result and 0x3333) shl 2)
+  result = ((result and 0xF0F0) shr 4) or ((result and 0x0F0F) shl 4)
+  result = ((result and 0xFF00) shr 8) or ((result and 0x00FF) shl 8)
+
+func reverseBits(n, bits: int): int {.inline.} =
+  assert bits <= 16
+  reverse16Bits(n) shr (16 - bits)
+
 func initHuffman(lengths: seq[uint8], maxCodes: int): Huffman =
  ## See https://raw.githubusercontent.com/madler/zlib/master/doc/algorithm.txt

-  var
-    counts: array[maxCodeLength + 1, uint16]
-    numCodes: int
+  var sizes: array[17, int]
+  for i in 0 ..< lengths.len:
+    inc sizes[lengths[i]]
+  sizes[0] = 0

-  result.minCodeLength = uint8.high
-
-  for _, n in lengths:
-    if n == 0:
-      continue
-    inc counts[n]
-    inc numCodes
-    result.minCodeLength = min(n, result.minCodeLength)
-    result.maxCodeLength = max(n, result.maxCodeLength)
-
-  if result.maxCodeLength == 0 or
-    result.maxCodeLength > maxCodeLength or
-    numCodes > maxCodes:
-    failUncompress()
+  for i in 1 ..< 16:
+    if sizes[i] > (1 shl i):
+      failUncompress()

  var
-    code: uint16
-    nextCode: array[maxCodeLength + 1, uint16]
-  for i in result.minCodeLength .. result.maxCodeLength:
-    code = code shl 1
+    code, k: int
+    nextCode: array[16, int]
+  for i in 1 ..< 16:
    nextCode[i] = code
-    code += counts[i]
+    result.firstCode[i] = code.uint16
+    result.firstSymbol[i] = k.uint16
+    code = code + sizes[i]
+    if sizes[i] > 0 and code - 1 >= (1 shl i):
+      failUncompress()
+    result.maxCodes[i] = (code shl (16 - i))
+    code = code shl 1
+    k += sizes[i]

-  # if code != (1.uint16 shl result.maxCodeLength) and
-  #   not (code == 1 and result.maxCodeLength == 1):
-  #   debugEcho code, " ", result.maxCodeLength, " ", result.minCodeLength
-  #   failUncompress()
+  result.maxCodes[16] = 1 shl 16

-  if result.maxCodeLength > huffmanChunkBits:
-    let numLinks = 1.uint16 shl (result.maxCodeLength - huffmanChunkBits)
-    result.linkMask = numLinks - 1
-
-    let link = nextCode[huffmanChunkBits + 1] shr 1
-    result.links.setLen(huffmanNumChunks - link)
-    for i in link ..< huffmanNumChunks:
-      let
-        reverse = reverseUint16(i.uint16, huffmanChunkBits)
-        offset = i - link
-      when not defined(release):
-        if result.chunks[reverse] != 0:
-          raise newException(ZippyError, "Overwriting chunk")
-      result.chunks[reverse] = (
-        (offset shl huffmanValueShift) or huffmanChunkBits + 1
-      ).uint16
-      # result.links[offset].setLen(numLinks)
-
-  for i, n in lengths:
-    if n == 0:
-      continue
-
-    let
-      code = nextCode[n]
-      chunk = (i.uint16 shl huffmanValueShift) or n
-      reverse = reverseUint16(code, n)
-    inc nextCode[n]
-    if n <= huffmanChunkBits:
-      for offset in countup(reverse.int, result.chunks.high, 1 shl n):
-        when not defined(release):
-          if result.chunks[offset] != 0:
-            raise newException(ZippyError, "Overwriting chunk")
-        result.chunks[offset] = chunk
-    else:
-      let
-        j = reverse and (huffmanNumChunks - 1)
-        value = result.chunks[j] shr huffmanValueShift
-        reverseShifted = reverse shr huffmanChunkBits
-      when not defined(release):
-        if (result.chunks[j] and huffmanCountMask) != huffmanChunkBits + 1:
-          raise newException(ZippyError, "Not an indirect chunk")
-      for offset in countup(
-        reverseShifted.int,
-        result.links[value].high,
-        1 shl (n - huffmanChunkBits)
-      ):
-        when not defined(release):
-          if result.links[value][offset] != 0:
-            raise newException(ZippyError, "Overwriting chunk")
-        result.links[value][offset] = chunk
-
-  # when not defined(release):
-  #   for i, chunk in result.chunks:
-  #     if chunk == 0:
-  #       if code == 1 and i mod 2 == 1:
-  #         continue
-  #       raise newException(ZippyError, "Missing chunk")
-
-  #   for i in 0 ..< result.links.len:
-  #     for _, chunk in result.links[i]:
-  #       if chunk == 0:
-  #         raise newException(ZippyError, "Missing chunk")
+  for i, len in lengths:
+    if len > 0:
+      let symbolId = nextCode[len] - result.firstCode[len].int + result.firstSymbol[len].int
+      result.lengths[symbolId] = len
+      result.values[symbolId] = i.uint16
+      if len <= fastBits:
+        let fast = (len.uint16 shl 9) or i.uint16
+        var k = reverseBits(nextCode[len], len.int)
+        while k < (1 shl fastBits):
+          result.fast[k] = fast
+          k += (1 shl len)
+      inc nextCode[len]

 func decodeSymbol(b: var BitStream, h: Huffman): uint16 {.inline.} =
  ## See https://raw.githubusercontent.com/madler/zlib/master/doc/algorithm.txt
@ -121,33 +74,37 @@ func decodeSymbol(b: var BitStream, h: Huffman): uint16 {.inline.} =
  b.checkBytePos()

  var
-    bits = b.data[b.bytePos].uint16 shr b.bitPos
+    bits = b.data[b.bytePos].int shr b.bitPos
    numBits = 8 - b.bitPos

  # Fill bits up since we know codes must be between 1 and 15 bits long
  if b.bytePos + 1 < b.data.len:
-    bits = bits or (b.data[b.bytePos + 1].uint16 shl numBits)
+    bits = bits or (b.data[b.bytePos + 1].int shl numBits)
  if b.bytePos + 2 < b.data.len:
-    bits = bits or (b.data[b.bytePos + 2].uint16 shl (numBits + 8))
+    bits = bits or (b.data[b.bytePos + 2].int shl (numBits + 8))

-  numBits = 15
+  let fast = h.fast[bits and fastMask]
+  var len: int
+  if fast > 0:
+    len = (fast.int shr 9)
+    result = fast and 511
+  else: # Slow path
+    let k = reverse16Bits(bits)
+    len = fastBits + 1
+    while len < h.maxCodes.len:
+      if k < h.maxCodes[len]:
+        break
+      inc len

-  var
-    chunk = h.chunks[bits and (huffmanNumChunks - 1)]
-    n = (chunk and huffmanCountMask).int
-  if n > huffmanChunkBits:
-    chunk = h.links[
-      chunk shr huffmanValueShift][(bits shr huffmanChunkBits) and h.linkMask
-    ]
-    n = (chunk and huffmanCountMask).int
+    if len == 16:
+      failUncompress()

-  if n == 0 or n > numBits:
-    failUncompress()
+    let symbolId =
+      (k shr (16 - len)) - h.firstCode[len].int + h.firstSymbol[len].int
+    result = h.values[symbolId]

-  inc(b.bytePos, (n + b.bitPos) shr 3)
-  b.bitPos = (n + b.bitPos) and 7
-
-  chunk shr huffmanValueShift
+  b.bytePos += (len + b.bitPos) shr 3
+  b.bitPos = (len + b.bitPos) and 7

 func inflateBlock(b: var BitStream, dst: var seq[uint8], fixedCodes: bool) =
  var literalHuffman, distanceHuffman: Huffman
--- a/zippy.nimble
+++ b/zippy.nimble
@ -1,4 +1,4 @@
-version       = "0.3.10"
+version       = "0.3.11"
 author        = "Ryan Oldenburg"
 description   = "Pure Nim implementation of deflate, zlib, gzip and zip."
 license       = "MIT"