nimbus-eth1/tests/replay/gunzip.nim

# Nimbus
# Copyright (c) 2018-2019 Status Research & Development GmbH
# Licensed under either of
#  * Apache License, version 2.0, ([LICENSE-APACHE](LICENSE-APACHE) or
#    http://www.apache.org/licenses/LICENSE-2.0)
#  * MIT license ([LICENSE-MIT](LICENSE-MIT) or
#    http://opensource.org/licenses/MIT)
# at your option. This file may not be copied, modified, or distributed except
# according to those terms.

import
  std/strutils,
  stew/results,
  zlib

const
  lineBufStrLen = 512
  outBufSize = 2048

type
  GUnzip = object
    mz: ZStream

    # fields used in explode()
    inCache: string
    inCount: uint
    outBuf: array[outBufSize,char]
    outCount: uint
    outDoneOK: bool

    # fields used by nextChunk()
    gzIn: File
    gzOpenOK: bool
    gzMax: int64
    gzCount: int64
    gzName: string

    # fields used by nextLine()
    lnList: seq[string]
    lnInx: int

{.push raises: [].}

# ------------------------------------------------------------------------------
# Private deflate helpers:
# ------------------------------------------------------------------------------

proc explode(state: var GUnzip; data: openArray[char];
             start, dataLen: int): Result[string,ZError] =
  var
    inBuf = state.inCache & data[start ..< start + dataLen].join
    outData = ""
    rc: ZError

  state.mz.next_in  = cast[ptr uint8](inBuf[0].addr)
  state.mz.total_in = 0
  state.mz.avail_in = inBuf.len.cuint

  while not state.outDoneOK and 0 < state.mz.avail_in:
    state.mz.next_out = cast[ptr uint8](state.outBuf[0].addr)
    state.mz.avail_out = state.outBuf.len.cuint
    state.mz.total_out = 0

    # Save inpust state to compare with later on
    let availIn = state.mz.avail_in

    # Deflate current block next_in[] => next_out[]
    rc = state.mz.inflate(Z_SYNC_FLUSH)
    if rc == Z_STREAM_END:
      state.outDoneOK = true
      rc = state.mz.inflateEnd
    if rc != Z_OK:
      break

    # Append processed data
    if 0 < state.mz.total_out:
      outData &= toOpenArray(state.outBuf, 0, state.mz.total_out-1).join
      state.outCount += state.mz.total_out.uint

    # Stop unless state change
    if state.mz.avail_in == availIn and
       state.mz.avail_out == state.outBuf.len.cuint:
      break

  # Cache left-over for next gzExplode() session
  state.inCount +=  state.mz.total_in.uint
  state.inCache =
    if state.mz.total_in.int < inBuf.len - 1:
      inBuf[state.mz.total_in.int ..< inBuf.len]
    else:
      ""

  # Return code
  if rc != Z_OK:
    err(rc)
  else:
    ok(outData)

# ------------------------------------------------------------------------------
# Public
# ------------------------------------------------------------------------------

proc open*(state: var GUnzip; fileName: string):
                      Result[void,ZError] {.gcsafe, raises: [IOError].} =
  ## Open gzipped file with path `fileName` and prepare for deflating and
  ## extraction.

  # Clear descriptor
  if state.gzOpenOK:
    state.gzIn.close
  state.reset

  var
    strBuf = lineBufStrLen.newString
    start = 10
    rc = state.mz.inflateInit2(Z_RAW_DEFLATE)
  doAssert rc == Z_OK

  state.gzIn = fileName.open(fmRead)
  state.gzOpenOK = true
  state.gzMax = state.gzIn.getFileSize
  state.gzCount = state.gzIn.readChars(toOpenArray(strBuf, 0, strBuf.len-1))

  # Parse GZIP header (RFC 1952)
  doAssert 18 < state.gzCount
  doAssert (strBuf[0].ord == 0x1f and     # magic number
            strBuf[1].ord == 0x8b and     # magic number
            strBuf[2].ord == 0x08)        # deflate
  doAssert (strBuf[3].ord and 0xf7) == 0  # unsupported flags
  if (strBuf[3].ord and 8) == 8:          # FNAME
    let endPos = strBuf.find(0.chr, start)
    state.gzName = strBuf[start ..< endPos]
    start = endPos + 1

  # Cut off trailor
  state.gzMax -= 8
  if state.gzMax < state.gzCount:
    state.gzCount = state.gzMax

  # Store unused data for the next read
  state.inCache = strBuf[start ..< state.gzCount]
  return ok()


proc close*(state: var GUnzip) =
  ## Close any open files and free resources
  if state.gzOpenOK:
    state.gzIn.close
    state.reset


proc nextChunk*(state: var GUnzip):
                Result[string,ZError] {.gcsafe, raises: [IOError].} =
  ## Fetch next unzipped data chunk, return and empty string if input
  ## is exhausted.
  var strBuf = 4096.newString
  result = ok("")

  while state.gzCount < state.gzMax:
    var strLen = state.gzIn.readChars(toOpenArray(strBuf, 0, strBuf.len-1))
    if state.gzMax < state.gzCount + strLen:
      strLen = (state.gzMax - state.gzCount).int
    state.gzCount += strLen

    result = state.explode(strBuf, 0, strLen)
    if result.isErr:
      state.close
      return
    if result.value != "":
      return


proc nextChunkOk*(state: var GUnzip): bool =
  ## True if there is another chunk of data so that `nextChunk()` might
  ## fetch another non-empty unzipped data chunk.
  state.gzCount < state.gzMax


proc nextLine*(state: var GUnzip):
             Result[string,ZError] {.gcsafe, raises: [IOError].} =
  ## Assume that the `state` argument descriptor referes to a gzipped text
  ## file with lines separated by a newline character. Then fetch the next
  ## unzipped line and return it.
  ##
  ## If all lines are exhausted, the error `Z_STREAM_END` is returned. See
  ## function `nextLineOk()` for inquiry whether there would be a next
  ## unzipped line, at all.

  # Return next item from list (but spare the last)
  if state.lnInx + 1 < state.lnList.len:
    result = ok(state.lnList[state.lnInx])
    state.lnInx += 1

  elif not state.nextChunkOk:
    result = err(Z_STREAM_END)

  else:
    # Need to refill, concatenate old last item with new first
    if state.lnInx + 1 == state.lnList.len:
      state.lnList = @[state.lnList[state.lnInx]]

    # First encounter => initialise
    else:
      state.lnList = @[""]

    # Fetch at least two lines
    while state.nextChunkOk and state.lnList.len < 2:
      let rc = state.nextChunk
      if rc.isErr:
        return rc
      var q = rc.value.split('\n')
      q[0] = state.lnList[0] & q[0]
      state.lnList = q

    result = ok(state.lnList[0])
    state.lnInx = 1


proc nextLineOk*(state: var GUnzip): bool =
  ## True if there is another unzipped line available with `nextLine()`.
  state.nextChunkOk or state.lnInx + 1 < state.lnList.len


iterator gunzipLines*(state: var GUnzip):
                            (int,string) {.gcsafe, raises: [IOError].} =
  ## Iterate over all lines of gzipped text file `fileName` and return
  ## the pair `(line-number,line-text)`
  var lno = 0
  while state.nextLineOk:
    let rc = state.nextLine
    if rc.isErr:
      break
    lno.inc
    yield (lno,rc.value)


iterator gunzipLines*(fileName: string):
                            (int,string) {.gcsafe, raises: [IOError].} =
  ## Open a gzipped text file, iterate over its lines (using the other
  ## version of `gunzipLines()`) and close it.
  var state: GUnzip
  doAssert state.open(fileName).isOk
  defer: state.close

  for w in state.gunzipLines:
    yield w

# ------------------------------------------------------------------------------
# End
# ------------------------------------------------------------------------------