From 7060c2ef7eac458493b011ebbbe26abc3b26af77 Mon Sep 17 00:00:00 2001
From: Dmitriy Ryajov <dryajov@gmail.com>
Date: Mon, 21 Mar 2022 12:06:00 -0600
Subject: [PATCH] Initial implementation

---
 leopard.nim                   |  42 --
 leopard/leopard.nim           | 198 +++++++++
 leopard/utils.nim             |   4 +
 leopard/utils/allocs.nim      |  82 ++++
 leopard/utils/cpuinfo_x86.nim | 793 ++++++++++++++++++++++++++++++++++
 leopard/wrapper.nim           | 216 +++++----
 tests/test_leopard.nim        | 551 -----------------------
 tests/testleopard.nim         |  48 ++
 8 files changed, 1231 insertions(+), 703 deletions(-)
 create mode 100644 leopard/leopard.nim
 create mode 100644 leopard/utils.nim
 create mode 100644 leopard/utils/allocs.nim
 create mode 100644 leopard/utils/cpuinfo_x86.nim
 delete mode 100644 tests/test_leopard.nim
 create mode 100644 tests/testleopard.nim

diff --git a/leopard.nim b/leopard.nim
index 560bd9c..9a30d6e 100644
--- a/leopard.nim
+++ b/leopard.nim
@@ -82,48 +82,6 @@ func isValid*(code: ReedSolomonCode): bool =
        (code.data < MinSymbols) or (code.parity < MinSymbols) or
        (code.codeword > MaxTotalSymbols))
 
-when (NimMajor, NimMinor, NimPatch) < (1, 4, 0):
-  const
-    header = "<stdlib.h>"
-
-  proc c_malloc(size: csize_t): pointer {.importc: "malloc", header: header.}
-  proc c_free(p: pointer) {.importc: "free", header: header.}
-
-proc SIMDSafeAllocate(size: int): pointer {.inline.}  =
-  var
-    data =
-      when (NimMajor, NimMinor, NimPatch) < (1, 4, 0):
-        c_malloc(LEO_ALIGN_BYTES + size.uint)
-      else:
-        allocShared(LEO_ALIGN_BYTES + size.uint)
-
-    doffset = cast[uint](data) mod LEO_ALIGN_BYTES
-
-  data = offset(data, (LEO_ALIGN_BYTES + doffset).int)
-
-  var
-    offsetPtr = cast[pointer](cast[uint](data) - 1)
-
-  moveMem(offsetPtr, addr doffset, sizeof(doffset))
-  data
-
-proc SIMDSafeFree(data: pointer) {.inline.} =
-  var
-    data = data
-
-  if not data.isNil:
-    let
-      offset = cast[uint](data) - 1
-
-    if offset >= LEO_ALIGN_BYTES: return
-
-    data = cast[pointer](cast[uint](data) - (LEO_ALIGN_BYTES - offset))
-
-    when (NimMajor, NimMinor, NimPatch) < (1, 4, 0):
-      c_free data
-    else:
-      deallocShared data
-
 proc leoInit*() =
   if wrapper.leoInit() != 0:
     raise (ref LeopardDefect)(msg: "Leopard-RS failed to initialize")
diff --git a/leopard/leopard.nim b/leopard/leopard.nim
new file mode 100644
index 0000000..760bc7e
--- /dev/null
+++ b/leopard/leopard.nim
@@ -0,0 +1,198 @@
+## Nim-Leopard
+## Copyright (c) 2022 Status Research & Development GmbH
+## Licensed under either of
+##  * Apache License, version 2.0, ([LICENSE-APACHE](LICENSE-APACHE))
+##  * MIT license ([LICENSE-MIT](LICENSE-MIT))
+## at your option.
+## This file may not be copied, modified, or distributed except according to
+## those terms.
+
+import pkg/upraises
+push: {.upraises: [].}
+
+{.deadCodeElim: on.}
+
+import pkg/stew/results
+import pkg/stew/byteutils
+
+import ./wrapper
+import ./utils
+
+export wrapper, results
+
+const
+  BuffMultiples* = 64
+
+type
+  LeoBufferPtr = ptr UncheckedArray[byte]
+  Leo = object of RootObj
+    bufSize*: int                       # size of the buffer in multiples of 64
+    buffers*: int                       # total number of data buffers (K)
+    parity*: int                        # total number of parity buffers (M)
+    dataBufferPtr: seq[LeoBufferPtr]    # buffer where data is copied before encoding
+    parityWorkCount: int                # number of parity work buffers
+    parityBufferPtr: seq[LeoBufferPtr]  # buffer where parity is copied before encoding
+
+  LeoEncoder* = object of Leo
+  LeoDecoder* = object of Leo
+    decodeWorkCount: int                # number of decoding work buffers
+    decodeBufferPtr: seq[LeoBufferPtr]  # work buffer used for decoding
+
+proc encode*(
+  self: var LeoEncoder,
+  data,
+  parity: var openArray[seq[byte]]): Result[void, cstring] =
+
+  # zero encode work buffer to avoid corrupting with previous run
+  for i in 0..<self.parityWorkCount:
+    zeroMem(self.parityBufferPtr[i], self.bufSize)
+
+  # copy data into aligned buffer
+  for i in 0..<data.len:
+    copyMem(self.dataBufferPtr[i], addr data[i][0], self.bufSize)
+
+  let
+    res = leoEncode(
+      self.bufSize.cuint,
+      self.buffers.cuint,
+      self.parity.cuint,
+      self.parityWorkCount.cuint,
+      cast[ptr pointer](addr self.dataBufferPtr[0]),
+      cast[ptr pointer](addr self.parityBufferPtr[0]))
+
+  if ord(res) != ord(LeopardSuccess):
+    return err(leoResultString(res.LeopardResult))
+
+  for i in 0..<parity.len:
+    copyMem(addr parity[i][0], self.parityBufferPtr[i], self.bufSize)
+
+  return ok()
+
+proc decode*(
+  self: var LeoDecoder,
+  data,
+  parity,
+  recovered: var openArray[seq[byte]]): Result[void, cstring] =
+  doAssert(data.len == self.buffers, "Number of data buffers should match!")
+  doAssert(parity.len == self.parity, "Number of parity buffers should match!")
+  doAssert(recovered.len == self.buffers, "Number of recovered buffers should match buffers!")
+
+  # zero both work buffers before decoding
+  for i in 0..<self.parityWorkCount:
+    zeroMem(self.parityBufferPtr[i], self.bufSize)
+
+  for i in 0..<self.decodeWorkCount:
+    zeroMem(self.decodeBufferPtr[i], self.bufSize)
+
+  var
+    dataPtr = newSeq[LeoBufferPtr](data.len)
+    parityPtr = newSeq[LeoBufferPtr](self.parityWorkCount)
+
+  # copy data into aligned buffer
+  for i in 0..<data.len:
+    if data[i].len > 0:
+      dataPtr[i] = self.dataBufferPtr[i]
+      copyMem(self.dataBufferPtr[i], addr data[i][0], self.bufSize)
+    else:
+      dataPtr[i] = nil
+
+  # copy parity into aligned buffer
+  for i in 0..<self.parityWorkCount:
+    if i < parity.len and parity[i].len > 0:
+      parityPtr[i] = self.parityBufferPtr[i]
+      copyMem(self.parityBufferPtr[i], addr parity[i][0], self.bufSize)
+    else:
+      parityPtr[i] = nil
+
+  let
+    res = leo_decode(
+      self.bufSize.cuint,
+      self.buffers.cuint,
+      self.parity.cuint,
+      self.decodeWorkCount.cuint,
+      cast[ptr pointer](addr dataPtr[0]),
+      cast[ptr pointer](addr self.parityBufferPtr[0]),
+      cast[ptr pointer](addr self.decodeBufferPtr[0]))
+
+  if ord(res) != ord(LeopardSuccess):
+    return err(leoResultString(res.LeopardResult))
+
+  for i in 0..<self.buffers:
+    if data[i].len <= 0:
+      echo string.fromBytes(self.decodeBufferPtr[i].toOpenArray(0, self.bufSize - 1))
+      copyMem(addr recovered[i][0], self.decodeBufferPtr[i], self.bufSize)
+
+  ok()
+
+proc free*(self: var Leo) = discard
+#   for i in 0..<self.encodeWorkCount:
+#     leoFree(self.encodeBufferPtr[i])
+#     self.encodeBufferPtr[i] = nil
+
+#   for i in 0..<self.decodeWorkCount:
+#     leoFree(self.decodeBufferPtr[i])
+#     self.decodeBufferPtr[i] = nil
+
+proc setup*(self: var Leo, bufSize, buffers, parity: int): Result[void, cstring] =
+  if bufSize mod BuffMultiples != 0:
+    return err("bufSize should be multiples of 64 bytes!")
+
+  once:
+    # First attempt to init the library
+    # This happens only once for all threads...
+    if (let res = leoinit(); res.ord != LeopardSuccess.ord):
+      return err(leoResultString(res.LeopardResult))
+
+  self.bufSize = bufSize
+  self.buffers = buffers
+  self.parity = parity
+
+  return ok()
+
+proc init*(T: type LeoEncoder, bufSize, buffers, parity: int): Result[T, cstring] =
+  var
+    self = LeoEncoder()
+
+  ? Leo(self).setup(bufSize, buffers, parity)
+
+  self.parityWorkCount = leoEncodeWorkCount(
+    buffers.cuint,
+    parity.cuint).int
+
+  # initialize encode work buffers
+  for _ in 0..<self.parityWorkCount:
+    self.parityBufferPtr.add(cast[LeoBufferPtr](leoAlloc(self.bufSize)))
+
+  # initialize data buffers
+  for _ in 0..<self.buffers:
+    self.dataBufferPtr.add(cast[LeoBufferPtr](leoAlloc(self.bufSize)))
+
+  ok(self)
+
+proc init*(T: type LeoDecoder, bufSize, buffers, parity: int): Result[T, cstring] =
+  var
+    self = LeoDecoder()
+
+  ? Leo(self).setup(bufSize, buffers, parity)
+
+  self.parityWorkCount = leoEncodeWorkCount(
+    buffers.cuint,
+    parity.cuint).int
+
+  self.decodeWorkCount = leoDecodeWorkCount(
+    buffers.cuint,
+    parity.cuint).int
+
+  # initialize decode work buffers
+  for _ in 0..<self.decodeWorkCount:
+    self.decodeBufferPtr.add(cast[LeoBufferPtr](leoAlloc(self.bufSize)))
+
+  # initialize data buffers
+  for _ in 0..<self.buffers:
+    self.dataBufferPtr.add(cast[LeoBufferPtr](leoAlloc(self.bufSize)))
+
+  # initialize data buffers
+  for _ in 0..<self.parityWorkCount:
+    self.parityBufferPtr.add(cast[LeoBufferPtr](leoAlloc(self.bufSize)))
+
+  ok(self)
diff --git a/leopard/utils.nim b/leopard/utils.nim
new file mode 100644
index 0000000..ab63788
--- /dev/null
+++ b/leopard/utils.nim
@@ -0,0 +1,4 @@
+import ./utils/allocs
+import ./utils/cpuinfo_x86
+
+export cpuinfo_x86, allocs
diff --git a/leopard/utils/allocs.nim b/leopard/utils/allocs.nim
new file mode 100644
index 0000000..4fabb8f
--- /dev/null
+++ b/leopard/utils/allocs.nim
@@ -0,0 +1,82 @@
+## Nim-Leopard
+## Copyright (c) 2022 Status Research & Development GmbH
+## Licensed under either of
+##  * Apache License, version 2.0, ([LICENSE-APACHE](LICENSE-APACHE))
+##  * MIT license ([LICENSE-MIT](LICENSE-MIT))
+## at your option.
+## This file may not be copied, modified, or distributed except according to
+## those terms.
+
+import pkg/upraises
+push: {.upraises: [].}
+
+{.deadCodeElim: on.}
+
+import system/ansi_c
+
+import pkg/stew/ptrops
+import ./cpuinfo_x86
+
+## inspired by https://github.com/mratsim/weave/blob/master/weave/memory/allocs.nim
+
+let
+  LeoAlignBytes* = if hasAvx2(): 32'u else: 16'u
+
+when defined(windows):
+  proc alignedAlloc(alignment, size: csize_t): pointer =
+    alignedAllocWindows(size, alignment)
+
+  proc alignedAllocWindows(size, alignment: csize_t): pointer
+    {.sideeffect, importc: "_aligned_malloc", header: "<malloc.h>".}
+    # Beware of the arg order!
+
+  proc alignedFree*[T](p: ptr T)
+    {.sideeffect, importc: "_aligned_free", header: "<malloc.h>".}
+elif defined(osx):
+  proc posix_memalign(mem: var pointer, alignment, size: csize_t)
+    {.sideeffect, importc, header:"<stdlib.h>".}
+
+  proc alignedAlloc(alignment, size: csize_t): pointer {.inline.} =
+    posix_memalign(result, alignment, size)
+
+  proc alignedFree*[T](p: ptr T) {.inline.} =
+    c_free(p)
+elif defined(unix):
+  proc alignedAlloc(alignment, size: csize_t): pointer
+    {.sideeffect, importc: "aligned_alloc", header: "<stdlib.h>".}
+
+  proc alignedFree*[T](p: ptr T) {.inline.} =
+    {.sideeffect, importc: "free_aligned", header: "<stdlib.h>".}
+    c_free(p)
+else:
+  {.warning: "Falling back to manual pointer alignment, might end-up using more memory!".}
+  proc alignedAlloc*(size, align: Positive): pointer {.inline.}  =
+    var
+      data = c_malloc(align + size)
+
+    if not isNil(data):
+      var
+        doffset = cast[uint](data) mod align
+
+      data = data.offset((align + doffset).int)
+      var
+        offsetPtr = cast[pointer](cast[uint](data) - 1'u)
+      moveMem(offsetPtr, addr doffset, sizeof(doffset))
+
+      return data
+
+  proc freeAligned*[T](p: ptr T, align: Positive) {.inline.} =
+    var data = p
+    if not isNil(data):
+      let offset = cast[uint](data) - 1'u
+      if offset >= align:
+          return
+
+      data = cast[pointer](cast[uint](data) - (align - offset))
+      c_free(data)
+
+proc leoAlloc*(size: Positive): pointer {.inline.} =
+  alignedAlloc(LeoAlignBytes, size.csize_t)
+
+proc leoFree*[T](p: ptr T) =
+  alignedFree(p)
diff --git a/leopard/utils/cpuinfo_x86.nim b/leopard/utils/cpuinfo_x86.nim
new file mode 100644
index 0000000..ce31069
--- /dev/null
+++ b/leopard/utils/cpuinfo_x86.nim
@@ -0,0 +1,793 @@
+## Nim-Leopard
+## Copyright (c) 2022 Status Research & Development GmbH
+## Licensed under either of
+##  * Apache License, version 2.0, ([LICENSE-APACHE](LICENSE-APACHE))
+##  * MIT license ([LICENSE-MIT](LICENSE-MIT))
+## at your option.
+## This file may not be copied, modified, or distributed except according to
+## those terms.
+
+import pkg/upraises
+push: {.upraises: [].}
+
+{.deadCodeElim: on.}
+
+# From awr1: https://github.com/nim-lang/Nim/pull/11816/files
+
+proc cpuidX86(eaxi, ecxi: int32): tuple[eax, ebx, ecx, edx: int32] {.used.}=
+  when defined(vcc):
+    # limited inline asm support in vcc, so intrinsics, here we go:
+    proc cpuidVcc(cpuInfo: ptr int32; functionID, subFunctionID: int32)
+      {.cdecl, importc: "__cpuidex", header: "intrin.h".}
+    cpuidVcc(addr result.eax, eaxi, ecxi)
+  else:
+    var (eaxr, ebxr, ecxr, edxr) = (0'i32, 0'i32, 0'i32, 0'i32)
+    asm """
+      cpuid
+      :"=a"(`eaxr`), "=b"(`ebxr`), "=c"(`ecxr`), "=d"(`edxr`)
+      :"a"(`eaxi`), "c"(`ecxi`)"""
+    (eaxr, ebxr, ecxr, edxr)
+
+proc cpuNameX86(): string {.used.}=
+  var leaves {.global.} = cast[array[48, char]]([
+    cpuidX86(eaxi = 0x80000002'i32, ecxi = 0),
+    cpuidX86(eaxi = 0x80000003'i32, ecxi = 0),
+    cpuidX86(eaxi = 0x80000004'i32, ecxi = 0)])
+  result = $cast[cstring](addr leaves[0])
+
+type
+  X86Feature {.pure.} = enum
+    HypervisorPresence, Hyperthreading, NoSMT, IntelVtx, Amdv, X87fpu, Mmx,
+    MmxExt, F3DNow, F3DNowEnhanced, Prefetch, Sse, Sse2, Sse3, Ssse3, Sse4a,
+    Sse41, Sse42, Avx, Avx2, Avx512f, Avx512dq, Avx512ifma, Avx512pf,
+    Avx512er, Avx512cd, Avx512bw, Avx512vl, Avx512vbmi, Avx512vbmi2,
+    Avx512vpopcntdq, Avx512vnni, Avx512vnniw4, Avx512fmaps4, Avx512bitalg,
+    Avx512bfloat16, Avx512vp2intersect, Rdrand, Rdseed, MovBigEndian, Popcnt,
+    Fma3, Fma4, Xop, Cas8B, Cas16B, Abm, Bmi1, Bmi2, TsxHle, TsxRtm, Adx, Sgx,
+    Gfni, Aes, Vaes, Vpclmulqdq, Pclmulqdq, NxBit, Float16c, Sha, Clflush,
+    ClflushOpt, Clwb, PrefetchWT1, Mpx
+
+let
+  leaf1 = cpuidX86(eaxi = 1, ecxi = 0)
+  leaf7 = cpuidX86(eaxi = 7, ecxi = 0)
+  leaf8 = cpuidX86(eaxi = 0x80000001'i32, ecxi = 0)
+
+# The reason why we don't just evaluate these directly in the `let` variable
+# list is so that we can internally organize features by their input (leaf)
+# and output registers.
+proc testX86Feature(feature: X86Feature): bool =
+  proc test(input, bit: int): bool =
+    ((1 shl bit) and input) != 0
+
+  # see: https://en.wikipedia.org/wiki/CPUID#Calling_CPUID
+  # see: Intel® Architecture Instruction Set Extensions and Future Features
+  #      Programming Reference
+  result = case feature
+    # leaf 1, edx
+    of X87fpu:
+      leaf1.edx.test(0)
+    of Clflush:
+      leaf1.edx.test(19)
+    of Mmx:
+      leaf1.edx.test(23)
+    of Sse:
+      leaf1.edx.test(25)
+    of Sse2:
+      leaf1.edx.test(26)
+    of Hyperthreading:
+      leaf1.edx.test(28)
+
+    # leaf 1, ecx
+    of Sse3:
+      leaf1.ecx.test(0)
+    of Pclmulqdq:
+      leaf1.ecx.test(1)
+    of IntelVtx:
+      leaf1.ecx.test(5)
+    of Ssse3:
+      leaf1.ecx.test(9)
+    of Fma3:
+      leaf1.ecx.test(12)
+    of Cas16B:
+      leaf1.ecx.test(13)
+    of Sse41:
+      leaf1.ecx.test(19)
+    of Sse42:
+      leaf1.ecx.test(20)
+    of MovBigEndian:
+      leaf1.ecx.test(22)
+    of Popcnt:
+      leaf1.ecx.test(23)
+    of Aes:
+      leaf1.ecx.test(25)
+    of Avx:
+      leaf1.ecx.test(28)
+    of Float16c:
+      leaf1.ecx.test(29)
+    of Rdrand:
+      leaf1.ecx.test(30)
+    of HypervisorPresence:
+      leaf1.ecx.test(31)
+
+    # leaf 7, ecx
+    of PrefetchWT1:
+      leaf7.ecx.test(0)
+    of Avx512vbmi:
+      leaf7.ecx.test(1)
+    of Avx512vbmi2:
+      leaf7.ecx.test(6)
+    of Gfni:
+      leaf7.ecx.test(8)
+    of Vaes:
+      leaf7.ecx.test(9)
+    of Vpclmulqdq:
+      leaf7.ecx.test(10)
+    of Avx512vnni:
+      leaf7.ecx.test(11)
+    of Avx512bitalg:
+      leaf7.ecx.test(12)
+    of Avx512vpopcntdq:
+      leaf7.ecx.test(14)
+
+    # lead 7, eax
+    of Avx512bfloat16:
+      leaf7.eax.test(5)
+
+    # leaf 7, ebx
+    of Sgx:
+      leaf7.ebx.test(2)
+    of Bmi1:
+      leaf7.ebx.test(3)
+    of TsxHle:
+      leaf7.ebx.test(4)
+    of Avx2:
+      leaf7.ebx.test(5)
+    of Bmi2:
+      leaf7.ebx.test(8)
+    of TsxRtm:
+      leaf7.ebx.test(11)
+    of Mpx:
+      leaf7.ebx.test(14)
+    of Avx512f:
+      leaf7.ebx.test(16)
+    of Avx512dq:
+      leaf7.ebx.test(17)
+    of Rdseed:
+      leaf7.ebx.test(18)
+    of Adx:
+      leaf7.ebx.test(19)
+    of Avx512ifma:
+      leaf7.ebx.test(21)
+    of ClflushOpt:
+      leaf7.ebx.test(23)
+    of Clwb:
+      leaf7.ebx.test(24)
+    of Avx512pf:
+      leaf7.ebx.test(26)
+    of Avx512er:
+      leaf7.ebx.test(27)
+    of Avx512cd:
+      leaf7.ebx.test(28)
+    of Sha:
+      leaf7.ebx.test(29)
+    of Avx512bw:
+      leaf7.ebx.test(30)
+    of Avx512vl:
+      leaf7.ebx.test(31)
+
+    # leaf 7, edx
+    of Avx512vnniw4:
+      leaf7.edx.test(2)
+    of Avx512fmaps4:
+      leaf7.edx.test(3)
+    of Avx512vp2intersect:
+      leaf7.edx.test(8)
+
+    # leaf 8, edx
+    of NoSMT:
+      leaf8.edx.test(1)
+    of Cas8B:
+      leaf8.edx.test(8)
+    of NxBit:
+      leaf8.edx.test(20)
+    of MmxExt:
+      leaf8.edx.test(22)
+    of F3DNowEnhanced:
+      leaf8.edx.test(30)
+    of F3DNow:
+      leaf8.edx.test(31)
+
+    # leaf 8, ecx
+    of Amdv:
+      leaf8.ecx.test(2)
+    of Abm:
+      leaf8.ecx.test(5)
+    of Sse4a:
+      leaf8.ecx.test(6)
+    of Prefetch:
+      leaf8.ecx.test(8)
+    of Xop:
+      leaf8.ecx.test(11)
+    of Fma4:
+      leaf8.ecx.test(16)
+
+let
+  isHypervisorPresentImpl = testX86Feature(HypervisorPresence)
+  hasSimultaneousMultithreadingImpl =
+    testX86Feature(Hyperthreading) or not testX86Feature(NoSMT)
+  hasIntelVtxImpl = testX86Feature(IntelVtx)
+  hasAmdvImpl = testX86Feature(Amdv)
+  hasX87fpuImpl = testX86Feature(X87fpu)
+  hasMmxImpl = testX86Feature(Mmx)
+  hasMmxExtImpl = testX86Feature(MmxExt)
+  has3DNowImpl = testX86Feature(F3DNow)
+  has3DNowEnhancedImpl = testX86Feature(F3DNowEnhanced)
+  hasPrefetchImpl = testX86Feature(Prefetch) or testX86Feature(F3DNow)
+  hasSseImpl = testX86Feature(Sse)
+  hasSse2Impl = testX86Feature(Sse2)
+  hasSse3Impl = testX86Feature(Sse3)
+  hasSsse3Impl = testX86Feature(Ssse3)
+  hasSse4aImpl = testX86Feature(Sse4a)
+  hasSse41Impl = testX86Feature(Sse41)
+  hasSse42Impl = testX86Feature(Sse42)
+  hasAvxImpl = testX86Feature(Avx)
+  hasAvx2Impl = testX86Feature(Avx2)
+  hasAvx512fImpl = testX86Feature(Avx512f)
+  hasAvx512dqImpl = testX86Feature(Avx512dq)
+  hasAvx512ifmaImpl = testX86Feature(Avx512ifma)
+  hasAvx512pfImpl = testX86Feature(Avx512pf)
+  hasAvx512erImpl = testX86Feature(Avx512er)
+  hasAvx512cdImpl = testX86Feature(Avx512dq)
+  hasAvx512bwImpl = testX86Feature(Avx512bw)
+  hasAvx512vlImpl = testX86Feature(Avx512vl)
+  hasAvx512vbmiImpl = testX86Feature(Avx512vbmi)
+  hasAvx512vbmi2Impl = testX86Feature(Avx512vbmi2)
+  hasAvx512vpopcntdqImpl = testX86Feature(Avx512vpopcntdq)
+  hasAvx512vnniImpl = testX86Feature(Avx512vnni)
+  hasAvx512vnniw4Impl = testX86Feature(Avx512vnniw4)
+  hasAvx512fmaps4Impl = testX86Feature(Avx512fmaps4)
+  hasAvx512bitalgImpl = testX86Feature(Avx512bitalg)
+  hasAvx512bfloat16Impl = testX86Feature(Avx512bfloat16)
+  hasAvx512vp2intersectImpl = testX86Feature(Avx512vp2intersect)
+  hasRdrandImpl = testX86Feature(Rdrand)
+  hasRdseedImpl = testX86Feature(Rdseed)
+  hasMovBigEndianImpl = testX86Feature(MovBigEndian)
+  hasPopcntImpl = testX86Feature(Popcnt)
+  hasFma3Impl = testX86Feature(Fma3)
+  hasFma4Impl = testX86Feature(Fma4)
+  hasXopImpl = testX86Feature(Xop)
+  hasCas8BImpl = testX86Feature(Cas8B)
+  hasCas16BImpl = testX86Feature(Cas16B)
+  hasAbmImpl = testX86Feature(Abm)
+  hasBmi1Impl = testX86Feature(Bmi1)
+  hasBmi2Impl = testX86Feature(Bmi2)
+  hasTsxHleImpl = testX86Feature(TsxHle)
+  hasTsxRtmImpl = testX86Feature(TsxRtm)
+  hasAdxImpl = testX86Feature(TsxHle)
+  hasSgxImpl = testX86Feature(Sgx)
+  hasGfniImpl = testX86Feature(Gfni)
+  hasAesImpl = testX86Feature(Aes)
+  hasVaesImpl = testX86Feature(Vaes)
+  hasVpclmulqdqImpl = testX86Feature(Vpclmulqdq)
+  hasPclmulqdqImpl = testX86Feature(Pclmulqdq)
+  hasNxBitImpl = testX86Feature(NxBit)
+  hasFloat16cImpl = testX86Feature(Float16c)
+  hasShaImpl = testX86Feature(Sha)
+  hasClflushImpl = testX86Feature(Clflush)
+  hasClflushOptImpl = testX86Feature(ClflushOpt)
+  hasClwbImpl = testX86Feature(Clwb)
+  hasPrefetchWT1Impl = testX86Feature(PrefetchWT1)
+  hasMpxImpl = testX86Feature(Mpx)
+
+# NOTE: We use procedures here (layered over the variables) to keep the API
+# consistent and usable against possible future heterogenous systems with ISA
+# differences between cores (a possibility that has historical precedents, for
+# instance, the PPU/SPU relationship found on the IBM Cell). If future systems
+# do end up having disparate ISA features across multiple cores, expect there to
+# be a "cpuCore" argument added to the feature procs.
+
+proc isHypervisorPresent*(): bool {.inline.} =
+  return isHypervisorPresentImpl
+  ## **(x86 Only)**
+  ##
+  ## Reports `true` if this application is running inside of a virtual machine
+  ## (this is by no means foolproof).
+
+proc hasSimultaneousMultithreading*(): bool {.inline.} =
+  return hasSimultaneousMultithreadingImpl
+  ## **(x86 Only)**
+  ##
+  ## Reports `true` if the hardware is utilizing simultaneous multithreading
+  ## (branded as *"hyperthreads"* on Intel processors).
+
+proc hasIntelVtx*(): bool {.inline.} =
+  return hasIntelVtxImpl
+  ## **(x86 Only)**
+  ##
+  ## Reports `true` if the Intel virtualization extensions (VT-x) are available.
+
+proc hasAmdv*(): bool {.inline.} =
+  return hasAmdvImpl
+  ## **(x86 Only)**
+  ##
+  ## Reports `true` if the AMD virtualization extensions (AMD-V) are available.
+
+proc hasX87fpu*(): bool {.inline.} =
+  return hasX87fpuImpl
+  ## **(x86 Only)**
+  ##
+  ## Reports `true` if the hardware can use x87 floating-point instructions
+  ## (includes support for single, double, and 80-bit percision floats as per
+  ## IEEE 754-1985).
+  ##
+  ## By virtue of SSE2 enforced compliance on AMD64 CPUs, this should always be
+  ## `true` on 64-bit x86 processors. It should be noted that support of these
+  ## instructions is deprecated on 64-bit versions of Windows - see MSDN_.
+  ##
+  ## .. _MSDN: https://docs.microsoft.com/en-us/windows/win32/dxtecharts/sixty-four-bit-programming-for-game-developers#porting-applications-to-64-bit-platforms
+
+proc hasMmx*(): bool {.inline.} =
+  return hasMmxImpl
+  ## **(x86 Only)**
+  ##
+  ## Reports `true` if the hardware can use MMX SIMD instructions.
+  ##
+  ## By virtue of SSE2 enforced compliance on AMD64 CPUs, this should always be
+  ## `true` on 64-bit x86 processors. It should be noted that support of these
+  ## instructions is deprecated on 64-bit versions of Windows (see MSDN_ for
+  ## more info).
+  ##
+  ## .. _MSDN: https://docs.microsoft.com/en-us/windows/win32/dxtecharts/sixty-four-bit-programming-for-game-developers#porting-applications-to-64-bit-platforms
+
+proc hasMmxExt*(): bool {.inline.} =
+  return hasMmxExtImpl
+  ## **(x86 Only)**
+  ##
+  ## Reports `true` if the hardware can use "Extended MMX" SIMD instructions.
+  ##
+  ## It should be noted that support of these instructions is deprecated on
+  ## 64-bit versions of Windows (see MSDN_ for more info).
+  ##
+  ## .. _MSDN: https://docs.microsoft.com/en-us/windows/win32/dxtecharts/sixty-four-bit-programming-for-game-developers#porting-applications-to-64-bit-platforms
+
+proc has3DNow*(): bool {.inline.} =
+  return has3DNowImpl
+  ## **(x86 Only)**
+  ##
+  ## Reports `true` if the hardware can use 3DNow! SIMD instructions.
+  ##
+  ## It should be noted that support of these instructions is deprecated on
+  ## 64-bit versions of Windows (see MSDN_ for more info), and that the 3DNow!
+  ## instructions (with an exception made for the prefetch instructions, see the
+  ## `hasPrefetch` procedure) have been phased out of AMD processors since 2010
+  ## (see `AMD Developer Central`_ for more info).
+  ##
+  ## .. _MSDN: https://docs.microsoft.com/en-us/windows/win32/dxtecharts/sixty-four-bit-programming-for-game-developers#porting-applications-to-64-bit-platforms
+  ## .. _`AMD Developer Central`: https://web.archive.org/web/20131109151245/http://developer.amd.com/community/blog/2010/08/18/3dnow-deprecated/
+
+proc has3DNowEnhanced*(): bool {.inline.} =
+  return has3DNowEnhancedImpl
+  ## **(x86 Only)**
+  ##
+  ## Reports `true` if the hardware can use "Enhanced 3DNow!" SIMD instructions.
+  ##
+  ## It should be noted that support of these instructions is deprecated on
+  ## 64-bit versions of Windows (see MSDN_ for more info), and that the 3DNow!
+  ## instructions (with an exception made for the prefetch instructions, see the
+  ## `hasPrefetch` procedure) have been phased out of AMD processors since 2010
+  ## (see `AMD Developer Central`_ for more info).
+  ##
+  ## .. _MSDN: https://docs.microsoft.com/en-us/windows/win32/dxtecharts/sixty-four-bit-programming-for-game-developers#porting-applications-to-64-bit-platforms
+  ## .. _`AMD Developer Central`: https://web.archive.org/web/20131109151245/http://developer.amd.com/community/blog/2010/08/18/3dnow-deprecated/
+
+proc hasPrefetch*(): bool {.inline.} =
+  return hasPrefetchImpl
+  ## **(x86 Only)**
+  ##
+  ## Reports `true` if the hardware can use the `PREFETCH` and `PREFETCHW`
+  ## instructions. These instructions originally included as part of 3DNow!, but
+  ## potentially indepdendent from the rest of it due to changes in contemporary
+  ## AMD processors (see above).
+
+proc hasSse*(): bool {.inline.} =
+  return hasSseImpl
+  ## **(x86 Only)**
+  ##
+  ## Reports `true` if the hardware can use the SSE (Streaming SIMD Extensions)
+  ## 1.0 instructions, which introduced 128-bit SIMD on x86 machines.
+  ##
+  ## By virtue of SSE2 enforced compliance on AMD64 CPUs, this should always be
+  ## `true` on 64-bit x86 processors.
+
+proc hasSse2*(): bool {.inline.} =
+  return hasSse2Impl
+  ## **(x86 Only)**
+  ##
+  ## Reports `true` if the hardware can use the SSE (Streaming SIMD Extensions)
+  ## 2.0 instructions.
+  ##
+  ## By virtue of SSE2 enforced compliance on AMD64 CPUs, this should always be
+  ## `true` on 64-bit x86 processors.
+
+proc hasSse3*(): bool {.inline.} =
+  return hasSse3Impl
+  ## **(x86 Only)**
+  ##
+  ## Reports `true` if the hardware can use SSE (Streaming SIMD Extensions) 3.0
+  ## instructions.
+
+proc hasSsse3*(): bool {.inline.} =
+  return hasSsse3Impl
+  ## **(x86 Only)**
+  ##
+  ## Reports `true` if the hardware can use Supplemental SSE (Streaming SIMD
+  ## Extensions) 3.0 instructions.
+
+proc hasSse4a*(): bool {.inline.} =
+  return hasSse4aImpl
+  ## **(x86 Only)**
+  ##
+  ## Reports `true` if the hardware can use Supplemental SSE (Streaming SIMD
+  ## Extensions) 4a instructions.
+
+proc hasSse41*(): bool {.inline.} =
+  return hasSse41Impl
+  ## **(x86 Only)**
+  ##
+  ## Reports `true` if the hardware can use Supplemental SSE (Streaming SIMD
+  ## Extensions) 4.1 instructions.
+
+proc hasSse42*(): bool {.inline.} =
+  return hasSse42Impl
+  ## **(x86 Only)**
+  ##
+  ## Reports `true` if the hardware can use Supplemental SSE (Streaming SIMD
+  ## Extensions) 4.2 instructions.
+
+proc hasAvx*(): bool {.inline.} =
+  return hasAvxImpl
+  ## **(x86 Only)**
+  ##
+  ## Reports `true` if the hardware can use AVX (Advanced Vector Extensions)
+  ## 1.0 instructions, which introduced 256-bit SIMD on x86 machines along with
+  ## addded reencoded versions of prior 128-bit SSE instructions into the more
+  ## code-dense and non-backward compatible VEX (Vector Extensions) format.
+
+proc hasAvx2*(): bool {.inline.} =
+  return hasAvx2Impl
+  ## **(x86 Only)**
+  ##
+  ## Reports `true` if the hardware can use AVX (Advanced Vector Extensions) 2.0
+  ## instructions.
+
+proc hasAvx512f*(): bool {.inline.} =
+  return hasAvx512fImpl
+  ## **(x86 Only)**
+  ##
+  ## Reports `true` if the hardware can use AVX (Advanced Vector Extensions)
+  ## 512-bit F (Foundation) instructions.
+
+proc hasAvx512dq*(): bool {.inline.} =
+  return hasAvx512dqImpl
+  ## **(x86 Only)**
+  ##
+  ## Reports `true` if the hardware can use AVX (Advanced Vector Extensions)
+  ## 512-bit DQ (Doubleword + Quadword) instructions.
+
+proc hasAvx512ifma*(): bool {.inline.} =
+  return hasAvx512ifmaImpl
+  ## **(x86 Only)**
+  ##
+  ## Reports `true` if the hardware can use AVX (Advanced Vector Extensions)
+  ## 512-bit IFMA (Integer Fused Multiply Accumulation) instructions.
+
+proc hasAvx512pf*(): bool {.inline.} =
+  return hasAvx512pfImpl
+  ## **(x86 Only)**
+  ##
+  ## Reports `true` if the hardware can use AVX (Advanced Vector Extensions)
+  ## 512-bit PF (Prefetch) instructions.
+
+proc hasAvx512er*(): bool {.inline.} =
+  return hasAvx512erImpl
+  ## **(x86 Only)**
+  ##
+  ## Reports `true` if the hardware can use AVX (Advanced Vector Extensions)
+  ## 512-bit ER (Exponential and Reciprocal) instructions.
+
+proc hasAvx512cd*(): bool {.inline.} =
+  return hasAvx512cdImpl
+  ## **(x86 Only)**
+  ##
+  ## Reports `true` if the hardware can use AVX (Advanced Vector Extensions)
+  ## 512-bit CD (Conflict Detection) instructions.
+
+proc hasAvx512bw*(): bool {.inline.} =
+  return hasAvx512bwImpl
+  ## **(x86 Only)**
+  ##
+  ## Reports `true` if the hardware can use AVX (Advanced Vector Extensions)
+  ## 512-bit BW (Byte and Word) instructions.
+
+proc hasAvx512vl*(): bool {.inline.} =
+  return hasAvx512vlImpl
+  ## **(x86 Only)**
+  ##
+  ## Reports `true` if the hardware can use AVX (Advanced Vector Extensions)
+  ## 512-bit VL (Vector Length) instructions.
+
+proc hasAvx512vbmi*(): bool {.inline.} =
+  return hasAvx512vbmiImpl
+  ## **(x86 Only)**
+  ##
+  ## Reports `true` if the hardware can use AVX (Advanced Vector Extensions)
+  ## 512-bit VBMI (Vector Byte Manipulation) 1.0 instructions.
+
+proc hasAvx512vbmi2*(): bool {.inline.} =
+  return hasAvx512vbmi2Impl
+  ## **(x86 Only)**
+  ##
+  ## Reports `true` if the hardware can use AVX (Advanced Vector Extensions)
+  ## 512-bit VBMI (Vector Byte Manipulation) 2.0 instructions.
+
+proc hasAvx512vpopcntdq*(): bool {.inline.} =
+  return hasAvx512vpopcntdqImpl
+  ## **(x86 Only)**
+  ##
+  ## Reports `true` if the hardware can use the AVX (Advanced Vector Extensions)
+  ## 512-bit `VPOPCNTDQ` (population count, i.e. determine number of flipped
+  ## bits) instruction.
+
+proc hasAvx512vnni*(): bool {.inline.} =
+  return hasAvx512vnniImpl
+  ## **(x86 Only)**
+  ##
+  ## Reports `true` if the hardware can use AVX (Advanced Vector Extensions)
+  ## 512-bit VNNI (Vector Neural Network) instructions.
+
+proc hasAvx512vnniw4*(): bool {.inline.} =
+  return hasAvx512vnniw4Impl
+  ## **(x86 Only)**
+  ##
+  ## Reports `true` if the hardware can use AVX (Advanced Vector Extensions)
+  ## 512-bit 4VNNIW (Vector Neural Network Word Variable Percision)
+  ## instructions.
+
+proc hasAvx512fmaps4*(): bool {.inline.} =
+  return hasAvx512fmaps4Impl
+  ## **(x86 Only)**
+  ##
+  ## Reports `true` if the hardware can use AVX (Advanced Vector Extensions)
+  ## 512-bit 4FMAPS (Fused-Multiply-Accumulation Single-percision) instructions.
+
+proc hasAvx512bitalg*(): bool {.inline.} =
+  return hasAvx512bitalgImpl
+  ## **(x86 Only)**
+  ##
+  ## Reports `true` if the hardware can use AVX (Advanced Vector Extensions)
+  ## 512-bit BITALG (Bit Algorithms) instructions.
+
+proc hasAvx512bfloat16*(): bool {.inline.} =
+  return hasAvx512bfloat16Impl
+  ## **(x86 Only)**
+  ## Reports `true` if the hardware can use AVX (Advanced Vector Extensions)
+  ## 512-bit BFLOAT16 (8-bit exponent, 7-bit mantissa) instructions used by
+  ## Intel DL (Deep Learning) Boost.
+
+proc hasAvx512vp2intersect*(): bool {.inline.} =
+  return hasAvx512vp2intersectImpl
+  ## **(x86 Only)**
+  ##
+  ## Reports `true` if the hardware can use AVX (Advanced Vector Extensions)
+  ## 512-bit VP2INTERSECT (Compute Intersections between Dualwords + Quadwords)
+  ## instructions.
+
+proc hasRdrand*(): bool {.inline.} =
+  return hasRdrandImpl
+  ## **(x86 Only)**
+  ##
+  ## Reports `true` if the hardware has support for the `RDRAND` instruction,
+  ## i.e. Intel on-CPU hardware random number generation.
+
+proc hasRdseed*(): bool {.inline.} =
+  return hasRdseedImpl
+  ## **(x86 Only)**
+  ##
+  ## Reports `true` if the hardware has support for the `RDSEED` instruction,
+  ## i.e. Intel on-CPU hardware random number generation (used for seeding other
+  ## PRNGs).
+
+proc hasMovBigEndian*(): bool {.inline.} =
+  return hasMovBigEndianImpl
+  ## **(x86 Only)**
+  ##
+  ## Reports `true` if the hardware has support for the `MOVBE` instruction for
+  ## endianness/byte-order switching.
+
+proc hasPopcnt*(): bool {.inline.} =
+  return hasPopcntImpl
+  ## **(x86 Only)**
+  ##
+  ## Reports `true` if the hardware has support for the `POPCNT` (population
+  ## count, i.e. determine number of flipped bits) instruction.
+
+proc hasFma3*(): bool {.inline.} =
+  return hasFma3Impl
+  ## **(x86 Only)**
+  ##
+  ## Reports `true` if the hardware has support for the FMA3 (Fused Multiply
+  ## Accumulation 3-operand) SIMD instructions.
+
+proc hasFma4*(): bool {.inline.} =
+  return hasFma4Impl
+  ## **(x86 Only)**
+  ##
+  ## Reports `true` if the hardware has support for the FMA4 (Fused Multiply
+  ## Accumulation 4-operand) SIMD instructions.
+
+proc hasXop*(): bool {.inline.} =
+  return hasXopImpl
+  ## **(x86 Only)**
+  ##
+  ## Reports `true` if the hardware has support for the XOP (eXtended
+  ## Operations) SIMD instructions. These instructions are exclusive to the
+  ## Bulldozer AMD microarchitecture family (i.e. Bulldozer, Piledriver,
+  ## Steamroller, and Excavator) and were phased out with the release of the Zen
+  ## design.
+
+proc hasCas8B*(): bool {.inline.} =
+  return hasCas8BImpl
+  ## **(x86 Only)**
+  ##
+  ## Reports `true` if the hardware has support for the (`LOCK`-able)
+  ## `CMPXCHG8B` 64-bit compare-and-swap instruction.
+
+proc hasCas16B*(): bool {.inline.} =
+  return hasCas16BImpl
+  ## **(x86 Only)**
+  ##
+  ## Reports `true` if the hardware has support for the (`LOCK`-able)
+  ## `CMPXCHG16B` 128-bit compare-and-swap instruction.
+
+proc hasAbm*(): bool {.inline.} =
+  return hasAbmImpl
+  ## **(x86 Only)**
+  ##
+  ## Reports `true` if the hardware has support for ABM (Advanced Bit
+  ## Manipulation) insturctions (i.e. `POPCNT` and `LZCNT` for counting leading
+  ## zeroes).
+
+proc hasBmi1*(): bool {.inline.} =
+  return hasBmi1Impl
+  ## **(x86 Only)**
+  ##
+  ## Reports `true` if the hardware has support for BMI (Bit Manipulation) 1.0
+  ## instructions.
+
+proc hasBmi2*(): bool {.inline.} =
+  return hasBmi2Impl
+  ## **(x86 Only)**
+  ##
+  ## Reports `true` if the hardware has support for BMI (Bit Manipulation) 2.0
+  ## instructions.
+
+proc hasTsxHle*(): bool {.inline.} =
+  return hasTsxHleImpl
+  ## **(x86 Only)**
+  ##
+  ## Reports `true` if the hardware has support for HLE (Hardware Lock Elision)
+  ## as part of Intel's TSX (Transactional Synchronization Extensions).
+
+proc hasTsxRtm*(): bool {.inline.} =
+  return hasTsxRtmImpl
+  ## **(x86 Only)**
+  ##
+  ## Reports `true` if the hardware has support for RTM (Restricted
+  ## Transactional Memory) as part of Intel's TSX (Transactional Synchronization
+  ## Extensions).
+
+proc hasAdx*(): bool {.inline.} =
+  return hasAdxImpl
+  ## **(x86 Only)**
+  ##
+  ## Reports `true` if the hardware has support for ADX (Multi-percision
+  ## Add-Carry Extensions) insructions.
+
+proc hasSgx*(): bool {.inline.} =
+  return hasSgxImpl
+  ## **(x86 Only)**
+  ##
+  ## Reports `true` if the hardware has support for SGX (Software Guard
+  ## eXtensions) memory encryption technology.
+
+proc hasGfni*(): bool {.inline.} =
+  return hasGfniImpl
+  ## **(x86 Only)**
+  ##
+  ## Reports `true` if the hardware has support for GFNI (Galois Field Affine
+  ## Transformation) instructions.
+
+proc hasAes*(): bool {.inline.} =
+  return hasAesImpl
+  ## **(x86 Only)**
+  ##
+  ## Reports `true` if the hardware has support for AESNI (Advanced Encryption
+  ## Standard) instructions.
+
+proc hasVaes*(): bool {.inline.} =
+  return hasVaesImpl
+  ## **(x86 Only)**
+  ##
+  ## Reports `true` if the hardware has support for VAES (Vectorized Advanced
+  ## Encryption Standard) instructions.
+
+proc hasVpclmulqdq*(): bool {.inline.} =
+  return hasVpclmulqdqImpl
+  ## **(x86 Only)**
+  ##
+  ## Reports `true` if the hardware has support for `VCLMULQDQ` (512 and 256-bit
+  ## Carryless Multiplication) instructions.
+
+proc hasPclmulqdq*(): bool {.inline.} =
+  return hasPclmulqdqImpl
+  ## **(x86 Only)**
+  ##
+  ## Reports `true` if the hardware has support for `PCLMULQDQ` (128-bit
+  ## Carryless Multiplication) instructions.
+
+proc hasNxBit*(): bool {.inline.} =
+  return hasNxBitImpl
+  ## **(x86 Only)**
+  ##
+  ## Reports `true` if the hardware has support for NX-bit (No-eXecute)
+  ## technology for marking pages of memory as non-executable.
+
+proc hasFloat16c*(): bool {.inline.} =
+  return hasFloat16cImpl
+  ## **(x86 Only)**
+  ##
+  ## Reports `true` if the hardware has support for F16C instructions, used for
+  ## converting 16-bit "half-percision" floating-point values to and from
+  ## single-percision floating-point values.
+
+proc hasSha*(): bool {.inline.} =
+  return hasShaImpl
+  ## **(x86 Only)**
+  ##
+  ## Reports `true` if the hardware has support for SHA (Secure Hash Algorithm)
+  ## instructions.
+
+proc hasClflush*(): bool {.inline.} =
+  return hasClflushImpl
+  ## **(x86 Only)**
+  ##
+  ## Reports `true` if the hardware has support for the `CLFLUSH` (Cache-line
+  ## Flush) instruction.
+
+proc hasClflushOpt*(): bool {.inline.} =
+  return hasClflushOptImpl
+  ## **(x86 Only)**
+  ##
+  ## Reports `true` if the hardware has support for the `CLFLUSHOPT` (Cache-line
+  ## Flush Optimized) instruction.
+
+proc hasClwb*(): bool {.inline.} =
+  return hasClwbImpl
+  ## **(x86 Only)**
+  ##
+  ## Reports `true` if the hardware has support for the `CLWB` (Cache-line Write
+  ## Back) instruction.
+
+proc hasPrefetchWT1*(): bool {.inline.} =
+  return hasPrefetchWT1Impl
+  ## **(x86 Only)**
+  ##
+  ## Reports `true` if the hardware has support for the `PREFECTHWT1`
+  ## instruction.
+
+proc hasMpx*(): bool {.inline.} =
+  return hasMpxImpl
+  ## **(x86 Only)**
+  ##
+  ## Reports `true` if the hardware has support for MPX (Memory Protection
+  ## eXtensions).
diff --git a/leopard/wrapper.nim b/leopard/wrapper.nim
index ea554ac..699a0a9 100644
--- a/leopard/wrapper.nim
+++ b/leopard/wrapper.nim
@@ -57,10 +57,9 @@
 ## Conference on File and Storage Technologies, San Jose, 2013
 
 
-import upraises
+import pkg/upraises
 push: {.upraises: [].}
 
-
 ## -----------------------------------------------------------------------------
 ## Build configuration
 
@@ -150,144 +149,141 @@ static:
 
 {.pragma: leo, cdecl, header: LeopardHeader.}
 
-
-## -----------------------------------------------------------------------------
-## Library version
-
-var LEO_VERSION* {.header: LeopardHeader, importc.}: int
-
-
-## -----------------------------------------------------------------------------
-## Platform/Architecture
-
-# maybe should detect AVX2 and set to 32 if detected, 16 otherwise:
-# https://github.com/catid/leopard/blob/master/LeopardCommon.h#L247-L253
-# https://github.com/mratsim/Arraymancer/blob/master/src/arraymancer/laser/cpuinfo_x86.nim#L220
-const LEO_ALIGN_BYTES* = 16
-
-
-## -----------------------------------------------------------------------------
-## Initialization API
-
-## leoInit()
-##
-## Perform static initialization for the library, verifying that the platform
-## is supported.
-##
-## Returns 0 on success and other values on failure.
-
 proc leoInit*(): cint {.leo, importcpp: "leo_init".}
 
+## ------------------------------------------------------------------------------
+##  Shared Constants / Datatypes
+##  Results
 
-## -----------------------------------------------------------------------------
-## Shared Constants / Datatypes
-
-## Results
+# TODO: For some reason it's only possibly to use the enum with `ord`
 type
   LeopardResult* = enum
-    LeopardCallInitialize = -7.cint ## Call leoInit() first
-    LeopardPlatform       = -6.cint ## Platform is unsupported
-    LeopardInvalidInput   = -5.cint ## A function parameter was invalid
-    LeopardInvalidCounts  = -4.cint ## Invalid counts provided
-    LeopardInvalidSize    = -3.cint ## Buffer size must be multiple of 64 bytes
-    LeopardTooMuchData    = -2.cint ## Buffer counts are too high
-    LeopardNeedMoreData   = -1.cint ## Not enough recovery data received
-    LeopardSuccess        =  0.cint ## Operation succeeded
-
-## Convert Leopard result to string
-func leoResultString*(res: LeopardResult): cstring
-  {.leo, importc: "leo_result_string".}
+    LeopardCallInitialize = -7, ##  Call leo_init() first
+    LeopardPlatform = -6,       ##  Platform is unsupported
+    LeopardInvalidInput = -5,   ##  A function parameter was invalid
+    LeopardInvalidCounts = -4,  ##  Invalid counts provided
+    LeopardInvalidSize = -3,    ##  Buffer size must be a multiple of 64 bytes
+    LeopardTooMuchData = -2,    ##  Buffer counts are too high
+    LeopardNeedMoreData = -1,   ##  Not enough recovery data received
+    LeopardSuccess = 0          ##  Operation succeeded
 
 
-## -----------------------------------------------------------------------------
-## Encoder API
+##  Convert Leopard result to string
 
-## leoEncodeWorkCount()
+proc leoResultString*(result: LeopardResult): cstring {.leo, importc: "leo_result_string".}
+## ------------------------------------------------------------------------------
+##  Encoder API
 ##
-## Calculate the number of work data buffers to provide to leoEncode().
+##     leo_encode_work_count()
 ##
-## The sum of originalCount + recoveryCount must not exceed 65536.
+##     Calculate the number of work_data buffers to provide to leo_encode().
+##
+##     The sum of original_count + recovery_count must not exceed 65536.
+##
+##     Returns the work_count value to pass into leo_encode().
+##     Returns 0 on invalid input.
 ##
-## Returns the workCount value to pass into leoEncode().
-## Returns 0 on invalid input.
 
-func leoEncodeWorkCount*(originalCount, recoveryCount: cuint): cuint
+proc leoEncodeWorkCount*(originalCount: cuint; recoveryCount: cuint): cuint
   {.leo, importc: "leo_encode_work_count".}
-
-## leoEncode()
 ##
-## Generate recovery data.
+##     leo_encode()
 ##
-## bufferBytes:   Number of bytes in each data buffer.
-## originalCount: Number of original data buffers provided.
-## recoveryCount: Number of desired recovery data buffers.
-## workCount:     Number of work data buffers, from leoEncodeWorkCount().
-## originalData:  Array of pointers to original data buffers.
-## workData:      Array of pointers to work data buffers.
+##     Generate recovery data.
 ##
-## The sum of originalCount + recoveryCount must not exceed 65536.
-## The recoveryCount <= originalCount.
+##     original_count: Number of original_data[] buffers provided.
+##     recovery_count: Number of desired recovery data buffers.
+##     buffer_bytes:   Number of bytes in each data buffer.
+##     original_data:  Array of pointers to original data buffers.
+##     work_count:     Number of work_data[] buffers, from leo_encode_work_count().
+##     work_data:      Array of pointers to work data buffers.
 ##
-## The value of bufferBytes must be a multiple of 64.
-## Each buffer should have the same number of bytes.
-## Even the last piece must be rounded up to the block size.
+##     The sum of original_count + recovery_count must not exceed 65536.
+##     The recovery_count <= original_count.
+##
+##     The buffer_bytes must be a multiple of 64.
+##     Each buffer should have the same number of bytes.
+##     Even the last piece must be rounded up to the block size.
+##
+##     Let buffer_bytes = The number of bytes in each buffer:
+##
+##         original_count = static_cast<unsigned>(
+##             ((uint64_t)total_bytes + buffer_bytes - 1) / buffer_bytes);
+##
+##     Or if the number of pieces is known:
+##
+##         buffer_bytes = static_cast<unsigned>(
+##             ((uint64_t)total_bytes + original_count - 1) / original_count);
+##
+##     Returns Leopard_Success on success.
+##  The first set of recovery_count buffers in work_data will be the result.
+##     Returns other values on errors.
 ##
-## Returns LeopardSuccess on success.
-## The first set of recoveryCount buffers in workData will be the result.
-## Returns other values on errors.
 
 proc leoEncode*(
-  bufferBytes: uint64,   ## Number of bytes in each data buffer
-  originalCount: cuint,  ## Number of originalData[] buffer pointers
-  recoveryCount: cuint,  ## Number of recovery data buffer pointers
-                         ## (readable post-call from start of workData[])
-  workCount: cuint,      ## Number of workData[] buffer pointers
-  originalData: pointer, ## Array of pointers to original data buffers
-  workData: pointer,     ## Array of pointers to work data buffers
-): LeopardResult {.leo, importc: "leo_encode".}
+  bufferBytes: uint64;
+  originalCount: cuint;
+  recoveryCount: cuint;
+  workCount: cuint;
+  originalData: ptr pointer;
+  workData: ptr pointer): LeopardResult {.leo, importc: "leo_encode".}
+  ##  Number of bytes in each data buffer
+  ##  Number of original_data[] buffer pointers
+  ##  Number of recovery_data[] buffer pointers
+  ##  Number of work_data[] buffer pointers, from leo_encode_work_count()
+  ##  Array of pointers to original data buffers
+  ##
 
-
-## -----------------------------------------------------------------------------
-## Decoder API
-
-## leoDecodeWorkCount()
+##  Array of work buffers
+## ------------------------------------------------------------------------------
+##  Decoder API
 ##
-## Calculate the number of work data buffers to provide to leoDecode().
+##     leo_decode_work_count()
 ##
-## The sum of originalCount + recoveryCount must not exceed 65536.
+##     Calculate the number of work_data buffers to provide to leo_decode().
+##
+##     The sum of original_count + recovery_count must not exceed 65536.
+##
+##     Returns the work_count value to pass into leo_encode().
+##     Returns 0 on invalid input.
 ##
-## Returns the workCount value to pass into leoDecode().
-## Returns 0 on invalid input.
 
-func leoDecodeWorkCount*(originalCount, recoveryCount: cuint): cuint
+proc leoDecodeWorkCount*(originalCount: cuint; recoveryCount: cuint): cuint
   {.leo, importc: "leo_decode_work_count".}
-
-## leoDecode()
 ##
-## Decode original data from recovery data.
+##     leo_decode()
 ##
-## bufferBytes:   Number of bytes in each data buffer.
-## originalCount: Number of original data buffers provided.
-## recoveryCount: Number of recovery data buffers provided.
-## workCount:     Number of work data buffers, from leoDecodeWorkCount().
-## originalData:  Array of pointers to original data buffers.
-## recoveryData:  Array of pointers to recovery data buffers.
-## workData:      Array of pointers to work data buffers.
+##     Decode original data from recovery data.
 ##
-## Lost original/recovery data should be set to NULL.
+##     buffer_bytes:   Number of bytes in each data buffer.
+##     original_count: Number of original_data[] buffers provided.
+##     original_data:  Array of pointers to original data buffers.
+##     recovery_count: Number of recovery_data[] buffers provided.
+##     recovery_data:  Array of pointers to recovery data buffers.
+##     work_count:     Number of work_data[] buffers, from leo_decode_work_count().
+##     work_data:      Array of pointers to recovery data buffers.
 ##
-## The sum of recoveryCount + the number of non-NULL original data must be at
-## least originalCount in order to perform recovery.
+##     Lost original/recovery data should be set to NULL.
+##
+##     The sum of recovery_count + the number of non-NULL original data must be at
+##     least original_count in order to perform recovery.
+##
+##     Returns Leopard_Success on success.
+##     Returns other values on errors.
 ##
-## Returns LeopardSuccess on success.
-## Returns other values on errors.
 
 proc leoDecode*(
-  bufferBytes: uint64,   ## Number of bytes in each data buffer
-  originalCount: cuint,  ## Number of originalData[] buffer pointers
-  recoveryCount: cuint,  ## Number of recoveryData[] buffer pointers
-  workCount: cuint,      ## Number of workData[] buffer pointers
-  originalData: pointer, ## Array of pointers to original data buffers
-  recoveryData: pointer, ## Array of pointers to recovery data buffers
-  workData: pointer,     ## Array of pointers to work data buffers
-): LeopardResult {.leo, importc: "leo_decode".}
+  bufferBytes: uint64;
+  originalCount: cuint;
+  recoveryCount: cuint;
+  workCount: cuint;
+  originalData: ptr pointer;
+  recoveryData: ptr pointer;
+  workData: ptr pointer): LeopardResult {.leo, importc: "leo_decode".}
+  ##  Number of bytes in each data buffer
+  ##  Number of original_data[] buffer pointers
+  ##  Number of recovery_data[] buffer pointers
+  ##  Number of buffer pointers in work_data[]
+  ##  Array of original data buffers
+  ##  Array of recovery data buffers
+##  Array of work data buffers
diff --git a/tests/test_leopard.nim b/tests/test_leopard.nim
deleted file mode 100644
index 50335eb..0000000
--- a/tests/test_leopard.nim
+++ /dev/null
@@ -1,551 +0,0 @@
-import std/random
-
-import pkg/leopard
-import pkg/unittest2
-
-randomize()
-
-proc genData(outerLen, innerLen: uint): Data =
-  var
-    data = newSeqOfCap[seq[byte]](outerLen)
-
-  for i in 0..<outerLen.int:
-    data.add newSeqUninitialized[byte](innerLen)
-    for j in 0..<innerLen:
-      data[i][j] = rand(255).byte
-
-  data
-
-var
-  initialized = false
-
-suite "Helpers":
-  test "isValid should return false if RS code is nonsensical or is invalid per Leopard-RS":
-    var
-      rsCode = (codeword: 8.uint, data: 5.uint, parity: 1.uint)
-
-    check: not rsCode.isValid
-
-    rsCode = RS(110,10)
-
-    check: not rsCode.isValid
-
-    rsCode = RS(1,1)
-
-    check: not rsCode.isValid
-
-    rsCode = (codeword: 2.uint, data: 0.uint, parity: 2.uint)
-
-    check: not rsCode.isValid
-
-    rsCode = RS(2,2)
-
-    check: not rsCode.isValid
-
-    rsCode = RS(65537,65409)
-
-    check: not rsCode.isValid
-
-suite "Initialization":
-  test "encode and decode should fail if Leopard-RS is not initialized":
-    let
-      rsCode = RS(8,5)
-      symbolBytes = MinBufferSize
-      parityData = genData(rsCode.parity, symbolBytes)
-
-    var
-      data = genData(rsCode.data, symbolBytes)
-
-    let
-      encodeRes = rsCode.encode data
-
-    # Related to a subtle race re: decode being called with data that has no
-    # holes while Leopard-RS is not initialized, i.e. it would succeed by
-    # simply returning the data without a call to leoDecode.
-    data[0] = @[]
-
-    let
-      decodeRes = rsCode.decode(data, parityData, symbolBytes)
-
-    check:
-      encodeRes.isErr
-      encodeRes.error.code == LeopardCallInitialize
-      decodeRes.isErr
-      decodeRes.error.code == LeopardCallInitialize
-
-  test "initialization should succeed else raise a Defect":
-    leoInit()
-    initialized = true
-
-    check: initialized
-
-suite "Encoder":
-  test "should fail if RS code is nonsensical or is invalid per Leopard-RS":
-    check: initialized
-    if not initialized: return
-
-    let
-      symbolBytes = MinBufferSize
-
-    var
-      rsCode = RS(110,10)
-      data = genData(rsCode.data, symbolBytes)
-      encodeRes = rsCode.encode data
-
-    check: encodeRes.isErr
-    if encodeRes.isErr:
-      check: encodeRes.error.code == LeopardBadCode
-
-  test "should fail if outer length of data does not match the RS code":
-    check: initialized
-    if not initialized: return
-
-    let
-      rsCode = RS(8,5)
-      symbolBytes = MinBufferSize
-      notEnoughData = genData(rsCode.data - 1, symbolBytes)
-      tooMuchData = genData(rsCode.data + 1, symbolBytes)
-      notEnoughEncodeRes = rsCode.encode notEnoughData
-      tooMuchEncodeRes = rsCode.encode tooMuchData
-
-    check:
-        notEnoughEncodeRes.isErr
-        tooMuchEncodeRes.isErr
-    if notEnoughEncodeRes.isErr:
-      check: notEnoughEncodeRes.error.code == LeopardNotEnoughData
-    if tooMuchEncodeRes.isErr:
-      check: tooMuchEncodeRes.error.code == LeopardTooMuchData
-
-  test "should fail if length of data[0] is less than minimum buffer size":
-    check: initialized
-    if not initialized: return
-
-    let
-      rsCode = RS(8,5)
-      symbolBytes = MinBufferSize - 5
-      data = genData(rsCode.data, symbolBytes)
-      encodeRes = rsCode.encode data
-
-    check: encodeRes.isErr
-    if encodeRes.isErr:
-      check: encodeRes.error.code == LeopardInvalidSize
-
-  test "should fail if length of data[0] is not a multiple of minimum buffer size":
-    check: initialized
-    if not initialized: return
-
-    let
-      rsCode = RS(8,5)
-      symbolBytes = MinBufferSize * 2 + 1
-      data = genData(rsCode.data, symbolBytes)
-      encodeRes = rsCode.encode data
-
-    check: encodeRes.isErr
-    if encodeRes.isErr:
-      check: encodeRes.error.code == LeopardInvalidSize
-
-  test "should fail if length of data[0+N] does not equal length of data[0]":
-    check: initialized
-    if not initialized: return
-
-    let
-      rsCode = RS(8,5)
-      symbolBytes = MinBufferSize
-
-    var
-      data = genData(rsCode.data, symbolBytes)
-
-    data[3] = @[1.byte, 2.byte, 3.byte]
-
-    let
-      encodeRes = rsCode.encode data
-
-    check: encodeRes.isErr
-    if encodeRes.isErr:
-      check: encodeRes.error.code == LeopardInconsistentSize
-
-  # With the current setup in leopard.nim it seems it's not possible to call
-  # encode with an RS code that would result in leoEncodeWorkCount being called
-  # with invalid parameters, i.e. that would result in it returning 0, because
-  # a Result error will always be returned before leoEncodeWorkCount is called.
-
-  # test "should fail if RS code parameters yield invalid parameters for leoEncodWorkCount":
-  #   check: initialized
-  #   if not initialized: return
-  #
-  #   let
-  #     rsCode = RS(?,?)
-  #     symbolBytes = MinBufferSize
-  #     data = genData(rsCode.data, symbolBytes)
-  #     encodeRes = rsCode.encode data
-  #
-  #   check: encodeRes.isErr
-  #   if encodeRes.isErr:
-  #     check: encodeRes.error.code == LeopardInvalidInput
-
-  test "should succeed if RS code and data yield valid parameters for leoEncode":
-    check: initialized
-    if not initialized: return
-
-    let
-      rsCode = RS(8,5)
-      symbolBytes = MinBufferSize
-      data = genData(rsCode.data, symbolBytes)
-      encodeRes = rsCode.encode data
-
-    check: encodeRes.isOk
-
-suite "Decoder":
-  test "should fail if RS code is nonsensical or is invalid per Leopard-RS":
-    check: initialized
-    if not initialized: return
-
-    let
-      symbolBytes = MinBufferSize
-
-    var
-      rsCode = RS(110,10)
-      data = genData(rsCode.data, symbolBytes)
-      parityData: ParityData
-      decodeRes = rsCode.decode(data, parityData, symbolBytes)
-
-    check: decodeRes.isErr
-    if decodeRes.isErr:
-      check: decodeRes.error.code == LeopardBadCode
-
-  test "should fail if outer length of data does not match the RS code":
-    check: initialized
-    if not initialized: return
-
-    let
-      rsCode = RS(8,5)
-      symbolBytes = MinBufferSize
-      notEnoughData = genData(rsCode.data - 1, symbolBytes)
-      tooMuchData = genData(rsCode.data + 1, symbolBytes)
-      parityData = genData(rsCode.parity, symbolBytes)
-      notEnoughDecodeRes = rsCode.decode(notEnoughData, parityData, symbolBytes)
-      tooMuchDecodeRes = rsCode.decode(tooMuchData, parityData, symbolBytes)
-
-    check:
-        notEnoughDecodeRes.isErr
-        tooMuchDecodeRes.isErr
-    if notEnoughDecodeRes.isErr:
-      check: notEnoughDecodeRes.error.code == LeopardNotEnoughData
-    if tooMuchDecodeRes.isErr:
-      check: tooMuchDecodeRes.error.code == LeopardTooMuchData
-
-  test "should fail if outer length of parityData does not match the RS code":
-    check: initialized
-    if not initialized: return
-
-    let
-      rsCode = RS(8,5)
-      symbolBytes = MinBufferSize
-      data = genData(rsCode.data, symbolBytes)
-      notEnoughParityData = genData(rsCode.parity - 1, symbolBytes)
-      tooMuchParityData = genData(rsCode.parity + 1, symbolBytes)
-      notEnoughDecodeRes = rsCode.decode(data, notEnoughParityData, symbolBytes)
-      tooMuchDecodeRes = rsCode.decode(data, tooMuchParityData, symbolBytes)
-
-    check:
-      notEnoughDecodeRes.isErr
-      tooMuchDecodeRes.isErr
-    if notEnoughDecodeRes.isErr:
-      check: notEnoughDecodeRes.error.code == LeopardNeedMoreData
-    if tooMuchDecodeRes.isErr:
-      check: tooMuchDecodeRes.error.code == LeopardNeedLessData
-
-  test "should fail if symbolBytes is less than minimum buffer size":
-    check: initialized
-    if not initialized: return
-
-    let
-      rsCode = RS(8,5)
-      symbolBytes = MinBufferSize - 5
-      data = genData(rsCode.data, symbolBytes)
-      parityData = genData(rsCode.parity, symbolBytes)
-      decodeRes = rsCode.decode(data, parityData, symbolBytes)
-
-    check: decodeRes.isErr
-    if decodeRes.isErr:
-      check: decodeRes.error.code == LeopardInvalidSize
-
-  test "should fail if symbolBytes is not a multiple of minimum buffer size":
-    check: initialized
-    if not initialized: return
-
-    let
-      rsCode = RS(8,5)
-      symbolBytes = MinBufferSize * 2 + 1
-      data = genData(rsCode.data, symbolBytes)
-      parityData = genData(rsCode.parity, symbolBytes)
-      decodeRes = rsCode.decode(data, parityData, symbolBytes)
-
-    check: decodeRes.isErr
-    if decodeRes.isErr:
-      check: decodeRes.error.code == LeopardInvalidSize
-
-  test "should fail if length of data[0+N] is not zero and does not equal symbolBytes":
-    check: initialized
-    if not initialized: return
-
-    let
-      rsCode = RS(8,5)
-      symbolBytes = MinBufferSize
-      parityData = genData(rsCode.parity, symbolBytes)
-
-    var
-      data = genData(rsCode.data, symbolBytes)
-
-    data[3] = @[1.byte, 2.byte, 3.byte]
-
-    let
-      decodeRes = rsCode.decode(data, parityData, symbolBytes)
-
-    check: decodeRes.isErr
-    if decodeRes.isErr:
-      check: decodeRes.error.code == LeopardInconsistentSize
-
-  test "should fail if there are data losses and length of parityData[0+N] is not zero and does not equal symbolBytes":
-    check: initialized
-    if not initialized: return
-
-    let
-      rsCode = RS(8,5)
-      symbolBytes = MinBufferSize
-
-    var
-      data = genData(rsCode.data, symbolBytes)
-      parityData = genData(rsCode.parity, symbolBytes)
-
-    data[3] = @[]
-    parityData[1] = @[1.byte, 2.byte, 3.byte]
-
-    let
-      decodeRes = rsCode.decode(data, parityData, symbolBytes)
-
-    check: decodeRes.isErr
-    if decodeRes.isErr:
-      check: decodeRes.error.code == LeopardInconsistentSize
-
-  # With the current setup in leopard.nim it seems it's not possible to call
-  # decode with an RS code that would result in leoDecodeWorkCount being called
-  # with invalid parameters, i.e. that would result in it returning 0, because
-  # a Result error will always be returned before leoDecodeWorkCount is called.
-
-  # test "should fail if there are data losses and RS code parameters yield invalid parameters for leoDecodWorkCount":
-  #   check: initialized
-  #   if not initialized: return
-  #
-  #   let
-  #     rsCode = RS(?,?)
-  #     symbolBytes = MinBufferSize
-  #     parityData = genData(rsCode.parity, symbolBytes)
-  #
-  #   var
-  #     data = genData(rsCode.data, symbolBytes)
-  #
-  #   data[0] = @[]
-  #
-  #   let
-  #     decodeRes = rsCode.decode(data, parityData, symbolBytes)
-  #
-  #   check: decodeRes.isErr
-  #   if decodeRes.isErr:
-  #     check: decodeRes.error.code == LeopardInvalidInput
-
-  test "should succeed if there are no data losses even when all parity data is lost":
-    check: initialized
-    if not initialized: return
-
-    let
-      rsCode = RS(8,5)
-      symbolBytes = MinBufferSize
-      data = genData(rsCode.data, symbolBytes)
-
-    var
-      parityData = genData(rsCode.parity, symbolBytes)
-      decodeRes = rsCode.decode(data, parityData, symbolBytes)
-
-    check: decodeRes.isOk
-
-    parityData = genData(rsCode.parity, symbolBytes)
-    parityData[1] = @[]
-    decodeRes = rsCode.decode(data, parityData, symbolBytes)
-
-    check: decodeRes.isOk
-
-    parityData = genData(rsCode.parity, symbolBytes)
-    for i in 0..<parityData.len: parityData[i] = @[]
-    decodeRes = rsCode.decode(data, parityData, symbolBytes)
-
-    check: decodeRes.isOk
-
-suite "Encode + Decode":
-  test "should fail to recover data when losses exceed tolerance":
-    check: initialized
-    if not initialized: return
-
-    var i = 0
-    while i < 1000:
-      let
-        # together dataSymbols = 256+, paritySymbols = 17+, symbolBytes = 64+
-        # seem to consistently trigger parallel processing with OpenMP
-        dataSymbols = rand(256..320)
-        paritySymbols = rand(17..dataSymbols)
-        codewordSymbols = dataSymbols + paritySymbols
-        symbolBytesMultip = rand(1..8)
-        symbolBytes = MinBufferSize * symbolBytesMultip.uint
-        rsCode = RS(codewordSymbols, dataSymbols)
-        data = genData(rsCode.data, symbolBytes)
-        losses = paritySymbols + 1
-        parityDataHoleCount =
-          if (losses - 1) == 0: 0 else: rand(1..(losses - 1))
-        dataHoleCount = losses - parityDataHoleCount
-        encodeRes = rsCode.encode data
-
-      check: dataHoleCount + parityDataHoleCount == losses
-
-      check: encodeRes.isOk
-      if encodeRes.isOk:
-        let
-          parityData = encodeRes.get
-
-        var
-          dataWithHoles = data
-          parityDataWithHoles = parityData
-
-        var
-          dataHoles: seq[int]
-
-        for i in 1..dataHoleCount:
-          while true:
-            let
-              j = rand(dataSymbols - 1)
-
-            if dataHoles.find(j) == -1:
-              dataHoles.add j
-              break
-
-        check: dataHoles.len == dataHoleCount
-
-        for i in dataHoles:
-          dataWithHoles[i] = @[]
-
-        var
-          parityDataHoles: seq[int]
-
-        for i in 1..parityDataHoleCount:
-          while true:
-            let
-              j = rand(paritySymbols - 1)
-
-            if parityDataHoles.find(j) == -1:
-              parityDataHoles.add j
-              break
-
-        check: parityDataHoles.len == parityDataHoleCount
-
-        for i in parityDataHoles:
-          parityDataWithHoles[i] = @[]
-
-        let
-          decodeRes = rsCode.decode(dataWithHoles, parityDataWithHoles,
-            symbolBytes)
-
-        check: decodeRes.isErr
-        if decodeRes.isErr:
-          check: decodeRes.error.code == LeopardNeedMoreData
-
-      else:
-        echo "encode error message: " & encodeRes.error.msg
-
-      inc i
-
-  test "should recover data otherwise":
-    check: initialized
-    if not initialized: return
-
-    var i = 0
-    while i < 1000:
-      let
-        # together dataSymbols = 256+, paritySymbols = 17+, symbolBytes = 64+
-        # seem to consistently trigger parallel processing with OpenMP
-        dataSymbols = rand(256..320)
-        paritySymbols = rand(17..dataSymbols)
-        codewordSymbols = dataSymbols + paritySymbols
-        symbolBytesMultip = rand(1..8)
-        symbolBytes = MinBufferSize * symbolBytesMultip.uint
-        rsCode = RS(codewordSymbols, dataSymbols)
-        data = genData(rsCode.data, symbolBytes)
-        losses = rand(1..paritySymbols)
-        parityDataHoleCount =
-          if (losses - 1) == 0: 0 else: rand(1..(losses - 1))
-        dataHoleCount = losses - parityDataHoleCount
-        encodeRes = rsCode.encode data
-
-      check: dataHoleCount + parityDataHoleCount == losses
-
-      check: encodeRes.isOk
-      if encodeRes.isOk:
-        let
-          parityData = encodeRes.get
-
-        var
-          dataWithHoles = data
-          parityDataWithHoles = parityData
-
-        var
-          dataHoles: seq[int]
-
-        for i in 1..dataHoleCount:
-          while true:
-            let
-              j = rand(dataSymbols - 1)
-
-            if dataHoles.find(j) == -1:
-              dataHoles.add j
-              break
-
-        check: dataHoles.len == dataHoleCount
-
-        for i in dataHoles:
-          dataWithHoles[i] = @[]
-
-        var
-          parityDataHoles: seq[int]
-
-        for i in 1..parityDataHoleCount:
-          while true:
-            let
-              j = rand(paritySymbols - 1)
-
-            if parityDataHoles.find(j) == -1:
-              parityDataHoles.add j
-              break
-
-        check: parityDataHoles.len == parityDataHoleCount
-
-        for i in parityDataHoles:
-          parityDataWithHoles[i] = @[]
-
-        let
-          decodeRes = rsCode.decode(dataWithHoles, parityDataWithHoles,
-            symbolBytes)
-
-        check: decodeRes.isOk
-        if decodeRes.isOk:
-          let
-            decodedData = decodeRes.get
-
-          check:
-            decodedData != dataWithHoles
-            decodedData == data
-
-        else:
-          echo "decode error message: " & decodeRes.error.msg
-
-      else:
-        echo "encode error message: " & encodeRes.error.msg
-
-      inc i
diff --git a/tests/testleopard.nim b/tests/testleopard.nim
new file mode 100644
index 0000000..f808091
--- /dev/null
+++ b/tests/testleopard.nim
@@ -0,0 +1,48 @@
+import pkg/unittest2
+import pkg/stew/results
+import pkg/stew/byteutils
+
+import ../leopard
+
+suite "Leopard":
+  const
+    testString = "Hello World!"
+
+  var
+    leoEncoder: LeoEncoder
+    leoDecoder: LeoDecoder
+    data: seq[seq[byte]]
+    parity: seq[seq[byte]]
+    recovered: seq[seq[byte]]
+
+  test "Test Encode/Decode":
+    leoEncoder = LeoEncoder.init(64, 16, 10).tryGet()
+    leoDecoder = LeoDecoder.init(64, 16, 10).tryGet()
+    data = newSeq[seq[byte]](16)
+    parity = newSeq[seq[byte]](10)
+    recovered = newSeq[seq[byte]](16)
+
+    for i in 0..<16:
+      data[i] = newSeq[byte](64)
+      recovered[i] = newSeq[byte](64)
+      var
+        str = testString & " " & $i
+
+      copyMem(addr data[i][0], addr str[0], str.len)
+
+    for i in 0..<10:
+      parity[i] = newSeq[byte](64)
+
+    leoEncoder.encode(data, parity).tryGet()
+
+    let
+      data1 = data[0]
+      data2 = data[1]
+
+    data[0].setLen(0)
+    data[1].setLen(0)
+
+    leoDecoder.decode(data, parity, recovered).tryGet()
+
+    check recovered[0] == data1
+    check recovered[1] == data2