High level wrapper (#3)

* initial implementation and tests * [wip] refactor checking of RS code validity * [wip] point GHA badge link to main branch instead of initial_impl * [wip] delete leftover echo at bottom of test_leopard.nim * [wip] add basic usage info to README * [wip] more basic info added to README re: requirements, installation, usage * [wip] add config.nims with --tlsEmulation:off to check if it helps with perf on Windows * [wip] use `object` instead of `object of CatchableError` for LeopardError workaround for edge case encountered in context of nimbus-build-system project * [wip] clarify wording in README re: stability * [wip] can use `object of CatchableError` for LeopardError with workaround * Initial implementation * make Leo a case object * initial test * cleanup * remove echo * use `func` where possible * comments, misc * make construction more convenient * add more tests * more tests * unused warnings * remove sideeffects pragma * fix importc pragma on unix * fix windows build * fix ci * better warning * adding more comprehensive tests * moar tests * add TODO for usage * Update leopard/leopard.nim Co-authored-by: Michael Bradley <michaelsbradleyjr@gmail.com> * Update leopard/wrapper.nim Co-authored-by: Michael Bradley <michaelsbradleyjr@gmail.com> * add tests to reuse same encoder/decoder * check that parity and data buffers are < 65536 * test that data+parity isn't > 65536 Co-authored-by: Michael Bradley, Jr <michaelsbradleyjr@gmail.com>
2026-02-16 19:53:12 +00:00 · 2022-03-28 18:42:45 -06:00 · 2022-03-28 18:42:45 -06:00 · 41cd86df5b
commit 41cd86df5b
parent 4d89e44e0d
17 changed files with 2133 additions and 5 deletions
--- a/.editorconfig
+++ b/.editorconfig
@ -0,0 +1,5 @@
+[*]
+indent_style = space
+insert_final_newline = true
+indent_size = 2
+trim_trailing_whitespace = true
--- a/.gitattributes
+++ b/.gitattributes
@ -0,0 +1 @@
+* text=auto eol=lf
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@ -0,0 +1,154 @@
+name: Tests
+
+on: [pull_request, push]
+
+jobs:
+  tests:
+    env:
+      NPROC: 2
+    strategy:
+      fail-fast: false
+      matrix:
+        cache_nonce: [ 1 ]
+        nim_version: [ 1.2.18, 1.4.8, 1.6.4 ]
+        platform:
+          - {
+            icon: 🐧,
+            label: Linux,
+            os: ubuntu,
+            shell: bash --noprofile --norc -eo pipefail
+          }
+          - {
+            icon: 🍎,
+            label: macOS,
+            os: macos,
+            shell: bash --noprofile --norc -eo pipefail
+          }
+          - {
+            icon: 🏁,
+            label: Windows,
+            os: windows,
+            shell: msys2
+          }
+    name: ${{ matrix.platform.icon }} ${{ matrix.platform.label }} - Nim v${{ matrix.nim_version }}
+    runs-on: ${{ matrix.platform.os }}-latest
+    defaults:
+      run:
+        shell: ${{ matrix.platform.shell }} {0}
+
+    steps:
+      # - name: Install tools and libraries via APT (Linux)
+      #   if: matrix.platform.os == 'ubuntu'
+      #   run: |
+      #     sudo apt update
+      #     sudo apt install -y \
+      #       ...
+
+      - name: Install tools and libraries via Homebrew (macOS)
+        if: matrix.platform.os == 'macos'
+        run: |
+          brew update
+          brew install \
+            findutils \
+            libomp
+
+      - name: Install tools and libraries via MSYS2 (Windows)
+        if: matrix.platform.os == 'windows'
+        uses: msys2/setup-msys2@v2
+        with:
+          msystem: UCRT64
+          install: >
+            base-devel
+            git
+            mingw-w64-ucrt-x86_64-cmake
+            mingw-w64-ucrt-x86_64-toolchain
+
+      - name: Checkout sources from GitHub
+        uses: actions/checkout@v2
+        with:
+          submodules: true
+
+      - name: Calculate cache member paths
+        id: calc-paths
+        run: |
+          if [[ ${{ matrix.platform.os }} = windows ]]; then
+            echo "::set-output name=bash_env::$(cygpath -m "${HOME}")/.bash_env"
+            echo "::set-output name=choosenim::$(cygpath -m "${USERPROFILE}")/.choosenim"
+            echo "::set-output name=nimble::$(cygpath -m "${HOME}")/.nimble"
+          else
+            echo "::set-output name=bash_env::${HOME}/.bash_env"
+            echo "::set-output name=choosenim::${HOME}/.choosenim"
+            echo "::set-output name=nimble::${HOME}/.nimble"
+          fi
+
+      - name: Restore choosenim and Nim tooling from cache
+        id: choosenim-nim-tooling-cache
+        uses: actions/cache@v2
+        with:
+          path: |
+            ${{ steps.calc-paths.outputs.bash_env }}
+            ${{ steps.calc-paths.outputs.choosenim }}
+            ${{ steps.calc-paths.outputs.nimble }}/bin
+          key: ${{ matrix.platform.os }}-nim_version:${{ matrix.nim_version }}-cache_nonce:${{ matrix.cache_nonce }}
+
+      - name: Install choosenim and Nim tooling
+        if: steps.choosenim-nim-tooling-cache.outputs.cache-hit != 'true'
+        run: |
+          mkdir -p "${HOME}/Downloads"
+          cd "${HOME}/Downloads"
+          curl https://nim-lang.org/choosenim/init.sh -sSf -O
+          chmod +x init.sh
+          if [[ ${{ matrix.platform.os }} = windows ]]; then
+            mkdir -p "$(cygpath "${USERPROFILE}")/.nimble/bin"
+          fi
+          CHOOSENIM_CHOOSE_VERSION=${{ matrix.nim_version }} ./init.sh -y
+          if [[ ${{ matrix.platform.os }} = windows ]]; then
+            mv "$(cygpath "${USERPROFILE}")/.nimble" "${HOME}/"
+            # intention is to rely only on libs provided by the OS and MSYS2 env
+            rm -rf "${HOME}/.nimble/bin/"*.dll
+            rm -rf "${HOME}/.nimble/bin/"*.pem
+          fi
+          echo 'export NIMBLE_DIR="${HOME}/.nimble"' >> "${HOME}/.bash_env"
+          echo 'export PATH="${NIMBLE_DIR}/bin:${PATH}"' >> "${HOME}/.bash_env"
+
+      - name: Install project dependencies
+        run: |
+          source "${HOME}/.bash_env"
+          cd "${NIMBLE_DIR}/bin"
+          # delete broken symlinks, which can arise because e.g. the cache
+          # restored a symlink that points to an executable within
+          # ../pkgs/foo-1.2.3/ but the project's .nimble file has been updated
+          # to install foo-#head; in the case of a broken symlink, nimble's
+          # auto-overwrite fails
+          if [[ ${{ matrix.platform.os }} = macos ]]; then
+            gfind . -xtype l -delete
+          else
+            find . -xtype l -delete
+          fi
+          cd -
+          nimble --accept install
+
+      - name: Build and run tests
+        run: |
+          source "${HOME}/.bash_env"
+          if [[ ${{ matrix.platform.os }} = windows ]]; then
+            touch tests/testleopard.exe
+          else
+            touch tests/testleopard
+          fi
+          if [[ ${{ matrix.platform.os }} = macos ]]; then
+            export PATH="$(brew --prefix)/opt/llvm/bin:${PATH}"
+            export LDFLAGS="-L$(brew --prefix)/opt/libomp/lib -L$(brew --prefix)/opt/llvm/lib -Wl,-rpath,$(brew --prefix)/opt/llvm/lib"
+            nimble test -d:verbose -d:release -d:LeopardCmakeFlags="-DCMAKE_BUILD_TYPE=Release -DCMAKE_C_COMPILER=$(brew --prefix)/opt/llvm/bin/clang -DCMAKE_CXX_COMPILER=$(brew --prefix)/opt/llvm/bin/clang++" -d:LeopardExtraCompilerlags="-fopenmp" -d:LeopardExtraLinkerFlags="-fopenmp -L$(brew --prefix)/opt/libomp/lib"
+          else
+            nimble test -d:verbose -d:release
+          fi
+          if [[ ${{ matrix.platform.os }} = macos ]]; then
+            echo
+            echo otool -L tests/testleopard
+            otool -L tests/testleopard
+          else
+            echo
+            echo ldd tests/testleopard
+            ldd tests/testleopard
+          fi
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,13 @@
+*
+!*/
+!*.*
+*.a
+*.dll
+*.dylib
+*.exe
+*.so
+.DS_Store
+.idea
+.vscode
+leopard.nims
+TODO
--- a/.gitmodules
+++ b/.gitmodules
@ -1,5 +1,5 @@
 [submodule "vendor/leopard"]
 	path = vendor/leopard
-	url = https://github.com/catid/leopard.git
+	url = https://github.com/status-im/leopard.git
 	ignore = untracked
 	branch = master
--- a/README.md
+++ b/README.md
@ -3,20 +3,92 @@
 [![License: Apache](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](https://opensource.org/licenses/Apache-2.0)
 [![License: MIT](https://img.shields.io/badge/License-MIT-blue.svg)](https://opensource.org/licenses/MIT)
 [![Stability: experimental](https://img.shields.io/badge/stability-experimental-orange.svg)](https://github.com/status-im/nim-leopard#stability)
+[![Tests (GitHub Actions)](https://github.com/status-im/nim-leopard/workflows/Tests/badge.svg?branch=main)](https://github.com/status-im/nim-leopard/actions?query=workflow%3ATests+branch%3Amain)

 Nim wrapper for [Leopard-RS](https://github.com/catid/leopard): a fast library for [Reed-Solomon](https://en.wikipedia.org/wiki/Reed%E2%80%93Solomon_error_correction) erasure correction coding.

+## Requirements
+
+* Same as Leopard-RS' requirements, e.g. CMake 3.7 or newer.
+* Nim 1.2 or newer.
+
+
+## Installation
+
+With [Nimble](https://github.com/nim-lang/nimble)
+```text
+$ nimble install leopard
+```
+In a project's `.nimble` file
+```nim
+requires "leopard >= 0.0.1 & < 0.0.2"
+```
+In a [nimbus-build-system](https://github.com/status-im/nimbus-build-system) project
+```text
+$ git submodule add https://github.com/status-im/nim-leopard.git vendor/nim-leopard
+$ make update
+```
+
+### Submodule
+
+#### Init
+
+[status-im/leopard](https://github.com/status-im/leopard), a fork of [catid/leopard](https://github.com/catid/leopard) (Leopard-RS), is a submodule of nim-leopard.
+
+When nim-leopard is installed with `nimble install leopard`, or as a dependency in a Nimble project, or vendored in a nimbus-build-system project, submodule init is handled automatically.
+
+If the nim-leopard repo is cloned directly, then before running `nimble develop` or `nimble install` in the root of the clone, it's necessary to init the submodule
+```text
+$ git submodule update --init
+```
+
+#### Build
+
+The submodule is automatically built (in the `nimcache` dir) and statically linked during compilation of any Nim module that has `import leopard` or `import leopard/wrapper`.
+
+If the `nimcache` dir is set to a custom value, it must be an absolute path.
+
+For the build to work on Windows, `nimble` or `nim c` must be run from a Bash shell, e.g. Git Bash or an MSYS2 shell, and all needed tools (e.g. `cmake` and `make`) must be available in and suitable for that environment.
+
+##### OpenMP
+
+Leopard-RS' `CMakeLists.txt` checks for [OpenMP](https://en.wikipedia.org/wiki/OpenMP) support. If it is available then it is enabled in the build of `libleopard.a`.
+
+Build toolchains commonly installed on Linux and Windows come with support for OpenMP.
+
+The clang/++ compiler in Apple's Xcode does not support OpenMP, but the one installed with `brew install llvm` does support it, though it's also necessary to `brew install libomp`.
+
+So, on macOS, when running `nimble test` of nim-leopard or compiling a project that imports nim-leopard:
+* If libomp is not installed and Apple's clang is used, no extra flags need to be passed to the Nim compiler. OpenMP support will not be enabled in `libleopard.a`.
+* If libomp is installed and Apple's clang is used, this flag should be passed to `nim c`
+  ```text
+  -d:LeopardCmakeFlags="-DCMAKE_BUILD_TYPE=Release -DENABLE_OPENMP=off"
+  ```
+* If the intent is to use brew-installed clang + libomp, the shell environment should be modified
+  ```text
+  $ export PATH="$(brew --prefix)/opt/llvm/bin:${PATH}"
+  $ export LDFLAGS="-L$(brew --prefix)/opt/libomp/lib -L$(brew --prefix)/opt/llvm/lib -Wl,-rpath,$(brew --prefix)/opt/llvm/lib"
+  ```
+  and these flags should be passed to `nim c`
+  ```text
+  -d:LeopardCmakeFlags="-DCMAKE_BUILD_TYPE=Release -DCMAKE_C_COMPILER=$(brew --prefix)/opt/llvm/bin/clang -DCMAKE_CXX_COMPILER=$(brew --prefix)/opt/llvm/bin/clang++" -d:LeopardExtraCompilerlags="-fopenmp" -d:LeopardExtraLinkerFlags="-fopenmp -L$(brew --prefix)/opt/libomp/lib"
+  ```
+
 ## Usage

 TODO

+### OpenMP
+
+When OpenMP is enabled, whether or not parallel processing kicks in depends on the symbol and byte counts. On a local machine with an Intel processor `RS(256,239)` with `symbolBytes == 64` seems to be the lower bound for triggering parallel processing.
+
 ## Versioning

-nim-leopard generally follows the upstream master branch.
+nim-leopard generally follows the upstream `master` branch such that changes there will result in a version bump for this package.

 ## Stability

-The API provided by this package is currently marked as experimental. Until it is marked as stable, it may be subject to breaking changes across any version bump.
+This package is currently marked as experimental. Until it is marked as stable, it may be subject to breaking changes across any version bump.

 ## License

--- a/config.nims
+++ b/config.nims
@ -0,0 +1,2 @@
+--threads:on
+--tlsEmulation:off
--- a/leopard.nim
+++ b/leopard.nim
@ -0,0 +1,12 @@
+## Nim-Leopard
+## Copyright (c) 2022 Status Research & Development GmbH
+## Licensed under either of
+##  * Apache License, version 2.0, ([LICENSE-APACHE](LICENSE-APACHE))
+##  * MIT license ([LICENSE-MIT](LICENSE-MIT))
+## at your option.
+## This file may not be copied, modified, or distributed except according to
+## those terms.
+
+import ./leopard/leopard
+
+export leopard
--- a/leopard.nimble
+++ b/leopard.nimble
@ -5,7 +5,9 @@ version       = "0.0.1"
 author        = "Status Research & Development GmbH"
 description   = "A wrapper for Leopard-RS"
 license       = "Apache License 2.0 or MIT"
+installDirs   = @["vendor"]

 requires "nim >= 1.2.0",
-         "stew#head",
-         "unittest2"
+         "stew",
+         "unittest2",
+         "upraises >= 0.1.0 & < 0.2.0"
--- a/leopard/leopard.nim
+++ b/leopard/leopard.nim
@ -0,0 +1,266 @@
+## Nim-Leopard
+## Copyright (c) 2022 Status Research & Development GmbH
+## Licensed under either of
+##  * Apache License, version 2.0, ([LICENSE-APACHE](LICENSE-APACHE))
+##  * MIT license ([LICENSE-MIT](LICENSE-MIT))
+## at your option.
+## This file may not be copied, modified, or distributed except according to
+## those terms.
+
+import pkg/upraises
+push: {.upraises: [].}
+
+{.deadCodeElim: on.}
+
+import pkg/stew/results
+
+import ./wrapper
+import ./utils
+
+export wrapper, results
+
+const
+  BuffMultiples* = 64
+
+type
+  LeoBufferPtr* = ptr UncheckedArray[byte]
+
+  LeoCoderKind* {.pure.} = enum
+    Encoder,
+    Decoder
+
+  Leo* = object of RootObj
+    bufSize*: int                         # size of the buffer in multiples of 64
+    buffers*: int                         # total number of data buffers (K)
+    parity*: int                          # total number of parity buffers (M)
+    dataBufferPtr: seq[LeoBufferPtr]      # buffer where data is copied before encoding
+    workBufferCount: int                  # number of parity work buffers
+    workBufferPtr: seq[LeoBufferPtr]      # buffer where parity data is written during encoding or before decoding
+    case kind: LeoCoderKind
+    of LeoCoderKind.Decoder:
+      decodeBufferCount: int              # number of decoding work buffers
+      decodeBufferPtr: seq[LeoBufferPtr]  # work buffer used for decoding
+    of LeoCoderKind.Encoder:
+      discard
+
+  LeoEncoder* = object of Leo
+  LeoDecoder* = object of Leo
+
+func encode*(
+  self: var LeoEncoder,
+  data,
+  parity: var openArray[seq[byte]]): Result[void, cstring] =
+  ## Encode a list of buffers in `data` into a number of `bufSize` sized
+  ## `parity` buffers
+  ##
+  ## `data`   - list of original data `buffers` of size `bufSize`
+  ## `parity` - list of parity `buffers` of size `bufSize`
+  ##
+
+  if data.len != self.buffers:
+    return err("Number of data buffers should match!")
+
+  if parity.len != self.parity:
+    return err("Number of parity buffers should match!")
+
+  # zero encode work buffer to avoid corrupting with previous run
+  for i in 0..<self.workBufferCount:
+    zeroMem(self.workBufferPtr[i], self.bufSize)
+
+  # copy data into aligned buffer
+  for i in 0..<data.len:
+    copyMem(self.dataBufferPtr[i], addr data[i][0], self.bufSize)
+
+  let
+    res = leoEncode(
+      self.bufSize.culonglong,
+      self.buffers.cuint,
+      self.parity.cuint,
+      self.workBufferCount.cuint,
+      cast[ptr pointer](addr self.dataBufferPtr[0]),
+      cast[ptr pointer](addr self.workBufferPtr[0]))
+
+  if ord(res) != ord(LeopardSuccess):
+    return err(leoResultString(res.LeopardResult))
+
+  for i in 0..<parity.len:
+    copyMem(addr parity[i][0], self.workBufferPtr[i], self.bufSize)
+
+  return ok()
+
+func decode*(
+  self: var LeoDecoder,
+  data,
+  parity,
+  recovered: var openArray[seq[byte]]): Result[void, cstring] =
+  ## Decode a list of buffers in `data` and `parity` into a list
+  ## of `recovered` buffers of `bufSize`. The list of `recovered`
+  ## buffers should be match the `Leo.buffers`
+  ##
+  ## `data`       - list of original data `buffers` of size `bufSize`
+  ## `parity`     - list of parity `buffers` of size `bufSize`
+  ## `recovered`  - list of recovered `buffers` of size `bufSize`
+  ##
+
+  if data.len != self.buffers:
+    return err("Number of data buffers should match!")
+
+  if parity.len != self.parity:
+    return err("Number of parity buffers should match!")
+
+  if recovered.len != self.buffers:
+    return err("Number of recovered buffers should match buffers!")
+
+  # clean out work and data buffers
+  for i in 0..<self.workBufferCount:
+    zeroMem(self.workBufferPtr[i], self.bufSize)
+
+  for i in 0..<self.decodeBufferCount:
+    zeroMem(self.decodeBufferPtr[i], self.bufSize)
+
+  for i in 0..<data.len:
+    zeroMem(self.dataBufferPtr[i], self.bufSize)
+
+  # this is needed because erasures are nil pointers
+  var
+    dataPtr = newSeq[LeoBufferPtr](data.len)
+    parityPtr = newSeq[LeoBufferPtr](self.workBufferCount)
+
+  # copy data into aligned buffer
+  for i in 0..<data.len:
+    if data[i].len > 0:
+      copyMem(self.dataBufferPtr[i], addr data[i][0], self.bufSize)
+      dataPtr[i] = self.dataBufferPtr[i]
+    else:
+      dataPtr[i] = nil
+
+  # copy parity into aligned buffer
+  for i in 0..<self.workBufferCount:
+    if i < parity.len and parity[i].len > 0:
+      copyMem(self.workBufferPtr[i], addr parity[i][0], self.bufSize)
+      parityPtr[i] = self.workBufferPtr[i]
+    else:
+      parityPtr[i] = nil
+
+  let
+    res = leo_decode(
+      self.bufSize.culonglong,
+      self.buffers.cuint,
+      self.parity.cuint,
+      self.decodeBufferCount.cuint,
+      cast[ptr pointer](addr dataPtr[0]),
+      cast[ptr pointer](addr parityPtr[0]),
+      cast[ptr pointer](addr self.decodeBufferPtr[0]))
+
+  if ord(res) != ord(LeopardSuccess):
+    return err(leoResultString(res.LeopardResult))
+
+  for i, p in dataPtr:
+    if p.isNil:
+      copyMem(addr recovered[i][0], self.decodeBufferPtr[i], self.bufSize)
+
+  ok()
+
+func free*(self: var Leo) =
+  if self.workBufferPtr.len > 0:
+    for i, p in self.workBufferPtr:
+      if not isNil(p):
+        p.leoFree()
+        self.workBufferPtr[i] = nil
+
+    self.workBufferPtr.setLen(0)
+
+  if self.dataBufferPtr.len > 0:
+    for i, p in self.dataBufferPtr:
+      if not isNil(p):
+        p.leoFree()
+        self.dataBufferPtr[i] = nil
+
+    self.dataBufferPtr.setLen(0)
+
+  if self.kind == LeoCoderKind.Decoder:
+    if self.decodeBufferPtr.len > 0:
+      for i, p in self.decodeBufferPtr:
+        if not isNil(p):
+          p.leoFree()
+          self.decodeBufferPtr[i] = nil
+      self.decodeBufferPtr.setLen(0)
+
+# TODO: The destructor doesn't behave as
+# I'd expect it, it's called many more times
+# than it should. This is however, most
+# likely my misinterpretation of how it should
+# work.
+# proc `=destroy`*(self: var Leo) =
+#   self.free()
+
+proc init[TT: Leo](
+  T: type TT,
+  bufSize,
+  buffers,
+  parity: int,
+  kind: LeoCoderKind): Result[T, cstring] =
+  if bufSize mod BuffMultiples != 0:
+    return err("bufSize should be multiples of 64 bytes!")
+
+  if parity > buffers:
+    return err("number of parity buffers cannot exceed number of data buffers!")
+
+  if (buffers + parity) > 65536:
+    return err("number of parity and data buffers cannot exceed 65536!")
+
+  once:
+    # First, attempt to init the leopard library,
+    # this happens only once for all threads and
+    # should be safe as internal tables are only read,
+    # never written. However instantiation should be
+    # synchronized, since two instances can attempt to
+    # concurrently instantiate the library twice, and
+    # might end up with two distinct versions - not a big
+    # deal but will defeat the purpose of this `once` block
+    if (let res = leoinit(); res.ord != LeopardSuccess.ord):
+      return err(leoResultString(res.LeopardResult))
+
+  var
+    self = T(
+      kind: kind,
+      bufSize: bufSize,
+      buffers: buffers,
+      parity: parity)
+
+  self.workBufferCount = leoEncodeWorkCount(
+    buffers.cuint,
+    parity.cuint).int
+
+  # initialize encode work buffers
+  for _ in 0..<self.workBufferCount:
+    self.workBufferPtr.add(cast[LeoBufferPtr](self.bufSize.leoAlloc()))
+
+  # initialize data buffers
+  for _ in 0..<self.buffers:
+    self.dataBufferPtr.add(cast[LeoBufferPtr](self.bufSize.leoAlloc()))
+
+  if self.kind == LeoCoderKind.Decoder:
+    self.decodeBufferCount = leoDecodeWorkCount(
+      buffers.cuint,
+      parity.cuint).int
+
+    # initialize decode work buffers
+    for _ in 0..<self.decodeBufferCount:
+      self.decodeBufferPtr.add(cast[LeoBufferPtr](self.bufSize.leoAlloc()))
+
+  ok(self)
+
+proc init*(
+  T: type LeoEncoder,
+  bufSize,
+  buffers,
+  parity: int): Result[LeoEncoder, cstring] =
+  LeoEncoder.init(bufSize, buffers, parity, LeoCoderKind.Encoder)
+
+proc init*(
+  T: type LeoDecoder,
+  bufSize,
+  buffers,
+  parity: int): Result[LeoDecoder, cstring] =
+  LeoDecoder.init(bufSize, buffers, parity, LeoCoderKind.Decoder)
--- a/leopard/utils.nim
+++ b/leopard/utils.nim
@ -0,0 +1,4 @@
+import ./utils/allocs
+import ./utils/cpuinfo_x86
+
+export cpuinfo_x86, allocs
--- a/leopard/utils/allocs.nim
+++ b/leopard/utils/allocs.nim
@ -0,0 +1,80 @@
+## Nim-Leopard
+## Copyright (c) 2022 Status Research & Development GmbH
+## Licensed under either of
+##  * Apache License, version 2.0, ([LICENSE-APACHE](LICENSE-APACHE))
+##  * MIT license ([LICENSE-MIT](LICENSE-MIT))
+## at your option.
+## This file may not be copied, modified, or distributed except according to
+## those terms.
+
+import pkg/upraises
+push: {.upraises: [].}
+
+{.deadCodeElim: on.}
+
+import system/ansi_c
+
+import ./cpuinfo_x86
+
+## inspired by https://github.com/mratsim/weave/blob/master/weave/memory/allocs.nim
+
+let
+  LeoAlignBytes* = if hasAvx2(): 32'u else: 16'u
+
+when defined(windows):
+  proc alignedAllocWindows(size, alignment: csize_t): pointer
+    {.importc: "_aligned_malloc", header: "<malloc.h>".}
+    # Beware of the arg order!
+
+  proc alignedAlloc(alignment, size: csize_t): pointer =
+    alignedAllocWindows(size, alignment)
+
+  proc alignedFree*[T](p: ptr T)
+    {.importc: "_aligned_free", header: "<malloc.h>".}
+elif defined(osx):
+  proc posix_memalign(mem: var pointer, alignment, size: csize_t)
+    {.importc, header:"<stdlib.h>".}
+
+  proc alignedAlloc(alignment, size: csize_t): pointer {.inline.} =
+    posix_memalign(result, alignment, size)
+
+  proc alignedFree*[T](p: ptr T) {.inline.} =
+    c_free(p)
+elif defined(unix):
+  proc alignedAlloc(alignment, size: csize_t): pointer
+    {.importc: "aligned_alloc", header: "<stdlib.h>".}
+
+  proc alignedFree*[T](p: ptr T) {.inline.} =
+    c_free(p)
+else:
+  {.warning: "Falling back to manual pointer alignment, this is highly inefficient!".}
+  proc alignedAlloc*(size, align: Positive): pointer {.inline.}  =
+    var
+      data = c_malloc(align + size)
+
+    if not isNil(data):
+      var
+        doffset = cast[uint](data) mod align
+
+      data = data.offset((align + doffset).int)
+      var
+        offsetPtr = cast[pointer](cast[uint](data) - 1'u)
+      moveMem(offsetPtr, addr doffset, sizeof(doffset))
+
+      return data
+
+  proc freeAligned*[T](p: ptr T, align: Positive) {.inline.} =
+    var data = p
+    if not isNil(data):
+      let offset = cast[uint](data) - 1'u
+      if offset >= align:
+          return
+
+      data = cast[pointer](cast[uint](data) - (align - offset))
+      c_free(data)
+
+proc leoAlloc*(size: Positive): pointer {.inline.} =
+  alignedAlloc(LeoAlignBytes, size.csize_t)
+
+proc leoFree*[T](p: ptr T) =
+  alignedFree(p)
--- a/leopard/utils/cpuinfo_x86.nim
+++ b/leopard/utils/cpuinfo_x86.nim
@ -0,0 +1,793 @@
+## Nim-Leopard
+## Copyright (c) 2022 Status Research & Development GmbH
+## Licensed under either of
+##  * Apache License, version 2.0, ([LICENSE-APACHE](LICENSE-APACHE))
+##  * MIT license ([LICENSE-MIT](LICENSE-MIT))
+## at your option.
+## This file may not be copied, modified, or distributed except according to
+## those terms.
+
+import pkg/upraises
+push: {.upraises: [].}
+
+{.deadCodeElim: on.}
+
+# From awr1: https://github.com/nim-lang/Nim/pull/11816/files
+
+proc cpuidX86(eaxi, ecxi: int32): tuple[eax, ebx, ecx, edx: int32] {.used.}=
+  when defined(vcc):
+    # limited inline asm support in vcc, so intrinsics, here we go:
+    proc cpuidVcc(cpuInfo: ptr int32; functionID, subFunctionID: int32)
+      {.cdecl, importc: "__cpuidex", header: "intrin.h".}
+    cpuidVcc(addr result.eax, eaxi, ecxi)
+  else:
+    var (eaxr, ebxr, ecxr, edxr) = (0'i32, 0'i32, 0'i32, 0'i32)
+    asm """
+      cpuid
+      :"=a"(`eaxr`), "=b"(`ebxr`), "=c"(`ecxr`), "=d"(`edxr`)
+      :"a"(`eaxi`), "c"(`ecxi`)"""
+    (eaxr, ebxr, ecxr, edxr)
+
+proc cpuNameX86(): string {.used.}=
+  var leaves {.global.} = cast[array[48, char]]([
+    cpuidX86(eaxi = 0x80000002'i32, ecxi = 0),
+    cpuidX86(eaxi = 0x80000003'i32, ecxi = 0),
+    cpuidX86(eaxi = 0x80000004'i32, ecxi = 0)])
+  result = $cast[cstring](addr leaves[0])
+
+type
+  X86Feature {.pure.} = enum
+    HypervisorPresence, Hyperthreading, NoSMT, IntelVtx, Amdv, X87fpu, Mmx,
+    MmxExt, F3DNow, F3DNowEnhanced, Prefetch, Sse, Sse2, Sse3, Ssse3, Sse4a,
+    Sse41, Sse42, Avx, Avx2, Avx512f, Avx512dq, Avx512ifma, Avx512pf,
+    Avx512er, Avx512cd, Avx512bw, Avx512vl, Avx512vbmi, Avx512vbmi2,
+    Avx512vpopcntdq, Avx512vnni, Avx512vnniw4, Avx512fmaps4, Avx512bitalg,
+    Avx512bfloat16, Avx512vp2intersect, Rdrand, Rdseed, MovBigEndian, Popcnt,
+    Fma3, Fma4, Xop, Cas8B, Cas16B, Abm, Bmi1, Bmi2, TsxHle, TsxRtm, Adx, Sgx,
+    Gfni, Aes, Vaes, Vpclmulqdq, Pclmulqdq, NxBit, Float16c, Sha, Clflush,
+    ClflushOpt, Clwb, PrefetchWT1, Mpx
+
+let
+  leaf1 = cpuidX86(eaxi = 1, ecxi = 0)
+  leaf7 = cpuidX86(eaxi = 7, ecxi = 0)
+  leaf8 = cpuidX86(eaxi = 0x80000001'i32, ecxi = 0)
+
+# The reason why we don't just evaluate these directly in the `let` variable
+# list is so that we can internally organize features by their input (leaf)
+# and output registers.
+proc testX86Feature(feature: X86Feature): bool =
+  proc test(input, bit: int): bool =
+    ((1 shl bit) and input) != 0
+
+  # see: https://en.wikipedia.org/wiki/CPUID#Calling_CPUID
+  # see: Intel® Architecture Instruction Set Extensions and Future Features
+  #      Programming Reference
+  result = case feature
+    # leaf 1, edx
+    of X87fpu:
+      leaf1.edx.test(0)
+    of Clflush:
+      leaf1.edx.test(19)
+    of Mmx:
+      leaf1.edx.test(23)
+    of Sse:
+      leaf1.edx.test(25)
+    of Sse2:
+      leaf1.edx.test(26)
+    of Hyperthreading:
+      leaf1.edx.test(28)
+
+    # leaf 1, ecx
+    of Sse3:
+      leaf1.ecx.test(0)
+    of Pclmulqdq:
+      leaf1.ecx.test(1)
+    of IntelVtx:
+      leaf1.ecx.test(5)
+    of Ssse3:
+      leaf1.ecx.test(9)
+    of Fma3:
+      leaf1.ecx.test(12)
+    of Cas16B:
+      leaf1.ecx.test(13)
+    of Sse41:
+      leaf1.ecx.test(19)
+    of Sse42:
+      leaf1.ecx.test(20)
+    of MovBigEndian:
+      leaf1.ecx.test(22)
+    of Popcnt:
+      leaf1.ecx.test(23)
+    of Aes:
+      leaf1.ecx.test(25)
+    of Avx:
+      leaf1.ecx.test(28)
+    of Float16c:
+      leaf1.ecx.test(29)
+    of Rdrand:
+      leaf1.ecx.test(30)
+    of HypervisorPresence:
+      leaf1.ecx.test(31)
+
+    # leaf 7, ecx
+    of PrefetchWT1:
+      leaf7.ecx.test(0)
+    of Avx512vbmi:
+      leaf7.ecx.test(1)
+    of Avx512vbmi2:
+      leaf7.ecx.test(6)
+    of Gfni:
+      leaf7.ecx.test(8)
+    of Vaes:
+      leaf7.ecx.test(9)
+    of Vpclmulqdq:
+      leaf7.ecx.test(10)
+    of Avx512vnni:
+      leaf7.ecx.test(11)
+    of Avx512bitalg:
+      leaf7.ecx.test(12)
+    of Avx512vpopcntdq:
+      leaf7.ecx.test(14)
+
+    # lead 7, eax
+    of Avx512bfloat16:
+      leaf7.eax.test(5)
+
+    # leaf 7, ebx
+    of Sgx:
+      leaf7.ebx.test(2)
+    of Bmi1:
+      leaf7.ebx.test(3)
+    of TsxHle:
+      leaf7.ebx.test(4)
+    of Avx2:
+      leaf7.ebx.test(5)
+    of Bmi2:
+      leaf7.ebx.test(8)
+    of TsxRtm:
+      leaf7.ebx.test(11)
+    of Mpx:
+      leaf7.ebx.test(14)
+    of Avx512f:
+      leaf7.ebx.test(16)
+    of Avx512dq:
+      leaf7.ebx.test(17)
+    of Rdseed:
+      leaf7.ebx.test(18)
+    of Adx:
+      leaf7.ebx.test(19)
+    of Avx512ifma:
+      leaf7.ebx.test(21)
+    of ClflushOpt:
+      leaf7.ebx.test(23)
+    of Clwb:
+      leaf7.ebx.test(24)
+    of Avx512pf:
+      leaf7.ebx.test(26)
+    of Avx512er:
+      leaf7.ebx.test(27)
+    of Avx512cd:
+      leaf7.ebx.test(28)
+    of Sha:
+      leaf7.ebx.test(29)
+    of Avx512bw:
+      leaf7.ebx.test(30)
+    of Avx512vl:
+      leaf7.ebx.test(31)
+
+    # leaf 7, edx
+    of Avx512vnniw4:
+      leaf7.edx.test(2)
+    of Avx512fmaps4:
+      leaf7.edx.test(3)
+    of Avx512vp2intersect:
+      leaf7.edx.test(8)
+
+    # leaf 8, edx
+    of NoSMT:
+      leaf8.edx.test(1)
+    of Cas8B:
+      leaf8.edx.test(8)
+    of NxBit:
+      leaf8.edx.test(20)
+    of MmxExt:
+      leaf8.edx.test(22)
+    of F3DNowEnhanced:
+      leaf8.edx.test(30)
+    of F3DNow:
+      leaf8.edx.test(31)
+
+    # leaf 8, ecx
+    of Amdv:
+      leaf8.ecx.test(2)
+    of Abm:
+      leaf8.ecx.test(5)
+    of Sse4a:
+      leaf8.ecx.test(6)
+    of Prefetch:
+      leaf8.ecx.test(8)
+    of Xop:
+      leaf8.ecx.test(11)
+    of Fma4:
+      leaf8.ecx.test(16)
+
+let
+  isHypervisorPresentImpl = testX86Feature(HypervisorPresence)
+  hasSimultaneousMultithreadingImpl =
+    testX86Feature(Hyperthreading) or not testX86Feature(NoSMT)
+  hasIntelVtxImpl = testX86Feature(IntelVtx)
+  hasAmdvImpl = testX86Feature(Amdv)
+  hasX87fpuImpl = testX86Feature(X87fpu)
+  hasMmxImpl = testX86Feature(Mmx)
+  hasMmxExtImpl = testX86Feature(MmxExt)
+  has3DNowImpl = testX86Feature(F3DNow)
+  has3DNowEnhancedImpl = testX86Feature(F3DNowEnhanced)
+  hasPrefetchImpl = testX86Feature(Prefetch) or testX86Feature(F3DNow)
+  hasSseImpl = testX86Feature(Sse)
+  hasSse2Impl = testX86Feature(Sse2)
+  hasSse3Impl = testX86Feature(Sse3)
+  hasSsse3Impl = testX86Feature(Ssse3)
+  hasSse4aImpl = testX86Feature(Sse4a)
+  hasSse41Impl = testX86Feature(Sse41)
+  hasSse42Impl = testX86Feature(Sse42)
+  hasAvxImpl = testX86Feature(Avx)
+  hasAvx2Impl = testX86Feature(Avx2)
+  hasAvx512fImpl = testX86Feature(Avx512f)
+  hasAvx512dqImpl = testX86Feature(Avx512dq)
+  hasAvx512ifmaImpl = testX86Feature(Avx512ifma)
+  hasAvx512pfImpl = testX86Feature(Avx512pf)
+  hasAvx512erImpl = testX86Feature(Avx512er)
+  hasAvx512cdImpl = testX86Feature(Avx512dq)
+  hasAvx512bwImpl = testX86Feature(Avx512bw)
+  hasAvx512vlImpl = testX86Feature(Avx512vl)
+  hasAvx512vbmiImpl = testX86Feature(Avx512vbmi)
+  hasAvx512vbmi2Impl = testX86Feature(Avx512vbmi2)
+  hasAvx512vpopcntdqImpl = testX86Feature(Avx512vpopcntdq)
+  hasAvx512vnniImpl = testX86Feature(Avx512vnni)
+  hasAvx512vnniw4Impl = testX86Feature(Avx512vnniw4)
+  hasAvx512fmaps4Impl = testX86Feature(Avx512fmaps4)
+  hasAvx512bitalgImpl = testX86Feature(Avx512bitalg)
+  hasAvx512bfloat16Impl = testX86Feature(Avx512bfloat16)
+  hasAvx512vp2intersectImpl = testX86Feature(Avx512vp2intersect)
+  hasRdrandImpl = testX86Feature(Rdrand)
+  hasRdseedImpl = testX86Feature(Rdseed)
+  hasMovBigEndianImpl = testX86Feature(MovBigEndian)
+  hasPopcntImpl = testX86Feature(Popcnt)
+  hasFma3Impl = testX86Feature(Fma3)
+  hasFma4Impl = testX86Feature(Fma4)
+  hasXopImpl = testX86Feature(Xop)
+  hasCas8BImpl = testX86Feature(Cas8B)
+  hasCas16BImpl = testX86Feature(Cas16B)
+  hasAbmImpl = testX86Feature(Abm)
+  hasBmi1Impl = testX86Feature(Bmi1)
+  hasBmi2Impl = testX86Feature(Bmi2)
+  hasTsxHleImpl = testX86Feature(TsxHle)
+  hasTsxRtmImpl = testX86Feature(TsxRtm)
+  hasAdxImpl = testX86Feature(TsxHle)
+  hasSgxImpl = testX86Feature(Sgx)
+  hasGfniImpl = testX86Feature(Gfni)
+  hasAesImpl = testX86Feature(Aes)
+  hasVaesImpl = testX86Feature(Vaes)
+  hasVpclmulqdqImpl = testX86Feature(Vpclmulqdq)
+  hasPclmulqdqImpl = testX86Feature(Pclmulqdq)
+  hasNxBitImpl = testX86Feature(NxBit)
+  hasFloat16cImpl = testX86Feature(Float16c)
+  hasShaImpl = testX86Feature(Sha)
+  hasClflushImpl = testX86Feature(Clflush)
+  hasClflushOptImpl = testX86Feature(ClflushOpt)
+  hasClwbImpl = testX86Feature(Clwb)
+  hasPrefetchWT1Impl = testX86Feature(PrefetchWT1)
+  hasMpxImpl = testX86Feature(Mpx)
+
+# NOTE: We use procedures here (layered over the variables) to keep the API
+# consistent and usable against possible future heterogenous systems with ISA
+# differences between cores (a possibility that has historical precedents, for
+# instance, the PPU/SPU relationship found on the IBM Cell). If future systems
+# do end up having disparate ISA features across multiple cores, expect there to
+# be a "cpuCore" argument added to the feature procs.
+
+proc isHypervisorPresent*(): bool {.inline.} =
+  return isHypervisorPresentImpl
+  ## **(x86 Only)**
+  ##
+  ## Reports `true` if this application is running inside of a virtual machine
+  ## (this is by no means foolproof).
+
+proc hasSimultaneousMultithreading*(): bool {.inline.} =
+  return hasSimultaneousMultithreadingImpl
+  ## **(x86 Only)**
+  ##
+  ## Reports `true` if the hardware is utilizing simultaneous multithreading
+  ## (branded as *"hyperthreads"* on Intel processors).
+
+proc hasIntelVtx*(): bool {.inline.} =
+  return hasIntelVtxImpl
+  ## **(x86 Only)**
+  ##
+  ## Reports `true` if the Intel virtualization extensions (VT-x) are available.
+
+proc hasAmdv*(): bool {.inline.} =
+  return hasAmdvImpl
+  ## **(x86 Only)**
+  ##
+  ## Reports `true` if the AMD virtualization extensions (AMD-V) are available.
+
+proc hasX87fpu*(): bool {.inline.} =
+  return hasX87fpuImpl
+  ## **(x86 Only)**
+  ##
+  ## Reports `true` if the hardware can use x87 floating-point instructions
+  ## (includes support for single, double, and 80-bit percision floats as per
+  ## IEEE 754-1985).
+  ##
+  ## By virtue of SSE2 enforced compliance on AMD64 CPUs, this should always be
+  ## `true` on 64-bit x86 processors. It should be noted that support of these
+  ## instructions is deprecated on 64-bit versions of Windows - see MSDN_.
+  ##
+  ## .. _MSDN: https://docs.microsoft.com/en-us/windows/win32/dxtecharts/sixty-four-bit-programming-for-game-developers#porting-applications-to-64-bit-platforms
+
+proc hasMmx*(): bool {.inline.} =
+  return hasMmxImpl
+  ## **(x86 Only)**
+  ##
+  ## Reports `true` if the hardware can use MMX SIMD instructions.
+  ##
+  ## By virtue of SSE2 enforced compliance on AMD64 CPUs, this should always be
+  ## `true` on 64-bit x86 processors. It should be noted that support of these
+  ## instructions is deprecated on 64-bit versions of Windows (see MSDN_ for
+  ## more info).
+  ##
+  ## .. _MSDN: https://docs.microsoft.com/en-us/windows/win32/dxtecharts/sixty-four-bit-programming-for-game-developers#porting-applications-to-64-bit-platforms
+
+proc hasMmxExt*(): bool {.inline.} =
+  return hasMmxExtImpl
+  ## **(x86 Only)**
+  ##
+  ## Reports `true` if the hardware can use "Extended MMX" SIMD instructions.
+  ##
+  ## It should be noted that support of these instructions is deprecated on
+  ## 64-bit versions of Windows (see MSDN_ for more info).
+  ##
+  ## .. _MSDN: https://docs.microsoft.com/en-us/windows/win32/dxtecharts/sixty-four-bit-programming-for-game-developers#porting-applications-to-64-bit-platforms
+
+proc has3DNow*(): bool {.inline.} =
+  return has3DNowImpl
+  ## **(x86 Only)**
+  ##
+  ## Reports `true` if the hardware can use 3DNow! SIMD instructions.
+  ##
+  ## It should be noted that support of these instructions is deprecated on
+  ## 64-bit versions of Windows (see MSDN_ for more info), and that the 3DNow!
+  ## instructions (with an exception made for the prefetch instructions, see the
+  ## `hasPrefetch` procedure) have been phased out of AMD processors since 2010
+  ## (see `AMD Developer Central`_ for more info).
+  ##
+  ## .. _MSDN: https://docs.microsoft.com/en-us/windows/win32/dxtecharts/sixty-four-bit-programming-for-game-developers#porting-applications-to-64-bit-platforms
+  ## .. _`AMD Developer Central`: https://web.archive.org/web/20131109151245/http://developer.amd.com/community/blog/2010/08/18/3dnow-deprecated/
+
+proc has3DNowEnhanced*(): bool {.inline.} =
+  return has3DNowEnhancedImpl
+  ## **(x86 Only)**
+  ##
+  ## Reports `true` if the hardware can use "Enhanced 3DNow!" SIMD instructions.
+  ##
+  ## It should be noted that support of these instructions is deprecated on
+  ## 64-bit versions of Windows (see MSDN_ for more info), and that the 3DNow!
+  ## instructions (with an exception made for the prefetch instructions, see the
+  ## `hasPrefetch` procedure) have been phased out of AMD processors since 2010
+  ## (see `AMD Developer Central`_ for more info).
+  ##
+  ## .. _MSDN: https://docs.microsoft.com/en-us/windows/win32/dxtecharts/sixty-four-bit-programming-for-game-developers#porting-applications-to-64-bit-platforms
+  ## .. _`AMD Developer Central`: https://web.archive.org/web/20131109151245/http://developer.amd.com/community/blog/2010/08/18/3dnow-deprecated/
+
+proc hasPrefetch*(): bool {.inline.} =
+  return hasPrefetchImpl
+  ## **(x86 Only)**
+  ##
+  ## Reports `true` if the hardware can use the `PREFETCH` and `PREFETCHW`
+  ## instructions. These instructions originally included as part of 3DNow!, but
+  ## potentially indepdendent from the rest of it due to changes in contemporary
+  ## AMD processors (see above).
+
+proc hasSse*(): bool {.inline.} =
+  return hasSseImpl
+  ## **(x86 Only)**
+  ##
+  ## Reports `true` if the hardware can use the SSE (Streaming SIMD Extensions)
+  ## 1.0 instructions, which introduced 128-bit SIMD on x86 machines.
+  ##
+  ## By virtue of SSE2 enforced compliance on AMD64 CPUs, this should always be
+  ## `true` on 64-bit x86 processors.
+
+proc hasSse2*(): bool {.inline.} =
+  return hasSse2Impl
+  ## **(x86 Only)**
+  ##
+  ## Reports `true` if the hardware can use the SSE (Streaming SIMD Extensions)
+  ## 2.0 instructions.
+  ##
+  ## By virtue of SSE2 enforced compliance on AMD64 CPUs, this should always be
+  ## `true` on 64-bit x86 processors.
+
+proc hasSse3*(): bool {.inline.} =
+  return hasSse3Impl
+  ## **(x86 Only)**
+  ##
+  ## Reports `true` if the hardware can use SSE (Streaming SIMD Extensions) 3.0
+  ## instructions.
+
+proc hasSsse3*(): bool {.inline.} =
+  return hasSsse3Impl
+  ## **(x86 Only)**
+  ##
+  ## Reports `true` if the hardware can use Supplemental SSE (Streaming SIMD
+  ## Extensions) 3.0 instructions.
+
+proc hasSse4a*(): bool {.inline.} =
+  return hasSse4aImpl
+  ## **(x86 Only)**
+  ##
+  ## Reports `true` if the hardware can use Supplemental SSE (Streaming SIMD
+  ## Extensions) 4a instructions.
+
+proc hasSse41*(): bool {.inline.} =
+  return hasSse41Impl
+  ## **(x86 Only)**
+  ##
+  ## Reports `true` if the hardware can use Supplemental SSE (Streaming SIMD
+  ## Extensions) 4.1 instructions.
+
+proc hasSse42*(): bool {.inline.} =
+  return hasSse42Impl
+  ## **(x86 Only)**
+  ##
+  ## Reports `true` if the hardware can use Supplemental SSE (Streaming SIMD
+  ## Extensions) 4.2 instructions.
+
+proc hasAvx*(): bool {.inline.} =
+  return hasAvxImpl
+  ## **(x86 Only)**
+  ##
+  ## Reports `true` if the hardware can use AVX (Advanced Vector Extensions)
+  ## 1.0 instructions, which introduced 256-bit SIMD on x86 machines along with
+  ## addded reencoded versions of prior 128-bit SSE instructions into the more
+  ## code-dense and non-backward compatible VEX (Vector Extensions) format.
+
+proc hasAvx2*(): bool {.inline.} =
+  return hasAvx2Impl
+  ## **(x86 Only)**
+  ##
+  ## Reports `true` if the hardware can use AVX (Advanced Vector Extensions) 2.0
+  ## instructions.
+
+proc hasAvx512f*(): bool {.inline.} =
+  return hasAvx512fImpl
+  ## **(x86 Only)**
+  ##
+  ## Reports `true` if the hardware can use AVX (Advanced Vector Extensions)
+  ## 512-bit F (Foundation) instructions.
+
+proc hasAvx512dq*(): bool {.inline.} =
+  return hasAvx512dqImpl
+  ## **(x86 Only)**
+  ##
+  ## Reports `true` if the hardware can use AVX (Advanced Vector Extensions)
+  ## 512-bit DQ (Doubleword + Quadword) instructions.
+
+proc hasAvx512ifma*(): bool {.inline.} =
+  return hasAvx512ifmaImpl
+  ## **(x86 Only)**
+  ##
+  ## Reports `true` if the hardware can use AVX (Advanced Vector Extensions)
+  ## 512-bit IFMA (Integer Fused Multiply Accumulation) instructions.
+
+proc hasAvx512pf*(): bool {.inline.} =
+  return hasAvx512pfImpl
+  ## **(x86 Only)**
+  ##
+  ## Reports `true` if the hardware can use AVX (Advanced Vector Extensions)
+  ## 512-bit PF (Prefetch) instructions.
+
+proc hasAvx512er*(): bool {.inline.} =
+  return hasAvx512erImpl
+  ## **(x86 Only)**
+  ##
+  ## Reports `true` if the hardware can use AVX (Advanced Vector Extensions)
+  ## 512-bit ER (Exponential and Reciprocal) instructions.
+
+proc hasAvx512cd*(): bool {.inline.} =
+  return hasAvx512cdImpl
+  ## **(x86 Only)**
+  ##
+  ## Reports `true` if the hardware can use AVX (Advanced Vector Extensions)
+  ## 512-bit CD (Conflict Detection) instructions.
+
+proc hasAvx512bw*(): bool {.inline.} =
+  return hasAvx512bwImpl
+  ## **(x86 Only)**
+  ##
+  ## Reports `true` if the hardware can use AVX (Advanced Vector Extensions)
+  ## 512-bit BW (Byte and Word) instructions.
+
+proc hasAvx512vl*(): bool {.inline.} =
+  return hasAvx512vlImpl
+  ## **(x86 Only)**
+  ##
+  ## Reports `true` if the hardware can use AVX (Advanced Vector Extensions)
+  ## 512-bit VL (Vector Length) instructions.
+
+proc hasAvx512vbmi*(): bool {.inline.} =
+  return hasAvx512vbmiImpl
+  ## **(x86 Only)**
+  ##
+  ## Reports `true` if the hardware can use AVX (Advanced Vector Extensions)
+  ## 512-bit VBMI (Vector Byte Manipulation) 1.0 instructions.
+
+proc hasAvx512vbmi2*(): bool {.inline.} =
+  return hasAvx512vbmi2Impl
+  ## **(x86 Only)**
+  ##
+  ## Reports `true` if the hardware can use AVX (Advanced Vector Extensions)
+  ## 512-bit VBMI (Vector Byte Manipulation) 2.0 instructions.
+
+proc hasAvx512vpopcntdq*(): bool {.inline.} =
+  return hasAvx512vpopcntdqImpl
+  ## **(x86 Only)**
+  ##
+  ## Reports `true` if the hardware can use the AVX (Advanced Vector Extensions)
+  ## 512-bit `VPOPCNTDQ` (population count, i.e. determine number of flipped
+  ## bits) instruction.
+
+proc hasAvx512vnni*(): bool {.inline.} =
+  return hasAvx512vnniImpl
+  ## **(x86 Only)**
+  ##
+  ## Reports `true` if the hardware can use AVX (Advanced Vector Extensions)
+  ## 512-bit VNNI (Vector Neural Network) instructions.
+
+proc hasAvx512vnniw4*(): bool {.inline.} =
+  return hasAvx512vnniw4Impl
+  ## **(x86 Only)**
+  ##
+  ## Reports `true` if the hardware can use AVX (Advanced Vector Extensions)
+  ## 512-bit 4VNNIW (Vector Neural Network Word Variable Percision)
+  ## instructions.
+
+proc hasAvx512fmaps4*(): bool {.inline.} =
+  return hasAvx512fmaps4Impl
+  ## **(x86 Only)**
+  ##
+  ## Reports `true` if the hardware can use AVX (Advanced Vector Extensions)
+  ## 512-bit 4FMAPS (Fused-Multiply-Accumulation Single-percision) instructions.
+
+proc hasAvx512bitalg*(): bool {.inline.} =
+  return hasAvx512bitalgImpl
+  ## **(x86 Only)**
+  ##
+  ## Reports `true` if the hardware can use AVX (Advanced Vector Extensions)
+  ## 512-bit BITALG (Bit Algorithms) instructions.
+
+proc hasAvx512bfloat16*(): bool {.inline.} =
+  return hasAvx512bfloat16Impl
+  ## **(x86 Only)**
+  ## Reports `true` if the hardware can use AVX (Advanced Vector Extensions)
+  ## 512-bit BFLOAT16 (8-bit exponent, 7-bit mantissa) instructions used by
+  ## Intel DL (Deep Learning) Boost.
+
+proc hasAvx512vp2intersect*(): bool {.inline.} =
+  return hasAvx512vp2intersectImpl
+  ## **(x86 Only)**
+  ##
+  ## Reports `true` if the hardware can use AVX (Advanced Vector Extensions)
+  ## 512-bit VP2INTERSECT (Compute Intersections between Dualwords + Quadwords)
+  ## instructions.
+
+proc hasRdrand*(): bool {.inline.} =
+  return hasRdrandImpl
+  ## **(x86 Only)**
+  ##
+  ## Reports `true` if the hardware has support for the `RDRAND` instruction,
+  ## i.e. Intel on-CPU hardware random number generation.
+
+proc hasRdseed*(): bool {.inline.} =
+  return hasRdseedImpl
+  ## **(x86 Only)**
+  ##
+  ## Reports `true` if the hardware has support for the `RDSEED` instruction,
+  ## i.e. Intel on-CPU hardware random number generation (used for seeding other
+  ## PRNGs).
+
+proc hasMovBigEndian*(): bool {.inline.} =
+  return hasMovBigEndianImpl
+  ## **(x86 Only)**
+  ##
+  ## Reports `true` if the hardware has support for the `MOVBE` instruction for
+  ## endianness/byte-order switching.
+
+proc hasPopcnt*(): bool {.inline.} =
+  return hasPopcntImpl
+  ## **(x86 Only)**
+  ##
+  ## Reports `true` if the hardware has support for the `POPCNT` (population
+  ## count, i.e. determine number of flipped bits) instruction.
+
+proc hasFma3*(): bool {.inline.} =
+  return hasFma3Impl
+  ## **(x86 Only)**
+  ##
+  ## Reports `true` if the hardware has support for the FMA3 (Fused Multiply
+  ## Accumulation 3-operand) SIMD instructions.
+
+proc hasFma4*(): bool {.inline.} =
+  return hasFma4Impl
+  ## **(x86 Only)**
+  ##
+  ## Reports `true` if the hardware has support for the FMA4 (Fused Multiply
+  ## Accumulation 4-operand) SIMD instructions.
+
+proc hasXop*(): bool {.inline.} =
+  return hasXopImpl
+  ## **(x86 Only)**
+  ##
+  ## Reports `true` if the hardware has support for the XOP (eXtended
+  ## Operations) SIMD instructions. These instructions are exclusive to the
+  ## Bulldozer AMD microarchitecture family (i.e. Bulldozer, Piledriver,
+  ## Steamroller, and Excavator) and were phased out with the release of the Zen
+  ## design.
+
+proc hasCas8B*(): bool {.inline.} =
+  return hasCas8BImpl
+  ## **(x86 Only)**
+  ##
+  ## Reports `true` if the hardware has support for the (`LOCK`-able)
+  ## `CMPXCHG8B` 64-bit compare-and-swap instruction.
+
+proc hasCas16B*(): bool {.inline.} =
+  return hasCas16BImpl
+  ## **(x86 Only)**
+  ##
+  ## Reports `true` if the hardware has support for the (`LOCK`-able)
+  ## `CMPXCHG16B` 128-bit compare-and-swap instruction.
+
+proc hasAbm*(): bool {.inline.} =
+  return hasAbmImpl
+  ## **(x86 Only)**
+  ##
+  ## Reports `true` if the hardware has support for ABM (Advanced Bit
+  ## Manipulation) insturctions (i.e. `POPCNT` and `LZCNT` for counting leading
+  ## zeroes).
+
+proc hasBmi1*(): bool {.inline.} =
+  return hasBmi1Impl
+  ## **(x86 Only)**
+  ##
+  ## Reports `true` if the hardware has support for BMI (Bit Manipulation) 1.0
+  ## instructions.
+
+proc hasBmi2*(): bool {.inline.} =
+  return hasBmi2Impl
+  ## **(x86 Only)**
+  ##
+  ## Reports `true` if the hardware has support for BMI (Bit Manipulation) 2.0
+  ## instructions.
+
+proc hasTsxHle*(): bool {.inline.} =
+  return hasTsxHleImpl
+  ## **(x86 Only)**
+  ##
+  ## Reports `true` if the hardware has support for HLE (Hardware Lock Elision)
+  ## as part of Intel's TSX (Transactional Synchronization Extensions).
+
+proc hasTsxRtm*(): bool {.inline.} =
+  return hasTsxRtmImpl
+  ## **(x86 Only)**
+  ##
+  ## Reports `true` if the hardware has support for RTM (Restricted
+  ## Transactional Memory) as part of Intel's TSX (Transactional Synchronization
+  ## Extensions).
+
+proc hasAdx*(): bool {.inline.} =
+  return hasAdxImpl
+  ## **(x86 Only)**
+  ##
+  ## Reports `true` if the hardware has support for ADX (Multi-percision
+  ## Add-Carry Extensions) insructions.
+
+proc hasSgx*(): bool {.inline.} =
+  return hasSgxImpl
+  ## **(x86 Only)**
+  ##
+  ## Reports `true` if the hardware has support for SGX (Software Guard
+  ## eXtensions) memory encryption technology.
+
+proc hasGfni*(): bool {.inline.} =
+  return hasGfniImpl
+  ## **(x86 Only)**
+  ##
+  ## Reports `true` if the hardware has support for GFNI (Galois Field Affine
+  ## Transformation) instructions.
+
+proc hasAes*(): bool {.inline.} =
+  return hasAesImpl
+  ## **(x86 Only)**
+  ##
+  ## Reports `true` if the hardware has support for AESNI (Advanced Encryption
+  ## Standard) instructions.
+
+proc hasVaes*(): bool {.inline.} =
+  return hasVaesImpl
+  ## **(x86 Only)**
+  ##
+  ## Reports `true` if the hardware has support for VAES (Vectorized Advanced
+  ## Encryption Standard) instructions.
+
+proc hasVpclmulqdq*(): bool {.inline.} =
+  return hasVpclmulqdqImpl
+  ## **(x86 Only)**
+  ##
+  ## Reports `true` if the hardware has support for `VCLMULQDQ` (512 and 256-bit
+  ## Carryless Multiplication) instructions.
+
+proc hasPclmulqdq*(): bool {.inline.} =
+  return hasPclmulqdqImpl
+  ## **(x86 Only)**
+  ##
+  ## Reports `true` if the hardware has support for `PCLMULQDQ` (128-bit
+  ## Carryless Multiplication) instructions.
+
+proc hasNxBit*(): bool {.inline.} =
+  return hasNxBitImpl
+  ## **(x86 Only)**
+  ##
+  ## Reports `true` if the hardware has support for NX-bit (No-eXecute)
+  ## technology for marking pages of memory as non-executable.
+
+proc hasFloat16c*(): bool {.inline.} =
+  return hasFloat16cImpl
+  ## **(x86 Only)**
+  ##
+  ## Reports `true` if the hardware has support for F16C instructions, used for
+  ## converting 16-bit "half-percision" floating-point values to and from
+  ## single-percision floating-point values.
+
+proc hasSha*(): bool {.inline.} =
+  return hasShaImpl
+  ## **(x86 Only)**
+  ##
+  ## Reports `true` if the hardware has support for SHA (Secure Hash Algorithm)
+  ## instructions.
+
+proc hasClflush*(): bool {.inline.} =
+  return hasClflushImpl
+  ## **(x86 Only)**
+  ##
+  ## Reports `true` if the hardware has support for the `CLFLUSH` (Cache-line
+  ## Flush) instruction.
+
+proc hasClflushOpt*(): bool {.inline.} =
+  return hasClflushOptImpl
+  ## **(x86 Only)**
+  ##
+  ## Reports `true` if the hardware has support for the `CLFLUSHOPT` (Cache-line
+  ## Flush Optimized) instruction.
+
+proc hasClwb*(): bool {.inline.} =
+  return hasClwbImpl
+  ## **(x86 Only)**
+  ##
+  ## Reports `true` if the hardware has support for the `CLWB` (Cache-line Write
+  ## Back) instruction.
+
+proc hasPrefetchWT1*(): bool {.inline.} =
+  return hasPrefetchWT1Impl
+  ## **(x86 Only)**
+  ##
+  ## Reports `true` if the hardware has support for the `PREFECTHWT1`
+  ## instruction.
+
+proc hasMpx*(): bool {.inline.} =
+  return hasMpxImpl
+  ## **(x86 Only)**
+  ##
+  ## Reports `true` if the hardware has support for MPX (Memory Protection
+  ## eXtensions).
--- a/leopard/wrapper.nim
+++ b/leopard/wrapper.nim
@ -0,0 +1,289 @@
+## Copyright (c) 2017 Christopher A. Taylor.  All rights reserved.
+##
+## Redistribution and use in source and binary forms, with or without
+## modification, are permitted provided that the following conditions are met:
+##
+## * Redistributions of source code must retain the above copyright notice,
+##   this list of conditions and the following disclaimer.
+## * Redistributions in binary form must reproduce the above copyright notice,
+##   this list of conditions and the following disclaimer in the documentation
+##   and/or other materials provided with the distribution.
+## * Neither the name of Leopard-RS nor the names of its contributors may be
+##   used to endorse or promote products derived from this software without
+##   specific prior written permission.
+##
+## THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+## AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+## IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+## ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+## LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+## CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+## SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+## INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+## CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+## ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+## POSSIBILITY OF SUCH DAMAGE.
+
+
+## Leopard-RS
+## MDS Reed-Solomon Erasure Correction Codes for Large Data in C
+##
+## Algorithms are described in LeopardCommon.h
+##
+##
+## Inspired by discussion with:
+##
+## Sian-Jhen Lin <sjhenglin@gmail.com> : Author of {1} {3}, basis for Leopard
+## Bulat Ziganshin <bulat.ziganshin@gmail.com> : Author of FastECC
+## Yutaka Sawada <tenfon@outlook.jp> : Author of MultiPar
+##
+##
+## References:
+##
+## {1} S.-J. Lin, T. Y. Al-Naffouri, Y. S. Han, and W.-H. Chung,
+## "Novel Polynomial Basis with Fast Fourier Transform
+## and Its Application to Reed-Solomon Erasure Codes"
+## IEEE Trans. on Information Theory, pp. 6284-6299, November, 2016.
+##
+## {2} D. G. Cantor, "On arithmetical algorithms over finite fields",
+## Journal of Combinatorial Theory, Series A, vol. 50, no. 2, pp. 285-300, 1989.
+##
+## {3} Sian-Jheng Lin, Wei-Ho Chung, "An Efficient (n, k) Information
+## Dispersal Algorithm for High Code Rate System over Fermat Fields,"
+## IEEE Commun. Lett., vol.16, no.12, pp. 2036-2039, Dec. 2012.
+##
+## {4} Plank, J. S., Greenan, K. M., Miller, E. L., "Screaming fast Galois Field
+## arithmetic using Intel SIMD instructions."  In: FAST-2013: 11th Usenix
+## Conference on File and Storage Technologies, San Jose, 2013
+
+
+import pkg/upraises
+push: {.upraises: [].}
+
+## -----------------------------------------------------------------------------
+## Build configuration
+
+import std/compilesettings
+import std/os
+import std/strutils
+
+const
+  LeopardCmakeFlags {.strdefine.} =
+    when defined(macosx):
+      "-DCMAKE_BUILD_TYPE=Release -DENABLE_OPENMP=off"
+    elif defined(windows):
+      "-G\"MSYS Makefiles\" -DCMAKE_BUILD_TYPE=Release"
+    else:
+      "-DCMAKE_BUILD_TYPE=Release"
+
+  LeopardDir {.strdefine.} =
+    joinPath(currentSourcePath.parentDir.parentDir, "vendor", "leopard")
+
+  buildDir = joinPath(querySetting(nimcacheDir), "vendor_leopard")
+
+  LeopardHeader {.strdefine.} = "leopard.h"
+
+  LeopardLib {.strdefine.} = joinPath(buildDir, "liblibleopard.a")
+
+  LeopardCompilerFlags {.strdefine.} =
+    when defined(macosx):
+      "-I" & LeopardDir
+    else:
+      "-I" & LeopardDir & " -fopenmp"
+
+  LeopardLinkerFlags {.strdefine.} =
+    when defined(macosx):
+      LeopardLib
+    else:
+      LeopardLib & " -fopenmp"
+
+  LeopardExtraCompilerFlags {.strdefine.} = ""
+
+  LeopardExtraLinkerFlags {.strdefine.} = ""
+
+static:
+  if defined(windows):
+    func pathUnix2Win(path: string): string =
+      gorge("cygpath -w " & path.strip).strip
+
+    func pathWin2Unix(path: string): string =
+      gorge("cygpath " & path.strip).strip
+
+    proc bash(cmd: varargs[string]): string =
+      gorge(gorge("which bash").pathUnix2Win & " -c '" & cmd.join(" ") & "'")
+
+    proc bashEx(cmd: varargs[string]): tuple[output: string, exitCode: int] =
+      gorgeEx(gorge("which bash").pathUnix2Win & " -c '" & cmd.join(" ") & "'")
+
+    let
+      buildDirUnix = buildDir.pathWin2Unix
+      leopardDirUnix = LeopardDir.pathWin2Unix
+    if defined(LeopardRebuild): discard bash("rm -rf", buildDirUnix)
+    if (bashEx("ls", LeopardLib.pathWin2Unix)).exitCode != 0:
+      discard bash("mkdir -p", buildDirUnix)
+      let cmd =
+        @["cd", buildDirUnix, "&& cmake", leopardDirUnix, LeopardCmakeFlags,
+          "&& make"]
+      echo "\nBuilding Leopard-RS: " & cmd.join(" ")
+      let (output, exitCode) = bashEx cmd
+      echo output
+      if exitCode != 0:
+        discard bash("rm -rf", buildDirUnix)
+        raise (ref Defect)(msg: "Failed to build Leopard-RS")
+  else:
+    if defined(LeopardRebuild): discard gorge "rm -rf " & buildDir
+    if gorgeEx("ls " & LeopardLib).exitCode != 0:
+      discard gorge "mkdir -p " & buildDir
+      let cmd =
+        "cd " & buildDir & " && cmake " & LeopardDir & " " & LeopardCmakeFlags &
+        " && make"
+      echo "\nBuilding Leopard-RS: " & cmd
+      let (output, exitCode) = gorgeEx cmd
+      echo output
+      if exitCode != 0:
+        discard gorge "rm -rf " & buildDir
+        raise (ref Defect)(msg: "Failed to build Leopard-RS")
+
+{.passC: LeopardCompilerFlags & " " & LeopardExtraCompilerFlags.}
+{.passL: LeopardLinkerFlags & " " & LeopardExtraLinkerFlags.}
+
+{.pragma: leo, cdecl, header: LeopardHeader.}
+
+proc leoInit*(): cint {.leo, importcpp: "leo_init".}
+
+## ------------------------------------------------------------------------------
+##  Shared Constants / Datatypes
+##  Results
+
+# TODO: For some reason it's only possibly to use the enum with `ord`
+type
+  LeopardResult* = enum
+    LeopardCallInitialize = -7, ##  Call leo_init() first
+    LeopardPlatform = -6,       ##  Platform is unsupported
+    LeopardInvalidInput = -5,   ##  A function parameter was invalid
+    LeopardInvalidCounts = -4,  ##  Invalid counts provided
+    LeopardInvalidSize = -3,    ##  Buffer size must be a multiple of 64 bytes
+    LeopardTooMuchData = -2,    ##  Buffer counts are too high
+    LeopardNeedMoreData = -1,   ##  Not enough recovery data received
+    LeopardSuccess = 0          ##  Operation succeeded
+
+
+##  Convert Leopard result to string
+
+proc leoResultString*(result: LeopardResult): cstring {.leo, importc: "leo_result_string".}
+## ------------------------------------------------------------------------------
+##  Encoder API
+##
+##     leo_encode_work_count()
+##
+##     Calculate the number of work_data buffers to provide to leo_encode().
+##
+##     The sum of original_count + recovery_count must not exceed 65536.
+##
+##     Returns the work_count value to pass into leo_encode().
+##     Returns 0 on invalid input.
+##
+
+proc leoEncodeWorkCount*(originalCount: cuint; recoveryCount: cuint): cuint
+  {.leo, importc: "leo_encode_work_count".}
+##
+##     leo_encode()
+##
+##     Generate recovery data.
+##
+##     original_count: Number of original_data[] buffers provided.
+##     recovery_count: Number of desired recovery data buffers.
+##     buffer_bytes:   Number of bytes in each data buffer.
+##     original_data:  Array of pointers to original data buffers.
+##     work_count:     Number of work_data[] buffers, from leo_encode_work_count().
+##     work_data:      Array of pointers to work data buffers.
+##
+##     The sum of original_count + recovery_count must not exceed 65536.
+##     The recovery_count <= original_count.
+##
+##     The buffer_bytes must be a multiple of 64.
+##     Each buffer should have the same number of bytes.
+##     Even the last piece must be rounded up to the block size.
+##
+##     Let buffer_bytes = The number of bytes in each buffer:
+##
+##         original_count = static_cast<unsigned>(
+##             ((uint64_t)total_bytes + buffer_bytes - 1) / buffer_bytes);
+##
+##     Or if the number of pieces is known:
+##
+##         buffer_bytes = static_cast<unsigned>(
+##             ((uint64_t)total_bytes + original_count - 1) / original_count);
+##
+##     Returns Leopard_Success on success.
+##  The first set of recovery_count buffers in work_data will be the result.
+##     Returns other values on errors.
+##
+
+proc leoEncode*(
+  bufferBytes: uint64;
+  originalCount: cuint;
+  recoveryCount: cuint;
+  workCount: cuint;
+  originalData: ptr pointer;
+  workData: ptr pointer): LeopardResult {.leo, importc: "leo_encode".}
+  ##  Number of bytes in each data buffer
+  ##  Number of original_data[] buffer pointers
+  ##  Number of recovery_data[] buffer pointers
+  ##  Number of work_data[] buffer pointers, from leo_encode_work_count()
+  ##  Array of pointers to original data buffers
+  ##
+
+##  Array of work buffers
+## ------------------------------------------------------------------------------
+##  Decoder API
+##
+##     leo_decode_work_count()
+##
+##     Calculate the number of work_data buffers to provide to leo_decode().
+##
+##     The sum of original_count + recovery_count must not exceed 65536.
+##
+##     Returns the work_count value to pass into leo_encode().
+##     Returns 0 on invalid input.
+##
+
+proc leoDecodeWorkCount*(originalCount: cuint; recoveryCount: cuint): cuint
+  {.leo, importc: "leo_decode_work_count".}
+##
+##     leo_decode()
+##
+##     Decode original data from recovery data.
+##
+##     buffer_bytes:   Number of bytes in each data buffer.
+##     original_count: Number of original_data[] buffers provided.
+##     original_data:  Array of pointers to original data buffers.
+##     recovery_count: Number of recovery_data[] buffers provided.
+##     recovery_data:  Array of pointers to recovery data buffers.
+##     work_count:     Number of work_data[] buffers, from leo_decode_work_count().
+##     work_data:      Array of pointers to recovery data buffers.
+##
+##     Lost original/recovery data should be set to NULL.
+##
+##     The sum of recovery_count + the number of non-NULL original data must be at
+##     least original_count in order to perform recovery.
+##
+##     Returns Leopard_Success on success.
+##     Returns other values on errors.
+##
+
+proc leoDecode*(
+  bufferBytes: uint64;
+  originalCount: cuint;
+  recoveryCount: cuint;
+  workCount: cuint;
+  originalData: ptr pointer;
+  recoveryData: ptr pointer;
+  workData: ptr pointer): LeopardResult {.leo, importc: "leo_decode".}
+  ##  Number of bytes in each data buffer
+  ##  Number of original_data[] buffer pointers
+  ##  Number of recovery_data[] buffer pointers
+  ##  Number of buffer pointers in work_data[]
+  ##  Array of original data buffers
+  ##  Array of recovery data buffers
+  ##  Array of work data buffers
--- a/tests/helpers.nim
+++ b/tests/helpers.nim
@ -0,0 +1,106 @@
+import std/random
+
+import pkg/stew/results
+import ../leopard
+
+proc randomCRCPacket*(data: var openArray[byte]) =
+  if data.len < 16:
+    data[0] = rand(data.len).byte
+    for i in 1..<data.len:
+      data[i] = data[0]
+  else:
+    let
+      len: uint32 = data.len.uint32
+
+    copyMem(addr data[0], unsafeAddr len, sizeof(len))
+    var
+      crc = data.len.uint32
+
+    for i in 8..<data.len:
+      let v = rand(data.len).byte
+      data[i] = v
+      crc = (crc shl 3) and (crc shr (32 - 3))
+      crc += v
+
+    copyMem(addr data[4], unsafeAddr crc, sizeof(crc))
+
+proc checkCRCPacket*(data: openArray[byte]): bool =
+  if data.len < 16:
+    for d in data[1..data.high]:
+      if d != data[0]:
+        raise (ref Defect)(msg: "Packet don't match")
+  else:
+    var
+      crc = data.len.uint32
+      packCrc: uint32
+      packSize: uint32
+
+    copyMem(addr packSize, unsafeAddr data[0], sizeof(packSize))
+    if packSize != data.len.uint:
+      raise (ref Defect)(msg: "Packet size don't match!")
+
+    for i in 4..<data.len:
+      let v = data[i]
+      crc = (crc shl 3) and (crc shr (32 - 3))
+      crc += v
+
+    copyMem(addr packCrc, unsafeAddr data[4], sizeof(packCrc))
+
+    if packCrc == crc:
+      return true
+
+proc dropRandomIdx*(bufs: var openArray[seq[byte]], dropCount: int) =
+  var
+    count = 0
+    dups: seq[int]
+    size = bufs.len
+
+  while count < dropCount:
+    let i = rand(0..<size)
+    if dups.find(i) == -1:
+      dups.add(i)
+      bufs[i].setLen(0)
+      count.inc
+
+proc testPackets*(
+  buffers,
+  parity,
+  bufSize,
+  dataLosses: int,
+  parityLosses: int,
+  encoder: var LeoEncoder,
+  decoder: var LeoDecoder): Result[void, cstring] =
+
+  var
+    dataBuf = newSeqOfCap[seq[byte]](buffers)
+    parityBuf = newSeqOfCap[seq[byte]](parity)
+    recoveredBuf = newSeqOfCap[seq[byte]](buffers)
+
+  for _ in 0..<buffers:
+    var
+      dataSeq = newSeq[byte](bufSize)
+
+    randomCRCPacket(dataSeq)
+    dataBuf.add(dataSeq)
+
+    recoveredBuf.add(newSeq[byte](bufSize))
+
+  for _ in 0..<parity:
+    parityBuf.add(newSeq[byte](bufSize))
+
+  encoder.encode(dataBuf, parityBuf).tryGet()
+
+  if dataLosses > 0:
+    dropRandomIdx(dataBuf, dataLosses)
+
+  if parityLosses > 0:
+    dropRandomIdx(parityBuf, parityLosses)
+
+  decoder.decode(dataBuf, parityBuf, recoveredBuf).tryGet()
+
+  for i, d in dataBuf:
+    if d.len <= 0:
+      if not checkCRCPacket(recoveredBuf[i]):
+        return err(("Check failed for packet " & $i).cstring)
+
+  ok()
--- a/tests/test_leopard.nim
+++ b/tests/test_leopard.nim
--- a/tests/testleopard.nim
+++ b/tests/testleopard.nim
@ -0,0 +1,329 @@
+import std/random
+import std/sets
+
+import pkg/unittest2
+import pkg/stew/results
+
+import ../leopard
+import ./helpers
+
+randomize()
+
+suite "Leopard Parametrization":
+  test "Should not allow invalid buffer multiples":
+    check:
+      LeoEncoder.init(63, 4, 2).error == "bufSize should be multiples of 64 bytes!"
+      LeoEncoder.init(65, 4, 2).error == "bufSize should be multiples of 64 bytes!"
+
+  test "Should not allow invalid data/parity buffer counts":
+    check:
+      LeoEncoder.init(64, 1, 2).error ==
+      "number of parity buffers cannot exceed number of data buffers!"
+
+  test "Should not allow data + parity to exceed 65536":
+    check:
+      LeoEncoder.init(64, 65536 + 1, 0).error ==
+      "number of parity and data buffers cannot exceed 65536!"
+
+      LeoEncoder.init(64, 32768 + 1, 32768).error ==
+      "number of parity and data buffers cannot exceed 65536!"
+
+  test "Should not allow encoding with invalid data buffer counts":
+    var
+      leo = LeoEncoder.init(64, 4, 2).tryGet()
+      data = newSeq[seq[byte]](3)
+      parity = newSeq[seq[byte]](2)
+
+    check:
+      leo.encode(data, parity).error == "Number of data buffers should match!"
+
+  test "Should not allow encoding with invalid parity buffer counts":
+    var
+      leo = LeoEncoder.init(64, 4, 2).tryGet()
+      data = newSeq[seq[byte]](4)
+      parity = newSeq[seq[byte]](3)
+
+    check:
+      leo.encode(data, parity).error == "Number of parity buffers should match!"
+
+  test "Should not allow decoding with invalid data buffer counts":
+    var
+      leo = LeoDecoder.init(64, 4, 2).tryGet()
+      data = newSeq[seq[byte]](3)
+      parity = newSeq[seq[byte]](2)
+      recovered = newSeq[seq[byte]](3)
+
+    check:
+      leo.decode(data, parity, recovered).error == "Number of data buffers should match!"
+
+  test "Should not allow decoding with invalid data buffer counts":
+    var
+      leo = LeoDecoder.init(64, 4, 2).tryGet()
+      data = newSeq[seq[byte]](4)
+      parity = newSeq[seq[byte]](1)
+      recovered = newSeq[seq[byte]](3)
+
+    check:
+      leo.decode(data, parity, recovered).error == "Number of parity buffers should match!"
+
+  test "Should not allow decoding with invalid data buffer counts":
+    var
+      leo = LeoDecoder.init(64, 4, 2).tryGet()
+      data = newSeq[seq[byte]](4)
+      parity = newSeq[seq[byte]](2)
+      recovered = newSeq[seq[byte]](3)
+
+    check:
+      leo.decode(data, parity, recovered).error == "Number of recovered buffers should match buffers!"
+
+suite "Leopard simple Encode/Decode":
+  const
+    TestString = "Hello World!"
+    DataCount = 4
+    ParityCount = 2
+    BufferSize = 64
+
+  var
+    encoder: LeoEncoder
+    decoder: LeoDecoder
+    data: seq[seq[byte]]
+    parity: seq[seq[byte]]
+    recovered: seq[seq[byte]]
+
+  setup:
+    encoder = LeoEncoder.init(BufferSize, DataCount, ParityCount).tryGet()
+    decoder = LeoDecoder.init(BufferSize, DataCount, ParityCount).tryGet()
+    data = newSeq[seq[byte]](DataCount)
+    parity = newSeq[seq[byte]](ParityCount)
+    recovered = newSeq[seq[byte]](DataCount)
+
+  teardown:
+    encoder.free()
+    decoder.free()
+
+  test "Test 2 data loses out of 4 possible":
+    for i in 0..<DataCount:
+      data[i] = newSeq[byte](BufferSize)
+      recovered[i] = newSeq[byte](BufferSize)
+      var
+        str = TestString & " " & $i
+
+      copyMem(addr data[i][0], addr str[0], str.len)
+
+    for i in 0..<ParityCount:
+      parity[i] = newSeq[byte](BufferSize)
+
+    encoder.encode(data, parity).tryGet()
+
+    var
+      data1 = data[0]
+      data2 = data[1]
+
+    data[0].setLen(0)
+    data[1].setLen(0)
+
+    decoder.decode(data, parity, recovered).tryGet()
+
+    check recovered[0] == data1
+    check recovered[1] == data2
+
+  test "Test 1 data and 1 parity loss out of 4 possible":
+    for i in 0..<DataCount:
+      data[i] = newSeq[byte](BufferSize)
+      recovered[i] = newSeq[byte](BufferSize)
+
+      var
+        str = TestString & " " & $i
+
+      copyMem(addr data[i][0], addr str[0], str.len)
+
+    for i in 0..<ParityCount:
+      parity[i] = newSeq[byte](BufferSize)
+
+    encoder.encode(data, parity).tryGet()
+
+    var
+      data1 = data[0]
+
+    data[0].setLen(0)
+    parity[0].setLen(0)
+
+    decoder.decode(data, parity, recovered).tryGet()
+    check recovered[0] == data1
+
+suite "Leopard Encode/Decode":
+  test "bufSize = 4096, K = 800, M = 200 - drop data = 200 data":
+    var
+      encoder: LeoEncoder
+      decoder: LeoDecoder
+      buffers = 800
+      parity = 200
+      bufSize = 4096
+      dataLoses = 200
+
+    try:
+      encoder = LeoEncoder.init(bufSize, buffers, parity).tryGet()
+      decoder = LeoDecoder.init(bufSize, buffers, parity).tryGet()
+      testPackets(buffers, parity, bufSize, dataLoses, 0, encoder, decoder).tryGet()
+    finally:
+      encoder.free()
+      decoder.free()
+
+  test "bufSize = 4096, K = 800, M = 200 - drop parity = 200":
+    var
+      encoder: LeoEncoder
+      decoder: LeoDecoder
+      buffers = 800
+      parity = 200
+      bufSize = 4096
+      parityLoses = 200
+
+    try:
+      encoder = LeoEncoder.init(bufSize, buffers, parity).tryGet()
+      decoder = LeoDecoder.init(bufSize, buffers, parity).tryGet()
+      testPackets(buffers, parity, bufSize, parityLoses, 0, encoder, decoder).tryGet()
+    finally:
+      encoder.free()
+      decoder.free()
+
+  test "bufSize = 4096, K = 800, M = 200 - drop data = 100, drop parity = 100":
+    var
+      encoder: LeoEncoder
+      decoder: LeoDecoder
+      buffers = 800
+      parity = 200
+      bufSize = 4096
+      dataLoses = 100
+      parityLoses = 100
+
+    try:
+      encoder = LeoEncoder.init(bufSize, buffers, parity).tryGet()
+      decoder = LeoDecoder.init(bufSize, buffers, parity).tryGet()
+      testPackets(buffers, parity, bufSize, dataLoses, parityLoses, encoder, decoder).tryGet()
+    finally:
+      encoder.free()
+      decoder.free()
+
+  test "bufSize = 4096, K = 8000, M = 2000 - drop data = 2000":
+    var
+      encoder: LeoEncoder
+      decoder: LeoDecoder
+      buffers = 8000
+      parity = 2000
+      bufSize = 4096
+      dataLoses = 2000
+      parityLoses = 0
+
+    try:
+      encoder = LeoEncoder.init(bufSize, buffers, parity).tryGet()
+      decoder = LeoDecoder.init(bufSize, buffers, parity).tryGet()
+      testPackets(buffers, parity, bufSize, dataLoses, parityLoses, encoder, decoder).tryGet()
+    finally:
+      encoder.free()
+      decoder.free()
+
+  test "bufSize = 4096, K = 8000, M = 2000 - drop parity = 2000":
+    var
+      encoder: LeoEncoder
+      decoder: LeoDecoder
+      buffers = 8000
+      parity = 2000
+      bufSize = 4096
+      dataLoses = 0
+      parityLoses = 2000
+
+    try:
+      encoder = LeoEncoder.init(bufSize, buffers, parity).tryGet()
+      decoder = LeoDecoder.init(bufSize, buffers, parity).tryGet()
+      testPackets(buffers, parity, bufSize, dataLoses, parityLoses, encoder, decoder).tryGet()
+    finally:
+      encoder.free()
+      decoder.free()
+
+  test "bufSize = 4096, K = 8000, M = 2000 - drop data = 1000, parity = 1000":
+    var
+      encoder: LeoEncoder
+      decoder: LeoDecoder
+      buffers = 8000
+      parity = 2000
+      bufSize = 4096
+      dataLoses = 1000
+      parityLoses = 1000
+
+    try:
+      encoder = LeoEncoder.init(bufSize, buffers, parity).tryGet()
+      decoder = LeoDecoder.init(bufSize, buffers, parity).tryGet()
+      testPackets(buffers, parity, bufSize, dataLoses, parityLoses, encoder, decoder).tryGet()
+    finally:
+      encoder.free()
+      decoder.free()
+
+  test "bufSize = 4096, K = 8000, M = 8000 - drop data = 8000":
+    var
+      encoder: LeoEncoder
+      decoder: LeoDecoder
+      buffers = 8000
+      parity = 8000
+      bufSize = 4096
+      dataLoses = 8000
+      parityLoses = 0
+
+    try:
+      encoder = LeoEncoder.init(bufSize, buffers, parity).tryGet()
+      decoder = LeoDecoder.init(bufSize, buffers, parity).tryGet()
+      testPackets(buffers, parity, bufSize, dataLoses, parityLoses, encoder, decoder).tryGet()
+    finally:
+      encoder.free()
+      decoder.free()
+
+  test "bufSize = 4096, K = 8000, M = 8000 - drop parity = 8000":
+    var
+      encoder: LeoEncoder
+      decoder: LeoDecoder
+      buffers = 8000
+      parity = 8000
+      bufSize = 4096
+      dataLoses = 0
+      parityLoses = 8000
+
+    try:
+      encoder = LeoEncoder.init(bufSize, buffers, parity).tryGet()
+      decoder = LeoDecoder.init(bufSize, buffers, parity).tryGet()
+      testPackets(buffers, parity, bufSize, dataLoses, parityLoses, encoder, decoder).tryGet()
+    finally:
+      encoder.free()
+      decoder.free()
+
+  test "bufSize = 4096, K = 8000, M = 8000 - drop data = 4000, parity = 4000":
+    var
+      encoder: LeoEncoder
+      decoder: LeoDecoder
+      buffers = 8000
+      parity = 8000
+      bufSize = 4096
+      dataLoses = 4000
+      parityLoses = 4000
+
+    try:
+      encoder = LeoEncoder.init(bufSize, buffers, parity).tryGet()
+      decoder = LeoDecoder.init(bufSize, buffers, parity).tryGet()
+      testPackets(buffers, parity, bufSize, dataLoses, parityLoses, encoder, decoder).tryGet()
+    finally:
+      encoder.free()
+      decoder.free()
+
+suite "Leopard use same encoder/decoder multiple times":
+    var
+      encoder: LeoEncoder
+      decoder: LeoDecoder
+
+    try:
+      encoder = LeoEncoder.init(4096, 800, 800).tryGet()
+      decoder = LeoDecoder.init(4096, 800, 800).tryGet()
+      for i in 0..10:
+        let lost = 40 * i
+        test "Encode/Decode using same encoder/decoder - lost data = " & $lost & " lost parity = " & $lost:
+          testPackets(800, 800, 4096, 40 * i, 40 * i, encoder, decoder).tryGet()
+    finally:
+      encoder.free()
+      decoder.free()