Add a threadpool (#213)

* Implement a threadpool * int and SomeUnsignedInt ... * Type conversion for windows SynchronizationBarrier * Use the latest MacOS 11, Big Sur API (jan 2021) for MacOS futexes, Github action offers MacOS 12 and can test them * bench need posix timer not available on windows and darwin futex * Windows: nimble exec empty line is an error, Mac: use defined(osx) instead of defined(macos) * file rename * okay, that's the last one hopefully * deactivate stealHalf for now
2026-02-17 03:33:22 +00:00 · 2023-01-24 02:32:28 +01:00 · 2023-01-24 02:32:28 +01:00 · 2931913b67
commit 2931913b67
parent 188f3e710c
48 changed files with 4647 additions and 106 deletions
--- a/constantine.nimble
+++ b/constantine.nimble
@ -43,7 +43,7 @@ const testDesc: seq[tuple[path: string, useGMP: bool]] = @[
  # ----------------------------------------------------------
  ("tests/math/t_primitives.nim", false),
  ("tests/math/t_primitives_extended_precision.nim", false),
-  
+
  # Big ints
  # ----------------------------------------------------------
  ("tests/math/t_io_bigints.nim", false),
@ -53,7 +53,7 @@ const testDesc: seq[tuple[path: string, useGMP: bool]] = @[
  ("tests/math/t_bigints_mod_vs_gmp.nim", true),
  ("tests/math/t_bigints_mul_vs_gmp.nim", true),
  ("tests/math/t_bigints_mul_high_words_vs_gmp.nim", true),
-  
+
  # Field
  # ----------------------------------------------------------
  ("tests/math/t_io_fields", false),
@ -64,11 +64,11 @@ const testDesc: seq[tuple[path: string, useGMP: bool]] = @[
  ("tests/math/t_finite_fields_powinv.nim", false),
  ("tests/math/t_finite_fields_vs_gmp.nim", true),
  # ("tests/math/t_fp_cubic_root.nim", false),
-  
+
  # Double-precision finite fields
  # ----------------------------------------------------------
  ("tests/math/t_finite_fields_double_precision.nim", false),
-  
+
  # Towers of extension fields
  # ----------------------------------------------------------
  # ("tests/math/t_fp2.nim", false),
@ -90,7 +90,7 @@ const testDesc: seq[tuple[path: string, useGMP: bool]] = @[
  ("tests/math/t_fp6_frobenius.nim", false),
  ("tests/math/t_fp12_frobenius.nim", false),

-  # Elliptic curve arithmetic 
+  # Elliptic curve arithmetic
  # ----------------------------------------------------------
  ("tests/math/t_ec_conversion.nim", false),

@ -111,7 +111,7 @@ const testDesc: seq[tuple[path: string, useGMP: bool]] = @[
  ("tests/math/t_ec_twedwards_prj_add_double", false),
  ("tests/math/t_ec_twedwards_prj_mul_sanity", false),
  ("tests/math/t_ec_twedwards_prj_mul_distri", false),
- 
+

  # Elliptic curve arithmetic G2
  # ----------------------------------------------------------
@ -172,7 +172,7 @@ const testDesc: seq[tuple[path: string, useGMP: bool]] = @[
  ("tests/math/t_ec_sage_bls12_381.nim", false),
  ("tests/math/t_ec_sage_pallas.nim", false),
  ("tests/math/t_ec_sage_vesta.nim", false),
-  
+
  # Edge cases highlighted by past bugs
  # ----------------------------------------------------------
  ("tests/math/t_ec_shortw_prj_edge_cases.nim", false),
@ -233,6 +233,18 @@ const testDescNvidia: seq[string] = @[
  "tests/gpu/t_nvidia_fp.nim",
 ]

+const testDescThreadpool: seq[string] = @[
+  "constantine/platforms/threadpool/examples/e01_simple_tasks.nim",
+  "constantine/platforms/threadpool/examples/e02_parallel_pi.nim",
+  # "constantine/platforms/threadpool/benchmarks/bouncing_producer_consumer/threadpool_bpc.nim", # Need timing not implemented on Windows
+  "constantine/platforms/threadpool/benchmarks/dfs/threadpool_dfs.nim",
+  "constantine/platforms/threadpool/benchmarks/fibonacci/threadpool_fib.nim",
+  "constantine/platforms/threadpool/benchmarks/heat/threadpool_heat.nim",
+  # "constantine/platforms/threadpool/benchmarks/matmul_cache_oblivious/threadpool_matmul_co.nim",
+  "constantine/platforms/threadpool/benchmarks/nqueens/threadpool_nqueens.nim",
+  # "constantine/platforms/threadpool/benchmarks/single_task_producer/threadpool_spc.nim", # Need timing not implemented on Windows
+]
+
 const benchDesc = [
  "bench_fp",
  "bench_fp_double_precision",
@ -301,7 +313,7 @@ proc clearParallelBuild() =
  if fileExists(buildParallel):
    rmFile(buildParallel)

-template setupCommand(): untyped {.dirty.} = 
+template setupCommand(): untyped {.dirty.} =
  var lang = "c"
  if existsEnv"TEST_LANG":
    lang = getEnv"TEST_LANG"
@ -367,7 +379,7 @@ proc addTestSet(cmdFile: var string, requireGMP: bool, test32bit = false, testAS
  if not dirExists "build":
    mkDir "build"
  echo "Found " & $testDesc.len & " tests to run."
-  
+
  for td in testDesc:
    if not(td.useGMP and not requireGMP):
      var flags = ""
@ -379,17 +391,25 @@ proc addTestSet(cmdFile: var string, requireGMP: bool, test32bit = false, testAS
        flags &= " -d:debugConstantine"
      if td.path notin skipSanitizers:
        flags &= sanitizers
-      
+
      cmdFile.testBatch(flags, td.path)

 proc addTestSetNvidia(cmdFile: var string) =
  if not dirExists "build":
    mkDir "build"
  echo "Found " & $testDescNvidia.len & " tests to run."
-  
+
  for path in testDescNvidia:
    cmdFile.testBatch(flags = "", path)

+proc addTestSetThreadpool(cmdFile: var string) =
+  if not dirExists "build":
+    mkDir "build"
+  echo "Found " & $testDescThreadpool.len & " tests to run."
+
+  for path in testDescThreadpool:
+    cmdFile.testBatch(flags = "--threads:on --linetrace:on", path)
+
 proc addBenchSet(cmdFile: var string, useAsm = true) =
  if not dirExists "build":
    mkDir "build"
@ -491,13 +511,13 @@ task test_bindings, "Test C bindings":
    # Put DLL near the exe as LD_LIBRARY_PATH doesn't work even in an POSIX compatible shell
    exec "gcc -Ibindings/generated -Lbindings/generated -o build/testsuite/t_libctt_bls12_381_dl.exe tests/bindings/t_libctt_bls12_381.c -lgmp -lconstantine_bls12_381"
    exec "./build/testsuite/t_libctt_bls12_381_dl.exe"
-  
+
  echo "--> Testing statically linked library"
  when not defined(windows):
    # Beware MacOS annoying linker with regards to static libraries
    # The following standard way cannot be used on MacOS
    # exec "gcc -Ibindings/generated -Lbindings/generated -o build/t_libctt_bls12_381_sl.exe tests/bindings/t_libctt_bls12_381.c -lgmp -Wl,-Bstatic -lconstantine_bls12_381 -Wl,-Bdynamic"
-    
+
    exec "gcc -Ibindings/generated -o build/testsuite/t_libctt_bls12_381_sl tests/bindings/t_libctt_bls12_381.c bindings/generated/libconstantine_bls12_381.a -lgmp"
    exec "./build/testsuite/t_libctt_bls12_381_sl"
  else:
@ -509,32 +529,40 @@ task test, "Run all tests":
  var cmdFile: string
  cmdFile.addTestSet(requireGMP = true, testASM = true)
  cmdFile.addBenchSet(useASM = true)    # Build (but don't run) benches to ensure they stay relevant
+  cmdFile.addTestSetThreadpool()
  for cmd in cmdFile.splitLines():
-    exec cmd
+    if cmd != "": # Windows doesn't like empty commands
+      exec cmd

 task test_no_asm, "Run all tests (no assembly)":
  # -d:testingCurves is configured in a *.nim.cfg for convenience
  var cmdFile: string
  cmdFile.addTestSet(requireGMP = true, testASM = false)
  cmdFile.addBenchSet(useASM = false)    # Build (but don't run) benches to ensure they stay relevant
+  cmdFile.addTestSetThreadpool()
  for cmd in cmdFile.splitLines():
-    exec cmd
+    if cmd != "": # Windows doesn't like empty commands
+      exec cmd

 task test_no_gmp, "Run tests that don't require GMP":
  # -d:testingCurves is configured in a *.nim.cfg for convenience
  var cmdFile: string
  cmdFile.addTestSet(requireGMP = false, testASM = true)
  cmdFile.addBenchSet(useASM = true)    # Build (but don't run) benches to ensure they stay relevant
+  cmdFile.addTestSetThreadpool()
  for cmd in cmdFile.splitLines():
-    exec cmd
+    if cmd != "": # Windows doesn't like empty commands
+      exec cmd

 task test_no_gmp_no_asm, "Run tests that don't require GMP using a pure Nim backend":
  # -d:testingCurves is configured in a *.nim.cfg for convenience
  var cmdFile: string
  cmdFile.addTestSet(requireGMP = false, testASM = false)
  cmdFile.addBenchSet(useASM = false)    # Build (but don't run) benches to ensure they stay relevant
+  cmdFile.addTestSetThreadpool()
  for cmd in cmdFile.splitLines():
-    exec cmd
+    if cmd != "": # Windows doesn't like empty commands
+      exec cmd

 task test_parallel, "Run all tests in parallel (via GNU parallel)":
  # -d:testingCurves is configured in a *.nim.cfg for convenience
@ -547,6 +575,13 @@ task test_parallel, "Run all tests in parallel (via GNU parallel)":
  writeFile(buildParallel, cmdFile)
  exec "build/pararun " & buildParallel

+  # Threadpool tests done serially
+  cmdFile = ""
+  cmdFile.addTestSetThreadpool()
+  for cmd in cmdFile.splitLines():
+    if cmd != "": # Windows doesn't like empty commands
+      exec cmd
+
 task test_parallel_no_asm, "Run all tests (without macro assembler) in parallel (via GNU parallel)":
  # -d:testingCurves is configured in a *.nim.cfg for convenience
  clearParallelBuild()
@ -558,6 +593,13 @@ task test_parallel_no_asm, "Run all tests (without macro assembler) in parallel
  writeFile(buildParallel, cmdFile)
  exec "build/pararun " & buildParallel

+  # Threadpool tests done serially
+  cmdFile = ""
+  cmdFile.addTestSetThreadpool()
+  for cmd in cmdFile.splitLines():
+    if cmd != "": # Windows doesn't like empty commands
+      exec cmd
+
 task test_parallel_no_gmp, "Run all tests in parallel (via GNU parallel)":
  # -d:testingCurves is configured in a *.nim.cfg for convenience
  clearParallelBuild()
@ -569,6 +611,13 @@ task test_parallel_no_gmp, "Run all tests in parallel (via GNU parallel)":
  writeFile(buildParallel, cmdFile)
  exec "build/pararun " & buildParallel

+  # Threadpool tests done serially
+  cmdFile = ""
+  cmdFile.addTestSetThreadpool()
+  for cmd in cmdFile.splitLines():
+    if cmd != "": # Windows doesn't like empty commands
+      exec cmd
+
 task test_parallel_no_gmp_no_asm, "Run all tests in parallel (via GNU parallel)":
  # -d:testingCurves is configured in a *.nim.cfg for convenience
  clearParallelBuild()
@ -580,11 +629,26 @@ task test_parallel_no_gmp_no_asm, "Run all tests in parallel (via GNU parallel)"
  writeFile(buildParallel, cmdFile)
  exec "build/pararun " & buildParallel

+  # Threadpool tests done serially
+  cmdFile = ""
+  cmdFile.addTestSetThreadpool()
+  for cmd in cmdFile.splitLines():
+    if cmd != "": # Windows doesn't like empty commands
+      exec cmd
+
+task test_threadpool, "Run all tests for the builtin threadpool":
+  var cmdFile: string
+  cmdFile.addTestSetThreadpool()
+  for cmd in cmdFile.splitLines():
+    if cmd != "": # Windows doesn't like empty commands
+      exec cmd
+
 task test_nvidia, "Run all tests for Nvidia GPUs":
  var cmdFile: string
  cmdFile.addTestSetNvidia()
  for cmd in cmdFile.splitLines():
-    exec cmd
+    if cmd != "": # Windows doesn't like empty commands
+      exec cmd

 # Finite field 𝔽p
 # ------------------------------------------
--- a/constantine/math/arithmetic/limbs.nim
+++ b/constantine/math/arithmetic/limbs.nim
@ -356,7 +356,7 @@ func div10*(a: var Limbs): SecretWord =
  ## Divide `a` by 10 in-place and return the remainder
  result = Zero

-  const clz = WordBitWidth - 1 - log2_vartime(10)
+  const clz = WordBitWidth - 1 - log2_vartime(10'u32)
  const norm10 = SecretWord(10) shl clz

  for i in countdown(a.len-1, 0):
--- a/constantine/math/arithmetic/limbs_exgcd.nim
+++ b/constantine/math/arithmetic/limbs_exgcd.nim
@ -61,7 +61,7 @@ func invModBitwidth(a: BaseType): BaseType =
  # which grows in O(log(log(a)))
  debug: doAssert (a and 1) == 1, "a must be odd"

-  const k = log2_vartime(a.sizeof() * 8)
+  const k = log2_vartime(a.sizeof().uint32 * 8)
  result = a                 # Start from an inverse of M0 modulo 2, M0 is odd and it's own inverse
  for _ in 0 ..< k:          # at each iteration we get the inverse mod(2^2k)
    result *= 2 - a * result # x' = x(2 - ax)
--- a/constantine/math/config/precompute.nim
+++ b/constantine/math/config/precompute.nim
@ -282,7 +282,7 @@ func invModBitwidth*[T: SomeUnsignedInt](a: T): T =
  # which grows in O(log(log(a)))
  checkOdd(a)

-  let k = log2_vartime(T.sizeof() * 8)
+  let k = log2_vartime(T.sizeof().uint32 * 8)
  result = a                 # Start from an inverse of M0 modulo 2, M0 is odd and it's own inverse
  for _ in 0 ..< k:          # at each iteration we get the inverse mod(2^2k)
    result *= 2 - a * result # x' = x(2 - ax)
--- a/constantine/math/config/type_bigint.nim
+++ b/constantine/math/config/type_bigint.nim
@ -36,7 +36,7 @@ debug:
    result[0] = '0'
    result[1] = 'x'
    var a = a
-    for j in countdown(result.len-1, 2):
+    for j in countdown(result.len-1, 0):
      result[j] = hexChars.secretLookup(a and SecretWord 0xF)
      a = a shr 4

--- a/constantine/math/elliptic/ec_shortweierstrass_batch_ops.nim
+++ b/constantine/math/elliptic/ec_shortweierstrass_batch_ops.nim
@ -291,8 +291,8 @@ func sum_batch_vartime*[F; G: static Subgroup](
  const maxStride = maxChunkSize div sizeof(ECP_ShortW_Aff[F, G])
  
  let n = min(maxStride, points.len)
-  let accumulators = alloca(ECP_ShortW_Aff[F, G], n)
-  let lambdas = alloca(tuple[num, den: F], n)
+  let accumulators = allocStackArray(ECP_ShortW_Aff[F, G], n)
+  let lambdas = allocStackArray(tuple[num, den: F], n)

  for i in countup(0, points.len-1, maxStride):
    let n = min(maxStride, points.len - i)
--- a/constantine/platforms/README.md
+++ b/constantine/platforms/README.md
@ -6,9 +6,23 @@ This folder holds:
  to have the compiler enforce proper usage
 - extended precision multiplication and division primitives
 - assembly or builtin int128 primitives
- intrinsics
- an assembler
- Runtime CPU features detection
+- SIMD intrinsics
+- assemblers for x86 and LLVM IR
+- a code generator for Nvidia GPU from LLVM IR
+- runtime CPU features detection
+- a threadpool
+
+## Runtimes
+
+Constantine strongly avoid any runtime so that it can be used even where garbage collection, dynamic memory allocation
+are not allowed. That also avoids secrets remaining in heap memory.
+
+At runtime, Constantine may:
+- detect the CPU features at the start of the application (in Nim) or after calling `ctt_myprotocol_init_NimMain()` for the C (or any other language) bindings.
+
+And offers the following opt-in features with use dynamic allocation:
+- a threadpool, only for explicitly tagged parallel primitives.
+- use LLVM and Cuda, and configure code to run computation on GPUs.

 ## Security

--- a/constantine/platforms/allocs.nim
+++ b/constantine/platforms/allocs.nim
@ -6,6 +6,9 @@
 #   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
 # at your option. This file may not be copied, modified, or distributed except according to those terms.

+{.push raises:[].}  # No exceptions for crypto
+{.push checks:off.} # No int->size_t exceptions
+
 # ############################################################
 #
 #                        Allocators
@ -20,13 +23,107 @@
 #
 # stack allocation is strongly preferred where necessary.

+# Bindings
+# ----------------------------------------------------------------------------------
+# We wrap them with int instead of size_t / csize_t
+
 when defined(windows):
  proc alloca(size: int): pointer {.header: "<malloc.h>".}
 else:
  proc alloca(size: int): pointer {.header: "<alloca.h>".}

-template alloca*(T: typedesc): ptr T =
+proc malloc(size: int): pointer {.sideeffect, header: "<stdlib.h>".}
+proc free(p: pointer) {.sideeffect, header: "<stdlib.h>".}
+
+when defined(windows):
+  proc aligned_alloc_windows(size, alignment: int): pointer {.sideeffect,importc:"_aligned_malloc", header:"<malloc.h>".}
+    # Beware of the arg order!
+  proc aligned_alloc(alignment, size: int): pointer {.inline.} =
+    aligned_alloc_windows(size, alignment)
+  proc aligned_free(p: pointer){.sideeffect,importc:"_aligned_free", header:"<malloc.h>".}
+elif defined(osx):
+  proc posix_memalign(mem: var pointer, alignment, size: int){.sideeffect,importc, header:"<stdlib.h>".}
+  proc aligned_alloc(alignment, size: int): pointer {.inline.} =
+    posix_memalign(result, alignment, size)
+  proc aligned_free(p: pointer) {.sideeffect, importc: "free", header: "<stdlib.h>".}
+else:
+  proc aligned_alloc(alignment, size: int): pointer {.sideeffect,importc, header:"<stdlib.h>".}
+  proc aligned_free(p: pointer) {.sideeffect, importc: "free", header: "<stdlib.h>".}
+
+# Helpers
+# ----------------------------------------------------------------------------------
+
+proc isPowerOfTwo(n: int): bool {.inline.} =
+  (n and (n - 1)) == 0 and (n != 0)
+
+func roundNextMultipleOf(x: int, n: static int): int {.inline.} =
+  ## Round the input to the next multiple of "n"
+  when n.isPowerOfTwo():
+    # n is a power of 2. (If compiler cannot prove that x>0 it does not make the optim)
+    result = (x + n - 1) and not(n - 1)
+  else:
+    result = ((x + n - 1) div n) * n
+
+# Stack allocation
+# ----------------------------------------------------------------------------------
+
+template allocStack*(T: typedesc): ptr T =
  cast[ptr T](alloca(sizeof(T)))

-template alloca*(T: typedesc, len: Natural): ptr UncheckedArray[T] =
-  cast[ptr UncheckedArray[T]](alloca(sizeof(T) * len))
+template allocStackUnchecked*(T: typedesc, size: int): ptr T =
+  ## Stack allocation for types containing a variable-sized UncheckedArray field
+  cast[ptr T](alloca(size))
+
+template allocStackArray*(T: typedesc, len: Natural): ptr UncheckedArray[T] =
+  cast[ptr UncheckedArray[T]](alloca(sizeof(T) * len))
+
+# Heap allocation
+# ----------------------------------------------------------------------------------
+
+proc allocHeap*(T: typedesc): ptr T {.inline.} =
+  cast[type result](malloc(sizeof(T)))
+
+proc allocHeapUnchecked*(T: typedesc, size: int): ptr T {.inline.} =
+  ## Heap allocation for types containing a variable-sized UncheckedArray field
+  cast[type result](malloc(size))
+
+proc allocHeapArray*(T: typedesc, len: SomeInteger): ptr UncheckedArray[T] {.inline.} =
+  cast[type result](malloc(len*sizeof(T)))
+
+proc freeHeap*(p: pointer) {.inline.} =
+  free(p)
+
+proc allocHeapAligned*(T: typedesc, alignment: static Natural): ptr T {.inline.} =
+  # aligned_alloc requires allocating in multiple of the alignment.
+  const
+    size = sizeof(T)
+    requiredMem = size.roundNextMultipleOf(alignment)
+
+  cast[ptr T](aligned_alloc(alignment, requiredMem))
+
+proc allocHeapUncheckedAligned*(T: typedesc, size: int, alignment: static Natural): ptr T {.inline.} =
+  ## Aligned heap allocation for types containing a variable-sized UncheckedArray field
+  ## or an importc type with missing size information
+  # aligned_alloc requires allocating in multiple of the alignment.
+  let requiredMem = size.roundNextMultipleOf(alignment)
+
+  cast[ptr T](aligned_alloc(alignment, requiredMem))
+
+proc allocHeapArrayAligned*(T: typedesc, len: int, alignment: static Natural): ptr UncheckedArray[T] {.inline.} =
+  # aligned_alloc requires allocating in multiple of the alignment.
+  let
+    size = sizeof(T) * len
+    requiredMem = size.roundNextMultipleOf(alignment)
+
+  cast[ptr UncheckedArray[T]](aligned_alloc(alignment, requiredMem))
+
+proc allocHeapAlignedPtr*(T: typedesc[ptr], alignment: static Natural): T {.inline.} =
+  allocHeapAligned(typeof(default(T)[]), alignment)
+
+proc allocHeapUncheckedAlignedPtr*(T: typedesc[ptr], size: int, alignment: static Natural): T {.inline.} =
+  ## Aligned heap allocation for types containing a variable-sized UncheckedArray field
+  ## or an importc type with missing size information
+  allocHeapUncheckedAligned(typeof(default(T)[]), size, alignment)
+
+proc freeHeapAligned*(p: pointer) {.inline.} =
+  aligned_free(p)
--- a/constantine/platforms/bithacks.nim
+++ b/constantine/platforms/bithacks.nim
@ -32,7 +32,10 @@ import ./compilers/bitops
 #
 # See: https://www.chessprogramming.org/BitScan
 #      https://www.chessprogramming.org/General_Setwise_Operations
+#      https://www.chessprogramming.org/De_Bruijn_Sequence_Generator
 # and https://graphics.stanford.edu/%7Eseander/bithacks.html
+# and Hacker's Delight 2nd Edition, Henry S Warren, Jr.
+# and https://sites.google.com/site/sydfhd/articles-tutorials/de-bruijn-sequence-generator
 # for compendiums of bit manipulation

 func clearMask[T: SomeInteger](v: T, mask: T): T {.inline.} =
@ -43,77 +46,106 @@ func clearBit*[T: SomeInteger](v: T, bit: T): T {.inline.} =
  ## Returns ``v``, with the bit at position ``bit`` set to 0
  v.clearMask(1.T shl bit)

-func log2impl_vartime(x: uint32): uint32 =
+func log2_impl_vartime(n: uint32): uint32 =
  ## Find the log base 2 of a 32-bit or less integer.
  ## using De Bruijn multiplication
  ## Works at compile-time.
  ## ⚠️ not constant-time, table accesses are not uniform.
  # https://graphics.stanford.edu/%7Eseander/bithacks.html#IntegerLogDeBruijn
-  const lookup: array[32, uint8] = [0'u8, 9, 1, 10, 13, 21, 2, 29, 11, 14, 16, 18,
-    22, 25, 3, 30, 8, 12, 20, 28, 15, 17, 24, 7, 19, 27, 23, 6, 26, 5, 4, 31]
-  var v = x
-  v = v or v shr 1 # first round down to one less than a power of 2
-  v = v or v shr 2
-  v = v or v shr 4
-  v = v or v shr 8
-  v = v or v shr 16
-  lookup[(v * 0x07C4ACDD'u32) shr 27]
+  const lookup: array[32, uint8] = [
+    uint8  0,  9,  1, 10, 13, 21,  2, 29,
+          11, 14, 16, 18, 22, 25,  3, 30,
+           8, 12, 20, 28, 15, 17, 24,  7,
+          19, 27, 23,  6, 26,  5,  4, 31]
+  
+  # Isolate MSB
+  var n = n
+  n = n or n shr 1 # first round down to one less than a power of 2
+  n = n or n shr 2
+  n = n or n shr 4
+  n = n or n shr 8
+  n = n or n shr 16
+  uint32 lookup[(n * 0x07C4ACDD'u32) shr 27]

-func log2impl_vartime(x: uint64): uint64 {.inline.} =
+func log2_impl_vartime(n: uint64): uint64 {.inline.} =
  ## Find the log base 2 of a 32-bit or less integer.
  ## using De Bruijn multiplication
  ## Works at compile-time.
  ## ⚠️ not constant-time, table accesses are not uniform.
  # https://graphics.stanford.edu/%7Eseander/bithacks.html#IntegerLogDeBruijn
-  const lookup: array[64, uint8] = [0'u8, 58, 1, 59, 47, 53, 2, 60, 39, 48, 27, 54,
-    33, 42, 3, 61, 51, 37, 40, 49, 18, 28, 20, 55, 30, 34, 11, 43, 14, 22, 4, 62,
-    57, 46, 52, 38, 26, 32, 41, 50, 36, 17, 19, 29, 10, 13, 21, 56, 45, 25, 31,
-    35, 16, 9, 12, 44, 24, 15, 8, 23, 7, 6, 5, 63]
-  var v = x
-  v = v or v shr 1 # first round down to one less than a power of 2
-  v = v or v shr 2
-  v = v or v shr 4
-  v = v or v shr 8
-  v = v or v shr 16
-  v = v or v shr 32
-  lookup[(v * 0x03F6EAF2CD271461'u64) shr 58]
+  # https://stackoverflow.com/questions/11376288/fast-computing-of-log2-for-64-bit-integers
+  const lookup: array[64, uint8] = [
+    uint8  0, 58,  1, 59, 47, 53,  2, 60,
+          39, 48, 27, 54, 33, 42,  3, 61,
+          51, 37, 40, 49, 18, 28, 20, 55,
+          30, 34, 11, 43, 14, 22,  4, 62,
+          57, 46, 52, 38, 26, 32, 41, 50,
+          36, 17, 19, 29, 10, 13, 21, 56,
+          45, 25, 31, 35, 16,  9, 12, 44,
+          24, 15,  8, 23,  7,  6,  5, 63]
+  
+  # Isolate MSB
+  var n = n
+  n = n or n shr 1 # first round down to one less than a power of 2
+  n = n or n shr 2
+  n = n or n shr 4
+  n = n or n shr 8
+  n = n or n shr 16
+  n = n or n shr 32
+  uint64 lookup[(n * 0x03F6EAF2CD271461'u64) shr 58]

 func log2_vartime*[T: SomeUnsignedInt](n: T): T {.inline.} =
  ## Find the log base 2 of an integer
-  ## 
-  ## ⚠ With GCC and Clang compilers on x86, if n is zero, result is undefined.
  when nimvm:
-    when sizeof(T) == sizeof(uint64):
-      T(log2impl_vartime(uint64(n)))
+    when sizeof(T) == 8:
+      T(log2_impl_vartime(uint64(n)))
    else:
-      static: doAssert sizeof(T) <= sizeof(uint32)
-      T(log2impl_vartime(uint32(n)))
+      T(log2_impl_vartime(uint32(n)))
  else:
-    when sizeof(T) == sizeof(uint64):
-      T(log2_c_compiler_vartime(uint64(n)))
+    log2_c_compiler_vartime(n)
+
+func ctz_impl_vartime(n: uint32): uint32 =
+  ## Find the number of trailing zero bits
+  ## Requires n != 0
+  # https://sites.google.com/site/sydfhd/articles-tutorials/de-bruijn-sequence-generator
+  const lookup: array[32, uint8] = [
+    uint8  0,  1, 16,  2, 29, 17,  3, 22,
+          30, 20, 18, 11, 13,  4,  7, 23,
+          31, 15, 28, 21, 19, 10, 12,  6,
+          14, 27,  9,  5, 26,  8, 25, 24]
+  
+  let isolateLSB = n xor (n-1)
+  uint32 lookup[(isolateLSB * 0x6EB14F9'u32) shr 27]
+
+func ctz_impl_vartime(n: uint64): uint64 =
+  ## Find the number of trailing zero bits
+  ## Requires n != 0
+  # https://www.chessprogramming.org/BitScan#Bitscan_forward
+  const lookup: array[64, uint8] = [
+    uint8  0, 47,  1, 56, 48, 27,  2, 60,
+          57, 49, 41, 37, 28, 16,  3, 61,
+          54, 58, 35, 52, 50, 42, 21, 44,
+          38, 32, 29, 23, 17, 11,  4, 62,
+          46, 55, 26, 59, 40, 36, 15, 53,
+          34, 51, 20, 43, 31, 22, 10, 45,
+          25, 39, 14, 33, 19, 30,  9, 24,
+          13, 18,  8, 12,  7,  6,  5, 63]
+
+  let isolateLSB = n xor (n-1)
+  uint64 lookup[(isolateLSB * 0x03f79d71b4cb0a89'u64) shr 58]
+
+func countTrailingZeroBits*[T: SomeUnsignedInt](n: T): T {.inline.} =
+  ## Count the number of trailing zero bits of an integer
+  when nimvm:
+    if n == 0:
+      T(sizeof(n) * 8)
    else:
-      static: doAssert sizeof(T) <= sizeof(uint32)
-      T(log2_c_compiler_vartime(uint32(n)))
-
-func hammingWeight*(x: uint32): uint {.inline.} =
-  ## Counts the set bits in integer.
-  # https://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel
-  var v = x
-  v = v - ((v shr 1) and 0x55555555)
-  v = (v and 0x33333333) + ((v shr 2) and 0x33333333)
-  uint(((v + (v shr 4) and 0xF0F0F0F) * 0x1010101) shr 24)
-
-func hammingWeight*(x: uint64): uint {.inline.} =
-  ## Counts the set bits in integer.
-  # https://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel
-  var v = x
-  v = v - ((v shr 1'u64) and 0x5555555555555555'u64)
-  v = (v and 0x3333333333333333'u64) + ((v shr 2'u64) and 0x3333333333333333'u64)
-  v = (v + (v shr 4'u64) and 0x0F0F0F0F0F0F0F0F'u64)
-  uint((v * 0x0101010101010101'u64) shr 56'u64)
-
-func countLeadingZeros_vartime*[T: SomeUnsignedInt](x: T): T {.inline.} =
-  (8*sizeof(T)) - 1 - log2_vartime(x)
+      when sizeof(T) == 8:
+        T(ctz_impl_vartime(uint64(n)))
+      else:
+        T(ctz_impl_vartime(uint32(n)))
+  else:
+    ctz_c_compiler_vartime(n)

 func isPowerOf2_vartime*(n: SomeUnsignedInt): bool {.inline.} =
  ## Returns true if n is a power of 2
@ -121,7 +153,7 @@ func isPowerOf2_vartime*(n: SomeUnsignedInt): bool {.inline.} =
  ## for compile-time or explicit vartime proc only.
  (n and (n - 1)) == 0

-func nextPowerOf2_vartime*(n: uint64): uint64 {.inline.} =
+func nextPowerOfTwo_vartime*(n: uint32): uint32 {.inline.} =
  ## Returns x if x is a power of 2
  ## or the next biggest power of 2
-  1'u64 shl (log2_vartime(n-1) + 1)
+  1'u32 shl (log2_vartime(n-1) + 1)
--- a/constantine/platforms/compilers/bitops.nim
+++ b/constantine/platforms/compilers/bitops.nim
@ -15,54 +15,101 @@ when GCC_Compatible:
  func builtin_clzll(n: uint64): cint {.importc: "__builtin_clzll", nodecl.}
    ## Count the number of leading zeros
    ## undefined if n is zero
+  func builtin_ctz(n: uint32): cint {.importc: "__builtin_ctz", nodecl.}
+    ## Count the number of trailing zeros
+    ## undefined if n is zero
+  func builtin_ctzll(n: uint64): cint {.importc: "__builtin_ctzll", nodecl.}
+    ## Count the number of trailing zeros
+    ## undefined if n is zero

-  func log2_c_compiler_vartime*(n: uint8|uint16|uint32): int {.inline.} =
+  func log2_c_compiler_vartime*(n: SomeUnsignedInt): cint {.inline.} =
    ## Compute the log2 of n using compiler builtin
    ## ⚠ Depending on the compiler:
    ## - It is undefined if n == 0
    ## - It is not constant-time as a zero input is checked
-    cast[int](31 - cast[cuint](builtin_clz(n.uint32)))
-  func log2_c_compiler_vartime*(n: uint64): int {.inline.} =
-    ## Compute the log2 of n using compiler builtin
+    if n == 0:
+      0
+    else:
+      when sizeof(n) == 8:
+        cint(64) - builtin_clzll(n)
+      else:
+        cint(31) - builtin_clz(n.uint32)
+
+  func ctz_c_compiler_vartime*(n: SomeUnsignedInt): cint {.inline.} =
+    ## Compute the number of trailing zeros 
+    ## in the bit representation of n using compiler builtin
    ## ⚠ Depending on the compiler:
    ## - It is undefined if n == 0
    ## - It is not constant-time as a zero input is checked
-    cast[int](63 - cast[cuint](builtin_clzll(n)))
+    if n == 0:
+      sizeof(n) * 8
+    else:
+      when sizeof(n) == 8:
+        builtin_ctzll(n)
+      else:
+        builtin_ctz(n.uint32)

 elif defined(icc):
  func bitScanReverse(r: var uint32, n: uint32): uint8 {.importc: "_BitScanReverse", header: "<immintrin.h>".}
    ## Returns 0 if n is zero and non-zero otherwise
    ## Returns the position of the first set bit in `r`
+    ## from MSB to LSB
  func bitScanReverse64(r: var uint32, n: uint64): uint8 {.importc: "_BitScanReverse64", header: "<immintrin.h>".}
    ## Returns 0 if n is zero and non-zero otherwise
    ## Returns the position of the first set bit in `r`
+    ## from MSB to LSB
+  func bitScanForward(r: var uint32, n: uint32): uint8 {.importc: "_BitScanForward", header: "<immintrin.h>".}
+    ## Returns 0 if n is zero and non-zero otherwise
+    ## Returns the position of the first set bit in `r`
+    ## from LSB to MSB
+  func bitScanForward64(r: var uint32, n: uint64): uint8 {.importc: "_BitScanForward64", header: "<immintrin.h>".}
+    ## Returns 0 if n is zero and non-zero otherwise
+    ## Returns the position of the first set bit in `r`
+    ## from LSB to MSB

-  template bitscan(fnc: untyped; v: untyped): int {.inline.} =
+  template bitscan(fnc: untyped; v: untyped, default: static int): int {.inline.} =
    var index: uint32
    if fnc(index.addr, v) == 0:
-      return 0
+      return default
    return index.int
  
-  func log2_c_compiler_vartime*(n: uint8|uint16|uint32): int {.inline.} =
+  func log2_c_compiler_vartime*(n: SomeUnsignedInt): cint {.inline.} =
    ## Compute the log2 of n using compiler builtin
    ## ⚠ Depending on the compiler:
    ## - It is undefined if n == 0
    ## - It is not constant-time as a zero input is checked
-    bitscan(bitScanReverse, c.uint32)
-  func log2_c_compiler_vartime*(n: uint64): int {.inline.} =
-    ## Compute the log2 of n using compiler builtin
-    ## ⚠ Depending on the compiler:
-    ## - It is undefined if n == 0
-    ## - It is not constant-time as a zero input is checked
-    bitscan(bitScanReverse64, n)
+    when sizeof(n) == 8:
+      bitscan(bitScanReverse64, n, default = 0)
+    else:
+      bitscan(bitScanReverse, c.uint32, default = 0)
    
+  func ctz_c_compiler_vartime*(n: SomeUnsignedInt): cint {.inline.} =
+    ## Compute the number of trailing zero bits of n using compiler builtin
+    ## ⚠ Depending on the compiler:
+    ## - It is undefined if n == 0
+    ## - It is not constant-time as a zero input is checked
+    when sizeof(n) == 8:
+      bitscan(bitScanForward64, n, default = 0)
+    else:
+      bitscan(bitScanForward, c.uint32, default = 0)
+
 elif defined(vcc):
  func bitScanReverse(p: ptr uint32, b: uint32): uint8 {.importc: "_BitScanReverse", header: "<intrin.h>".}
    ## Returns 0 if n s no set bit and non-zero otherwise
    ## Returns the position of the first set bit in `r`
+    ## from MSB to LSB
  func bitScanReverse64(p: ptr uint32, b: uint64): uint8 {.importc: "_BitScanReverse64", header: "<intrin.h>".}
    ## Returns 0 if n s no set bit and non-zero otherwise
    ## Returns the position of the first set bit in `r`
+    ## from MSB to LSB
+  func bitScanForward(r: var uint32, n: uint32): uint8 {.importc: "_BitScanForward", header: "<intrin.h>".}
+    ## Returns 0 if n is zero and non-zero otherwise
+    ## Returns the position of the first set bit in `r`
+    ## from LSB to MSB
+  func bitScanForward64(r: var uint32, n: uint64): uint8 {.importc: "_BitScanForward64", header: "<intrin.h>".}
+    ## Returns 0 if n is zero and non-zero otherwise
+    ## Returns the position of the first set bit in `r`
+    ## from LSB to MSB

  template bitscan(fnc: untyped; v: untyped): int =
    var index: uint32
@ -70,18 +117,25 @@ elif defined(vcc):
      return 0
    return index.int
  
-  func log2_c_compiler_vartime*(n: uint8|uint16|uint32): int {.inline.} =
+  func log2_c_compiler_vartime*(n: SomeUnsignedInt): cint {.inline.} =
    ## Compute the log2 of n using compiler builtin
    ## ⚠ Depending on the compiler:
    ## - It is undefined if n == 0
    ## - It is not constant-time as a zero input is checked
-    bitscan(bitScanReverse, c.uint32)
-  func log2_c_compiler_vartime*(n: uint64): int {.inline.} =
-    ## Compute the log2 of n using compiler builtin
+    when sizeof(n) == 8:
+      bitscan(bitScanReverse64, n, default = 0)
+    else:
+      bitscan(bitScanReverse, c.uint32, default = 0)
+    
+  func ctz_c_compiler_vartime*(n: SomeUnsignedInt): cint {.inline.} =
+    ## Compute the number of trailing zero bits of n using compiler builtin
    ## ⚠ Depending on the compiler:
    ## - It is undefined if n == 0
    ## - It is not constant-time as a zero input is checked
-    bitscan(bitScanReverse64, n)
+    when sizeof(n) == 8:
+      bitscan(bitScanForward64, n, default = sizeof(n) * 8)
+    else:
+      bitscan(bitScanForward, c.uint32, default = sizeof(n) * 8)

 else:
  {. error: "Unsupported compiler".}
--- a/constantine/platforms/threadpool/README.md
+++ b/constantine/platforms/threadpool/README.md
@ -0,0 +1,38 @@
+# Constantine Threadpool
+
+## API
+
+The API spec follows https://github.com/nim-lang/RFCs/issues/347#task-parallelism-api
+
+## Overview
+
+This implements a lightweight, energy-efficient, easily auditable multithreaded threadpool.
+
+This threadpool will desirable properties are:
+
+- Ease of auditing and maintenance.
+- Resource-efficient. Threads spindown to save power, low memory use.
+- Decent performance and scalability. The CPU should spent its time processing user workloads
+  and not dealing with threadpool contention, latencies and overheads.
+
+Compared to [Weave](https://github.com/mratsim/weave), here are the tradeoffs:
+- Constantine's threadpool only provide spawn/sync (task parallelism).\
+  There is no (extremely) optimized parallel for (data parallelism)\
+  or precise in/out dependencies (events / dataflow parallelism).
+- Constantine's threadpool has been significantly optimized to provide
+  overhead lower than Weave's default (and as low as Weave "lazy" + "alloca" allocation scheme)
+- Constantine's threadpool provides the same adaptative scheduling strategy as Weave
+  with additional enhancement (leapfrogging)
+
+Compared to [nim-taskpools](https://github.com/status-im), here are the tradeoffs:
+- Constantine does not use std/tasks:
+  - No external dependencies at runtime (apart from compilers, OS and drivers)
+  - We can replace Task with an intrusive linked list
+  - Furthermore we can embed tasks in their future
+- Hence allocation/scheduler overhead is 3x less than nim-taskpools as we fuse the following allocations:
+  - Task
+  - The linked list of tasks
+  - The future (Flowvar) result channel
+- Contention improvement, Constantine is entirely lock-free while Nim-taskpools need a lock+condition variable for putting threads to sleep
+- Powersaving improvement, threads sleep when awaiting for a task and there is no work available.
+- Scheduling improvement, Constantine's threadpool incorporate Weave's adaptative scheduling policy with additional enhancement (leapfrogging)
--- a/constantine/platforms/threadpool/benchmarks/bouncing_producer_consumer/README.md
+++ b/constantine/platforms/threadpool/benchmarks/bouncing_producer_consumer/README.md
@ -0,0 +1,11 @@
+# BPC (Bouncing Producer-Consumer)
+
+From [tasking-2.0](https://github.com/aprell/tasking-2.0) description
+
+> **BPC**, short for **B**ouncing **P**roducer-**C**onsumer benchmark, as far
+> as I know, first described by [Dinan et al][1]. There are two types of
+> tasks, producer and consumer tasks. Each producer task creates another
+> producer task followed by *n* consumer tasks, until a certain depth *d* is
+> reached. Consumer tasks run for *t* microseconds. The smaller the values of
+> *n* and *t*, the harder it becomes to exploit the available parallelism. A
+> solid contender for the most antagonistic microbenchmark.
--- a/constantine/platforms/threadpool/benchmarks/bouncing_producer_consumer/threadpool_bpc.nim
+++ b/constantine/platforms/threadpool/benchmarks/bouncing_producer_consumer/threadpool_bpc.nim
@ -0,0 +1,157 @@
+import
+  # STD lib
+  system/ansi_c, std/[os, strutils, cpuinfo, strformat, math],
+  # Library
+  ../../threadpool,
+  # bench
+  ../wtime, ../resources
+
+var
+  Depth: int32 # For example 10000
+  NumTasksPerDepth: int32 # For example 9
+  # The total number of tasks in the BPC benchmark is
+  # (NumTasksPerDepth + 1) * Depth
+  NumTasksTotal: int32
+  TaskGranularity: int32 # in microseconds
+  PollInterval: float64  # in microseconds
+
+  tp: Threadpool
+
+var global_poll_elapsed {.threadvar.}: float64
+
+template dummy_cpt(): untyped =
+  # Dummy computation
+  # Calculate fib(30) iteratively
+  var
+    fib = 0
+    f2 = 0
+    f1 = 1
+  for i in 2 .. 30:
+    fib = f1 + f2
+    f2 = f1
+    f1 = fib
+
+proc bpc_consume(usec: int32) =
+
+  var pollElapsed = 0'f64
+
+  let start = wtime_usec()
+  let stop = usec.float64
+  global_poll_elapsed = PollInterval
+
+  while true:
+    var elapsed = wtime_usec() - start
+    elapsed -= pollElapsed
+    if elapsed >= stop:
+      break
+
+    dummy_cpt()
+
+    # if elapsed >= global_poll_elapsed:
+    #   let pollStart = wtime_usec()
+    #   loadBalance(Weave)
+    #   pollElapsed += wtime_usec() - pollStart
+    #   global_poll_elapsed += PollInterval
+
+proc bpc_consume_nopoll(usec: int32) =
+
+  let start = wtime_usec()
+  let stop = usec.float64
+
+  while true:
+    var elapsed = wtime_usec() - start
+    if elapsed >= stop:
+      break
+
+    dummy_cpt()
+
+proc bpc_produce(n, d: int32) {.gcsafe.} =
+  if d > 0:
+    # Create producer task
+    tp.spawn bpc_produce(n, d-1)
+  else:
+    return
+
+  # Followed by n consumer tasks
+  for i in 0 ..< n:
+    tp.spawn bpc_consume(TaskGranularity)
+
+proc main() =
+  Depth = 10000
+  NumTasksPerDepth = 999
+  TaskGranularity = 1
+
+  if paramCount() == 0:
+    let exeName = getAppFilename().extractFilename()
+    echo &"Usage: {exeName} <depth: {Depth}> " &
+         &"<# of tasks per depth: {NumTasksPerDepth}> " &
+         &"[task granularity (us): {TaskGranularity}] " &
+         &"[polling interval (us): task granularity]"
+    echo &"Running with default config Depth = {Depth}, NumTasksPerDepth = {NumTasksPerDepth}, granularity (us) = {TaskGranularity}, polling (us) = {PollInterval}"
+  if paramCount() >= 1:
+    Depth = paramStr(1).parseInt.int32
+  if paramCount() >= 2:
+    NumTasksPerDepth = paramStr(2). parseInt.int32
+  if paramCount() >= 3:
+    TaskGranularity = paramStr(3). parseInt.int32
+  if paramCount() == 4:
+    PollInterval = paramStr(4).parseInt.float64
+  else:
+    PollInterval = TaskGranularity.float64
+  if paramCount() > 4:
+    let exeName = getAppFilename().extractFilename()
+    echo &"Usage: {exeName} <depth: {Depth}> " &
+         &"<# of tasks per depth: {NumTasksPerDepth}> " &
+         &"[task granularity (us): {TaskGranularity}] " &
+         &"[polling interval (us): task granularity]"
+    quit 1
+
+  NumTasksTotal = (NumTasksPerDepth + 1) * Depth
+
+  var nthreads: int
+  if existsEnv"CTT_NUM_THREADS":
+    nthreads = getEnv"CTT_NUM_THREADS".parseInt()
+  else:
+    nthreads = countProcessors()
+
+  tp = Threadpool.new(numThreads = nthreads)
+
+  # measure overhead during tasking
+  var ru: Rusage
+  getrusage(RusageSelf, ru)
+  var
+    rss = ru.ru_maxrss
+    flt = ru.ru_minflt
+
+  let start = wtime_msec()
+
+  bpc_produce(NumTasksPerDepth, Depth)
+  tp.syncAll()
+
+  let stop = wtime_msec()
+
+  getrusage(RusageSelf, ru)
+  rss = ru.ru_maxrss - rss
+  flt = ru.ru_minflt - flt
+
+  tp.shutdown()
+
+  echo "--------------------------------------------------------------------------"
+  echo "Scheduler:                                     Constantine's Threadpool"
+  echo "Benchmark:                                     BPC (Bouncing Producer-Consumer)"
+  echo "Threads:                                       ", nthreads
+  echo "Time(ms)                                       ", round(stop - start, 3)
+  echo "Max RSS (KB):                                  ", ru.ru_maxrss
+  echo "Runtime RSS (KB):                              ", rss
+  echo "# of page faults:                              ", flt
+  echo "--------------------------------------------------------------------------"
+  echo "# of tasks:                                    ", NumTasksTotal
+  echo "# of tasks/depth:                              ", NumTasksPerDepth
+  echo "Depth:                                         ", Depth
+  echo "Task granularity (us):                         ", TaskGranularity
+  echo "Polling / manual load balancing interval (us): ", PollInterval
+  echo "--------------------------------------------------------------------------"
+
+  quit 0
+
+main()
--- a/constantine/platforms/threadpool/benchmarks/dfs/threadpool_dfs.nim
+++ b/constantine/platforms/threadpool/benchmarks/dfs/threadpool_dfs.nim
@ -0,0 +1,86 @@
+# Weave
+# Copyright (c) 2019 Mamy André-Ratsimbazafy
+# Licensed and distributed under either of
+#   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
+#   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
+# at your option. This file may not be copied, modified, or distributed except according to those terms.
+
+import
+  # Stdlib
+  system/ansi_c, std/[strformat, os, strutils, cpuinfo],
+  # Library
+  ../../threadpool
+  
+when not defined(windows):
+  # bench
+  import ../wtime
+
+var tp: Threadpool
+
+proc dfs(depth, breadth: int): uint32 {.gcsafe.} =
+  if depth == 0:
+    return 1
+
+  # We could use alloca to avoid heap allocation here
+  var sums = newSeq[Flowvar[uint32]](breadth)
+
+  for i in 0 ..< breadth:
+    sums[i] = tp.spawn dfs(depth - 1, breadth)
+
+  for i in 0 ..< breadth:
+    result += sync(sums[i])
+
+proc test(depth, breadth: int): uint32 =
+  result = sync tp.spawn dfs(depth, breadth)
+
+proc main() =
+
+  var
+    depth = 8
+    breadth = 8
+    answer: uint32
+    nthreads: int
+
+  if existsEnv"CTT_NUM_THREADS":
+    nthreads = getEnv"CTT_NUM_THREADS".parseInt()
+  else:
+    nthreads = countProcessors()
+
+  if paramCount() == 0:
+    let exeName = getAppFilename().extractFilename()
+    echo &"Usage: {exeName} <depth:{depth}> <breadth:{breadth}>"
+    echo &"Running with default config depth = {depth} and breadth = {breadth}"
+
+  if paramCount() >= 1:
+    depth = paramStr(1).parseInt()
+  if paramCount() == 2:
+    breadth = paramStr(2).parseInt()
+  if paramCount() > 2:
+    let exeName = getAppFilename().extractFilename()
+    echo &"Usage: {exeName} <depth:{depth}> <breadth:{breadth}>"
+    echo &"Up to 2 parameters are valid. Received {paramCount()}"
+    quit 1
+
+  # Staccato benches runtime init and exit as well
+  when not defined(windows):
+    let start = wtime_usec()
+
+  tp = Threadpool.new()
+  answer = test(depth, breadth)
+  tp.shutdown()
+
+  when not defined(windows):
+    let stop = wtime_usec()
+
+  echo "--------------------------------------------------------------------------"
+  echo "Scheduler:  Constantine's Threadpool"
+  echo "Benchmark:  dfs"
+  echo "Threads:    ", nthreads
+  when not defined(windows):
+    echo "Time(us)    ", stop - start
+  echo "Output:     ", answer
+  echo "--------------------------------------------------------------------------"
+
+  quit 0
+
+main()
--- a/constantine/platforms/threadpool/benchmarks/fibonacci/README.md
+++ b/constantine/platforms/threadpool/benchmarks/fibonacci/README.md
@ -0,0 +1,77 @@
+# Fibonacci benchmarks
+
+⚠️ Disclaimer:
+   Please don't use parallel fibonacci in production!
+   Use the fast doubling method with memoization instead.
+
+Fibonacci benchmark has 3 draws:
+
+1. It's very simple to implement
+2. It's unbalanced and efficiency requires distributions to avoid idle cores.
+3. It's a very effective scheduler overhead benchmark, because the basic task is very trivial and the task spawning grows at 2^n scale.
+
+Want to know the difference between low and high overhead?
+
+Run the following C code (taken from [Oracle OpenMP example](https://docs.oracle.com/cd/E19205-01/820-7883/girtd/index.html))
+
+```C
+#include <stdio.h>
+#include <omp.h>
+int fib(int n)
+{
+  int i, j;
+  if (n<2)
+    return n;
+  else
+    {
+       #pragma omp task shared(i) firstprivate(n)
+       {
+         i=fib(n-1);
+       }
+
+       j=fib(n-2);
+       #pragma omp taskwait
+       return i+j;
+    }
+}
+
+int main()
+{
+  int n = 40;
+
+  #pragma omp parallel shared(n)
+  {
+    #pragma omp single
+    printf ("fib(%d) = %d\n", n, fib(n));
+  }
+}
+```
+
+First compile with Clang and run it
+```
+clang -O3 -fopenmp benchmarks/fibonacci/omp_fib.c
+time a.out
+```
+It should be fairly quick
+
+
+Then compile with GCC and run it
+```
+gcc -O3 -fopenmp benchmarks/fibonacci/omp_fib.c
+time a.out
+```
+
+Notice how some cores get idle as time goes on?
+Don't forget to kill the benchmark, you'll be there all day.
+
+What's happening?
+
+GCC's OpenMP implementation uses a single queue for all tasks.
+That queue gets constantly hammered by all threads and becomes a contention point.
+Furthermore, it seems like there is no load balancing or that due to the contention/lock
+threads are descheduled.
+
+However Clang implementation uses a work-stealing scheduler with one queue per thread.
+The only contention happens when a thread run out of work and has to look for more work,
+in the queue of other threads. And which thread to check is chosen at random so
+the potential contention is distributed among all threads instead of a single structure.
--- a/constantine/platforms/threadpool/benchmarks/fibonacci/stdnim_fib.nim
+++ b/constantine/platforms/threadpool/benchmarks/fibonacci/stdnim_fib.nim
@ -0,0 +1,35 @@
+import
+  # STD lib
+  std/[os, strutils, threadpool, strformat],
+  # bench
+  ../wtime
+
+# Using Nim's standard threadpool
+# Compile with "nim c --threads:on -d:release -d:danger --outdir:build benchmarks/fibonacci/stdnim_fib.nim"
+#
+# Note: it breaks at fib 16.
+
+proc parfib(n: uint64): uint64 =
+  if n < 2:   # Note: be sure to compare n<2 -> return n
+    return n #       instead of n<=2 -> return 1
+
+  let x = spawn parfib(n-1)
+  let y = parfib(n-2)
+
+  return ^x + y
+
+proc main() =
+  if paramCount() != 1:
+    echo "Usage: fib <n-th fibonacci number requested>"
+    quit 0
+
+  let n = paramStr(1).parseUInt.uint64
+
+  let start = wtime_msec()
+  let f = parfib(n)
+  let stop = wtime_msec()
+
+  echo "Result: ", f
+  echo &"Elapsed wall time: {stop-start:.2} ms"
+
+main()
--- a/constantine/platforms/threadpool/benchmarks/fibonacci/threadpool_fib.nim
+++ b/constantine/platforms/threadpool/benchmarks/fibonacci/threadpool_fib.nim
@ -0,0 +1,79 @@
+import
+  # STD lib
+  std/[os, strutils, cpuinfo, strformat, math],
+  # Library
+  ../../threadpool
+
+when not defined(windows):
+  # bench
+  import ../wtime, ../resources
+
+var tp: Threadpool
+
+proc fib(n: int): int =
+  # int64 on x86-64
+  if n < 2:
+    return n
+
+  let x = tp.spawn fib(n-1)
+  let y = fib(n-2)
+
+  result = sync(x) + y
+
+proc main() =
+  var n = 40
+  var nthreads: int
+
+  if paramCount() == 0:
+    let exeName = getAppFilename().extractFilename()
+    echo &"Usage: {exeName} <n-th fibonacci number requested:{n}> "
+    echo &"Running with default n = {n}"
+  elif paramCount() == 1:
+    n = paramStr(1).parseInt
+  else:
+    let exeName = getAppFilename().extractFilename()
+    echo &"Usage: {exeName} <n-th fibonacci number requested:{n}>"
+    quit 1
+
+  if existsEnv"CTT_NUM_THREADS":
+    nthreads = getEnv"CTT_NUM_THREADS".parseInt()
+  else:
+    nthreads = countProcessors()
+
+  tp = Threadpool.new()
+
+  # measure overhead during tasking
+  when not defined(windows):
+    var ru: Rusage
+    getrusage(RusageSelf, ru)
+    var
+      rss = ru.ru_maxrss
+      flt = ru.ru_minflt
+
+    let start = wtime_msec()
+  let f = fib(n)
+
+  when not defined(windows):
+    let stop = wtime_msec()
+
+  tp.shutdown()
+
+  when not defined(windows):
+    getrusage(RusageSelf, ru)
+    rss = ru.ru_maxrss - rss
+    flt = ru.ru_minflt - flt
+
+  echo "--------------------------------------------------------------------------"
+  echo "Scheduler:                                    Constantine's Threadpool"
+  echo "Benchmark:                                    Fibonacci"
+  echo "Threads:                                      ", nthreads
+  when not defined(windows):
+    echo "Time(ms)                                      ", round(stop - start, 3)
+    echo "Max RSS (KB):                                 ", ru.ru_maxrss
+    echo "Runtime RSS (KB):                             ", rss
+    echo "# of page faults:                             ", flt
+  echo "--------------------------------------------------------------------------"
+  echo "n requested:                                  ", n
+  echo "result:                                       ", f
+
+main()
--- a/constantine/platforms/threadpool/benchmarks/heat/stdnim_heat.nim
+++ b/constantine/platforms/threadpool/benchmarks/heat/stdnim_heat.nim
@ -0,0 +1,300 @@
+# Weave
+# Copyright (c) 2019 Mamy André-Ratsimbazafy
+# Licensed and distributed under either of
+#   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
+#   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
+# at your option. This file may not be copied, modified, or distributed except according to those terms.
+
+# From fibril
+#
+# Original license
+#
+# /*
+#  * Heat diffusion (Jacobi-type iteration)
+#  *
+#  * Volker Strumpen, Boston                                 August 1996
+#  *
+#  * Copyright (c) 1996 Massachusetts Institute of Technology
+#  *
+#  * This program is free software; you can redistribute it and/or modify
+#  * it under the terms of the GNU General Public License as published by
+#  * the Free Software Foundation; either version 2 of the License, or
+#  * (at your option) any later version.
+#  *
+#  * This program is distributed in the hope that it will be useful,
+#  * but WITHOUT ANY WARRANTY; without even the implied warranty of
+#  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#  * GNU General Public License for more details.
+#  *
+#  * You should have received a copy of the GNU General Public License
+#  * along with this program; if not, write to the Free Software
+#  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+#  */
+
+import
+  # Stdlib
+  system/ansi_c, std/[strformat, os, strutils, math, cpuinfo],
+  std/threadpool,
+  # bench
+  ../wtime, ../resources
+
+# This deadlocks :/
+
+# Helpers
+# -------------------------------------------------------
+
+# We need a thin wrapper around raw pointers for matrices,
+# we can't pass "var seq[seq[float64]]" to other threads
+# nor "var" for that matter
+type
+  Matrix[T] = object
+    buffer: ptr UncheckedArray[T]
+    m, n: int
+
+  Row[T] = object
+    buffer: ptr UncheckedArray[T]
+    len: int
+
+func newMatrix[T](m, n: int): Matrix[T] {.inline.} =
+  result.buffer = cast[ptr UncheckedArray[T]](c_malloc(csize_t m*n*sizeof(T)))
+  result.m = m
+  result.n = n
+
+template `[]`[T](mat: Matrix[T], row, col: Natural): T =
+  # row-major storage
+  assert row < mat.m
+  assert col < mat.n
+  mat.buffer[row * mat.n + col]
+
+template `[]=`[T](mat: Matrix[T], row, col: Natural, value: T) =
+  assert row < mat.m
+  assert col < mat.n
+  mat.buffer[row * mat.n + col] = value
+
+func getRow[T](mat: Matrix[T], rowIdx: Natural): Row[T] {.inline.} =
+  # row-major storage, there are n columns in between each rows
+  assert rowIdx < mat.m
+  result.buffer = cast[ptr UncheckedArray[T]](mat.buffer[rowIdx * mat.n].addr)
+  result.len = mat.m
+
+template `[]`[T](row: Row[T], idx: Natural): T =
+  assert idx < row.len
+  row.buffer[idx]
+
+template `[]=`[T](row: Row[T], idx: Natural, value: T) =
+  assert idx < row.len
+  row.buffer[idx] = value
+
+func delete[T](mat: sink Matrix[T]) =
+  c_free(mat.buffer)
+
+# And an auto converter for int32 -> float64 so we don't have to convert
+# all i, j indices manually
+
+converter i32toF64(x: int32): float64 {.inline.} =
+  float64(x)
+
+# -------------------------------------------------------
+
+template f(x, y: SomeFloat): SomeFloat =
+  sin(x) * sin(y)
+
+template randa[T: SomeFloat](x, t: T): T =
+  T(0.0)
+
+proc randb(x, t: SomeFloat): SomeFloat {.inline.} =
+  # proc instead of template to avoid Nim constant folding bug:
+  # https://github.com/nim-lang/Nim/issues/12783
+  exp(-2 * t) * sin(x)
+
+template randc[T: SomeFloat](y, t: T): T =
+  T(0.0)
+
+proc randd(y, t: SomeFloat): SomeFloat {.inline.} =
+  # proc instead of template to avoid Nim constant folding bug:
+  # https://github.com/nim-lang/Nim/issues/12783
+  exp(-2 * t) * sin(y)
+
+template solu(x, y, t: SomeFloat): SomeFloat =
+  exp(-2 * t) * sin(x) * sin(y)
+
+const n = 4096'i32
+
+var
+  nx, ny, nt: int32
+  xu, xo, yu, yo, tu, to: float64
+
+  dx, dy, dt: float64
+  dtdxsq, dtdysq: float64
+
+  odd: Matrix[float64]
+  even: Matrix[float64]
+
+proc heat(m: Matrix[float64], il, iu: int32): bool {.discardable.}=
+  # TODO to allow awaiting `heat` we return a dummy bool
+  # The parallel spawns are updating the same matrix cells otherwise
+  if iu - il > 1:
+    let im = (il + iu) div 2
+
+    let h = spawn heat(m, il, im)
+    heat(m, im, iu)
+    discard ^h
+    return true
+  # ------------------------
+
+  let i = il
+  let row = m.getRow(i)
+
+  if i == 0:
+    for j in 0 ..< ny:
+      row[j] = randc(yu + j*dy, 0)
+  elif i == nx - 1:
+    for j in 0 ..< ny:
+      row[j] = randd(yu + j*dy, 0)
+  else:
+    row[0] = randa(xu + i*dx, 0)
+    for j in 1 ..< ny - 1:
+      row[j] = f(xu + i*dx, yu + j*dy)
+    row[ny - 1] = randb(xu + i*dx, 0)
+
+proc diffuse(output: Matrix[float64], input: Matrix[float64], il, iu: int32, t: float64): bool {.discardable.} =
+  # TODO to allow awaiting `diffuse` we return a dummy bool
+  # The parallel spawns are updating the same matrix cells otherwise
+  if iu - il > 1:
+    let im = (il + iu) div 2
+
+    let d = spawn diffuse(output, input, il, im, t)
+    diffuse(output, input, im, iu, t)
+    discard ^d
+    return true
+  # ------------------------
+
+  let i = il
+  let row = output.getRow(i)
+
+  if i == 0:
+    for j in 0 ..< ny:
+      row[j] = randc(yu + j*dy, t)
+  elif i == nx - 1:
+    for j in 0 ..< ny:
+      row[j] = randd(yu + j*dy, t)
+  else:
+    row[0] = randa(xu + i*dx, t)
+    for j in 1 ..< ny - 1:
+      row[j] = input[i, j] + # The use of nested sequences here is a bad idea ...
+               dtdysq * (input[i, j+1] - 2 * input[i, j] + input[i, j-1]) +
+               dtdxsq * (input[i+1, j] - 2 * input[i, j] + input[i-1, j])
+    row[ny - 1] = randb(xu + i*dx, t)
+
+proc initTest() =
+  nx = n
+  ny = 1024
+  nt = 100
+  xu = 0.0
+  xo = 1.570796326794896558
+  yu = 0.0
+  yo = 1.570796326794896558
+  tu = 0.0
+  to = 0.0000001
+
+  dx = (xo - xu) / float64(nx - 1)
+  dy = (yo - yu) / float64(ny - 1)
+  dt = (to - tu) / float64(nt)
+
+  dtdxsq = dt / (dx * dx)
+  dtdysq = dt / (dy * dy)
+
+  even = newMatrix[float64](nx, ny)
+  odd = newMatrix[float64](nx, ny)
+
+proc prep() =
+  heat(even, 0, nx)
+
+proc test() =
+  var t = tu
+
+  for _ in countup(1, nt.int, 2):
+    # nt included
+    t += dt
+    diffuse(odd, even, 0, nx, t)
+    t += dt
+    diffuse(even, odd, 0, nx, t)
+
+  if nt mod 2 != 0:
+    t += dt
+    diffuse(odd, even, 0, nx, t)
+
+proc verify() =
+  var
+    mat: Matrix[float64]
+    mae: float64
+    mre: float64
+    me:  float64
+
+  mat = if nt mod 2 != 0: odd else: even
+
+  for a in 0 ..< nx:
+    for b in 0 ..< ny:
+      var tmp = abs(mat[a, b] - solu(xu + a*dx, yu + b*dy, to))
+      if tmp > 1e-3:
+        echo "nx: ", nx, " - ny: ", ny
+        echo "mat[", a, ", ", b, "] = ", mat[a, b], ", expected sol = ", solu(xu + a*dx, yu + b*dy, to)
+        quit 1
+
+      me += tmp
+      if tmp > mae: mae = tmp
+      if mat[a, b] != 0.0: tmp /= mat[a, b]
+      if tmp > mre: mre = tmp
+
+  me /= nx * ny
+
+  if mae > 1e-12:
+    echo &"Local maximal absolute error {mae:1.3e}"
+    quit 1
+  if mre > 1e-12:
+    echo &"Local maximal relative error {mre:1.3e}"
+    quit 1
+  if me > 1e-12:
+    echo &"Global mean absolute error {me:1.3e}"
+    quit 1
+
+  echo "Verification successful"
+
+proc main() =
+  var nthreads: int
+  nthreads = countProcessors()
+
+  var ru: Rusage
+  getrusage(RusageSelf, ru)
+  var
+    rss = ru.ru_maxrss
+    flt = ru.ru_minflt
+
+  initTest()
+
+  prep()
+  let start = wtime_usec()
+  test()
+  let stop = wtime_usec()
+
+  getrusage(RusageSelf, ru)
+  rss = ru.ru_maxrss - rss
+  flt = ru.ru_minflt - flt
+
+  sync()
+
+  verify()
+  delete(even)
+  delete(odd)
+
+  echo "Scheduler:  Nim threadpool (standard lib)"
+  echo "Benchmark:        heat"
+  echo "Threads:          ", nthreads
+  echo "Time(us)          ", stop - start
+  echo "Max RSS (KB):     ", ru.ru_maxrss
+  echo "Runtime RSS (KB): ", rss
+  echo "# of page faults: ", flt
+
+  quit 0
+
+main()
--- a/constantine/platforms/threadpool/benchmarks/heat/threadpool_heat.nim
+++ b/constantine/platforms/threadpool/benchmarks/heat/threadpool_heat.nim
@ -0,0 +1,314 @@
+# Weave
+# Copyright (c) 2019 Mamy André-Ratsimbazafy
+# Licensed and distributed under either of
+#   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
+#   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
+# at your option. This file may not be copied, modified, or distributed except according to those terms.
+
+# From fibril
+#
+# Original license
+#
+# /*
+#  * Heat diffusion (Jacobi-type iteration)
+#  *
+#  * Volker Strumpen, Boston                                 August 1996
+#  *
+#  * Copyright (c) 1996 Massachusetts Institute of Technology
+#  *
+#  * This program is free software; you can redistribute it and/or modify
+#  * it under the terms of the GNU General Public License as published by
+#  * the Free Software Foundation; either version 2 of the License, or
+#  * (at your option) any later version.
+#  *
+#  * This program is distributed in the hope that it will be useful,
+#  * but WITHOUT ANY WARRANTY; without even the implied warranty of
+#  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#  * GNU General Public License for more details.
+#  *
+#  * You should have received a copy of the GNU General Public License
+#  * along with this program; if not, write to the Free Software
+#  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+#  */
+
+import
+  # Stdlib
+  system/ansi_c, std/[strformat, os, strutils, math, cpuinfo],
+  # Library
+  ../../threadpool
+when not defined(windows):
+  # bench
+  import ../wtime, ../resources
+
+# Helpers
+# -------------------------------------------------------
+
+# We need a thin wrapper around raw pointers for matrices,
+# we can't pass "var seq[seq[float64]]" to other threads
+# nor "var" for that matter
+type
+  Matrix[T] = object
+    buffer: ptr UncheckedArray[T]
+    m, n: int
+
+  Row[T] = object
+    buffer: ptr UncheckedArray[T]
+    len: int
+
+var tp: Threadpool
+
+func newMatrix[T](m, n: int): Matrix[T] {.inline.} =
+  result.buffer = cast[ptr UncheckedArray[T]](c_malloc(csize_t m*n*sizeof(T)))
+  result.m = m
+  result.n = n
+
+template `[]`[T](mat: Matrix[T], row, col: Natural): T =
+  # row-major storage
+  assert row < mat.m
+  assert col < mat.n
+  mat.buffer[row * mat.n + col]
+
+template `[]=`[T](mat: Matrix[T], row, col: Natural, value: T) =
+  assert row < mat.m
+  assert col < mat.n
+  mat.buffer[row * mat.n + col] = value
+
+func getRow[T](mat: Matrix[T], rowIdx: Natural): Row[T] {.inline.} =
+  # row-major storage, there are n columns in between each rows
+  assert rowIdx < mat.m
+  result.buffer = cast[ptr UncheckedArray[T]](mat.buffer[rowIdx * mat.n].addr)
+  result.len = mat.m
+
+template `[]`[T](row: Row[T], idx: Natural): T =
+  assert idx < row.len
+  row.buffer[idx]
+
+template `[]=`[T](row: Row[T], idx: Natural, value: T) =
+  assert idx < row.len
+  row.buffer[idx] = value
+
+func delete[T](mat: sink Matrix[T]) =
+  c_free(mat.buffer)
+
+# And an auto converter for int32 -> float64 so we don't have to convert
+# all i, j indices manually
+
+converter i32toF64(x: int32): float64 {.inline.} =
+  float64(x)
+
+# -------------------------------------------------------
+
+template f(x, y: SomeFloat): SomeFloat =
+  sin(x) * sin(y)
+
+template randa[T: SomeFloat](x, t: T): T =
+  T(0.0)
+
+proc randb(x, t: SomeFloat): SomeFloat {.inline.} =
+  # proc instead of template to avoid Nim constant folding bug:
+  # https://github.com/nim-lang/Nim/issues/12783
+  exp(-2 * t) * sin(x)
+
+template randc[T: SomeFloat](y, t: T): T =
+  T(0.0)
+
+proc randd(y, t: SomeFloat): SomeFloat {.inline.} =
+  # proc instead of template to avoid Nim constant folding bug:
+  # https://github.com/nim-lang/Nim/issues/12783
+  exp(-2 * t) * sin(y)
+
+template solu(x, y, t: SomeFloat): SomeFloat =
+  exp(-2 * t) * sin(x) * sin(y)
+
+const n = 4096'i32
+
+var
+  nx, ny, nt: int32
+  xu, xo, yu, yo, tu, to: float64
+
+  dx, dy, dt: float64
+  dtdxsq, dtdysq: float64
+
+  odd: Matrix[float64]
+  even: Matrix[float64]
+
+proc heat(m: Matrix[float64], il, iu: int32): bool {.discardable, gcsafe.}=
+  # TODO to allow awaiting `heat` we return a dummy bool
+  # The parallel spawns are updating the same matrix cells otherwise
+  if iu - il > 1:
+    let im = (il + iu) div 2
+
+    let h = tp.spawn heat(m, il, im)
+    heat(m, im, iu)
+    discard sync(h)
+    return true
+  # ------------------------
+
+  let i = il
+  let row = m.getRow(i)
+
+  if i == 0:
+    for j in 0 ..< ny:
+      row[j] = randc(yu + j*dy, 0)
+  elif i == nx - 1:
+    for j in 0 ..< ny:
+      row[j] = randd(yu + j*dy, 0)
+  else:
+    row[0] = randa(xu + i*dx, 0)
+    for j in 1 ..< ny - 1:
+      row[j] = f(xu + i*dx, yu + j*dy)
+    row[ny - 1] = randb(xu + i*dx, 0)
+
+proc diffuse(output: Matrix[float64], input: Matrix[float64], il, iu: int32, t: float64): bool {.discardable, gcsafe.} =
+  # TODO to allow awaiting `diffuse` we return a dummy bool
+  # The parallel spawns are updating the same matrix cells otherwise
+  if iu - il > 1:
+    let im = (il + iu) div 2
+
+    let d = tp.spawn diffuse(output, input, il, im, t)
+    diffuse(output, input, im, iu, t)
+    discard sync(d)
+    return true
+  # ------------------------
+
+  let i = il
+  let row = output.getRow(i)
+
+  if i == 0:
+    for j in 0 ..< ny:
+      row[j] = randc(yu + j*dy, t)
+  elif i == nx - 1:
+    for j in 0 ..< ny:
+      row[j] = randd(yu + j*dy, t)
+  else:
+    row[0] = randa(xu + i*dx, t)
+    for j in 1 ..< ny - 1:
+      row[j] = input[i, j] + # The use of nested sequences here is a bad idea ...
+               dtdysq * (input[i, j+1] - 2 * input[i, j] + input[i, j-1]) +
+               dtdxsq * (input[i+1, j] - 2 * input[i, j] + input[i-1, j])
+    row[ny - 1] = randb(xu + i*dx, t)
+
+proc initTest() =
+  nx = n
+  ny = 1024
+  nt = 100
+  xu = 0.0
+  xo = 1.570796326794896558
+  yu = 0.0
+  yo = 1.570796326794896558
+  tu = 0.0
+  to = 0.0000001
+
+  dx = (xo - xu) / float64(nx - 1)
+  dy = (yo - yu) / float64(ny - 1)
+  dt = (to - tu) / float64(nt)
+
+  dtdxsq = dt / (dx * dx)
+  dtdysq = dt / (dy * dy)
+
+  even = newMatrix[float64](nx, ny)
+  odd = newMatrix[float64](nx, ny)
+
+proc prep() =
+  heat(even, 0, nx)
+
+proc test() =
+  var t = tu
+
+  for _ in countup(1, nt.int, 2):
+    # nt included
+    t += dt
+    diffuse(odd, even, 0, nx, t)
+    t += dt
+    diffuse(even, odd, 0, nx, t)
+
+  if nt mod 2 != 0:
+    t += dt
+    diffuse(odd, even, 0, nx, t)
+
+proc verify() =
+  var
+    mat: Matrix[float64]
+    mae: float64
+    mre: float64
+    me:  float64
+
+  mat = if nt mod 2 != 0: odd else: even
+
+  for a in 0 ..< nx:
+    for b in 0 ..< ny:
+      var tmp = abs(mat[a, b] - solu(xu + a*dx, yu + b*dy, to))
+      if tmp > 1e-3:
+        echo "nx: ", nx, " - ny: ", ny
+        echo "mat[", a, ", ", b, "] = ", mat[a, b], ", expected sol = ", solu(xu + a*dx, yu + b*dy, to)
+        quit 1
+
+      me += tmp
+      if tmp > mae: mae = tmp
+      if mat[a, b] != 0.0: tmp /= mat[a, b]
+      if tmp > mre: mre = tmp
+
+  me /= nx * ny
+
+  if mae > 1e-12:
+    echo &"Local maximal absolute error {mae:1.3e}"
+    quit 1
+  if mre > 1e-12:
+    echo &"Local maximal relative error {mre:1.3e}"
+    quit 1
+  if me > 1e-12:
+    echo &"Global mean absolute error {me:1.3e}"
+    quit 1
+
+  echo "Heat: Verification successful"
+
+proc main() =
+  var nthreads: int
+  if existsEnv"CTT_NUM_THREADS":
+    nthreads = getEnv"CTT_NUM_THREADS".parseInt()
+  else:
+    nthreads = countProcessors()
+
+  when not defined(windows):
+    var ru: Rusage
+    getrusage(RusageSelf, ru)
+    var
+      rss = ru.ru_maxrss
+      flt = ru.ru_minflt
+
+  initTest()
+
+  # Fibril initializes before benching
+  tp = Threadpool.new(numThreads = nthreads)
+
+  prep()
+  when not defined(windows):
+    let start = wtime_usec()
+  test()
+  when not defined(windows):
+    let stop = wtime_usec()
+
+    getrusage(RusageSelf, ru)
+    rss = ru.ru_maxrss - rss
+    flt = ru.ru_minflt - flt
+
+  tp.shutdown()
+
+  verify()
+  delete(even)
+  delete(odd)
+
+  echo "--------------------------------------------------------------------------"
+  echo "Scheduler:        Constantine's Threadpool"
+  echo "Benchmark:        heat"
+  echo "Threads:          ", nthreads
+  when not defined(windows):
+    echo "Time(us)          ", stop - start
+    echo "Max RSS (KB):     ", ru.ru_maxrss
+    echo "Runtime RSS (KB): ", rss
+    echo "# of page faults: ", flt
+  echo "--------------------------------------------------------------------------"
+
+  quit 0
+
+main()
--- a/constantine/platforms/threadpool/benchmarks/matmul_cache_oblivious/README.md
+++ b/constantine/platforms/threadpool/benchmarks/matmul_cache_oblivious/README.md
@ -0,0 +1,12 @@
+# Cache-Oblivious Matrix Multiplication
+
+From Staccato and Cilk
+
+https://bradley.csail.mit.edu/svn/repos/cilk/5.4.3/examples/matmul.cilk
+See the paper ``Cache-Oblivious Algorithms'', by
+Matteo Frigo, Charles E. Leiserson, Harald Prokop, and
+Sridhar Ramachandran, FOCS 1999, for an explanation of
+why this algorithm is good for caches.
+
+Note that the benchmarks output incorrect matrix traces
+according to the check ...
--- a/constantine/platforms/threadpool/benchmarks/matmul_cache_oblivious/threadpool_matmul_co.nim
+++ b/constantine/platforms/threadpool/benchmarks/matmul_cache_oblivious/threadpool_matmul_co.nim
@ -0,0 +1,214 @@
+# Weave
+# Copyright (c) 2019 Mamy André-Ratsimbazafy
+# Licensed and distributed under either of
+#   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
+#   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
+# at your option. This file may not be copied, modified, or distributed except according to those terms.
+
+# Rectangular matrix multiplication.
+#
+# Adapted from Cilk 5.4.3 example
+#
+# https://bradley.csail.mit.edu/svn/repos/cilk/5.4.3/examples/matmul.cilk;
+# See the paper ``Cache-Oblivious Algorithms'', by
+# Matteo Frigo, Charles E. Leiserson, Harald Prokop, and
+# Sridhar Ramachandran, FOCS 1999, for an explanation of
+# why this algorithm is good for caches.
+
+import
+  # Stdlib
+  system/ansi_c, std/[strformat, os, strutils, math, cpuinfo],
+  # Library
+  ../../threadpool,
+  # bench
+  ../wtime, ../resources
+
+# Helpers
+# -------------------------------------------------------
+
+# We need a thin wrapper around raw pointers for matrices,
+# we can't pass "var" to other threads
+type
+  Matrix[T: SomeFloat] = object
+    buffer: ptr UncheckedArray[T]
+    ld: int
+
+var tp: Threadpool
+
+func newMatrixNxN[T](n: int): Matrix[T] {.inline.} =
+  result.buffer = cast[ptr UncheckedArray[T]](c_malloc(csize_t n*n*sizeof(T)))
+  result.ld = n
+
+template `[]`[T](mat: Matrix[T], row, col: Natural): T =
+  # row-major storage
+  assert row < mat.ld, $i & " < " & $mat.ld
+  assert col < mat.ld, $i & " < " & $mat.ld
+  mat.buffer[row * mat.ld + col]
+
+template `[]=`[T](mat: Matrix[T], row, col: Natural, value: T) =
+  assert row < mat.ld, $i & " < " & $mat.ld
+  assert col < mat.ld, $i & " < " & $mat.ld
+  mat.buffer[row * mat.ld + col] = value
+
+func stride*[T](mat: Matrix[T], row, col: Natural): Matrix[T]{.inline.}=
+  ## Returns a new view offset by the row and column stride
+  result.buffer = cast[ptr UncheckedArray[T]](
+    addr mat.buffer[row*mat.ld + col]
+  )
+
+func delete[T](mat: sink Matrix[T]) =
+  c_free(mat.buffer)
+
+# -------------------------------------------------------
+
+proc xorshiftRand(): uint32 =
+  var x {.global.} = uint32(2463534242)
+  x = x xor (x shr 13)
+  x = x xor (x shl 17)
+  x = x xor (x shr 5)
+  return x
+
+func zero[T](A: Matrix[T]) =
+  # zeroing is not timed
+  zeroMem(A.buffer, A.ld * A.ld * sizeof(T))
+
+proc fill[T](A: Matrix[T]) =
+  for i in 0 ..< A.ld:
+    for j in 0 ..< A.ld:
+      A[i, j] = T(xorshiftRand() mod A.ld.uint32)
+
+func maxError(A, B: Matrix): float64 =
+  assert A.ld == B.ld
+  for i in 0 ..< A.ld:
+    for j in 0 ..< A.ld:
+      var diff = (A[i, j] - B[i, j]) / A[i, j]
+      if diff < 0:
+        diff = -diff
+      if diff > result:
+        result = diff
+
+func check[T](A, B, C: Matrix[T], n: int): bool =
+  var
+    tr_C = 0.T
+    tr_AB = 0.T
+  for i in 0 ..< n:
+    for j in 0 ..< n:
+      tr_AB += A[i, j] * B[j, i]
+    tr_C += C[i, i]
+
+  # Note, all benchmarks return false ‾\_(ツ)_/‾
+  return abs(tr_AB - tr_C) < 1e-3
+
+proc matmul[T](A, B, C: Matrix[T], m, n, p: int, add: bool): bool =
+  # The original bench passes around a ``ld`` parameter (leading dimension?),
+  # we store it in the matrices
+  # We return a dummy bool to allow waiting on the matmul
+
+  # Threshold
+  if (m + n + p) <= 64:
+    if add:
+      for i in 0 ..< m:
+        for k in 0 ..< p:
+          var c = 0.T
+          for j in 0 ..< n:
+            c += A[i, j] * B[j, k]
+          C[i, k] += c
+    else:
+      for i in 0 ..< m:
+        for k in 0 ..< p:
+          var c = 0.T
+          for j in 0 ..< n:
+            c += A[i, j] * B[j, k]
+          C[i, k] = c
+
+    return
+
+  var h0, h1: FlowVar[bool]
+  ## Each half of the computation
+
+  # matrix is larger than threshold
+  if m >= n and n >= p:
+    let m1 = m shr 1 # divide by 2
+    h0 = tp.spawn matmul(A, B, C, m1, n, p, add)
+    h1 = tp.spawn matmul(A.stride(m1, 0), B, C.stride(m1, 0), m - m1, n, p, add)
+  elif n >= m and n >= p:
+    let n1 = n shr 1 # divide by 2
+    h0 = tp.spawn matmul(A, B, C, m, n1, p, add)
+    h1 = tp.spawn matmul(A.stride(0, n1), B.stride(n1, 0), C, m, n - n1, p, add = true)
+  else:
+    let p1 = p shr 1
+    h0 = tp.spawn matmul(A, B, C, m, n, p1, add)
+    h1 = tp.spawn matmul(A, B.stride(0, p1), C.stride(0, p1), m, n, p - p1, add)
+
+  discard sync(h0)
+  discard sync(h1)
+
+proc main() =
+  echo "Warning the benchmark seems to not be correct."
+  var
+    n = 3000
+    nthreads: int
+
+  if existsEnv"CTT_NUM_THREADS":
+    nthreads = getEnv"CTT_NUM_THREADS".parseInt()
+  else:
+    nthreads = countProcessors()
+
+  if paramCount() == 0:
+    let exeName = getAppFilename().extractFilename()
+    echo &"Usage: {exeName} <n (matrix size):{n}>"
+    echo &"Running with default config n = {n}"
+  elif paramCount() == 1:
+    n = paramStr(1).parseInt()
+  else:
+    let exeName = getAppFilename().extractFilename()
+    echo &"Usage: {exeName} <n (matrix size):{n}>"
+    echo &"Up to 1 parameter is valid. Received {paramCount()}"
+    quit 1
+
+  var A = newMatrixNxN[float32](n)
+  var B = newMatrixNxN[float32](n)
+  var C = newMatrixNxN[float32](n)
+
+  fill(A)
+  fill(B)
+  zero(C)
+
+  var ru: Rusage
+  getrusage(RusageSelf, ru)
+  var
+    rss = ru.ru_maxrss
+    flt = ru.ru_minflt
+
+  # Staccato benches runtime init and exit as well
+  let start = wtime_msec()
+
+  tp = Threadpool.new(numThreads = nthreads)
+  discard sync tp.spawn matmul(A, B, C, n, n, n, add = false)
+  tp.shutdown()
+
+  let stop = wtime_msec()
+
+  getrusage(RusageSelf, ru)
+  rss = ru.ru_maxrss - rss
+  flt = ru.ru_minflt - flt
+
+  echo "--------------------------------------------------------------------------"
+  echo "Scheduler:        Constantine's Threadpool"
+  echo "Benchmark:        Matrix Multiplication (cache oblivious)"
+  echo "Threads:          ", nthreads
+  echo "Time(ms)          ", stop - start
+  echo "Max RSS (KB):     ", ru.ru_maxrss
+  echo "Runtime RSS (KB): ", rss
+  echo "# of page faults: ", flt
+  echo "Input:            ", n
+  echo "Error:           ", check(A, B, C, n)
+  echo "--------------------------------------------------------------------------"
+
+  delete A
+  delete B
+  delete C
+
+  quit 0
+
+main()
--- a/constantine/platforms/threadpool/benchmarks/nqueens/stdnim_nqueens.nim
+++ b/constantine/platforms/threadpool/benchmarks/nqueens/stdnim_nqueens.nim
@ -0,0 +1,181 @@
+# Weave
+# Copyright (c) 2019 Mamy André-Ratsimbazafy
+# Licensed and distributed under either of
+#   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
+#   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
+# at your option. This file may not be copied, modified, or distributed except according to those terms.
+#
+# Original code licenses
+# ------------------------------------------------------------------------------------------------
+#
+# /**********************************************************************************************/
+# /*  This program is part of the Barcelona OpenMP Tasks Suite                                  */
+# /*  Copyright (C) 2009 Barcelona Supercomputing Center - Centro Nacional de Supercomputacion  */
+# /*  Copyright (C) 2009 Universitat Politecnica de Catalunya                                   */
+# /*                                                                                            */
+# /*  This program is free software; you can redistribute it and/or modify                      */
+# /*  it under the terms of the GNU General Public License as published by                      */
+# /*  the Free Software Foundation; either version 2 of the License, or                         */
+# /*  (at your option) any later version.                                                       */
+# /*                                                                                            */
+# /*  This program is distributed in the hope that it will be useful,                           */
+# /*  but WITHOUT ANY WARRANTY; without even the implied warranty of                            */
+# /*  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the                             */
+# /*  GNU General Public License for more details.                                              */
+# /*                                                                                            */
+# /*  You should have received a copy of the GNU General Public License                         */
+# /*  along with this program; if not, write to the Free Software                               */
+# /*  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA            */
+# /**********************************************************************************************/
+#
+# /*
+#  * Original code from the Cilk project (by Keith Randall)
+#  *
+#  * Copyright (c) 2000 Massachusetts Institute of Technology
+#  * Copyright (c) 2000 Matteo Frigo
+#  */
+
+import
+  # Stdlib
+  system/ansi_c, std/[strformat, os, strutils],
+  std/threadpool,
+  # bench
+  ../wtime
+
+# This deadlocks :/
+
+# Nim helpers
+# -------------------------------------------------
+
+when defined(windows):
+  proc alloca(size: csize): pointer {.header: "<malloc.h>".}
+else:
+  proc alloca(size: csize): pointer {.header: "<alloca.h>".}
+
+template alloca*(T: typedesc): ptr T =
+  cast[ptr T](alloca(sizeof(T)))
+
+template alloca*(T: typedesc, len: Natural): ptr UncheckedArray[T] =
+  cast[ptr UncheckedArray[T]](alloca(sizeof(T) * len))
+
+proc tp_alloc*(T: typedesc, len: SomeInteger): ptr UncheckedArray[T] {.inline.} =
+  cast[type result](c_malloc(csize_t len*sizeof(T)))
+
+proc tp_free*[T: ptr](p: T) {.inline.} =
+  c_free(p)
+
+# We assume that Nim zeroMem vs C memset
+# and Nim copyMem vs C memcpy have no difference
+# Nim does have extra checks to handle GC-ed types
+# but they should be eliminated by the Nim compiler.
+
+# -------------------------------------------------
+
+type CharArray = ptr UncheckedArray[char]
+
+var example_solution: ptr UncheckedArray[char]
+
+func isValid(n: int32, a: CharArray): bool =
+  ## `a` contains an array of `n` queen positions.
+  ## Returns true if none of the queens conflict and 0 otherwise.
+
+  for i in 0'i32 ..< n:
+    let p = cast[int32](a[i])
+
+    for j in i+1 ..< n:
+      let q = cast[int32](a[j])
+      if q == p or q == p - (j-i) or q == p + (j-i):
+        return false
+  return true
+
+proc nqueens_ser(n, j: int32, a: CharArray): int32 =
+  # Serial nqueens
+  if n == j:
+    # Good solution count it
+    if example_solution.isNil:
+      example_solution = tp_alloc(char, n)
+      copyMem(example_solution, a, n * sizeof(char))
+      return 1
+
+  # Try each possible position for queen `j`
+  for i in 0 ..< n:
+    a[j] = cast[char](i)
+    if isValid(j+1, a):
+      result += nqueens_ser(n, j+1, a)
+
+proc nqueens_par(n, j: int32, a: CharArray): int32 =
+
+  if n == j:
+    # Good solution, count it
+    return 1
+
+  var localCounts = alloca(Flowvar[int32], n)
+  zeroMem(localCounts, n * sizeof(Flowvar[int32]))
+
+  # Try each position for queen `j`
+  for i in 0 ..< n:
+    var b = alloca(char, j+1)
+    copyMem(b, a, j * sizeof(char))
+    b[j] = cast[char](i)
+    if isValid(j+1, b):
+      localCounts[i] = spawn nqueens_par(n, j+1, b)
+
+  for i in 0 ..< n:
+    if not localCounts[i].isNil():
+      result += ^localCounts[i]
+
+const solutions = [
+  1,
+  0,
+  0,
+  2,
+  10, # 5x5
+  4,
+  10,
+  92, # 8x8
+  352,
+  724, # 10x10
+  2680,
+  14200,
+  73712,
+  365596,
+  2279184, # 15x15
+  14772512
+]
+
+proc verifyQueens(n, res: int32) =
+  if n > solutions.len:
+    echo &"Cannot verify result: {n} is out of range [1,{solutions.len}]"
+    return
+
+  if res != solutions[n-1]:
+    echo &"N-Queens failure: {res} is different from expected {solutions[n-1]}"
+
+proc main() =
+  if paramCount() != 1:
+    let exeName = getAppFilename().extractFilename()
+    echo &"Usage: {exeName} <n: number of queens on a nxn board>"
+    quit 0
+
+  let n = paramStr(1).parseInt.int32
+
+  if n notin 1 .. solutions.len:
+    echo &"The number of queens N (on a NxN board) must be in the range [1, {solutions.len}]"
+    quit 1
+
+
+  let start = wtime_msec()
+  let count = nqueens_par(n, 0, alloca(char, n))
+  let stop = wtime_msec()
+
+  verifyQueens(n, count)
+
+  if not example_solution.isNil:
+    stdout.write("Example solution: ")
+    for i in 0 ..< n:
+      c_printf("%2d ", example_solution[i])
+    stdout.write('\n')
+
+  echo &"Elapsed wall time: {stop-start:2.4f} ms"
+
+main()
--- a/constantine/platforms/threadpool/benchmarks/nqueens/threadpool_nqueens.nim
+++ b/constantine/platforms/threadpool/benchmarks/nqueens/threadpool_nqueens.nim
@ -0,0 +1,231 @@
+# Weave
+# Copyright (c) 2019 Mamy André-Ratsimbazafy
+# Licensed and distributed under either of
+#   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
+#   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
+# at your option. This file may not be copied, modified, or distributed except according to those terms.
+#
+# Original code licenses
+# ------------------------------------------------------------------------------------------------
+#
+# /**********************************************************************************************/
+# /*  This program is part of the Barcelona OpenMP Tasks Suite                                  */
+# /*  Copyright (C) 2009 Barcelona Supercomputing Center - Centro Nacional de Supercomputacion  */
+# /*  Copyright (C) 2009 Universitat Politecnica de Catalunya                                   */
+# /*                                                                                            */
+# /*  This program is free software; you can redistribute it and/or modify                      */
+# /*  it under the terms of the GNU General Public License as published by                      */
+# /*  the Free Software Foundation; either version 2 of the License, or                         */
+# /*  (at your option) any later version.                                                       */
+# /*                                                                                            */
+# /*  This program is distributed in the hope that it will be useful,                           */
+# /*  but WITHOUT ANY WARRANTY; without even the implied warranty of                            */
+# /*  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the                             */
+# /*  GNU General Public License for more details.                                              */
+# /*                                                                                            */
+# /*  You should have received a copy of the GNU General Public License                         */
+# /*  along with this program; if not, write to the Free Software                               */
+# /*  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA            */
+# /**********************************************************************************************/
+#
+# /*
+#  * Original code from the Cilk project (by Keith Randall)
+#  *
+#  * Copyright (c) 2000 Massachusetts Institute of Technology
+#  * Copyright (c) 2000 Matteo Frigo
+#  */
+
+import
+  # Stdlib
+  system/ansi_c, std/[strformat, os, strutils, cpuinfo],
+  # library
+  ../../threadpool
+
+when not defined(windows):
+  # bench
+  import ../wtime, ../resources
+
+# Nim helpers
+# -------------------------------------------------
+
+when defined(windows):
+  proc alloca(size: int): pointer {.header: "<malloc.h>".}
+else:
+  proc alloca(size: int): pointer {.header: "<alloca.h>".}
+
+template alloca*(T: typedesc): ptr T =
+  cast[ptr T](alloca(sizeof(T)))
+
+template alloca*(T: typedesc, len: Natural): ptr UncheckedArray[T] =
+  cast[ptr UncheckedArray[T]](alloca(sizeof(T) * len))
+
+proc tp_alloc*(T: typedesc, len: SomeInteger): ptr UncheckedArray[T] {.inline.} =
+  when defined(TP_useNimAlloc):
+    cast[type result](createSharedU(T, len))
+  else:
+    cast[type result](c_malloc(csize_t len*sizeof(T)))
+
+proc tp_free*[T: ptr](p: T) {.inline.} =
+  when defined(TP_useNimAlloc):
+    freeShared(p)
+  else:
+    c_free(p)
+
+# We assume that Nim zeroMem vs C memset
+# and Nim copyMem vs C memcpy have no difference
+# Nim does have extra checks to handle GC-ed types
+# but they should be eliminated by the Nim compiler.
+
+# -------------------------------------------------
+
+type CharArray = ptr UncheckedArray[char]
+
+var tp: Threadpool
+var example_solution: ptr UncheckedArray[char]
+
+func isValid(n: int32, a: CharArray): bool =
+  ## `a` contains an array of `n` queen positions.
+  ## Returns true if none of the queens conflict and 0 otherwise.
+
+  for i in 0'i32 ..< n:
+    let p = cast[int32](a[i])
+
+    for j in i+1 ..< n:
+      let q = cast[int32](a[j])
+      if q == p or q == p - (j-i) or q == p + (j-i):
+        return false
+  return true
+
+proc nqueens_ser(n, j: int32, a: CharArray): int32 =
+  # Serial nqueens
+  if n == j:
+    # Good solution count it
+    if example_solution.isNil:
+      example_solution = tp_alloc(char, n)
+      copyMem(example_solution, a, n * sizeof(char))
+    return 1
+
+  # Try each possible position for queen `j`
+  for i in 0 ..< n:
+    a[j] = cast[char](i)
+    if isValid(j+1, a):
+      result += nqueens_ser(n, j+1, a)
+
+proc nqueens_par(n, j: int32, a: CharArray): int32 {.gcsafe.} =
+
+  if n == j:
+    # Good solution, count it
+    return 1
+
+  var localCounts = alloca(Flowvar[int32], n)
+  zeroMem(localCounts, n * sizeof(Flowvar[int32]))
+
+  # Try each position for queen `j`
+  for i in 0 ..< n:
+    var b = alloca(char, j+1)
+    copyMem(b, a, j * sizeof(char))
+    b[j] = cast[char](i)
+    if isValid(j+1, b):
+      localCounts[i] = tp.spawn nqueens_par(n, j+1, b)
+
+  for i in 0 ..< n:
+    if localCounts[i].isSpawned():
+      result += sync(localCounts[i])
+
+const solutions = [
+  1,
+  0,
+  0,
+  2,
+  10, # 5x5
+  4,
+  10,
+  92, # 8x8
+  352,
+  724, # 10x10
+  2680,
+  14200,
+  73712,
+  365596,
+  2279184, # 15x15
+  14772512
+]
+
+proc verifyQueens(n, res: int32) =
+  if n > solutions.len:
+    echo &"Cannot verify result: {n} is out of range [1,{solutions.len}]"
+    return
+
+  if res != solutions[n-1]:
+    echo &"N-Queens failure: {res} is different from expected {solutions[n-1]}"
+
+proc main() =
+  var
+    n = 11'i32
+    nthreads: int
+
+  if existsEnv"CTT_NUM_THREADS":
+    nthreads = getEnv"CTT_NUM_THREADS".parseInt()
+  else:
+    nthreads = countProcessors()
+
+  if paramCount() == 0:
+    let exeName = getAppFilename().extractFilename()
+    echo &"Usage: {exeName} <N:{n}>"
+    echo &"Running with default config N = {n}\n"
+
+  if paramCount() >= 1:
+    n = paramStr(1).parseInt.int32
+
+  if n notin 1 .. solutions.len:
+    echo &"The number of queens N (on a NxN board) must be in the range [1, {solutions.len}]"
+    quit 1
+
+  when not defined(windows):
+    var ru: Rusage
+    getrusage(RusageSelf, ru)
+    var
+      rss = ru.ru_maxrss
+      flt = ru.ru_minflt
+
+  tp = Threadpool.new(numThreads = nthreads)
+
+  when not defined(windows):
+    let start = wtime_msec()
+
+  let count = nqueens_par(n, 0, alloca(char, n))
+
+  when not defined(windows):
+    let stop = wtime_msec()
+
+  when not defined(windows):
+    getrusage(RusageSelf, ru)
+    rss = ru.ru_maxrss - rss
+    flt = ru.ru_minflt - flt
+
+  tp.shutdown()
+
+  verifyQueens(n, count)
+
+  if not example_solution.isNil:
+    stdout.write("Example solution: ")
+    for i in 0 ..< n:
+      c_printf("%2d ", example_solution[i])
+    stdout.write('\n')
+
+  echo "--------------------------------------------------------------------------"
+  echo "Scheduler:            Constantine's Threadpool"
+  echo "Benchmark:            N-queens"
+  echo "Threads:              ", nthreads
+  when not defined(windows):
+    echo "Time(us)              ", stop - start
+    echo "Max RSS (KB):         ", ru.ru_maxrss
+    echo "Runtime RSS (KB):     ", rss
+    echo "# of page faults:     ", flt
+  echo "Problem size:         ", n,"x",n, " board with ",n, " queens"
+  echo "Solutions found:      ", count
+  echo "--------------------------------------------------------------------------"
+
+  quit 0
+
+main()
--- a/constantine/platforms/threadpool/benchmarks/resources.nim
+++ b/constantine/platforms/threadpool/benchmarks/resources.nim
@ -0,0 +1,24 @@
+type
+  Timeval {.importc: "timeval", header:"<sys/time.h>", bycopy.} = object
+
+  Rusage* {.importc: "struct rusage", header:"<sys/resource.h>", bycopy.} = object
+    ru_utime {.importc.}: Timeval
+    ru_stime {.importc.}: Timeval
+    ru_maxrss* {.importc.}: int32  # Maximum resident set size
+    # ...
+    ru_minflt* {.importc.}: int32  # page reclaims (soft page faults)
+
+  RusageWho* {.size: sizeof(cint).} = enum
+    RusageChildren = -1
+    RusageSelf = 0
+    RusageThread = 1
+
+when defined(debug):
+  var H_RUSAGE_SELF{.importc, header:"<sys/resource.h".}: cint
+  var H_RUSAGE_CHILDREN{.importc, header:"<sys/resource.h".}: cint
+  var H_RUSAGE_THREAD{.importc, header:"<sys/resource.h".}: cint
+  assert H_RUSAGE_SELF == ord(RusageSelf)
+  assert H_RUSAGE_CHILDREN = ord(RusageChildren)
+  assert H_RUSAGE_THREAD = ord(RusageThread)
+
+proc getrusage*(who: RusageWho, usage: var Rusage) {.importc, header: "sys/resource.h".}
--- a/constantine/platforms/threadpool/benchmarks/single_task_producer/README.md
+++ b/constantine/platforms/threadpool/benchmarks/single_task_producer/README.md
@ -0,0 +1,7 @@
+# Simple single-producer multiple consumers benchmarks
+
+SPC A Simple Producer-Consumer benchmark.
+
+A single worker produces n tasks,
+each running for t microseconds. This benchmark allows us to test how many
+concurrent consumers a single producer can sustain.
--- a/constantine/platforms/threadpool/benchmarks/single_task_producer/threadpool_spc.nim
+++ b/constantine/platforms/threadpool/benchmarks/single_task_producer/threadpool_spc.nim
@ -0,0 +1,146 @@
+import
+  # STD lib
+  system/ansi_c, std/[os, strutils, cpuinfo, strformat, math],
+  # Library
+  ../../threadpool,
+  # bench
+  ../wtime, ../resources
+
+var NumTasksTotal: int32
+var TaskGranularity: int32 # microsecond
+var PollInterval: float64  # microsecond
+
+var tp: Threadpool
+
+var global_poll_elapsed {.threadvar.}: float64
+
+template dummy_cpt(): untyped =
+  # Dummy computation
+  # Calculate fib(30) iteratively
+  var
+    fib = 0
+    f2 = 0
+    f1 = 1
+  for i in 2 .. 30:
+    fib = f1 + f2
+    f2 = f1
+    f1 = fib
+
+proc spc_consume(usec: int32) =
+
+  var pollElapsed = 0'f64
+
+  let start = wtime_usec()
+  let stop = usec.float64
+  global_poll_elapsed = PollInterval
+
+  while true:
+    var elapsed = wtime_usec() - start
+    elapsed = elapsed - pollElapsed
+    if elapsed >= stop:
+      break
+
+    dummy_cpt()
+
+    # if elapsed >= global_poll_elapsed:
+    #   let pollStart = wtime_usec()
+    #   loadBalance(Weave)
+    #   pollElapsed += wtime_usec() - pollStart
+    #   global_poll_elapsed += PollInterval
+
+  # c_printf("Elapsed: %.2lfus\n", elapsed)
+
+proc spc_consume_nopoll(usec: int32) =
+
+  let start = wtime_usec()
+  let stop = usec.float64
+
+  while true:
+    var elapsed = wtime_usec() - start
+    if elapsed >= stop:
+      break
+
+    dummy_cpt()
+
+  # c_printf("Elapsed: %.2lfus\n", elapsed)
+
+proc spc_produce(n: int32) =
+  for i in 0 ..< n:
+    tp.spawn spc_consume(TaskGranularity)
+
+proc spc_produce_seq(n: int32) =
+  for i in 0 ..< n:
+    spc_consume_nopoll(TaskGranularity)
+
+proc main() =
+  NumTasksTotal = 1000000
+  TaskGranularity = 10
+  PollInterval = 10
+
+  if paramCount() == 0:
+    let exeName = getAppFilename().extractFilename()
+    echo &"Usage: {exeName} <# of tasks:{NumTasksTotal}> " &
+         &"<task granularity (us): {TaskGranularity}> " &
+         &"[polling interval (us): task granularity]"
+    echo &"Running with default config tasks = {NumTasksTotal}, granularity (us) = {TaskGranularity}, polling (us) = {PollInterval}"
+  if paramCount() >= 1:
+    NumTasksTotal = paramStr(1).parseInt.int32
+  if paramCount() >= 2:
+    TaskGranularity = paramStr(2). parseInt.int32
+  if paramCount() == 3:
+    PollInterval = paramStr(3).parseInt.float64
+  else:
+    PollInterval = TaskGranularity.float64
+  if paramCount() > 3:
+    let exeName = getAppFilename().extractFilename()
+    echo &"Usage: {exeName} <# of tasks:{NumTasksTotal}> " &
+         &"<task granularity (us): {TaskGranularity}> " &
+         &"[polling interval (us): task granularity]"
+    quit 1
+
+  var nthreads: int
+  if existsEnv"CTT_NUM_THREADS":
+    nthreads = getEnv"CTT_NUM_THREADS".parseInt()
+  else:
+    nthreads = countProcessors()
+
+  tp = Threadpool.new(numThreads = nthreads)
+
+  # measure overhead during tasking
+  var ru: Rusage
+  getrusage(RusageSelf, ru)
+  var
+    rss = ru.ru_maxrss
+    flt = ru.ru_minflt
+
+  let start = wtime_msec()
+
+  # spc_produce_seq(NumTasksTotal)
+  spc_produce(NumTasksTotal)
+  tp.syncAll()
+
+  let stop = wtime_msec()
+
+  getrusage(RusageSelf, ru)
+  rss = ru.ru_maxrss - rss
+  flt = ru.ru_minflt - flt
+
+  tp.shutdown()
+
+  echo "--------------------------------------------------------------------------"
+  echo "Scheduler:                                     Constantine's Threadpool"
+  echo "Benchmark:                                     SPC (Single task Producer - multi Consumer)"
+  echo "Threads:                                       ", nthreads
+  echo "Time(ms)                                       ", round(stop - start, 3)
+  echo "Max RSS (KB):                                  ", ru.ru_maxrss
+  echo "Runtime RSS (KB):                              ", rss
+  echo "# of page faults:                              ", flt
+  echo "--------------------------------------------------------------------------"
+  echo "# of tasks:                                    ", NumTasksTotal
+  echo "Task granularity (us):                         ", TaskGranularity
+  echo "Polling / manual load balancing interval (us): ", PollInterval
+  echo "--------------------------------------------------------------------------"
+
+  quit 0
+
+main()
--- a/constantine/platforms/threadpool/benchmarks/wtime.h
+++ b/constantine/platforms/threadpool/benchmarks/wtime.h
@ -0,0 +1,53 @@
+#ifndef WTIME_H
+#define WTIME_H
+
+#include <sys/time.h>
+#include <time.h>
+
+// Number of seconds since the Epoch
+static inline double Wtime_sec(void)
+{
+	struct timeval tv;
+	gettimeofday(&tv, NULL);
+	return tv.tv_sec + tv.tv_usec / 1e6;
+}
+
+// Number of milliseconds since the Epoch
+static inline double Wtime_msec(void)
+{
+	struct timeval tv;
+	gettimeofday(&tv, NULL);
+	return tv.tv_sec * 1e3 + tv.tv_usec / 1e3;
+}
+
+// Number of microseconds since the Epoch
+static inline double Wtime_usec(void)
+{
+	struct timeval tv;
+	gettimeofday(&tv, NULL);
+	return tv.tv_sec * 1e6 + tv.tv_usec;
+}
+
+// Read time stamp counter on x86
+static inline unsigned long long readtsc(void)
+{
+	unsigned int lo, hi;
+	// RDTSC copies contents of 64-bit TSC into EDX:EAX
+	asm volatile ("rdtsc" : "=a" (lo), "=d" (hi));
+ 	return (unsigned long long)hi << 32 | lo;
+}
+
+#define WTIME_unique_var_name_paste(id, n) id ## n
+#define WTIME_unique_var_name(id, n) WTIME_unique_var_name_paste(id, n)
+#define WTIME_unique_var(id) WTIME_unique_var_name(id, __LINE__)
+
+// Convenience macro for time measurement
+#define WTIME(unit) \
+	double WTIME_unique_var(_start_##unit##_) = Wtime_##unit##ec(); \
+	int WTIME_unique_var(_i_) = 0; \
+	for (; WTIME_unique_var(_i_) == 0 || \
+		 (printf("Elapsed wall time: %.2lf "#unit"\n", \
+			     Wtime_##unit##ec() - WTIME_unique_var(_start_##unit##_)), 0); \
+		 WTIME_unique_var(_i_)++)
+
+#endif // WTIME_H
--- a/constantine/platforms/threadpool/benchmarks/wtime.nim
+++ b/constantine/platforms/threadpool/benchmarks/wtime.nim
@ -0,0 +1,10 @@
+
+import strutils, os
+
+const cSourcesPath = currentSourcePath.rsplit(DirSep, 1)[0]
+const cHeader = csourcesPath / "wtime.h"
+
+{.passC: "-I" & cSourcesPath .}
+
+proc wtime_usec*: float64 {.importc: "Wtime_usec", header: cHeader.}
+proc wtime_msec*: float64 {.importc: "Wtime_msec", header: cHeader.}
--- a/constantine/platforms/threadpool/crossthread/backoff.nim
+++ b/constantine/platforms/threadpool/crossthread/backoff.nim
@ -0,0 +1,186 @@
+# Constantine
+# Copyright (c) 2018-2019    Status Research & Development GmbH
+# Copyright (c) 2020-Present Mamy André-Ratsimbazafy
+# Licensed and distributed under either of
+#   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
+#   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
+# at your option. This file may not be copied, modified, or distributed except according to those terms.
+
+import
+  std/atomics,
+  ../primitives/futexes
+
+# We implement 2 datastructures to put threads to sleep:
+# 1. An event notifier to put an awaiting thread to sleep when the task they require is worked on by another thread
+# 2. An eventcount to put an idle thread to sleep
+
+{.push raises:[], checks:off.}
+
+# ############################################################
+#
+#                      Event Notifier
+#
+# ############################################################
+
+# Formal verification at: https://github.com/mratsim/weave/blob/7682784/formal_verification/event_notifiers.tla#L76-L109
+
+type
+  EventNotifier* = object
+    ## Multi Producers, Single Consumer event notification
+    ## This is can be seen as a wait-free condition variable for producers
+    ## that avoids them spending time in expensive kernel land due to mutexes.
+    # ---- Consumer specific ----
+    ticket{.align: 64.}: uint8  # A ticket for the consumer to sleep in a phase
+    # ---- Contention ---- no real need for padding as cache line should be reloaded in case of contention anyway
+    futex: Futex                # A Futex (atomic int32 that can put thread to sleep)
+    phase: Atomic[uint8]        # A binary timestamp, toggles between 0 and 1 (but there is no atomic "not")
+    signaled: Atomic[bool]      # Signaling condition
+
+func initialize*(en: var EventNotifier) {.inline.} =
+  en.futex.initialize()
+  en.ticket = 0
+  en.phase.store(0, moRelaxed)
+  en.signaled.store(false, moRelaxed)
+
+func `=destroy`*(en: var EventNotifier) {.inline.} =
+  en.futex.teardown()
+
+func `=`*(dst: var EventNotifier, src: EventNotifier) {.error: "An event notifier cannot be copied".}
+func `=sink`*(dst: var EventNotifier, src: EventNotifier) {.error: "An event notifier cannot be moved".}
+
+func prepareToPark*(en: var EventNotifier) {.inline.} =
+  ## The consumer intends to sleep soon.
+  ## This must be called before the formal notification
+  ## via a channel.
+  if not en.signaled.load(moRelaxed):
+    en.ticket = en.phase.load(moRelaxed)
+
+proc park*(en: var EventNotifier) {.inline.} =
+  ## Wait until we are signaled of an event
+  ## Thread is parked and does not consume CPU resources
+  ## This may wakeup spuriously.
+  if not en.signaled.load(moRelaxed):
+    if en.ticket == en.phase.load(moRelaxed):
+      en.futex.wait(0)
+  en.signaled.store(false, moRelaxed)
+  en.futex.initialize()
+
+proc notify*(en: var EventNotifier) {.inline.} =
+  ## Signal a thread that it can be unparked
+
+  if en.signaled.load(moRelaxed):
+    # Another producer is signaling
+    return
+  en.signaled.store(true, moRelease)
+  discard en.phase.fetchXor(1, moRelaxed)
+  en.futex.store(1, moRelease)
+  en.futex.wake()
+
+# ############################################################
+#
+#                      Eventcount
+#
+# ############################################################
+
+type
+  Eventcount* = object
+    ## The lock-free equivalent of a condition variable.
+    ##
+    ## Usage, if a thread needs to be parked until a condition is true
+    ##       and signaled by another thread:
+    ## ```Nim
+    ## if condition:
+    ##   return
+    ##
+    ## while true:
+    ##   ticket = ec.sleepy()
+    ##   if condition:
+    ##     ec.cancelSleep()
+    ##     break
+    ##   else:
+    ##     ec.sleep()
+    ## ```
+
+    state: Atomic[uint32]
+    # State is actually the equivalent of a bitfield
+    # type State = object
+    #   waiters {.bitsize: 16.}: uint16
+    #   when sizeof(pointer) == 4:
+    #     epoch {.bitsize: 16.}: uint16
+    #   else:
+    #     epoch {.bitsize: 48.}: uint48
+    #
+    # of size, the native integer size
+    # and so can be used for atomic operations on 32-bit or 64-bit platforms.
+    # but there is no native fetchAdd for bitfield
+    futex: Futex
+    # Technically we could use the futex as the state.
+    # When you wait on a Futex, it waits only if the value of the futex
+    # matches with a reference value.
+    # But our reference value will be the epoch of notifications
+    # and it is non-trivial to zero-out the waiters bits.
+    # - One way could be to split a 64-bit number in 2
+    #   and cast the epoch part to Futex but that would only work on 64-bit CPU.
+    # - Another more hacky way would be to pad with a zero-out uint16 before and after the Futex
+    #   and depending on big or little endian provide a shifted address as Futex.
+
+  ParkingTicket* = object
+    epoch: uint32
+
+const # bitfield
+  # Low 16 bits are waiters, up to 2¹⁶ = 65536 threads are supported
+  # High 16 or 48 bits are epochs.
+  # We can deal with the ABA problem o:
+  # - up to 65536 wake requests on 32-bit
+  # - up to 281 474 976 710 656 wake requests on 64-bit
+  # Epoch rolling over to 0 are not a problem, they won't change the low 16 bits
+  kEpochShift = 16
+  kAddEpoch = 1 shl kEpochShift
+  kWaiterMask = kAddEpoch - 1
+  kEpochMask {.used.} = not kWaiterMask
+  kAddWaiter = 1
+  kSubWaiter = 1
+
+func initialize*(ec: var EventCount) {.inline.} =
+  ec.state.store(0, moRelaxed)
+  ec.futex.initialize()
+
+func `=destroy`*(ec: var EventCount) {.inline.} =
+  ec.futex.teardown()
+
+proc sleepy*(ec: var Eventcount): ParkingTicket {.inline.} =
+  ## To be called before checking if the condition to not sleep is met.
+  ## Returns a ticket to be used when committing to sleep
+  let prevState = ec.state.fetchAdd(kAddWaiter, moAcquireRelease)
+  result.epoch = prevState shr kEpochShift
+
+proc sleep*(ec: var Eventcount, ticket: ParkingTicket) {.inline.} =
+  ## Put a thread to sleep until notified.
+  ## If the ticket becomes invalid (a notfication has been received)
+  ## by the time sleep is called, the thread won't enter sleep
+  while ec.state.load(moAcquire) shr kEpochShift == ticket.epoch:
+    ec.futex.wait(ticket.epoch) # We don't use the futex internal value
+
+  let prev {.used.} = ec.state.fetchSub(kSubWaiter, moRelaxed)
+
+proc cancelSleep*(ec: var Eventcount) {.inline.} =
+  ## Cancel a sleep that was scheduled.
+  let prev {.used.} = ec.state.fetchSub(kSubWaiter, moRelaxed)
+
+proc wake*(ec: var EventCount) {.inline.} =
+  ## Wake a thread if at least 1 is parked
+  let prev = ec.state.fetchAdd(kAddEpoch, moAcquireRelease)
+  if (prev and kWaiterMask) != 0:
+    ec.futex.wake()
+
+proc wakeAll*(ec: var EventCount) {.inline.} =
+  ## Wake all threads if at least 1 is parked
+  let prev = ec.state.fetchAdd(kAddEpoch, moAcquireRelease)
+  if (prev and kWaiterMask) != 0:
+    ec.futex.wakeAll()
+
+proc getNumWaiters*(ec: var EventCount): uint32 {.inline.} =
+  ## Get the number of parked threads
+  ec.state.load(moRelaxed) and kWaiterMask
+
+{.pop.} # {.push raises:[], checks:off.}
--- a/constantine/platforms/threadpool/crossthread/taskqueues.nim
+++ b/constantine/platforms/threadpool/crossthread/taskqueues.nim
@ -0,0 +1,286 @@
+# Constantine
+# Copyright (c) 2018-2019    Status Research & Development GmbH
+# Copyright (c) 2020-Present Mamy André-Ratsimbazafy
+# Licensed and distributed under either of
+#   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
+#   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
+# at your option. This file may not be copied, modified, or distributed except according to those terms.
+
+# This file implements a single-producer multi-consumer
+# task queue for work-stealing schedulers.
+#
+# Papers:
+# - Dynamic Circular Work-Stealing Deque
+#   David Chase, Yossi Lev, 1993
+#   https://www.dre.vanderbilt.edu/~schmidt/PDF/work-stealing-dequeue.pdf
+#
+# - Non-Blocking Steal-Half Work Queues
+#   Danny Hendler, Nir Shavit, 2002
+#   https://www.cs.bgu.ac.il/~hendlerd/papers/p280-hendler.pdf
+#
+# - Correct and Efficient Work-Stealing for Weak Memory Models
+#   Nhat Minh Lê, Antoniu Pop, Albert Cohen, Francesco Zappa Nardelli, 2013
+#   https://fzn.fr/readings/ppopp13.pdf
+#
+# The task queue implements the following push, pop, steal, stealHalf
+#
+#    front                                              back
+#                 ---------------------------------
+#  steal()     <- |         |          |          | <- push()
+#  stealHalf() <- | Task 0  |  Task 1  |  Task 2  | -> pop()
+#  any thread     |         |          |          |    owner-only
+#                 ---------------------------------
+#
+# To reduce contention, stealing is done on the opposite end from push/pop
+# so that there is a race only for the very last task(s).
+
+{.push raises: [], checks: off.} # No exceptions in a multithreading datastructure
+
+import
+  std/atomics,
+  ../instrumentation,
+  ../../allocs,
+  ./tasks_flowvars
+
+type
+  Buf = object
+    ## Backend buffer of a Taskqueue
+    ## `capacity` MUST be a power of 2
+    prevRetired: ptr Buf # intrusive linked list. Used for garbage collection
+
+    capacity: int
+    rawBuffer: UncheckedArray[Atomic[ptr Task]]
+
+  Taskqueue* = object
+    ## This implements a lock-free, growable, work-stealing task queue.
+    ## The owning thread enqueues and dequeues at the back
+    ## Foreign threads steal at the front.
+    ##
+    ## There is no memory reclamation scheme for simplicity
+    front {.align: 64.}: Atomic[int]  # Consumers - steal/stealHalf
+    back: Atomic[int]                 # Producer  - push/pop
+    buf: Atomic[ptr Buf]
+    garbage: ptr Buf
+
+proc peek*(tq: var Taskqueue): int =
+  ## Estimates the number of items pending in the channel
+  ## In a SPMC setting
+  ## - If called by the producer the true number might be less
+  ##   due to consumers removing items concurrently.
+  ## - If called by a consumer the true number is undefined
+  ##   as other consumers also remove items concurrently and
+  ##   the producer removes them concurrently.
+  ##
+  ## If the producer peeks and the Chase-Lev Deque returns 0,
+  ## the queue is empty.
+  ##
+  ## This is a non-locking operation.
+  let # Handle race conditions
+    b = tq.back.load(moRelaxed)
+    f = tq.front.load(moRelaxed)
+
+  if b >= f:
+    return b-f
+  else:
+    return 0
+
+func isPowerOfTwo(n: int): bool {.used, inline.} =
+  (n and (n - 1)) == 0 and (n != 0)
+
+proc newBuf(capacity: int): ptr Buf =
+  # Tasks have a destructor
+  # static:
+  #   doAssert supportsCopyMem(T), $T & " must be a (POD) plain-old-data type: no seq, string, ref."
+
+  preCondition: capacity.isPowerOfTwo()
+
+  result = allocHeapUnchecked(Buf, 1*sizeof(pointer) + 2*sizeof(int) + sizeof(pointer)*capacity)
+
+  result.prevRetired = nil
+  result.capacity = capacity
+  result.rawBuffer.addr.zeroMem(sizeof(pointer)*capacity)
+
+proc `[]=`(buf: var Buf, index: int, item: ptr Task) {.inline.} =
+  buf.rawBuffer[index and (buf.capacity-1)].store(item, moRelaxed)
+
+proc `[]`(buf: var Buf, index: int): ptr Task {.inline.} =
+  result = buf.rawBuffer[index and (buf.capacity-1)].load(moRelaxed)
+
+proc grow(tq: var Taskqueue, buf: var ptr Buf, newCapacity, front, back: int) {.inline.} =
+  ## Double the buffer size
+  ## back is the last item index
+  ##
+  ## To handle race-conditions the current "front", "back" and "buf"
+  ## have to be saved before calling this procedure.
+  ## It reads and writes the "tq.buf" and "tq.garbage"
+
+  # Read -> Copy -> Update
+  var tmp = newBuf(newCapacity)
+  for i in front ..< back:
+    tmp[][i] = buf[][i]
+
+  buf.prevRetired = tq.garbage
+  tq.garbage = buf
+  # publish globally
+  tq.buf.store(tmp, moRelaxed)
+  # publish locally
+  swap(buf, tmp)
+
+proc garbageCollect(tq: var Taskqueue) {.inline.} =
+  var node = tq.garbage
+  while node != nil:
+    let tmp = node.prevRetired
+    freeHeap(node)
+    node = tmp
+  tq.garbage = nil
+
+# Public API
+# ---------------------------------------------------
+
+proc init*(tq: var Taskqueue, initialCapacity: int) =
+  zeroMem(tq.addr, tq.sizeof())
+  tq.buf.store(newBuf(initialCapacity), moRelaxed)
+
+proc teardown*(tq: var Taskqueue) =
+  tq.garbageCollect()
+  freeHeap(tq.buf.load(moRelaxed))
+
+proc push*(tq: var Taskqueue, item: ptr Task) =
+  ## Enqueue an item at the back
+  ## As the task queue takes ownership of it. The item must not be used afterwards.
+  ## This is intended for the producer only.
+
+  let # Handle race conditions
+    b = tq.back.load(moRelaxed)
+    f = tq.front.load(moAcquire)
+  var buf = tq.buf.load(moRelaxed)
+
+  if b-f > buf.capacity - 1:
+    # Full queue
+    tq.grow(buf, buf.capacity*2, f, b)
+
+  if not tq.garbage.isNil and f == b:
+    # Empty queue, no thieves can have a pointer to an old retired buffer
+    tq.garbageCollect()
+
+  buf[][b] = item
+  fence(moRelease)
+  tq.back.store(b+1, moRelaxed)
+
+proc pop*(tq: var Taskqueue): ptr Task =
+  ## Dequeue an item at the back. Takes ownership of the item
+  ## This is intended for the producer only.
+
+  let # Handle race conditions
+    b = tq.back.load(moRelaxed) - 1
+    buf = tq.buf.load(moRelaxed)
+
+  tq.back.store(b, moRelaxed)
+  fence(moSequentiallyConsistent)
+  var f = tq.front.load(moRelaxed)
+
+  if f <= b:
+    # Non-empty queue.
+    result = buf[][b]
+    if f == b:
+      # Single last element in queue.
+      if not compareExchange(tq.front, f, f+1, moSequentiallyConsistent, moRelaxed):
+        # Failed race.
+        result = nil
+      tq.back.store(b+1, moRelaxed)
+      if not tq.garbage.isNil:
+          # Empty queue, no thieves can have a pointer to an old retired buffer
+          tq.garbageCollect()
+  else:
+    # Empty queue.
+    result = nil
+    tq.back.store(b+1, moRelaxed)
+    if not tq.garbage.isNil:
+        # Empty queue, no thieves can have a pointer to an old retired buffer
+        tq.garbageCollect()
+
+proc steal*(thiefID: uint32, tq: var Taskqueue): ptr Task =
+  ## Dequeue an item at the front. Takes ownership of the item
+  ## This is intended for consumers.
+  var f = tq.front.load(moAcquire)
+  fence(moSequentiallyConsistent)
+  let b = tq.back.load(moAcquire)
+  result = nil
+
+  if f < b:
+    # Non-empty queue.
+    let a = tq.buf.load(moConsume)
+    result = a[][f]
+    if not compareExchange(tq.front, f, f+1, moSequentiallyConsistent, moRelaxed):
+      # Failed race.
+      return nil
+    result.thiefID.store(thiefID, moRelease)
+
+proc stealHalfImpl(dst: var Buf, dstBack: int, src: var Taskqueue): int =
+  ## Theft part of stealHalf:
+  ## - updates the victim metadata if successful
+  ## - uncommitted updates to the thief tq whether successful or not
+  ## Returns -1 if dst buffer is too small
+  ## Assumes `dst` buffer is empty (i.e. not ahead-of-time thefts)
+
+  while true:
+    # Try as long as there are something to steal, we are idling anyway.
+
+    var f = src.front.load(moAcquire)
+    fence(moSequentiallyConsistent)
+    let b = src.back.load(moAcquire)
+    var n = b-f
+    n = n - (n shr 1) # Division by 2 rounded up, so if only one task is left, we still get it.
+
+    if n <= 0:
+      return 0
+    if n > dst.capacity:
+      return -1
+
+    # Non-empty queue.
+    let sBuf = src.buf.load(moConsume)
+    for i in 0 ..< n: # Copy LIFO or FIFO?
+      dst[dstBack+i] = sBuf[][f+i]
+    if compareExchange(src.front, f, f+n, moSequentiallyConsistent, moRelaxed):
+      return n
+
+proc stealHalf*(thiefID: uint32, dst: var Taskqueue, src: var Taskqueue): ptr Task =
+  ## Dequeue up to half of the items in the `src` tq, fom the front.
+  ## Return the last of those, or nil if none found
+
+  while true:
+    # Prepare for batch steal
+    let
+      bDst = dst.back.load(moRelaxed)
+      fDst = dst.front.load(moAcquire)
+    var dBuf = dst.buf.load(moAcquire)
+    let sBuf = src.buf.load(moAcquire)
+
+    if dBuf.capacity < sBuf.capacity:
+      # We could grow to sBuf/2 since we steal half, but we want to minimize
+      # churn if we are actually in the process of stealing and the buffers grows.
+      dst.grow(dBuf, sBuf.capacity, fDst, bDst)
+
+    # Steal
+    let n = dBuf[].stealHalfImpl(bDst, src)
+
+    if n == 0:
+      return nil
+    if n == -1:
+      # Oops, victim buffer grew bigger than ours, restart the whole process
+      continue
+
+    # Update metadata
+    for i in 0 ..< n:
+      dBuf[][bDst+i].thiefID.store(thiefID, moRelease)
+
+    # Commit/publish theft, return the first item for processing
+    let last = dBuf[][bDst+n-1]
+    fence(moSequentiallyConsistent)
+    if n == 1:
+      return last
+
+    # We have more than one item, so some must go in our queue
+    # they are already here but inaccessible for other thieves
+    dst.back.store(bDst+n-1, moRelease) # We assume that queue was empty and so dst.front didn't change
+    return last
--- a/constantine/platforms/threadpool/crossthread/tasks_flowvars.nim
+++ b/constantine/platforms/threadpool/crossthread/tasks_flowvars.nim
@ -0,0 +1,127 @@
+# Constantine
+# Copyright (c) 2018-2019    Status Research & Development GmbH
+# Copyright (c) 2020-Present Mamy André-Ratsimbazafy
+# Licensed and distributed under either of
+#   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
+#   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
+# at your option. This file may not be copied, modified, or distributed except according to those terms.
+
+import
+  std/atomics,
+  ../instrumentation,
+  ../../allocs,
+  ./backoff
+
+# Tasks have an efficient design so that a single heap allocation
+# is required per `spawn`.
+# This greatly reduce overhead and potential memory fragmentation for long-running applications.
+#
+# This is done by tasks:
+# - being an intrusive linked lists
+# - integrating the channel to send results
+#
+# Flowvar is the public type created when spawning a task.
+# and can be synced to receive the task result.
+# Flowvars are also called future interchangeably.
+# (The name future is already used for IO scheduling)
+
+type
+  Task* = object
+    # Intrusive metadata
+    # ------------------
+    parent*: ptr Task # When a task is awaiting, a thread can quickly prioritize the direct child of a task
+
+    thiefID*: Atomic[uint32] # ID of the worker that stole and run the task. For leapfrogging.
+
+    # Result sync
+    # ------------------
+    hasFuture*: bool         # Ownership: if a task has a future, the future deallocates it. Otherwise the worker thread does.
+    completed*: Atomic[bool]
+    waiter*: Atomic[ptr EventNotifier]
+
+    # Execution
+    # ------------------
+    fn*: proc (param: pointer) {.nimcall, gcsafe.}
+    # destroy*: proc (param: pointer) {.nimcall, gcsafe.} # Constantine only deals with plain old data
+    data*{.align:sizeof(int).}: UncheckedArray[byte]
+
+  Flowvar*[T] = object
+    task: ptr Task
+
+const SentinelThief* = 0xFACADE'u32
+
+proc new*(
+       T: typedesc[Task],
+       parent: ptr Task,
+       fn: proc (param: pointer) {.nimcall, gcsafe.}): ptr Task {.inline.} =
+
+  const size = sizeof(T)
+
+  result = allocHeapUnchecked(T, size)
+  result.parent = parent
+  result.thiefID.store(SentinelThief, moRelaxed)
+  result.completed.store(false, moRelaxed)
+  result.waiter.store(nil, moRelaxed)
+  result.fn = fn
+
+proc new*(
+       T: typedesc[Task],
+       parent: ptr Task,
+       fn: proc (param: pointer) {.nimcall, gcsafe.},
+       params: auto): ptr Task {.inline.} =
+
+  const size = sizeof(T) + # size without Unchecked
+               sizeof(params)
+
+  result = allocHeapUnchecked(T, size)
+  result.parent = parent
+  result.thiefID.store(SentinelThief, moRelaxed)
+  result.completed.store(false, moRelaxed)
+  result.waiter.store(nil, moRelaxed)
+  result.fn = fn
+  cast[ptr[type params]](result.data)[] = params
+
+# proc `=copy`*[T](dst: var Flowvar[T], src: Flowvar[T]) {.error: "Futures/Flowvars cannot be copied".}
+
+proc newFlowVar*(T: typedesc, task: ptr Task): Flowvar[T] {.inline.} =
+  result.task = task
+  result.task.hasFuture = true
+
+  # Task with future references themselves so that readyWith can be called
+  # within the constructed
+  #   proc async_fn(param: pointer) {.nimcall.}
+  # that can only access data
+  cast[ptr ptr Task](task.data.addr)[] = task
+
+proc cleanup*(fv: var Flowvar) {.inline.} =
+  fv.task.freeHeap()
+  fv.task = nil
+
+func isSpawned*(fv: Flowvar): bool {.inline.} =
+  ## Returns true if a flowvar is spawned
+  ## This may be useful for recursive algorithms that
+  ## may or may not spawn a flowvar depending on a condition.
+  ## This is similar to Option or Maybe types
+  return not fv.task.isNil
+
+func isReady*[T](fv: Flowvar[T]): bool {.inline.} =
+  ## Returns true if the result of a Flowvar is ready.
+  ## In that case `sync` will not block.
+  ## Otherwise the current will block to help on all the pending tasks
+  ## until the Flowvar is ready.
+  fv.task.completed.load(moAcquire)
+
+func readyWith*[T](task: ptr Task, childResult: T) {.inline.} =
+  ## Send the Flowvar result from the child thread processing the task
+  ## to its parent thread.
+  precondition: not task.completed.load(moAcquire)
+  cast[ptr (ptr Task, T)](task.data.addr)[1] = childResult
+  task.completed.store(true, moRelease)
+
+proc sync*[T](fv: sink Flowvar[T]): T {.inline, gcsafe.} =
+  ## Blocks the current thread until the flowvar is available
+  ## and returned.
+  ## The thread is not idle and will complete pending tasks.
+  mixin completeFuture
+  completeFuture(fv, result)
+  cleanup(fv)
--- a/constantine/platforms/threadpool/docs/partitioners.md
+++ b/constantine/platforms/threadpool/docs/partitioners.md
@ -0,0 +1,166 @@
+# Partitioners
+
+For data parallelism (parallel for loops) there are 2 main scheduling strategies:
+- static scheduling, when work is regular (for example adding 2 matrices).
+  In that case, splitting the loop
+  into same-sized chunk provides perfect speedup, with no synchronization overhead.
+  (Assuming threads have the same performance and no parasite load)
+- dynamic scheduling, when work is irregular (for example zero-ing buffers of different length).
+
+Partitioners help implementing static scheduling.
+
+Static scheduling
+=================
+
+Usually static scheduling is problematic because the threshold below which running in parallel
+is both hardware dependent, data layout and function dependent, see https://github.com/zy97140/omp-benchmark-for-pytorch
+
+| CPU Model                          | Sockets | Cores/Socket | Frequency |
+|------------------------------------|---------|--------------|-----------|
+| Intel(R) Xeon(R) CPU E5-2699 v4    | 2       | 22           | 2.20GHz   |
+| Intel(R) Xeon(R) Platinum 8180 CPU | 2       | 28           | 2.50GHz   |
+| Intel(R) Core(TM) i7-5960X CPU     | 1       | 8            | 3.00GHz   |
+
+| contiguous op | Xeon(R) Platinum 8180 CPU | Xeon(R) CPU E5-2699 v4 | i7-5960X CPU |
+|---------------|---------------------------|------------------------|--------------|
+| copy          | 80k                       | 20k                    | 8k           |
+| add           | 80k                       | 20k                    | 8k           |
+| div           | 50k                       | 10k                    | 2k           |
+| exp           | 1k                        | 1k                     | 1k           |
+| sin           | 1k                        | 1k                     | 1k           |
+| sum           | 1k                        | 1k                     | 1k           |
+| prod          | 1k                        | 1k                     | 1k           |
+
+| non-contiguous op | Xeon(R) Platinum 8180 CPU | Xeon(R) CPU E5-2699 v4 | i7-5960X CPU |
+|-------------------|---------------------------|------------------------|--------------|
+| copy              | 20k                       | 8k                     | 2k           |
+| add               | 20k                       | 8k                     | 2k           |
+| div               | 10k                       | 8k                     | 1k           |
+| exp               | 1k                        | 1k                     | 1k           |
+| sin               | 2k                        | 2k                     | 1k           |
+| sum               | 1k                        | 1k                     | 1k           |
+| prod              | 1k                        | 1k                     | 1k           |
+
+
+# Static partitioner
+====================
+
+```Nim
+iterator balancedChunks*(start, stopEx, numChunks: int): tuple[chunkID, start, stopEx: int] =
+  ## Balanced chunking algorithm for a range [start, stopEx)
+  ## This splits a range into min(stopEx-start, numChunks) balanced regions
+  ## and returns a tuple (chunkID, offset, length)
+
+  # Rationale
+  # The following simple chunking scheme can lead to severe load imbalance
+  #
+  # let chunk_offset = chunk_size * thread_id
+  # let chunk_size   = if thread_id < nb_chunks - 1: chunk_size
+  #                    else: omp_size - chunk_offset
+  #
+  # For example dividing 40 items on 12 threads will lead to
+  # a base_chunk_size of 40/12 = 3 so work on the first 11 threads
+  # will be 3 * 11 = 33, and the remainder 7 on the last thread.
+  #
+  # Instead of dividing 40 work items on 12 cores into:
+  # 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 7 = 3*11 + 7 = 40
+  # the following scheme will divide into
+  # 4, 4, 4, 4, 3, 3, 3, 3, 3, 3, 3, 3 = 4*4 + 3*8 = 40
+  #
+  # This is compliant with OpenMP spec (page 60)
+  # http://www.openmp.org/mp-documents/openmp-4.5.pdf
+  # "When no chunk_size is specified, the iteration space is divided into chunks
+  # that are approximately equal in size, and at most one chunk is distributed to
+  # each thread. The size of the chunks is unspecified in this case."
+  # ---> chunks are the same ±1
+
+  let
+    totalIters = stopEx - start
+    baseChunkSize = totalIters div numChunks
+    cutoff = totalIters mod numChunks
+
+  for chunkID in 0 ..< min(numChunks, totalIters):
+    if chunkID < cutoff:
+      let offset = start + ((baseChunkSize + 1) * chunkID)
+      let chunkSize = baseChunkSize + 1
+      yield (chunkID, offset, offset+chunkSize)
+    else:
+      let offset = start + (baseChunkSize * chunkID) + cutoff
+      let chunkSize = baseChunkSize
+      yield (chunkID, offset, chunkSize)
+
+when isMainModule:
+  for chunk in balancedChunks(start = 0, stopEx = 40, numChunks = 12):
+    echo chunk
+```
+
+Dynamic scheduling
+==================
+
+Dynamic schedulers decide at runtime whether to split a range for processing by multiple threads at runtime.
+Unfortunately most (all?) of those in production do not or cannot take into account
+the actual functions being called within a `parallel_for` section,
+and might split into too fine-grained chunks or into too coarse-grained chunks.
+Alternatively they might ask the programmer for a threshold below which not to split.
+As the programmer has no way to know if their code will run on a Raspberry Pi or a powerful workstation, that choice cannot be optimal.
+
+Recent advances in research like "Lazy Binary Splitting" and "Lazy Tree Splitting"
+allows dynamic scheduling to fully adapt to the system current load and
+the parallel section computational needs without programmer input (grain size or minimal split threshold)
+by using backpressure.
+
+
+Why do we need partitioners for cryptographic workload then?
+============================================================
+
+
+Here are 3 basic cryptographic primitives that are non-trivial to parallelize:
+
+1. batch elliptic-curve addition
+2. multi-scalar multiplication (MSM)
+3. batch signature verification via multi-pairing
+
+On the other hand, merkle tree hashing for example is a primitive where both static or dynamic scheduling should give perfect speedup.
+
+Let's take the example of batch EC addition.
+--------------------------------------------
+
+There is a naive way, via doing a parallel EC sum reduction,
+for example for 1M points,
+using projective coordinates, each sum costs 12M (field multiplication)
+At first level we have 500k sums, then 250k sums, then 125k, ...
+The number of sums is ~1M, for a total cost of 12e⁶
+
+```
+ 0 1 2 3 4 5 6 7
+  +   +   +   +
+    +       +
+        +
+```
+
+The fast way uses affine sum for an asymptotic cost of 6M, so 2x faster.
+but this requires an inversion, a fixed cost of ~131M (256-bit) to ~96M (384-bit)
+whatever number of additions we accumulate
+That inversion needs to be amortized into at least 20-25 additions.
+
+Let's take a look at multi-pairing
+----------------------------------
+
+Multi-pairing is split into 2 phases:
+- 1. the Miller-Loop which is embarassing parallel, each thread can compute it on their own.
+- 2. reducing the n Miller-Loops into a single 𝔽pₖ point using parallel 𝔽pₖ sum reduction.
+- 3. Computing the final exponentiation, a fixed cost whatever the number of Miller Loops we did.
+  3alt. Alternatively, computing n final exponentiations, and merging them with a 𝔽pₖ product reduction
+        in that case, step 2 is not needed.
+
+Conclusion
+----------
+
+Dynamic scheduling for reduction with variable + fixed costs (independent of chunk size) is tricky.
+Furthermore the computations are regular, same workload per range and static scheduling seems like a great fit.
+
+The downside is if a core has a parasite workload or on architecture like ARM big.Little or Alder Lake
+with performance and power-saving cores.
+
+Alternatively, we can add dynamic scheduling hints about min and max chunk size so that
+chunk size is kept within the optimal range whatever the number of idle threads.
--- a/constantine/platforms/threadpool/docs/random_permutations.md
+++ b/constantine/platforms/threadpool/docs/random_permutations.md
@ -0,0 +1,42 @@
+# Random permutations
+
+Work-stealing is more efficient when the thread we steal from is randomized.
+If all threads steal in the same order, we increase contention
+on the start victims task queues.
+
+The randomness quality is not important besides distributing potential contention,
+i.e. randomly trying thread i, then i+1, then i+n-1 (mod n) is good enough.
+
+Hence for efficiency, so that a thread can go to sleep faster, we want to
+reduce calls to to the RNG as:
+- Getting a random value itself can be expensive, especially if we use a CSPRNG (not a requirement)
+- a CSPRNG can be starved of entropy as with small tasks, threads might make millions of calls.
+- If we want unbiaised thread ID generation in a range, rejection sampling is costly (not a requirement).
+
+Instead of using Fisher-Yates
+  - generates the victim set eagerly, inefficient if the first steal attempts are successful
+  - needs a RNG call when sampling a victim
+  - memory usage: numThreads per thread so numthreads² uint8 (255 threads max) or uint32
+
+or a sparseset
+  - 1 RNG call when sampling a victim
+  - memory usage: 2\*numThreads per thread so 2\*numthreads² uint8 (255 threads max) or uint32
+
+we can use Linear Congruential Generators, a recurrence relation of the form Xₙ₊₁ = aXₙ+c (mod m)
+If we respect the Hull-Dobell theorem requirements, we can generate pseudo-random permutations in [0, m)
+with fixed memory usage whatever the number of potential victims: just 4 registers for a, x, c, m
+
+References:
+- Fisher-Yates: https://en.wikipedia.org/wiki/Fisher%E2%80%93Yates_shuffle
+- Sparse sets: https://dl.acm.org/doi/pdf/10.1145/176454.176484
+               https://github.com/mratsim/weave/blob/7682784/weave/datatypes/sparsesets.nim
+               https://github.com/status-im/nim-taskpools/blob/4bc0b59/taskpools/sparsesets.nim
+- Linear Congruential Generator: https://en.wikipedia.org/wiki/Linear_congruential_generator
+
+And if we want cryptographic strength:
+- Ciphers with Arbitrary Finite Domains
+  John Black and Phillip Rogaway, 2001
+  https://eprint.iacr.org/2001/012
+- An Enciphering Scheme Based on a Card Shuffle
+  Viet Tung Hoang, Ben Morris, Phillip Rogaway
+  https://www.iacr.org/archive/crypto2012/74170001/74170001.pdf
--- a/constantine/platforms/threadpool/examples/e01_simple_tasks.nim
+++ b/constantine/platforms/threadpool/examples/e01_simple_tasks.nim
@ -0,0 +1,48 @@
+import ../threadpool
+
+block: # Async without result
+
+  proc displayInt(x: int) =
+    stdout.write(x)
+    stdout.write(" - SUCCESS\n")
+
+  proc main() =
+    echo "\n=============================================================================================="
+    echo "Running 'threadpool/examples/e01_simple_tasks.nim'"
+    echo "=============================================================================================="
+
+    echo "\nSanity check 1: Printing 123456 654321 in parallel"
+
+    var tp = Threadpool.new(numThreads = 4)
+    tp.spawn displayInt(123456)
+    tp.spawn displayInt(654321)
+    tp.shutdown()
+
+  main()
+
+block: # Async/Await
+  var tp: Threadpool
+
+  proc asyncFib(n: int): int =
+    if n < 2:
+      return n
+
+    let x = tp.spawn asyncFib(n-1)
+    let y = asyncFib(n-2)
+
+    result = sync(x) + y
+
+  proc main2() =
+    echo "\n=============================================================================================="
+    echo "Running 'threadpool/examples/e01_simple_tasks.nim'"
+    echo "=============================================================================================="
+
+    echo "\nSanity check 2: fib(20)"
+
+    tp = Threadpool.new()
+    let f = asyncFib(20)
+    tp.shutdown()
+
+    doAssert f == 6765, "f was " & $f
+
+  main2()
--- a/constantine/platforms/threadpool/examples/e02_parallel_pi.nim
+++ b/constantine/platforms/threadpool/examples/e02_parallel_pi.nim
@ -0,0 +1,38 @@
+# Demo of API using a very inefficient π approcimation algorithm.
+
+import
+  std/[strutils, math, cpuinfo],
+  ../threadpool
+
+# From https://github.com/nim-lang/Nim/blob/v1.6.2/tests/parallel/tpi.nim
+# Leibniz Formula https://en.wikipedia.org/wiki/Leibniz_formula_for_%CF%80
+proc term(k: int): float =
+  if k mod 2 == 1:
+    -4'f / float(2*k + 1)
+  else:
+    4'f / float(2*k + 1)
+
+proc piApprox(tp: Threadpool, n: int): float =
+  var pendingFuts = newSeq[Flowvar[float]](n)
+  for k in 0 ..< pendingFuts.len:
+    pendingFuts[k] = tp.spawn term(k) # Schedule a task on the threadpool a return a handle to retrieve the result.
+  for k in 0 ..< pendingFuts.len:
+    result += sync pendingFuts[k]     # Block until the result is available.
+
+proc main() =
+
+  echo "\n=============================================================================================="
+  echo "Running 'threadpool/examples/e02_parallel_pi.nim'"
+  echo "=============================================================================================="
+
+  var n = 1_000_000
+  var nthreads = countProcessors()
+
+  var tp = Threadpool.new(num_threads = nthreads) # Default to the number of hardware threads.
+
+  echo formatFloat(tp.piApprox(n))
+
+  tp.shutdown()
+
+# Compile with nim c -r -d:release --threads:on --outdir:build example.nim
+main()
--- a/constantine/platforms/threadpool/instrumentation.nim
+++ b/constantine/platforms/threadpool/instrumentation.nim
@ -0,0 +1,142 @@
+# Weave
+# Copyright (c) 2019 Mamy André-Ratsimbazafy
+# Licensed and distributed under either of
+#   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
+#   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
+# at your option. This file may not be copied, modified, or distributed except according to those terms.
+
+import system/ansi_c
+
+# Loggers
+# --------------------------------------------------------
+
+template log*(args: varargs[untyped]): untyped =
+  c_printf(args)
+  flushFile(stdout)
+
+template debugTermination*(body: untyped): untyped =
+  when defined(TP_DebugTermination) or defined(TP_Debug):
+    {.noSideEffect, gcsafe.}: body
+
+template debug*(body: untyped): untyped =
+  when defined(TP_Debug):
+    {.noSideEffect, gcsafe.}: body
+
+# --------------------------------------------------------
+
+import std/macros
+
+# A simple design-by-contract API
+# --------------------------------------------------------
+
+# Everything should be a template that doesn't produce any code
+# when debugConstantine is not defined.
+# Those checks are controlled by a custom flag instead of
+# "--boundsChecks" or "--nilChecks" to decouple them from user code checks.
+# Furthermore, we want them to be very lightweight on performance
+
+func toHex*(a: SomeInteger): string =
+  const hexChars = "0123456789abcdef"
+  const L = 2*sizeof(a)
+  result = newString(2 + L)
+  result[0] = '0'
+  result[1] = 'x'
+  var a = a
+  for j in countdown(result.len-1, 0):
+    result[j] = hexChars[a and 0xF]
+    a = a shr 4
+
+proc inspectInfix(node: NimNode): NimNode =
+  ## Inspect an expression,
+  ## Returns the AST as string with runtime values inlined
+  ## from infix operators inlined.
+  # TODO: pointer and custom type need a default repr
+  #       otherwise we can only resulve simple expressions
+  proc inspect(node: NimNode): NimNode =
+    case node.kind:
+    of nnkInfix:
+      return newCall(
+          bindSym"&",
+          newCall(
+            bindSym"&",
+            newCall(ident"$", inspect(node[1])),
+            newLit(" " & $node[0] & " ")
+          ),
+          newCall(ident"$", inspect(node[2]))
+        )
+    of {nnkIdent, nnkSym}:
+      return node
+    of nnkDotExpr:
+      return quote do:
+        when `node` is pointer or
+             `node` is ptr or
+             `node` is (proc):
+          toHex(cast[ByteAddress](`node`) and 0xffff_ffff)
+        else:
+          $(`node`)
+    of nnkPar:
+      result = nnkPar.newTree()
+      for sub in node:
+        result.add inspect(sub)
+    else:
+      return node.toStrLit()
+  return inspect(node)
+
+macro assertContract(
+        checkName: static string,
+        predicate: untyped) =
+  let lineinfo = lineInfoObj(predicate)
+
+  var strippedPredicate: NimNode
+  if predicate.kind == nnkStmtList:
+    assert predicate.len == 1, "Only one-liner conditions are supported"
+    strippedPredicate = predicate[0]
+  else:
+    strippedPredicate = predicate
+
+  let debug = "\n    Contract violated for " & checkName & " at " & $lineinfo &
+              "\n        " & $strippedPredicate.toStrLit &
+              "\n    The following values are contrary to expectations:" &
+              "\n        "
+  let values = inspectInfix(strippedPredicate)
+  let workerID = quote do:
+    when declared(workerContext):
+      $workerContext.id
+    else:
+      "N/A"
+  let threadpoolID = quote do:
+    when declared(workerContext):
+      cast[ByteAddress](workerContext.threadpool).toHex()
+    else:
+      "N/A"
+
+  result = quote do:
+    {.noSideEffect.}:
+      when compileOption("assertions"):
+        assert(`predicate`, `debug` & $`values` & "  [Worker " & `workerID` & " on threadpool " & `threadpoolID` & "]\n")
+      elif defined(TP_Asserts):
+        if unlikely(not(`predicate`)):
+          raise newException(AssertionError, `debug` & $`values` & "  [Worker " & `workerID` & " on threadpool " & `threadpoolID` & "]\n")
+
+
+template preCondition*(require: untyped) =
+  ## Optional runtime check before returning from a function
+  assertContract("pre-condition", require)
+
+template postCondition*(ensure: untyped) =
+  ## Optional runtime check at the start of a function
+  assertContract("post-condition", ensure)
+
+template ascertain*(check: untyped) =
+  ## Optional runtime check in the middle of processing
+  assertContract("transient condition", check)
+
+# Sanity checks
+# ----------------------------------------------------------------------------------
+
+when isMainModule:
+  proc assertGreater(x, y: int) =
+    postcondition(x > y)
+
+  # We should get a nicely formatted exception
+  assertGreater(10, 12)
--- a/constantine/platforms/threadpool/parallel_offloading.nim
+++ b/constantine/platforms/threadpool/parallel_offloading.nim
@ -0,0 +1,160 @@
+# Constantine
+# Copyright (c) 2018-2019    Status Research & Development GmbH
+# Copyright (c) 2020-Present Mamy André-Ratsimbazafy
+# Licensed and distributed under either of
+#   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
+#   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
+# at your option. This file may not be copied, modified, or distributed except according to those terms.
+
+import
+  std/macros,
+  ./instrumentation,
+  ./crossthread/tasks_flowvars
+
+# Task parallelism - spawn
+# ---------------------------------------------
+
+proc spawnVoid(funcCall: NimNode, args, argsTy: NimNode, workerContext, schedule: NimNode): NimNode =
+  # Create the async function
+  let fn = funcCall[0]
+  let fnName = $fn
+  let withArgs = args.len > 0
+  let async_fn = ident("async_" & fnName)
+  var fnCall = newCall(fn)
+  let data = ident("data")   # typed pointer to data
+
+  # Schedule
+  let task = ident"task"
+  let scheduleBlock = newCall(schedule, workerContext, task)
+
+  result = newStmtList()
+
+  if funcCall.len == 2:
+    # With only 1 arg, the tuple syntax doesn't construct a tuple
+    # let data = (123) # is an int
+    fnCall.add nnkDerefExpr.newTree(data)
+  else: # This handles the 0 arg case as well
+    for i in 1 ..< funcCall.len:
+      fnCall.add nnkBracketExpr.newTree(
+        data,
+        newLit i-1
+      )
+
+  # Create the async call
+  result.add quote do:
+    proc `async_fn`(param: pointer) {.nimcall.} =
+      # preCondition: not isRootTask(`workerContext`.currentTask)
+
+      when bool(`withArgs`):
+        let `data` = cast[ptr `argsTy`](param)
+      `fnCall`
+  
+  # Create the task
+  result.add quote do:
+    block enq_deq_task:
+      when bool(`withArgs`):
+        let `task` = Task.new(
+          parent = `workerContext`.currentTask,
+          fn = `async_fn`,
+          params = `args`)
+      else:
+        let `task` = Task.new(
+          parent = `workerContext`.currentTask,
+          fn = `async_fn`)
+      `scheduleBlock`
+
+proc spawnRet(funcCall: NimNode, retTy, args, argsTy: NimNode, workerContext, schedule: NimNode): NimNode =
+  # Create the async function
+  let fn = funcCall[0]
+  let fnName = $fn
+  let async_fn = ident("async_" & fnName)
+  var fnCall = newCall(fn)
+  let data = ident("data")   # typed pointer to data
+
+  # Schedule
+  let task = ident"task"
+  let scheduleBlock = newCall(schedule, workerContext, task)
+
+  result = newStmtList()
+
+  # tasks have no return value.
+  # 1. The start of the task `data` buffer will store the return value for the flowvar and awaiter/sync
+  # 2. We create a wrapper async_fn without return value that send the return value in the channel
+  # 3. We package that wrapper function in a task
+
+  # We store the following in task.data:
+  #
+  # | ptr Task | result | arg₀ | arg₁ | ... | argₙ
+  let fut = ident"fut"
+  let taskSelfReference = ident"taskSelfReference"
+  let retVal = ident"retVal"
+
+  var futArgs = nnkPar.newTree
+  var futArgsTy = nnkPar.newTree
+  futArgs.add taskSelfReference
+  futArgsTy.add nnkPtrTy.newTree(bindSym"Task")
+  futArgs.add retVal
+  futArgsTy.add retTy
+
+  for i in 1 ..< funcCall.len:
+    futArgsTy.add getTypeInst(funcCall[i])
+    futArgs.add funcCall[i]
+
+  # data stores | ptr Task | result | arg₀ | arg₁ | ... | argₙ
+  # so arguments starts at data[2] in the wrapping funcCall functions
+  for i in 1 ..< funcCall.len:
+    fnCall.add nnkBracketExpr.newTree(
+      data,
+      newLit i+1
+    )
+
+  result.add quote do:
+    proc `async_fn`(param: pointer) {.nimcall.} =
+      # preCondition: not isRootTask(`workerContext`.currentTask)
+
+      let `data` = cast[ptr `futArgsTy`](param)
+      let res = `fnCall`
+      readyWith(`data`[0], res)
+
+  # Regenerate fresh ident, retTy has been tagged as a function call param
+  let retTy = ident($retTy)
+
+  # Create the task
+  result.add quote do:
+    block enq_deq_task:
+      let `taskSelfReference` = cast[ptr Task](0xDEADBEEF)
+      let `retVal` = default(`retTy`)
+
+      let `task` = Task.new(
+        parent = `workerContext`.currentTask,
+        fn = `async_fn`,
+        params = `futArgs`)
+      let `fut` = newFlowvar(`retTy`, `task`)
+      `scheduleBlock`
+      # Return the future
+      `fut`
+
+proc spawnImpl*(tp: NimNode{nkSym}, funcCall: NimNode, workerContext, schedule: NimNode): NimNode =
+  funcCall.expectKind(nnkCall)
+  
+  # Get the return type if any
+  let retType = funcCall[0].getImpl[3][0]
+  let needFuture = retType.kind != nnkEmpty
+
+  # Get a serialized type and data for all function arguments
+  # We use adhoc tuple
+  var argsTy = nnkPar.newTree()
+  var args = nnkPar.newTree()
+  for i in 1 ..< funcCall.len:
+    argsTy.add getTypeInst(funcCall[i])
+    args.add funcCall[i]
+
+  # Package in a task
+  if not needFuture:
+    result = spawnVoid(funcCall, args, argsTy, workerContext, schedule)
+  else:
+    result = spawnRet(funcCall, retType, args, argsTy, workerContext, schedule)
+
+  # Wrap in a block for namespacing
+  result = nnkBlockStmt.newTree(newEmptyNode(), result)
+  # echo result.toStrLit
--- a/constantine/platforms/threadpool/primitives/barriers.md
+++ b/constantine/platforms/threadpool/primitives/barriers.md
@ -0,0 +1,53 @@
+# Synchronization Barriers
+
+OSX does not implement pthread_barrier as its an optional part
+of the POSIX standard and they probably want to drive people to libdispatch/Grand Central Dispatch.
+
+So we need to roll our own with a POSIX compatible API.
+
+## Glibc barriers, design bug and implementation
+
+> Note: due to GPL licensing, do not lift the code.
+>       Not that we can as it is heavily dependent on futexes
+>       which are not available on OSX
+
+We need to make sure that we don't hit the same bug
+as glibc: https://sourceware.org/bugzilla/show_bug.cgi?id=13065
+which seems to be an issue in some of the barrier implementations
+in the wild.
+
+The design of Glibc barriers is here:
+https://sourceware.org/git/?p=glibc.git;a=blob;f=nptl/DESIGN-barrier.txt;h=23463c6b7e77231697db3e13933b36ce295365b1;hb=HEAD
+
+And implementation:
+- https://sourceware.org/git/?p=glibc.git;a=blob;f=nptl/pthread_barrier_destroy.c;h=76957adef3ee751e5b0cfa429fcf4dd3cfd80b2b;hb=HEAD
+- https://sourceware.org/git/?p=glibc.git;a=blob;f=nptl/pthread_barrier_init.c;h=c8ebab3a3cb5cbbe469c0d05fb8d9ca0c365b2bb;hb=HEAD`
+- https://sourceware.org/git/?p=glibc.git;a=blob;f=nptl/pthread_barrier_wait.c;h=49fcfd370c1c4929fdabdf420f2f19720362e4a0;hb=HEAD
+
+## Synchronization barrier techniques
+
+This article goes over the techniques of
+"pool barrier" and "ticket barrier"
+https://locklessinc.com/articles/barriers/
+to reach 2x to 20x the speed of pthreads barrier
+
+This course https://cs.anu.edu.au/courses/comp8320/lectures/aux/comp422-Lecture21-Barriers.pdf
+goes over
+- centralized barrier with sense reversal
+- combining tree barrier
+- dissemination barrier
+- tournament barrier
+- scalable tree barrier
+More courses:
+- http://www.cs.rochester.edu/u/sandhya/csc458/seminars/jb_Barrier_Methods.pdf
+
+It however requires lightweight mutexes like Linux futexes
+that OSX lacks.
+
+This post goes over lightweight mutexes like Benaphores (from BeOS)
+https://preshing.com/20120226/roll-your-own-lightweight-mutex/
+
+This gives a few barrier implementations
+http://gallium.inria.fr/~maranget/MPRI/02.pdf
+and refers to Cubible paper for formally verifying synchronization barriers
+http://cubicle.lri.fr/papers/jfla2014.pdf (in French)
--- a/constantine/platforms/threadpool/primitives/barriers.nim
+++ b/constantine/platforms/threadpool/primitives/barriers.nim
@ -0,0 +1,71 @@
+# Weave
+# Copyright (c) 2019 Mamy André-Ratsimbazafy
+# Licensed and distributed under either of
+#   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
+#   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
+# at your option. This file may not be copied, modified, or distributed except according to those terms.
+
+{.push raises: [], checks: off.} # No exceptions or overflow/conversion checks
+
+when defined(windows):
+  import ./barriers_windows
+  when compileOption("assertions"):
+    import os
+
+  type SyncBarrier* = SynchronizationBarrier
+
+  proc init*(syncBarrier: var SyncBarrier, threadCount: uint32) {.inline.} =
+    ## Initialize a synchronization barrier that will block ``threadCount`` threads
+    ## before release.
+    let err {.used.} = InitializeSynchronizationBarrier(syncBarrier, cast[int32](threadCount), -1)
+    when compileOption("assertions"):
+      if err != 1:
+        assert err == 0
+        raiseOSError(osLastError())
+
+  proc wait*(syncBarrier: var SyncBarrier): bool {.inline.} =
+    ## Blocks thread at a synchronization barrier.
+    ## Returns true for one of the threads (the last one on Windows, undefined on Posix)
+    ## and false for the others.
+    result = bool EnterSynchronizationBarrier(syncBarrier, SYNCHRONIZATION_BARRIER_FLAGS_NO_DELETE)
+
+  proc delete*(syncBarrier: sink SyncBarrier) {.inline.} =
+    ## Deletes a synchronization barrier.
+    ## This assumes no race between waiting at a barrier and deleting it,
+    ## and reuse of the barrier requires initialization.
+    DeleteSynchronizationBarrier(syncBarrier.addr)
+
+else:
+  import ./barriers_posix
+  when compileOption("assertions"):
+    import os
+
+  type SyncBarrier* = PthreadBarrier
+
+  proc init*(syncBarrier: var SyncBarrier, threadCount: uint32) {.inline.} =
+    ## Initialize a synchronization barrier that will block ``threadCount`` threads
+    ## before release.
+    let err {.used.} = pthread_barrier_init(syncBarrier, nil, threadCount)
+    when compileOption("assertions"):
+      if err != 0:
+        raiseOSError(OSErrorCode(err))
+
+  proc wait*(syncBarrier: var SyncBarrier): bool {.inline.} =
+    ## Blocks thread at a synchronization barrier.
+    ## Returns true for one of the threads (the last one on Windows, undefined on Posix)
+    ## and false for the others.
+    let err {.used.} = pthread_barrier_wait(syncBarrier)
+    when compileOption("assertions"):
+      if err != PTHREAD_BARRIER_SERIAL_THREAD and err < 0:
+        raiseOSError(OSErrorCode(err))
+    result = if err == PTHREAD_BARRIER_SERIAL_THREAD: true
+             else: false
+
+  proc delete*(syncBarrier: sink SyncBarrier) {.inline.} =
+    ## Deletes a synchronization barrier.
+    ## This assumes no race between waiting at a barrier and deleting it,
+    ## and reuse of the barrier requires initialization.
+    let err {.used.} = pthread_barrier_destroy(syncBarrier)
+    when compileOption("assertions"):
+      if err < 0:
+        raiseOSError(OSErrorCode(err))
--- a/constantine/platforms/threadpool/primitives/barriers_macos.nim
+++ b/constantine/platforms/threadpool/primitives/barriers_macos.nim
@ -0,0 +1,84 @@
+# Weave
+# Copyright (c) 2019 Mamy André-Ratsimbazafy
+# Licensed and distributed under either of
+#   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
+#   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
+# at your option. This file may not be copied, modified, or distributed except according to those terms.
+
+# OSX doesn't implement pthread_barrier_t
+# It's an optional part of the POSIX standard
+#
+# This is a manual implementation of a sense reversing barrier
+
+import std/locks
+
+type
+  Errno* = cint
+
+  PthreadBarrierAttr* = object
+    ## Dummy
+  PthreadBarrier* = object
+    ## Implementation of a sense reversing barrier
+    ## (The Art of Multiprocessor Programming by Maurice Herlihy & Nir Shavit)
+
+    lock: Lock                      # Alternatively spinlock on Atomic
+    cond {.guard: lock.}: Cond
+    sense {.guard: lock.}: bool     # Choose int32 to avoid zero-expansion cost in registers?
+    left {.guard: lock.}: cuint     # Number of threads missing at the barrier before opening
+    count: cuint                    # Total number of threads that need to arrive before opening the barrier
+
+const
+  PTHREAD_BARRIER_SERIAL_THREAD* = Errno(1)
+
+proc pthread_cond_broadcast(cond: var Cond): Errno {.header:"<pthread.h>".}
+  ## Nim only signal one thread in locks
+  ## We need to unblock all
+
+proc broadcast(cond: var Cond) {.inline.}=
+  discard pthread_cond_broadcast(cond)
+
+func pthread_barrier_init*(
+        barrier: var PthreadBarrier,
+        attr: ptr PthreadBarrierAttr,
+        count: cuint
+      ): Errno =
+  barrier.lock.initLock()
+  {.locks: [barrier.lock].}:
+    barrier.cond.initCond()
+    barrier.left = count
+  barrier.count = count
+  # barrier.sense = false
+
+proc pthread_barrier_wait*(barrier: var PthreadBarrier): Errno =
+  ## Wait on `barrier`
+  ## Returns PTHREAD_BARRIER_SERIAL_THREAD for a single arbitrary thread
+  ## Returns 0 for the other
+  ## Returns Errno if there is an error
+  barrier.lock.acquire()
+  {.locks: [barrier.lock].}:
+    var local_sense = barrier.sense # Thread local sense
+    dec barrier.left
+
+    if barrier.left == 0:
+      # Last thread to arrive at the barrier
+      # Reverse phase and release it
+      barrier.left = barrier.count
+      barrier.sense = not barrier.sense
+      barrier.cond.broadcast()
+      barrier.lock.release()
+      return PTHREAD_BARRIER_SERIAL_THREAD
+
+    while barrier.sense == local_sense:
+      # We are waiting for threads
+      # Wait for the sense to reverse
+      # while loop because we might have spurious wakeups
+      barrier.cond.wait(barrier.lock)
+
+    # Reversed, we can leave the barrier
+    barrier.lock.release()
+    return Errno(0)
+
+proc pthread_barrier_destroy*(barrier: var PthreadBarrier): Errno =
+  {.locks: [barrier.lock].}:
+    barrier.cond.deinitCond()
+  barrier.lock.deinitLock()
--- a/constantine/platforms/threadpool/primitives/barriers_posix.nim
+++ b/constantine/platforms/threadpool/primitives/barriers_posix.nim
@ -0,0 +1,56 @@
+# Weave
+# Copyright (c) 2019 Mamy André-Ratsimbazafy
+# Licensed and distributed under either of
+#   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
+#   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
+# at your option. This file may not be copied, modified, or distributed except according to those terms.
+
+# Abstractions over POSIX barriers (non-)implementations
+
+when not compileOption("threads"):
+  {.error: "This requires --threads:on compilation flag".}
+
+# Types
+# -------------------------------------------------------
+
+when defined(osx):
+  import ./barriers_macos
+  export PthreadBarrierAttr, PthreadBarrier, Errno, PTHREAD_BARRIER_SERIAL_THREAD
+else:
+  type
+    PthreadBarrierAttr* {.importc: "pthread_barrierattr_t", header: "<sys/types.h>", byref.} = object
+      when (defined(linux) and not defined(android)) and defined(amd64):
+        abi: array[4 div sizeof(cint), cint] # https://sourceware.org/git/?p=glibc.git;a=blob;f=sysdeps/x86/nptl/bits/pthreadtypes-arch.h;h=dd06d6753ebc80d94ede6c3c18227a3ad3104570;hb=HEAD#l45
+    PthreadBarrier* {.importc: "pthread_barrier_t", header: "<sys/types.h>", byref.} = object
+      when (defined(linux) and not defined(android)) and defined(amd64):
+        abi: array[32 div sizeof(clong), clong] # https://sourceware.org/git/?p=glibc.git;a=blob;f=sysdeps/x86/nptl/bits/pthreadtypes-arch.h;h=dd06d6753ebc80d94ede6c3c18227a3ad3104570;hb=HEAD#l28
+
+
+    Errno* = cint
+
+  var PTHREAD_BARRIER_SERIAL_THREAD* {.importc, header:"<pthread.h>".}: Errno
+
+# Pthread
+# -------------------------------------------------------
+when defined(osx):
+  export pthread_barrier_init, pthread_barrier_wait, pthread_barrier_destroy
+else:
+  proc pthread_barrier_init*(
+        barrier: PthreadBarrier,
+        attr: ptr PthreadBarrierAttr,
+        count: cuint
+      ): Errno {.header: "<pthread.h>".}
+    ## Initialize `barrier` with the attributes `attr`.
+    ## The barrier is opened when `count` waiters arrived.
+
+  proc pthread_barrier_destroy*(
+        barrier: sink PthreadBarrier): Errno {.header: "<pthread.h>".}
+    ## Destroy a previously dynamically initialized `barrier`.
+
+  proc pthread_barrier_wait*(
+        barrier: var PthreadBarrier
+      ): Errno {.header: "<pthread.h>".}
+    ## Wait on `barrier`
+    ## Returns PTHREAD_BARRIER_SERIAL_THREAD for a single arbitrary thread
+    ## Returns 0 for the other
+    ## Returns Errno if there is an error
--- a/constantine/platforms/threadpool/primitives/barriers_windows.nim
+++ b/constantine/platforms/threadpool/primitives/barriers_windows.nim
@ -0,0 +1,31 @@
+# Weave
+# Copyright (c) 2019 Mamy André-Ratsimbazafy
+# Licensed and distributed under either of
+#   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
+#   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
+# at your option. This file may not be copied, modified, or distributed except according to those terms.
+
+import winlean
+
+# Technically in <synchapi.h> but MSVC complains with 
+# @m..@s..@sweave@sscheduler.nim.cpp
+# C:\Program Files (x86)\Windows Kits\10\include\10.0.17763.0\um\winnt.h(154): fatal error C1189: #error:  "No Target Architecture"
+
+type
+  SynchronizationBarrier*{.importc:"SYNCHRONIZATION_BARRIER", header:"<windows.h>".} = object
+
+var SYNCHRONIZATION_BARRIER_FLAGS_NO_DELETE* {.importc, header: "<windows.h>".}: DWORD
+  ## Skip expensive checks on barrier enter if a barrier is never deleted.
+
+proc EnterSynchronizationBarrier*(lpBarrier: var SynchronizationBarrier, dwFlags: DWORD): WINBOOL {.importc, stdcall, header: "<windows.h>".}
+proc DeleteSynchronizationBarrier*(lpBarrier: ptr SynchronizationBarrier) {.importc, stdcall, header: "<windows.h>".}
+proc InitializeSynchronizationBarrier*(lpBarrier: var SynchronizationBarrier, lTotalThreads: LONG, lSpinCount: LONG): WINBOOL {.importc, stdcall, header: "<windows.h>".}
+
+when isMainModule:
+  import os
+
+  var x{.noinit.}: SynchronizationBarrier
+  let err = InitializeSynchronizationBarrier(x, 2, -1)
+  if err != 1:
+    assert err == 0
+    raiseOSError(osLastError())
--- a/constantine/platforms/threadpool/primitives/futexes.nim
+++ b/constantine/platforms/threadpool/primitives/futexes.nim
@ -0,0 +1,18 @@
+# Weave
+# Copyright (c) 2019 Mamy André-Ratsimbazafy
+# Licensed and distributed under either of
+#   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
+#   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
+# at your option. This file may not be copied, modified, or distributed except according to those terms.
+
+when defined(linux):
+  import ./futexes_linux
+  export futexes_linux
+elif defined(windows):
+  import ./futexes_windows
+  export futexes_windows
+elif defined(osx):
+  import ./futexes_macos
+  export futexes_macos
+else:
+  {.error: "Futexes are not implemented for your OS".}
--- a/constantine/platforms/threadpool/primitives/futexes_linux.nim
+++ b/constantine/platforms/threadpool/primitives/futexes_linux.nim
@ -0,0 +1,68 @@
+# Weave
+# Copyright (c) 2019 Mamy André-Ratsimbazafy
+# Licensed and distributed under either of
+#   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
+#   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
+# at your option. This file may not be copied, modified, or distributed except according to those terms.
+
+# A wrapper for linux futex.
+# Condition variables do not always wake on signal which can deadlock the runtime
+# so we need to roll up our sleeves and use the low-level futex API.
+
+import std/atomics
+export MemoryOrder
+
+type
+  Futex* = object
+    value: Atomic[uint32]
+  FutexOp = distinct cint
+
+var NR_Futex {.importc: "__NR_futex", header: "<sys/syscall.h>".}: cint
+var FutexWaitPrivate {.importc:"FUTEX_WAIT_PRIVATE", header: "<linux/futex.h>".}: FutexOp
+var FutexWakePrivate {.importc:"FUTEX_WAKE_PRIVATE", header: "<linux/futex.h>".}: FutexOp
+
+proc syscall(sysno: clong): cint {.header:"<unistd.h>", varargs.}
+
+proc sysFutex(
+       futex: var Futex, op: FutexOp, val1: cuint or cint,
+       timeout: pointer = nil, val2: pointer = nil, val3: cint = 0): cint {.inline.} =
+  syscall(NR_Futex, futex.value.addr, op, val1, timeout, val2, val3)
+
+proc initialize*(futex: var Futex) {.inline.} =
+  futex.value.store(0, moRelaxed)
+
+proc teardown*(futex: var Futex) {.inline.} =
+  futex.value.store(0, moRelaxed)
+
+proc load*(futex: var Futex, order: MemoryOrder): uint32 {.inline.} =
+  futex.value.load(order)
+
+proc loadMut*(futex: var Futex): var Atomic[uint32] {.inline.} =
+  futex.value
+
+proc store*(futex: var Futex, value: uint32, order: MemoryOrder) {.inline.} =
+  futex.value.store(value, order)
+
+proc wait*(futex: var Futex, refVal: uint32) {.inline.} =
+  ## Suspend a thread if the value of the futex is the same as refVal.
+
+  # Returns 0 in case of a successful suspend
+  # If value are different, it returns EWOULDBLOCK
+  # We discard as this is not needed and simplifies compat with Windows futex
+  discard sysFutex(futex, FutexWaitPrivate, refVal)
+
+proc wake*(futex: var Futex) {.inline.} =
+  ## Wake one thread (from the same process)
+
+  # Returns the number of actually woken threads
+  # or a Posix error code (if negative)
+  # We discard as this is not needed and simplifies compat with Windows futex
+  discard sysFutex(futex, FutexWakePrivate, 1)
+
+proc wakeAll*(futex: var Futex) {.inline.} =
+  ## Wake all threads (from the same process)
+
+  # Returns the number of actually woken threads
+  # or a Posix error code (if negative)
+  # We discard as this is not needed and simplifies compat with Windows futex
+  discard sysFutex(futex, FutexWakePrivate, high(int32))
--- a/constantine/platforms/threadpool/primitives/futexes_macos.nim
+++ b/constantine/platforms/threadpool/primitives/futexes_macos.nim
@ -0,0 +1,109 @@
+# Weave
+# Copyright (c) 2019 Mamy André-Ratsimbazafy
+# Licensed and distributed under either of
+#   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
+#   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
+# at your option. This file may not be copied, modified, or distributed except according to those terms.
+
+import std/atomics
+
+# A wrapper for Darwin futex.
+# They are used in libc++ so likely to be very stable.
+# A new API appeared in OSX Big Sur (Jan 2021) ulock_wait2 and macOS pthread_cond_t has been migrated to it
+# - https://github.com/apple/darwin-xnu/commit/d4061fb0260b3ed486147341b72468f836ed6c8f#diff-08f993cc40af475663274687b7c326cc6c3031e0db3ac8de7b24624610616be6
+#
+# The old API is ulock_wait
+# - https://opensource.apple.com/source/xnu/xnu-7195.81.3/bsd/kern/sys_ulock.c.auto.html
+# - https://opensource.apple.com/source/xnu/xnu-7195.81.3/bsd/sys/ulock.h.auto.html
+
+{.push hint[XDeclaredButNotUsed]: off.}
+
+const UL_COMPARE_AND_WAIT            = 1
+const UL_UNFAIR_LOCK                 = 2
+const UL_COMPARE_AND_WAIT_SHARED     = 3
+const UL_UNFAIR_LOCK64_SHARED        = 4
+const UL_COMPARE_AND_WAIT64          = 5
+const UL_COMPARE_AND_WAIT64_SHARED   = 6
+# obsolete names
+const UL_OSSPINLOCK                  = UL_COMPARE_AND_WAIT
+const UL_HANDOFFLOCK                 = UL_UNFAIR_LOCK
+#  These operation code are only implemented in (DEVELOPMENT || DEBUG) kernels
+const UL_DEBUG_SIMULATE_COPYIN_FAULT = 253
+const UL_DEBUG_HASH_DUMP_ALL         = 254
+const UL_DEBUG_HASH_DUMP_PID         = 255
+
+# operation bits [15, 8] contain the flags for __ulock_wake
+#
+const ULF_WAKE_ALL                   =  0x00000100
+const ULF_WAKE_THREAD                =  0x00000200
+const ULF_WAKE_ALLOW_NON_OWNER       =  0x00000400
+
+# operation bits [23, 16] contain the flags for __ulock_wait
+#
+# @const ULF_WAIT_WORKQ_DATA_CONTENTION
+# The waiter is contending on this lock for synchronization around global data.
+# This causes the workqueue subsystem to not create new threads to offset for
+# waiters on this lock.
+#
+# @const ULF_WAIT_CANCEL_POINT
+# This wait is a cancelation point
+#
+# @const ULF_WAIT_ADAPTIVE_SPIN
+# Use adaptive spinning when the thread that currently holds the unfair lock
+# is on core.
+const ULF_WAIT_WORKQ_DATA_CONTENTION = 0x00010000
+const ULF_WAIT_CANCEL_POINT          = 0x00020000
+const ULF_WAIT_ADAPTIVE_SPIN         = 0x00040000
+
+# operation bits [31, 24] contain the generic flags
+const ULF_NO_ERRNO                   = 0x01000000
+
+# masks
+const UL_OPCODE_MASK                 = 0x000000FF
+const UL_FLAGS_MASK                  = 0xFFFFFF00
+const ULF_GENERIC_MASK               = 0xFFFF0000
+
+const ULF_WAIT_MASK     = ULF_NO_ERRNO or
+                          ULF_WAIT_WORKQ_DATA_CONTENTION or
+                          ULF_WAIT_CANCEL_POINT or
+                          ULF_WAIT_ADAPTIVE_SPIN
+
+const ULF_WAKE_MASK     = ULF_NO_ERRNO or
+                          ULF_WAKE_ALL or
+                          ULF_WAKE_THREAD or
+                          ULF_WAKE_ALLOW_NON_OWNER
+
+proc ulock_wait(operation: uint32, address: pointer, value: uint64, timeout: uint32): cint {.importc:"__ulock_wait", cdecl.}
+proc ulock_wait2(operation: uint32, address: pointer, value: uint64, timeout, value2: uint64): cint {.importc:"__ulock_wait2", cdecl.}
+proc ulock_wake(operation: uint32, address: pointer, wake_value: uint64): cint {.importc:"__ulock_wake", cdecl.}
+
+type
+  Futex* = object
+    value: Atomic[uint32]
+
+proc initialize*(futex: var Futex) {.inline.} =
+  futex.value.store(0, moRelaxed)
+
+proc teardown*(futex: var Futex) {.inline.} =
+  futex.value.store(0, moRelaxed)
+
+proc load*(futex: var Futex, order: MemoryOrder): uint32 {.inline.} =
+  futex.value.load(order)
+
+proc loadMut*(futex: var Futex): var Atomic[uint32] {.inline.} =
+  futex.value
+
+proc store*(futex: var Futex, value: uint32, order: MemoryOrder) {.inline.} =
+  futex.value.store(value, order)
+
+proc wait*(futex: var Futex, refVal: uint32) {.inline.} =
+  ## Suspend a thread if the value of the futex is the same as refVal.
+  discard ulock_wait(UL_UNFAIR_LOCK64_SHARED or ULF_NO_ERRNO, futex.value.addr, uint64 refVal, 0)
+
+proc wake*(futex: var Futex) {.inline.} =
+  ## Wake one thread (from the same process)
+  discard ulock_wake(ULF_WAKE_THREAD or ULF_NO_ERRNO, futex.value.addr, 0)
+
+proc wakeAll*(futex: var Futex) {.inline.} =
+  ## Wake all threads (from the same process)
+  discard ulock_wake(ULF_WAKE_ALL or ULF_NO_ERRNO, futex.value.addr, 0)
--- a/constantine/platforms/threadpool/primitives/futexes_windows.nim
+++ b/constantine/platforms/threadpool/primitives/futexes_windows.nim
@ -0,0 +1,59 @@
+# Weave
+# Copyright (c) 2019 Mamy André-Ratsimbazafy
+# Licensed and distributed under either of
+#   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
+#   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
+# at your option. This file may not be copied, modified, or distributed except according to those terms.
+
+# An implementation of futex using Windows primitives
+
+import std/atomics, winlean
+export MemoryOrder
+
+type
+  Futex* = object
+    value: Atomic[uint32]
+
+# Contrary to the documentation, the futex related primitives are NOT in kernel32.dll
+# but in API-MS-Win-Core-Synch-l1-2-0.dll ¯\_(ツ)_/¯
+
+proc initialize*(futex: var Futex) {.inline.} =
+  futex.value.store(0, moRelaxed)
+
+proc teardown*(futex: var Futex) {.inline.} =
+  futex.value.store(0, moRelaxed)
+
+proc WaitOnAddress(
+        Address: pointer, CompareAddress: pointer,
+        AddressSize: csize_t, dwMilliseconds: DWORD
+       ): WINBOOL {.importc, stdcall, dynlib: "API-MS-Win-Core-Synch-l1-2-0.dll".}
+  # The Address should be volatile
+
+proc WakeByAddressSingle(Address: pointer) {.importc, stdcall, dynlib: "API-MS-Win-Core-Synch-l1-2-0.dll".}
+proc WakeByAddressAll(Address: pointer) {.importc, stdcall, dynlib: "API-MS-Win-Core-Synch-l1-2-0.dll".}
+
+proc load*(futex: var Futex, order: MemoryOrder): uint32 {.inline.} =
+  futex.value.load(order)
+
+proc loadMut*(futex: var Futex): var Atomic[uint32] {.inline.} =
+  futex.value
+
+proc store*(futex: var Futex, value: uint32, order: MemoryOrder) {.inline.} =
+  futex.value.store(value, order)
+
+proc wait*(futex: var Futex, refVal: uint32) {.inline.} =
+  ## Suspend a thread if the value of the futex is the same as refVal.
+
+  # Returns TRUE if the wait succeeds or FALSE if not.
+  # getLastError() will contain the error information, for example
+  # if it failed due to a timeout.
+  # We discard as this is not needed and simplifies compat with Linux futex
+  discard WaitOnAddress(futex.value.addr, refVal.unsafeAddr, csize_t sizeof(refVal), INFINITE)
+
+proc wake*(futex: var Futex) {.inline.} =
+  ## Wake one thread (from the same process)
+  WakeByAddressSingle(futex.value.addr)
+
+proc wakeAll*(futex: var Futex) {.inline.} =
+  ## Wake all threads (from the same process)
+  WakeByAddressAll(futex.value.addr)
--- a/constantine/platforms/threadpool/threadpool.nim
+++ b/constantine/platforms/threadpool/threadpool.nim
@ -0,0 +1,555 @@
+# Constantine
+# Copyright (c) 2018-2019    Status Research & Development GmbH
+# Copyright (c) 2020-Present Mamy André-Ratsimbazafy
+# Licensed and distributed under either of
+#   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
+#   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
+# at your option. This file may not be copied, modified, or distributed except according to those terms.
+
+when not compileOption("threads"):
+  {.error: "This requires --threads:on compilation flag".}
+
+{.push raises: [].}
+
+import
+  std/[cpuinfo, atomics, macros],
+  ./crossthread/[
+    taskqueues,
+    backoff,
+    tasks_flowvars],
+  ./instrumentation,
+  ./primitives/barriers,
+  ./parallel_offloading,
+  ../allocs, ../bithacks,
+  ../../../helpers/prng_unsafe
+
+export
+  # flowvars
+  Flowvar, isSpawned, isReady, sync
+
+type
+  WorkerID = uint32
+  Signal = object
+    terminate {.align: 64.}: Atomic[bool]
+
+  WorkerContext = object
+    ## Thread-local worker context
+
+    # Params
+    id: WorkerID
+    threadpool: Threadpool
+
+    # Tasks
+    taskqueue: ptr Taskqueue          # owned task queue
+    currentTask: ptr Task
+
+    # Synchronization
+    localBackoff: EventNotifier       # Multi-Producer Single-Consumer backoff
+    signal: ptr Signal                # owned signal
+
+    # Thefts
+    rng: RngState                     # RNG state to select victims
+
+    # Adaptative theft policy
+    stealHalf: bool
+    recentTasks: uint32
+    recentThefts: uint32
+    recentTheftsAdaptative: uint32
+    recentLeaps: uint32
+
+  Threadpool* = ptr object
+    barrier: SyncBarrier             # Barrier for initialization and teardown
+    # -- align: 64
+    globalBackoff: EventCount        # Multi-Producer Multi-Consumer backoff
+    # -- align: 64
+    numThreads*{.align: 64.}: uint32
+    workerQueues: ptr UncheckedArray[Taskqueue]
+    workers: ptr UncheckedArray[Thread[(Threadpool, WorkerID)]]
+    workerSignals: ptr UncheckedArray[Signal]
+
+# Thread-local config
+# ---------------------------------------------
+
+var workerContext {.threadvar.}: WorkerContext
+  ## Thread-local Worker context
+
+proc setupWorker() =
+  ## Initialize the thread-local context of a worker
+  ## Requires the ID and threadpool fields to be initialized
+  template ctx: untyped = workerContext
+
+  preCondition: not ctx.threadpool.isNil()
+  preCondition: 0 <= ctx.id and ctx.id < ctx.threadpool.numThreads.uint32
+  preCondition: not ctx.threadpool.workerQueues.isNil()
+  preCondition: not ctx.threadpool.workerSignals.isNil()
+
+  # Thefts
+  ctx.rng.seed(0xEFFACED + ctx.id)
+
+  # Synchronization
+  ctx.localBackoff.initialize()
+  ctx.signal = addr ctx.threadpool.workerSignals[ctx.id]
+  ctx.signal.terminate.store(false, moRelaxed)
+
+  # Tasks
+  ctx.taskqueue = addr ctx.threadpool.workerQueues[ctx.id]
+  ctx.currentTask = nil
+
+  # Init
+  ctx.taskqueue[].init(initialCapacity = 32)
+
+  # Adaptative theft policy
+  ctx.recentTasks = 0
+  ctx.recentThefts = 0
+  ctx.recentTheftsAdaptative = 0
+  ctx.recentLeaps = 0
+
+proc teardownWorker() =
+  ## Cleanup the thread-local context of a worker
+  workerContext.localBackoff.`=destroy`()
+  workerContext.taskqueue[].teardown()
+
+proc eventLoop(ctx: var WorkerContext) {.raises:[Exception].}
+
+proc workerEntryFn(params: tuple[threadpool: Threadpool, id: WorkerID]) {.raises: [Exception].} =
+  ## On the start of the threadpool workers will execute this
+  ## until they receive a termination signal
+  # We assume that thread_local variables start all at their binary zero value
+  preCondition: workerContext == default(WorkerContext)
+
+  template ctx: untyped = workerContext
+
+  # If the following crashes, you need --tlsEmulation:off
+  ctx.id = params.id
+  ctx.threadpool = params.threadpool
+
+  setupWorker()
+
+  # 1 matching barrier in Threadpool.new() for root thread
+  discard params.threadpool.barrier.wait()
+
+  {.cast(gcsafe).}: # Compiler does not consider that GC-safe by default when multi-threaded due to thread-local variables
+    ctx.eventLoop()
+
+  debugTermination:
+    log(">>> Worker %2d shutting down <<<\n", ctx.id)
+
+  # 1 matching barrier in threadpool.shutdown() for root thread
+  discard params.threadpool.barrier.wait()
+
+  teardownWorker()
+
+# Tasks
+# ---------------------------------------------
+
+# Sentinel values
+const ReadyFuture = cast[ptr EventNotifier](0xCA11AB1E)
+const RootTask = cast[ptr Task](0xEFFACED0)
+
+proc run*(ctx: var WorkerContext, task: ptr Task) {.raises:[Exception].} =
+  ## Run a task, frees it if it is not owned by a Flowvar
+  let suspendedTask = workerContext.currentTask
+  ctx.currentTask = task
+  debug: log("Worker %2d: running task.fn 0x%.08x (%d pending)\n", ctx.id, task.fn, ctx.taskqueue[].peek())
+  task.fn(task.data.addr)
+  debug: log("Worker %2d: completed task.fn 0x%.08x (%d pending)\n", ctx.id, task.fn, ctx.taskqueue[].peek())
+  ctx.recentTasks += 1
+  ctx.currentTask = suspendedTask
+  if not task.hasFuture:
+    freeHeap(task)
+    return
+
+  # Sync with an awaiting thread without work in completeFuture
+  var expected = (ptr EventNotifier)(nil)
+  if not compareExchange(task.waiter, expected, desired = ReadyFuture, moAcquireRelease):
+    debug: log("Worker %2d: completed task 0x%.08x, notifying waiter 0x%.08x\n", ctx.id, task, expected)
+    expected[].notify()
+
+proc schedule(ctx: var WorkerContext, tn: ptr Task, forceWake = false) {.inline.} =
+  ## Schedule a task in the threadpool
+  ## This wakes a sibling thread if our local queue is empty
+  ## or forceWake is true.
+  debug: log("Worker %2d: schedule task 0x%.08x (parent 0x%.08x, current 0x%.08x)\n", ctx.id, tn, tn.parent, ctx.currentTask)
+
+  # Instead of notifying every time a task is scheduled, we notify
+  # only when the worker queue is empty. This is a good approximation
+  # of starvation in work-stealing.
+  # - Tzannes, A., G. C. Caragea, R. Barua, and U. Vishkin.
+  #   Lazy binary-splitting: a run-time adaptive work-stealing scheduler.
+  #   In PPoPP ’10, Bangalore, India, January 2010. ACM, pp. 179–190.
+  #   https://user.eng.umd.edu/~barua/ppopp164.pdf
+
+  let wasEmpty = ctx.taskqueue[].peek() == 0
+  ctx.taskqueue[].push(tn)
+  if forceWake or wasEmpty:
+    ctx.threadpool.globalBackoff.wake()
+
+# Scheduler
+# ---------------------------------------------
+
+iterator pseudoRandomPermutation(randomSeed, maxExclusive: uint32): uint32 =
+  ## Create a (low-quality) pseudo-random permutation from [0, max)
+  # Design considerations and randomness constraint for work-stealing, see docs/random_permutations.md
+  #
+  # Linear Congruential Generator: https://en.wikipedia.org/wiki/Linear_congruential_generator
+  #
+  # Xₙ₊₁ = aXₙ+c (mod m) generates all random number mod m without repetition
+  # if and only if (Hull-Dobell theorem):
+  # 1. c and m are coprime
+  # 2. a-1 is divisible by all prime factors of m
+  # 3. a-1 is divisible by 4 if m is divisible by 4
+  #
+  # Alternative 1. By choosing a=1, all conditions are easy to reach.
+  #
+  # The randomness quality is not important besides distributing potential contention,
+  # i.e. randomly trying thread i, then i+1, then i+n-1 (mod n) is good enough.
+  #
+  # Assuming 6 threads, co-primes are [1, 5], which means the following permutations
+  # assuming we start with victim 0:
+  # - [0, 1, 2, 3, 4, 5]
+  # - [0, 5, 4, 3, 2, 1]
+  # While we don't care much about randoness quality, it's a bit disappointing.
+  #
+  # Alternative 2. We can choose m to be the next power of 2, meaning all odd integers are co-primes,
+  # consequently:
+  # - we don't need a GCD to find the coprimes
+  # - we don't need to cache coprimes, removing a cache-miss potential
+  # - a != 1, so we now have a multiplicative factor, which makes output more "random looking".
+
+  # n and (m-1) <=> n mod m, if m is a power of 2
+  let M = maxExclusive.nextPowerOfTwo_vartime()
+  let c = (randomSeed and ((M shr 1) - 1)) * 2 + 1 # c odd and c ∈ [0, M)
+  let a = (randomSeed and ((M shr 2) - 1)) * 4 + 1 # a-1 divisible by 2 (all prime factors of m) and by 4 if m divisible by 4
+
+  let mask = M-1                                   # for mod M
+  let start = randomSeed and mask
+
+  var x = start
+  while true:
+    if x < maxExclusive:
+      yield x
+    x = (a*x + c) and mask                         # ax + c (mod M), with M power of 2
+    if x == start:
+      break
+
+proc tryStealOne(ctx: var WorkerContext): ptr Task =
+  ## Try to steal a task.
+  let seed = ctx.rng.next().uint32
+  for targetId in seed.pseudoRandomPermutation(ctx.threadpool.numThreads):
+    if targetId == ctx.id:
+      continue
+
+    let stolenTask = ctx.id.steal(ctx.threadpool.workerQueues[targetId])
+
+    if not stolenTask.isNil():
+      ctx.recentThefts += 1
+      # Theft successful, there might be more work for idle threads, wake one
+      ctx.threadpool.globalBackoff.wake()
+      return stolenTask
+  return nil
+
+proc updateStealStrategy(ctx: var WorkerContext) =
+  ## Estimate work-stealing efficiency during the last interval
+  ## If the value is below a threshold, switch strategies
+  const StealAdaptativeInterval = 25
+  if ctx.recentTheftsAdaptative == StealAdaptativeInterval:
+    let recentTheftsNonAdaptative = ctx.recentThefts - ctx.recentTheftsAdaptative
+    let adaptativeTasks = ctx.recentTasks - ctx.recentLeaps - recentTheftsNonAdaptative
+
+    let ratio = adaptativeTasks.float32 / StealAdaptativeInterval.float32
+    if ctx.stealHalf and ratio < 2.0f:
+      # Tasks stolen are coarse-grained, steal only one to reduce re-steal
+      ctx.stealHalf = false
+    elif not ctx.stealHalf and ratio == 1.0f:
+      # All tasks processed were stolen tasks, we need to steal many at a time
+      ctx.stealHalf = true
+
+    # Reset interval
+    ctx.recentTasks = 0
+    ctx.recentThefts = 0
+    ctx.recentTheftsAdaptative = 0
+    ctx.recentLeaps = 0
+
+proc tryStealAdaptative(ctx: var WorkerContext): ptr Task =
+  ## Try to steal one or many tasks, depending on load
+
+  # TODO: while running 'threadpool/examples/e02_parallel_pi.nim'
+  #       stealHalf can error out in tasks_flowvars.nim with:
+  #       "precondition not task.completed.load(moAcquire)"
+  ctx.stealHalf = false
+  # ctx.updateStealStrategy()
+
+  let seed = ctx.rng.next().uint32
+  for targetId in seed.pseudoRandomPermutation(ctx.threadpool.numThreads):
+    if targetId == ctx.id:
+      continue
+
+    let stolenTask =
+      if ctx.stealHalf: ctx.id.stealHalf(ctx.taskqueue[], ctx.threadpool.workerQueues[targetId])
+      else:             ctx.id.steal(ctx.threadpool.workerQueues[targetId])
+
+    if not stolenTask.isNil():
+      ctx.recentThefts += 1
+      ctx.recentTheftsAdaptative += 1
+      # Theft successful, there might be more work for idle threads, wake one
+      ctx.threadpool.globalBackoff.wake()
+      return stolenTask
+  return nil
+
+proc tryLeapfrog(ctx: var WorkerContext, awaitedTask: ptr Task): ptr Task =
+  ## Leapfrogging:
+  ##
+  ## - Leapfrogging: a portable technique for implementing efficient futures,
+  ##   David B. Wagner, Bradley G. Calder, 1993
+  ##   https://dl.acm.org/doi/10.1145/173284.155354
+  ##
+  ## When awaiting a future, we can look in the thief queue first. They steal when they run out of tasks.
+  ## If they have tasks in their queue, it's the task we are awaiting that created them and it will likely be stuck
+  ## on those tasks as well, so we need to help them help us.
+
+  var thiefID = SentinelThief
+  while true:
+    debug: log("Worker %2d: waiting for thief to publish their ID\n", ctx.id)
+    thiefID = awaitedTask.thiefID.load(moAcquire)
+    if thiefID != SentinelThief:
+      break
+    cpuRelax()
+  ascertain: 0 <= thiefID and thiefID < ctx.threadpool.numThreads
+
+  # Leapfrogging is used when completing a future, steal only one
+  # and don't leave tasks stranded in our queue.
+  let leapTask = ctx.id.steal(ctx.threadpool.workerQueues[thiefID])
+  if not leapTask.isNil():
+    ctx.recentLeaps += 1
+    # Theft successful, there might be more work for idle threads, wake one
+    ctx.threadpool.globalBackoff.wake()
+    return leapTask
+  return nil
+
+proc eventLoop(ctx: var WorkerContext) {.raises:[Exception].} =
+  ## Each worker thread executes this loop over and over.
+  while not ctx.signal.terminate.load(moRelaxed):
+    # 1. Pick from local queue
+    debug: log("Worker %2d: eventLoop 1 - searching task from local queue\n", ctx.id)
+    while (var task = ctx.taskqueue[].pop(); not task.isNil):
+      debug: log("Worker %2d: eventLoop 1 - running task 0x%.08x (parent 0x%.08x, current 0x%.08x)\n", ctx.id, task, task.parent, ctx.currentTask)
+      ctx.run(task)
+
+    # 2. Run out of tasks, become a thief
+    debug: log("Worker %2d: eventLoop 2 - becoming a thief\n", ctx.id)
+    let ticket = ctx.threadpool.globalBackoff.sleepy()
+    var stolenTask = ctx.tryStealAdaptative()
+    if not stolenTask.isNil:
+      # We manage to steal a task, cancel sleep
+      ctx.threadpool.globalBackoff.cancelSleep()
+      # 2.a Run task
+      debug: log("Worker %2d: eventLoop 2.a - stole task 0x%.08x (parent 0x%.08x, current 0x%.08x)\n", ctx.id, stolenTask, stolenTask.parent, ctx.currentTask)
+      ctx.run(stolenTask)
+    else:
+      # 2.b Park the thread until a new task enters the threadpool
+      debug: log("Worker %2d: eventLoop 2.b - sleeping\n", ctx.id)
+      ctx.threadpool.globalBackoff.sleep(ticket)
+      debug: log("Worker %2d: eventLoop 2.b - waking\n", ctx.id)
+
+# Sync
+# ---------------------------------------------
+
+template isRootTask(task: ptr Task): bool =
+  task == RootTask
+
+proc completeFuture*[T](fv: Flowvar[T], parentResult: var T) {.raises:[Exception].} =
+  ## Eagerly complete an awaited FlowVar
+
+  template ctx: untyped = workerContext
+
+  template isFutReady(): untyped =
+    let isReady = fv.task.completed.load(moAcquire)
+    if isReady:
+      parentResult = cast[ptr (ptr Task, T)](fv.task.data.addr)[1]
+    isReady
+
+  if isFutReady():
+    return
+
+  ## 1. Process all the children of the current tasks.
+  ##    This ensures that we can give control back ASAP.
+  debug: log("Worker %2d: sync 1 - searching task from local queue\n", ctx.id)
+  while (let task = ctx.taskqueue[].pop(); not task.isNil):
+    if task.parent != ctx.currentTask:
+      debug: log("Worker %2d: sync 1 - skipping non-direct descendant task 0x%.08x (parent 0x%.08x, current 0x%.08x)\n", ctx.id, task, task.parent, ctx.currentTask)
+      ctx.schedule(task, forceWake = true) # reschedule task and wake a sibling to take it over.
+      break
+    debug: log("Worker %2d: sync 1 - running task 0x%.08x (parent 0x%.08x, current 0x%.08x)\n", ctx.id, task, task.parent, ctx.currentTask)
+    ctx.run(task)
+    if isFutReady():
+      debug: log("Worker %2d: sync 1 - future ready, exiting\n", ctx.id)
+      return
+
+  ## 2. We run out-of-tasks or out-of-direct-child of our current awaited task
+  ##    So the task is bottlenecked by dependencies in other threads,
+  ##    hence we abandon our enqueued work and steal in the others' queues
+  ##    in hope it advances our awaited task. This prioritizes latency over throughput.
+  ##
+  ##    See also
+  ##    - Proactive work-stealing for futures
+  ##      Kyle Singer, Yifan Xu, I-Ting Angelina Lee, 2019
+  ##      https://dl.acm.org/doi/10.1145/3293883.3295735
+  debug: log("Worker %2d: sync 2 - future not ready, becoming a thief (currentTask 0x%.08x)\n", ctx.id, ctx.currentTask)
+  while not isFutReady():
+    if (let leapTask = ctx.tryLeapfrog(fv.task); not leapTask.isNil):
+      # We stole a task generated by the task we are awaiting.
+      debug: log("Worker %2d: sync 2.1 - leapfrog task 0x%.08x (parent 0x%.08x, current 0x%.08x)\n", ctx.id, leapTask, leapTask.parent, ctx.currentTask)
+      ctx.run(leapTask)
+    elif (let stolenTask = ctx.tryStealOne(); not stolenTask.isNil):
+      # We stole a task, we hope we advance our awaited task.
+      debug: log("Worker %2d: sync 2.2 - stole task 0x%.08x (parent 0x%.08x, current 0x%.08x)\n", ctx.id, stolenTask, stolenTask.parent, ctx.currentTask)
+      ctx.run(stolenTask)
+    elif (let ownTask = ctx.taskqueue[].pop(); not ownTask.isNil):
+      # We advance our own queue, this increases global throughput but may impact latency on the awaited task.
+      #
+      # Note: for a scheduler to be optimal (i.e. within 2x than ideal) it should be greedy
+      #       so all workers should be working. This is a difficult tradeoff.
+      debug: log("Worker %2d: sync 2.3 - couldn't steal, running own task\n", ctx.id)
+      ctx.run(ownTask)
+    else:
+      # Nothing to do, we park.
+      # Note: On today's hyperthreaded systems, it might be more efficient to always park
+      #       instead of working on unrelated tasks in our task queue, despite making the scheduler non-greedy.
+      #       The actual hardware resources are 2x less than the actual number of cores
+      ctx.localBackoff.prepareToPark()
+
+      var expected = (ptr EventNotifier)(nil)
+      if compareExchange(fv.task.waiter, expected, desired = ctx.localBackoff.addr, moAcquireRelease):
+        ctx.localBackoff.park()
+
+proc syncAll*(tp: Threadpool) {.raises: [Exception].} =
+  ## Blocks until all pending tasks are completed
+  ## This MUST only be called from
+  ## the root scope that created the threadpool
+  template ctx: untyped = workerContext
+
+  debugTermination:
+    log(">>> Worker %2d enters barrier <<<\n", ctx.id)
+
+  preCondition: ctx.id == 0
+  preCondition: ctx.currentTask.isRootTask()
+
+  # Empty all tasks
+  var foreignThreadsParked = false
+  while not foreignThreadsParked:
+    # 1. Empty local tasks
+    debug: log("Worker %2d: syncAll 1 - searching task from local queue\n", ctx.id)
+    while (let task = ctx.taskqueue[].pop(); not task.isNil):
+      debug: log("Worker %2d: syncAll 1 - running task 0x%.08x (parent 0x%.08x, current 0x%.08x)\n", ctx.id, task, task.parent, ctx.currentTask)
+      ctx.run(task)
+
+    if tp.numThreads == 1 or foreignThreadsParked:
+      break
+
+    # 2. Help other threads
+    debug: log("Worker %2d: syncAll 2 - becoming a thief\n", ctx.id)
+    let stolenTask = ctx.tryStealAdaptative()
+
+    if not stolenTask.isNil:
+      # 2.1 We stole some task
+      debug: log("Worker %2d: syncAll 2.1 - stole task 0x%.08x (parent 0x%.08x, current 0x%.08x)\n", ctx.id, stolenTask, stolenTask.parent, ctx.currentTask)
+      ctx.run(stolenTask)
+    else:
+      # 2.2 No task to steal
+      if tp.globalBackoff.getNumWaiters() == tp.numThreads - 1:
+        # 2.2.1 all threads besides the current are parked
+        debugTermination:
+          log("Worker %2d: syncAll 2.2.1 - termination, all other threads sleeping\n", ctx.id)
+        foreignThreadsParked = true
+      else:
+        # 2.2.2 We don't park as there is no notif for task completion
+        cpuRelax()
+
+  debugTermination:
+    log(">>> Worker %2d leaves barrier <<<\n", ctx.id)
+
+# Runtime
+# ---------------------------------------------
+
+proc new*(T: type Threadpool, numThreads = countProcessors()): T {.raises: [Exception].} =
+  ## Initialize a threadpool that manages `numThreads` threads.
+  ## Default to the number of logical processors available.
+
+  type TpObj = typeof(default(Threadpool)[]) # due to C import, we need a dynamic sizeof
+  var tp = allocHeapUncheckedAlignedPtr(Threadpool, sizeof(TpObj), alignment = 64)
+
+  tp.barrier.init(numThreads.uint32)
+  tp.globalBackoff.initialize()
+  tp.numThreads = numThreads.uint32
+  tp.workerQueues = allocHeapArrayAligned(Taskqueue, numThreads, alignment = 64)
+  tp.workers = allocHeapArrayAligned(Thread[(Threadpool, WorkerID)], numThreads, alignment = 64)
+  tp.workerSignals = allocHeapArrayAligned(Signal, numThreads, alignment = 64)
+
+  # Setup master thread
+  workerContext.id = 0
+  workerContext.threadpool = tp
+
+  # Start worker threads
+  for i in 1 ..< numThreads:
+    createThread(tp.workers[i], workerEntryFn, (tp, WorkerID(i)))
+
+  # Root worker
+  setupWorker()
+
+  # Root task, this is a sentinel task that is never called.
+  workerContext.currentTask = RootTask
+
+  # Wait for the child threads
+  discard tp.barrier.wait()
+  return tp
+
+proc cleanup(tp: var Threadpool) {.raises: [OSError].} =
+  ## Cleanup all resources allocated by the threadpool
+  preCondition: workerContext.currentTask.isRootTask()
+
+  for i in 1 ..< tp.numThreads:
+    joinThread(tp.workers[i])
+
+  tp.workerSignals.freeHeapAligned()
+  tp.workers.freeHeapAligned()
+  tp.workerQueues.freeHeapAligned()
+  tp.globalBackoff.`=destroy`()
+  tp.barrier.delete()
+
+  tp.freeHeapAligned()
+
+proc shutdown*(tp: var Threadpool) {.raises:[Exception].} =
+  ## Wait until all tasks are processed and then shutdown the threadpool
+  preCondition: workerContext.currentTask.isRootTask()
+  tp.syncAll()
+
+  # Signal termination to all threads
+  for i in 0 ..< tp.numThreads:
+    tp.workerSignals[i].terminate.store(true, moRelaxed)
+
+  tp.globalBackoff.wakeAll()
+
+  # 1 matching barrier in workerEntryFn
+  discard tp.barrier.wait()
+
+  teardownWorker()
+  tp.cleanup()
+
+  # Delete dummy task
+  workerContext.currentTask = nil
+
+{.pop.} # raises:[]
+
+# Task parallel API
+# ---------------------------------------------
+
+macro spawn*(tp: Threadpool, fnCall: typed): untyped =
+  ## Spawns the input function call asynchronously, potentially on another thread of execution.
+  ##
+  ## If the function calls returns a result, spawn will wrap it in a Flowvar.
+  ## You can use `sync` to block the current thread and extract the asynchronous result from the flowvar.
+  ## You can use `isReady` to check if result is available and if subsequent
+  ## `spawn` returns immediately.
+  ##
+  ## Tasks are processed approximately in Last-In-First-Out (LIFO) order
+  result = spawnImpl(tp, fnCall, bindSym"workerContext", bindSym"schedule")
--- a/helpers/prng_unsafe.nim
+++ b/helpers/prng_unsafe.nim
@ -68,7 +68,7 @@ func rotl(x: uint64, k: static int): uint64 {.inline.} =
 template `^=`(x: var uint64, y: uint64) =
  x = x xor y

-func next(rng: var RngState): uint64 =
+func next*(rng: var RngState): uint64 =
  ## Compute a random uint64 from the input state
  ## using xoshiro512** algorithm by Vigna et al
  ## State is updated.
@ -96,6 +96,12 @@ func random_unsafe*(rng: var RngState, maxExclusive: uint32): uint32 =
  ## Uses an unbiaised generation method
  ## See Lemire's algorithm modified by Melissa O'Neill
  ##   https://www.pcg-random.org/posts/bounded-rands.html
+  ## Original:
+  ##   biaised:   https://lemire.me/blog/2016/06/27/a-fast-alternative-to-the-modulo-reduction/
+  ##   unbiaised: https://arxiv.org/pdf/1805.10941.pdf
+  ## Also:
+  ##   Barrett Reduction: https://en.wikipedia.org/wiki/Barrett_reduction
+  ##                      http://www.acsel-lab.com/arithmetic/arith18/papers/ARITH18_Hasenplaugh.pdf
  let max = maxExclusive
  var x = uint32 rng.next()
  var m = x.uint64 * max.uint64