Parallel for (#222)

* introduce reserve threads to minimize latency and maximize throughput when awaiting a future * introduce a ceilDiv proc * threadpool: implement parallel-for loops * 10x perf improvement by not waking reserveBackoff on syncAll * bench overhead: new reserve system might introduce too much wakeup latency, 2x slower, for fine-grained parallelism * add parallelForStrided * Threadpool: Implement parallel reductions * refactor parallel loop codegen: introduce descriptor, parsing and codegen stages * parallel strided, test transpose bench * tight loop is faster when backoff is not inline * no POSIX stuff on windows, larger types for histogram bench * fix tests * max RSS overflow? * missed an undefined var * exit histogram on 32-bit * forgot to return early dor 32-bit
2025-03-03 11:50:39 +00:00 · 2023-02-24 09:47:36 +01:00 · 2023-02-24 09:47:36 +01:00 · bf32c2d408
commit bf32c2d408
parent 8993789ddf
66 changed files with 16974 additions and 303 deletions
--- a/constantine.nimble
+++ b/constantine.nimble
@ -250,6 +250,10 @@ const testDescThreadpool: seq[string] = @[
  # "constantine/platforms/threadpool/benchmarks/matmul_cache_oblivious/threadpool_matmul_co.nim",
  "constantine/platforms/threadpool/benchmarks/nqueens/threadpool_nqueens.nim",
  # "constantine/platforms/threadpool/benchmarks/single_task_producer/threadpool_spc.nim", # Need timing not implemented on Windows
+  # "constantine/platforms/threadpool/benchmarks/black_scholes/threadpool_black_scholes.nim", # Need input file
+  "constantine/platforms/threadpool/benchmarks/matrix_transposition/threadpool_transposes.nim",
+  "constantine/platforms/threadpool/benchmarks/histogram_2D/threadpool_histogram.nim",
+  "constantine/platforms/threadpool/benchmarks/logsumexp/threadpool_logsumexp.nim",
 ]

 const testDescMultithreadedCrypto: seq[string] = @[
--- a/constantine/hash_to_curve/h2c_hash_to_field.nim
+++ b/constantine/hash_to_curve/h2c_hash_to_field.nim
@ -26,11 +26,6 @@ import

 # Helpers
 # ----------------------------------------------------------------
-func ceilDiv(a, b: uint): uint =
-  ## ceil division
-  ## ceil(a / b)
-  (a + b - 1) div b
-
 proc copyFrom[M, N: static int](output: var array[M, byte], bi: array[N, byte], cur: var uint) =
  static: doAssert M mod N == 0
  for i in 0'u ..< N:
@ -113,7 +108,7 @@ func expandMessageXMD*[B1, B2, B3: byte|char, len_in_bytes: static int](
    doAssert output.len mod 8 == 0  # By spec
    doAssert output.len mod 32 == 0 # Assumed by copy optimization

-  let ell = ceilDiv(output.len.uint, DigestSize.uint)
+  let ell = output.len.ceilDiv_vartime(DigestSize)
  var l_i_b_str0 {.noInit.}: array[3, byte]
  l_i_b_str0.dumpRawInt(output.len.uint16, cursor = 0, bigEndian)
  l_i_b_str0[2] = 0
@ -200,7 +195,7 @@ func hashToField*[Field; B1, B2, B3: byte|char, count: static int](
  ##   it is recommended to cache the reduced DST.

  const
-    L = int ceilDiv(Field.C.getCurveBitwidth() + k, 8)
+    L = ceilDiv_vartime(Field.C.getCurveBitwidth() + k, 8)
    m = block:
      when Field is Fp: 1
      elif Field is Fp2: 2
--- a/constantine/math/arithmetic/bigints_montgomery.nim
+++ b/constantine/math/arithmetic/bigints_montgomery.nim
@ -142,7 +142,7 @@ func powMont*[mBits, eBits: static int](
  ##
  ## This is constant-time: the window optimization does
  ## not reveal the exponent bits or hamming weight
-  var expBE {.noInit.}: array[(ebits + 7) div 8, byte]
+  var expBE {.noInit.}: array[ebits.ceilDiv_vartime(8), byte]
  expBE.marshal(exponent, bigEndian)

  powMont(a, expBE, M, one, negInvModWord, windowSize, spareBits)
@ -165,7 +165,7 @@ func powMontUnsafeExponent*[mBits, eBits: static int](
  ##
  ## This uses fixed window optimization
  ## A window size in the range [1, 5] must be chosen
-  var expBE {.noInit.}: array[(ebits + 7) div 8, byte]
+  var expBE {.noInit.}: array[ebits.ceilDiv_vartime(8), byte]
  expBE.marshal(exponent, bigEndian)

  powMontUnsafeExponent(a, expBE, M, one, negInvModWord, windowSize, spareBits)
--- a/constantine/math/arithmetic/finite_fields.nim
+++ b/constantine/math/arithmetic/finite_fields.nim
@ -591,7 +591,6 @@ macro addchain*(fn: untyped): untyped =
    body.add s

  result[^1] = body
-  # echo result.toStrLit()

 # ############################################################
 #
--- a/constantine/math/arithmetic/limbs_exgcd.nim
+++ b/constantine/math/arithmetic/limbs_exgcd.nim
@ -191,7 +191,7 @@ proc partitionDivsteps(bits, wordBitWidth: int): tuple[totalIters, numChunks, ch
      # For any input, for gcd(f, g) with 0 <= g <= f <= Modulus with hddivstep variant (half-delta divstep)
      # (inversion g == 1)
      (45907*bits + 26313) div 19929
-  let numChunks = (totalIters + wordBitWidth-1) div wordBitWidth
+  let numChunks = totalIters.ceilDiv_vartime(wordBitWidth)
  let chunkSize = totalIters div numChunks
  let cutoff = totalIters mod numChunks
  return (totalIters, numChunks, chunkSize, cutoff)
@ -444,7 +444,7 @@ func invmod*(
  ## ``a`` MUST be less than M.
  const Excess = 2
  const k = WordBitWidth - Excess
-  const NumUnsatWords = (bits + k - 1) div k
+  const NumUnsatWords = bits.ceilDiv_vartime(k)

  # Convert values to unsaturated repr
  var m2 {.noInit.}: LimbsUnsaturated[NumUnsatWords, Excess]
@ -471,7 +471,7 @@ func invmod*(

  const Excess = 2
  const k = WordBitWidth - Excess
-  const NumUnsatWords = (bits + k - 1) div k
+  const NumUnsatWords = bits.ceilDiv_vartime(k)

  # Convert values to unsaturated repr
  const m2 = LimbsUnsaturated[NumUnsatWords, Excess].fromPackedRepr(M)
@ -647,7 +647,7 @@ func legendre*(a, M: Limbs, bits: static int): SecretWord =
  ##                      ≡  0 (mod p), iff a is 0
  const Excess = 2
  const k = WordBitWidth - Excess
-  const NumUnsatWords = (bits + k - 1) div k
+  const NumUnsatWords = bits.ceilDiv_vartime(k)

  # Convert values to unsaturated repr
  var m2 {.noInit.}: LimbsUnsaturated[NumUnsatWords, Excess]
@ -667,7 +667,7 @@ func legendre*(a: Limbs, M: static Limbs, bits: static int): SecretWord =

  const Excess = 2
  const k = WordBitWidth - Excess
-  const NumUnsatWords = (bits + k - 1) div k
+  const NumUnsatWords = bits.ceilDiv_vartime(k)

  # Convert values to unsaturated repr
  const m2 = LimbsUnsaturated[NumUnsatWords, Excess].fromPackedRepr(M)
@ -852,7 +852,7 @@ func invmod_vartime*(
  ## ``a`` MUST be less than M.
  const Excess = 2
  const k = WordBitWidth - Excess
-  const NumUnsatWords = (bits + k - 1) div k
+  const NumUnsatWords = bits.ceilDiv_vartime(k)

  # Convert values to unsaturated repr
  var m2 {.noInit.}: LimbsUnsaturated[NumUnsatWords, Excess]
@ -879,7 +879,7 @@ func invmod_vartime*(

  const Excess = 2
  const k = WordBitWidth - Excess
-  const NumUnsatWords = (bits + k - 1) div k
+  const NumUnsatWords = bits.ceilDiv_vartime(k)

  # Convert values to unsaturated repr
  const m2 = LimbsUnsaturated[NumUnsatWords, Excess].fromPackedRepr(M)
--- a/constantine/math/config/curves_derived.nim
+++ b/constantine/math/config/curves_derived.nim
@ -116,5 +116,3 @@ macro genDerivedConstants*(mode: static DerivedConstantMode): untyped =
        M
      )
    )
-
-  # echo result.toStrLit()
--- a/constantine/math/config/curves_parser_curve.nim
+++ b/constantine/math/config/curves_parser_curve.nim
@ -95,7 +95,7 @@ proc genCurveConstants(defs: seq[CurveParams]): NimNode =
        exported($curve & "_sexticTwist"),
        newLit curveDef.sexticTwist
      )
-    
+
    if curveDef.eq_form == TwistedEdwards and
         curveDef.coef_A.kind != NoCoef and curveDef.coef_D.kind != NoCoef:
      curveEllipticStmts.add newConstStmt(
@ -118,8 +118,6 @@ proc genCurveConstants(defs: seq[CurveParams]): NimNode =

  result.add curveEllipticStmts

-  # echo result.toStrLit()
-
 macro setupCurves(): untyped =
  result = genCurveConstants(curvesDefinitions)

--- a/constantine/math/config/curves_parser_field.nim
+++ b/constantine/math/config/curves_parser_field.nim
@ -321,8 +321,6 @@ proc genFieldsConstants(defs: seq[CurveParams]): NimNode =
    exported("CurveOrderBitWidth"), MapCurveOrderBitWidth
  )

-  # echo result.toStrLit()
-
 macro declareCurves*(curves: untyped): untyped =
  ## Parse curve configuration and generates
  ##
--- a/constantine/math/config/precompute.nim
+++ b/constantine/math/config/precompute.nim
@ -389,7 +389,7 @@ func primePlus1div2*(P: BigInt): BigInt =

 func primeMinus3div4_BE*[bits: static int](
       P: BigInt[bits]
-     ): array[(bits+7) div 8, byte] {.noInit.} =
+     ): array[bits.ceilDiv_vartime(8), byte] {.noInit.} =
  ## For an input prime `p`, compute (p-3)/4
  ## and return the result as a canonical byte array / octet string
  ## For use to check if a number is a square (quadratic residue)
@ -408,7 +408,7 @@ func primeMinus3div4_BE*[bits: static int](

 func primeMinus5div8_BE*[bits: static int](
       P: BigInt[bits]
-     ): array[(bits+7) div 8, byte] {.noInit.} =
+     ): array[bits.ceilDiv_vartime(8), byte] {.noInit.} =
  ## For an input prime `p`, compute (p-5)/8
  ## and return the result as a canonical byte array / octet string
  ## For use to check if a number is a square (quadratic residue)
--- a/constantine/math/config/type_bigint.nim
+++ b/constantine/math/config/type_bigint.nim
@ -11,7 +11,7 @@ import ../../platforms/abstractions
 func wordsRequired*(bits: int): int {.compileTime.} =
  ## Compute the number of limbs required
  # from the **announced** bit length
-  (bits + WordBitWidth - 1) div WordBitWidth
+  bits.ceilDiv_vartime(WordBitWidth)

 type
  BigInt*[bits: static int] = object
--- a/constantine/math/elliptic/ec_endomorphism_accel.nim
+++ b/constantine/math/elliptic/ec_endomorphism_accel.nim
@ -61,7 +61,7 @@ func decomposeEndo*[M, scalBits, L: static int](
  static: doAssert scalBits >= L, "Cannot decompose a scalar smaller than a mini-scalar or the decomposition coefficient"

  # Equal when no window or no negative handling, greater otherwise
-  static: doAssert L >= (scalBits + M - 1) div M + 1
+  static: doAssert L >= scalBits.ceilDiv_vartime(M) + 1
  const w = F.C.getCurveOrderBitwidth().wordsRequired()

  when M == 2:
@ -129,7 +129,7 @@ func decomposeEndo*[M, scalBits, L: static int](
 # (For example generating a public-key)

 type
-  Recoded[LengthInDigits: static int] = distinct array[(LengthInDigits + 7) div 8, byte]
+  Recoded[LengthInDigits: static int] = distinct array[LengthInDigits.ceilDiv_vartime(8), byte]
  GLV_SAC[M, LengthInDigits: static int] = array[M, Recoded[LengthInDigits]]
    ## GLV-Based Sign-Aligned-Column representation
    ## see Faz-Hernandez, 2013
@ -319,7 +319,7 @@ func scalarMulEndo*[scalBits; EC](
    {.error: "Unconfigured".}

  # 2. Decompose scalar into mini-scalars
-  const L = (scalBits + M - 1) div M + 1 # Alternatively, negative can be handled with an extra "+1"
+  const L = scalBits.ceilDiv_vartime(M) + 1 # Alternatively, negative can be handled with an extra "+1"
  var miniScalars {.noInit.}: array[M, BigInt[L]]
  var negatePoints {.noInit.}: array[M, SecretBool]
  miniScalars.decomposeEndo(negatePoints, scalar, P.F)
@ -473,7 +473,7 @@ func computeRecodedLength(bitWidth, window: int): int =
  # Strangely in the paper this doesn't depend
  # "m", the GLV decomposition dimension.
  # lw = ⌈log2 r/w⌉+1 (optionally a second "+1" to handle negative mini scalars)
-  let lw = (bitWidth + window - 1) div window + 1
+  let lw = bitWidth.ceilDiv_vartime(window) + 1
  result = (lw mod window) + lw

 func scalarMulGLV_m2w2*[scalBits; EC](
--- a/constantine/math/elliptic/ec_multi_scalar_mul.nim
+++ b/constantine/math/elliptic/ec_multi_scalar_mul.nim
@ -44,7 +44,7 @@ func multiScalarMulImpl_reference_vartime[F, G; bits: static int](
  # Prologue
  # --------
  const numBuckets = 1 shl c - 1 # bucket 0 is unused
-  const numWindows = (bits + c - 1) div c
+  const numWindows = bits.ceilDiv_vartime(c)
  type EC = typeof(r)

  let miniMSMs = allocHeapArray(EC, numWindows)
@ -376,7 +376,7 @@ func multiScalarMul_vartime*[bits: static int, F, G](
              elif F is Fp2: 4
              else: {.error: "Unconfigured".}

-    const L = (bits + M - 1) div M + 1
+    const L = bits.ceilDiv_vartime(M) + 1
    let splitCoefs   = allocHeapArray(array[M, BigInt[L]], N)
    let endoBasis    = allocHeapArray(array[M, ECP_ShortW_Aff[F, G]], N)

--- a/constantine/math/elliptic/ec_multi_scalar_mul_scheduler.nim
+++ b/constantine/math/elliptic/ec_multi_scalar_mul_scheduler.nim
@ -599,7 +599,7 @@ when isMainModule:
    let c = inputSize.bestBucketBitSize(255, useSignedBuckets = true, useManualTuning = false)
    let twoPow = "2^"
    let numNZBuckets = 1 shl (c-1)
-    let collisionMapSize = ((1 shl (c-1))+63) div 64 * 8 # Stored in BigInt[1 shl (c-1)]
+    let collisionMapSize = ceilDiv_vartime(1 shl (c-1), 64) * 8 # Stored in BigInt[1 shl (c-1)]
    let queueSize = 4*c*c - 16*c - 128
    let numCollisions = float(inputSize*queueSize) / float(numNZBuckets)
    let collisionPercentage = numCollisions / float(inputSize) * 100
--- a/constantine/math/elliptic/ec_scalar_mul.nim
+++ b/constantine/math/elliptic/ec_scalar_mul.nim
@ -223,8 +223,8 @@ func scalarMulGeneric*[EC](P: var EC, scalar: BigInt, window: static int = 5) =
  ## the scalar multiplication.
  var
    scratchSpace: array[1 shl window, EC]
-    scalarCanonicalBE: array[(scalar.bits+7) div 8, byte] # canonical big endian representation
-  scalarCanonicalBE.marshal(scalar, bigEndian)      # Export is constant-time
+    scalarCanonicalBE: array[scalar.bits.ceilDiv_vartime(8), byte] # canonical big endian representation
+  scalarCanonicalBE.marshal(scalar, bigEndian)                     # Export is constant-time
  P.scalarMulGeneric(scalarCanonicalBE, scratchSpace)

 func scalarMul*[EC](
--- a/constantine/math/elliptic/ec_scalar_mul_vartime.nim
+++ b/constantine/math/elliptic/ec_scalar_mul_vartime.nim
@ -38,7 +38,7 @@ func scalarMul_doubleAdd_vartime*[EC](P: var EC, scalar: BigInt) {.tags:[VarTime
  ## This MUST NOT be used with secret data.
  ##
  ## This is highly VULNERABLE to timing attacks and power analysis attacks.
-  var scalarCanonical: array[(scalar.bits+7) div 8, byte]
+  var scalarCanonical: array[scalar.bits.ceilDiv_vartime(8), byte]
  scalarCanonical.marshal(scalar, bigEndian)

  var Paff {.noinit.}: affine(EC)
--- a/constantine/math/elliptic/ec_shortweierstrass_batch_ops_parallel.nim
+++ b/constantine/math/elliptic/ec_shortweierstrass_batch_ops_parallel.nim
@ -45,7 +45,7 @@ func computeBalancedChunks(start, stopEx, minChunkSize, maxChunkSize, targetNumC
    cutoff = totalIters mod numChunks
  elif baseChunkSize > maxChunkSize or (baseChunkSize == maxChunkSize and cutoff != 0):
    # After cutoff, we do baseChunkSize+1, and would run afoul of the maxChunkSize constraint (unless no remainder), hence ceildiv
-    numChunks = (totalIters + maxChunkSize - 1) div maxChunkSize # ceildiv
+    numChunks = totalIters.ceilDiv_vartime(maxChunkSize)
    baseChunkSize = totalIters div numChunks
    cutoff = totalIters mod numChunks

--- a/constantine/math/extension_fields/exponentiations.nim
+++ b/constantine/math/extension_fields/exponentiations.nim
@ -183,6 +183,6 @@ func powUnsafeExponent*[F; bits: static int](
  ## - memory access analysis
  ## - power analysis
  ## - timing analysis
-  var expBE {.noInit.}: array[(bits + 7) div 8, byte]
+  var expBE {.noInit.}: array[bits.ceilDiv_vartime(8), byte]
  expBE.marshal(exponent, bigEndian)
  a.powUnsafeExponent(expBE, window)
--- a/constantine/math/io/io_bigints.nim
+++ b/constantine/math/io/io_bigints.nim
@ -355,7 +355,7 @@ func marshal*(
  ## or zero-padded right for little-endian.
  ## I.e least significant bit is aligned to buffer boundary
  debug:
-    doAssert dst.len >= (BigInt.bits + 7) div 8, block:
+    doAssert dst.len >= BigInt.bits.ceilDiv_vartime(8), block:
      "BigInt -> Raw int conversion: destination buffer is too small\n" &
      "  bits: " & $BigInt.bits & "\n" &
      "  bytes allocated: " & $dst.len & '\n'
@ -389,7 +389,7 @@ func fromHex*(a: var BigInt, s: string) =
  ## Can work at compile-time to declare curve moduli from their hex strings

  # 1. Convert to canonical uint
-  const canonLen = (BigInt.bits + 8 - 1) div 8
+  const canonLen = BigInt.bits.ceilDiv_vartime(8)
  var bytes: array[canonLen, byte]
  bytes.paddedFromHex(s, bigEndian)

@ -428,7 +428,7 @@ func appendHex*(dst: var string, big: BigInt, order: static Endianness = bigEndi
  ## This function may allocate.

  # 1. Convert Big Int to canonical uint
-  const canonLen = (big.bits + 8 - 1) div 8
+  const canonLen = big.bits.ceilDiv_vartime(8)
  var bytes: array[canonLen, byte]
  marshal(bytes, big, order)

--- a/constantine/platforms/allocs.nim
+++ b/constantine/platforms/allocs.nim
@ -67,7 +67,7 @@ func roundNextMultipleOf(x: int, n: static int): int {.inline.} =
    # n is a power of 2. (If compiler cannot prove that x>0 it does not make the optim)
    result = (x + n - 1) and not(n - 1)
  else:
-    result = ((x + n - 1) div n) * n
+    result = x.ceilDiv_vartime(n) * n

 # Stack allocation
 # ----------------------------------------------------------------------------------
--- a/constantine/platforms/ast_rebuilder.nim
+++ b/constantine/platforms/ast_rebuilder.nim
@ -0,0 +1,77 @@
+# Constantine
+# Copyright (c) 2018-2019    Status Research & Development GmbH
+# Copyright (c) 2020-Present Mamy André-Ratsimbazafy
+# Licensed and distributed under either of
+#   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
+#   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
+# at your option. This file may not be copied, modified, or distributed except according to those terms.
+
+import std/macros
+
+proc rebuildUntypedAst*(ast: NimNode, dropRootStmtList = false): NimNode =
+  ## In some cases (generics or static proc) Nim gives us
+  ## typed NimNode which are hard to process.
+  ## This rebuilds an untyped AST.
+  ##
+  ## Additionally this allows dropping the root StmtList that
+  ## may wrap the typed AST from early symbol resolution
+  proc rebuild(node: NimNode): NimNode =
+    proc defaultMultipleChildren(node: NimNode): NimNode =
+      var rTree = node.kind.newTree()
+      for child in node:
+        rTree.add rebuild(child)
+      return rTree
+
+    case node.kind:
+    of {nnkIdent, nnkSym}:
+      return ident($node)
+    of nnkEmpty:
+      return node
+    of nnkLiterals:
+      return node
+    of nnkHiddenStdConv:
+      if node[1].kind == nnkIntLit:
+        return node[1]
+      else:
+        expectKind(node[1], nnkSym)
+        return ident($node[1])
+    of nnkConv: # type conversion needs to be replaced by a function call in untyped AST
+      var rTree = nnkCall.newTree()
+      for child in node:
+        rTree.add rebuild(child)
+      return rTree
+    of {nnkCall, nnkInfix, nnkPrefix}:
+      if node[0].kind == nnkOpenSymChoice:
+        if node[0][0].eqIdent"contains":
+          var rTree = nnkInfix.newTree()
+          rTree.add ident"in"
+          rTree.add rebuild(node[2])
+          rTree.add rebuild(node[1])
+          return rTree
+        else:
+          var rTree = node.kind.newTree()
+          rTree.add rebuild(node[0][0])
+          for i in 1 ..< node.len:
+            rTree.add rebuild(node[i])
+          return rTree
+      elif node[0].kind == nnkClosedSymChoice:
+        if node[0][0].eqIdent"addr":
+          node.expectLen(1)
+          return nnkAddr.newTree(rebuild(node[1]))
+        else:
+          var rTree = node.kind.newTree()
+          rTree.add rebuild(node[0][0])
+          for i in 1 ..< node.len:
+            rTree.add rebuild(node[i])
+          return rTree
+      else:
+        return defaultMultipleChildren(node)
+    of nnkClosedSymChoice:
+      return rebuild(node[0])
+    else:
+      return defaultMultipleChildren(node)
+
+  if dropRootStmtList and ast.kind == nnkStmtList:
+    return rebuild(ast[0])
+  else:
+    result = rebuild(ast)
--- a/constantine/platforms/compilers/bitops.nim
+++ b/constantine/platforms/compilers/bitops.nim
@ -31,12 +31,12 @@ when GCC_Compatible:
      0
    else:
      when sizeof(n) == 8:
-        cint(64) - builtin_clzll(n)
+        cint(63) - builtin_clzll(n)
      else:
        cint(31) - builtin_clz(n.uint32)

  func ctz_c_compiler_vartime*(n: SomeUnsignedInt): cint {.inline.} =
-    ## Compute the number of trailing zeros 
+    ## Compute the number of trailing zeros
    ## in the bit representation of n using compiler builtin
    ## ⚠ Depending on the compiler:
    ## - It is undefined if n == 0
@ -72,7 +72,7 @@ elif defined(icc):
    if fnc(index.addr, v) == 0:
      return default
    return index.int
-  
+
  func log2_c_compiler_vartime*(n: SomeUnsignedInt): cint {.inline.} =
    ## Compute the log2 of n using compiler builtin
    ## ⚠ Depending on the compiler:
@ -82,7 +82,7 @@ elif defined(icc):
      bitscan(bitScanReverse64, n, default = 0)
    else:
      bitscan(bitScanReverse, c.uint32, default = 0)
-    
+
  func ctz_c_compiler_vartime*(n: SomeUnsignedInt): cint {.inline.} =
    ## Compute the number of trailing zero bits of n using compiler builtin
    ## ⚠ Depending on the compiler:
@ -116,7 +116,7 @@ elif defined(vcc):
    if fnc(index.addr, v) == 0:
      return 0
    return index.int
-  
+
  func log2_c_compiler_vartime*(n: SomeUnsignedInt): cint {.inline.} =
    ## Compute the log2 of n using compiler builtin
    ## ⚠ Depending on the compiler:
@ -126,7 +126,7 @@ elif defined(vcc):
      bitscan(bitScanReverse64, n, default = 0)
    else:
      bitscan(bitScanReverse, c.uint32, default = 0)
-    
+
  func ctz_c_compiler_vartime*(n: SomeUnsignedInt): cint {.inline.} =
    ## Compute the number of trailing zero bits of n using compiler builtin
    ## ⚠ Depending on the compiler:
--- a/constantine/platforms/gpu/bindings/utils.nim
+++ b/constantine/platforms/gpu/bindings/utils.nim
@ -7,6 +7,7 @@
 # at your option. This file may not be copied, modified, or distributed except according to those terms.

 import std/macros
+import ../../ast_rebuilder

 # ############################################################
 #
@ -33,33 +34,6 @@ func flag*[E: enum](e: varargs[E]): Flag[E] {.inline.} =
 # Macros
 # ------------------------------------------------------------

-proc replaceSymsByIdents*(ast: NimNode): NimNode =
-  proc inspect(node: NimNode): NimNode =
-    case node.kind:
-    of {nnkIdent, nnkSym}:
-      return ident($node)
-    of nnkEmpty:
-      return node
-    of nnkLiterals:
-      return node
-    of nnkHiddenStdConv:
-      if node[1].kind == nnkIntLit:
-        return node[1]
-      else:
-        expectKind(node[1], nnkSym)
-        return ident($node[1])
-    of nnkConv: # type conversion needs to be replaced by a function call in untyped AST
-      var rTree = nnkCall.newTree()
-      for child in node:
-        rTree.add inspect(child)
-      return rTree
-    else:
-      var rTree = node.kind.newTree()
-      for child in node:
-        rTree.add inspect(child)
-      return rTree
-  result = inspect(ast)
-
 macro replacePragmasByInline(procAst: typed): untyped =
  ## Replace pragmas by the inline pragma
  ## We need a separate "typed" macro
@ -76,7 +50,7 @@ macro replacePragmasByInline(procAst: typed): untyped =
  result.add newProc(
    name = procAst.name,
    params = params,
-    body = procAst.body.replaceSymsByIdents(),
+    body = procAst.body.rebuildUntypedAst(),
    procType = nnkProcDef,
    pragmas = nnkPragma.newTree(ident"inline", ident"nimcall")
  )
@ -84,19 +58,19 @@ macro replacePragmasByInline(procAst: typed): untyped =
  result.add nnkPragma.newTree(ident"pop")

 macro wrapOpenArrayLenType*(ty: typedesc, procAst: untyped): untyped =
-  ## Wraps pointer+len library calls in properly typed and converted openArray calls 
+  ## Wraps pointer+len library calls in properly typed and converted openArray calls
  ##
  ## ```
  ## {.push cdecl.}
  ## proc foo*(r: int, a: openArray[CustomType], b: int) {.wrapOpenArrayLenType: uint32, importc: "foo", dynlib: "libfoo.so".}
  ## {.pop.}
  ## ```
-  ## 
+  ##
  ## is transformed into
-  ## 
+  ##
  ## ```
  ## proc foo(r: int, a: ptr CustomType, aLen: uint32, b: int) {.cdecl, importc: "foo", dynlib: "libfoo.so".}
-  ## 
+  ##
  ## proc foo*(r: int, a: openArray[CustomType], b: int) {.inline.} =
  ##   foo(r, a[0].unsafeAddr, a.len.uint32, b)
  ## ```
--- a/constantine/platforms/gpu/ir.nim
+++ b/constantine/platforms/gpu/ir.nim
@ -9,7 +9,7 @@
 import
  ../../math/config/[curves, precompute],
  ../../math/io/io_bigints,
-  ../primitives, ../bithacks, ../endians,
+  ../primitives, ../bithacks, ../endians, ../codecs,
  ./llvm

 # ############################################################
--- a/constantine/platforms/primitives.nim
+++ b/constantine/platforms/primitives.nim
@ -52,8 +52,25 @@ template debug*(body: untyped): untyped =
  when defined(debugConstantine):
    body

-func unreachable*() {.noReturn.} =
+proc builtin_unreachable(){.nodecl, importc: "__builtin_unreachable".}
+
+func unreachable*() {.noReturn, inline.} =
  doAssert false, "Unreachable"
+  when GCC_Compatible:
+    builtin_unreachable()
+
+# ############################################################
+#
+#                       Arithmetic
+#
+# ############################################################
+
+func ceilDiv_vartime*(a, b: auto): auto {.inline.} =
+  ## ceil division, to be used only on length or at compile-time
+  ## ceil(a / b)
+  # "LengthInDigits: static int" doesn't match "int"
+  # if "SomeInteger" is used instead of "autoi"
+  (a + b - 1) div b

 # ############################################################
 #
--- a/constantine/platforms/static_for.nim
+++ b/constantine/platforms/static_for.nim
@ -32,16 +32,14 @@ macro staticFor*(idx: untyped{nkIdent}, start, stopEx: static int, body: untyped
  for i in start ..< stopEx:
    result.add nnkBlockStmt.newTree(
      ident("unrolledIter_" & $idx & $i),
-      body.replaceNodes(idx, newLit i)
-    )
+      body.replaceNodes(idx, newLit i))

 macro staticForCountdown*(idx: untyped{nkIdent}, start, stopIncl: static int, body: untyped): untyped =
  result = newStmtList()
  for i in countdown(start, stopIncl):
    result.add nnkBlockStmt.newTree(
      ident("unrolledIter_" & $idx & $i),
-      body.replaceNodes(idx, newLit i)
-    )
+      body.replaceNodes(idx, newLit i))

 {.experimental: "dynamicBindSym".}

@ -62,5 +60,4 @@ macro staticFor*(ident: untyped{nkIdent}, choices: typed, body: untyped): untype
  for choice in choices:
    result.add nnkBlockStmt.newTree(
      ident($ident & "_" & $choice.intVal),
-      body.replaceNodes(ident, choice)
-    )
+      body.replaceNodes(ident, choice))
--- a/constantine/platforms/threadpool/benchmarks/black_scholes/README.md
+++ b/constantine/platforms/threadpool/benchmarks/black_scholes/README.md
@ -0,0 +1,9 @@
+# Black & Scholes European Option Pricing
+
+https://www.investopedia.com/terms/b/blackscholes.asp
+
+Benchmarks from the PARSEC benchmark suite
+
+https://parsec.cs.princeton.edu
+
+Reference C implementation by Intel
--- a/constantine/platforms/threadpool/benchmarks/black_scholes/README_Parsec.txt
+++ b/constantine/platforms/threadpool/benchmarks/black_scholes/README_Parsec.txt
@ -0,0 +1,67 @@
+Name:  Black Scholes
+
+Description: The Black-Scholes equation is a differential equation that
+describes how, under a certain set of assumptions, the value of an option
+changes as the price of the underlying asset changes.
+
+The formula for a put option is similar. The cumulative normal distribution 
+function, CND(x), gives the probability that normally distributed random
+variable will have a value less than x. There is no closed form expression for
+this function, and as such it must be evaluated numerically. Alternatively,
+the values of this function may be pre-computed and hard-coded in the table; in
+this case, they can be obtained at runtime using table lookup. We compare both
+of these approaches in our work. The other parameters are as follows:
+S underlying asset.s current price, X the strike price, T time to the
+expiration date, r risk-less rate of return, and v stock.s volatility.
+
+Based on this formula, one can compute the option price analytically based on
+the five input parameters.  Using this analytical approach to price option,
+the limiting factor lies with the amount of floating-point calculation a
+processor can perform.
+
+Parallelization: Our parallelization algorithms is very simple: we simply price 
+multiple options in parallel using Black-Scholes formula. Each thread prices an 
+individual option. In practice financial houses price 10.s to 100.s of thousandsof options using Black-Scholes.
+
+=======================================
+Programming Languages & Libraries:
+
+C/C++ and Pthread is used to implement this benchmark.
+
+=======================================
+System requirements:
+
+ 1) Intel(R) C++ Compiler: version 9.0 or higher
+ 2) GNU gcc/g++: version 3.3 or higher
+ 3) sed: version 4.0.9 or higher recommended.
+The minimum required memory size is 140 MBytes.
+=======================================
+Input/Output:
+The input data file of this benchmark includes an array of data of 
+options.
+
+The output benchmark will output the price of the options based on the five
+input parameters in the dataset file. 
+
+
+=======================================
+Characteristics:
+
+(1) Hotspot
+Hotspot of the benchmark includes computing the price of options using 
+black scholes formula and  the cumulative normal distribution function.
+They are implemented in BlkSchlsEqEuroNoDiv and CNDF in "bs.c" respectly.
+
+=======================================
+Revision History
+
+Date: Person-making-revision  brief-description-of-revision
+
+=======================================
+Author: Victor Lee, Mikhail Smelyanskiy
+
+Acknowledgements:
+
+References:
+[Black73] Black, Fischer, and M. Scholes. The Pricing of Options and Corporate Liabilities. Journal of Political Economy, 81:637--659, May--June 1973.
+
--- a/constantine/platforms/threadpool/benchmarks/black_scholes/blackscholes.c
+++ b/constantine/platforms/threadpool/benchmarks/black_scholes/blackscholes.c
@ -0,0 +1,509 @@
+// Copyright (c) 2007 Intel Corp.
+
+// Black-Scholes
+// Analytical method for calculating European Options
+//
+//
+// Reference Source: Options, Futures, and Other Derivatives, 3rd Edition, Prentice
+// Hall, John C. Hull,
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+#include <string.h>
+
+#ifdef ENABLE_PARSEC_HOOKS
+#include <hooks.h>
+#endif
+
+// Multi-threaded pthreads header
+#ifdef ENABLE_THREADS
+// Add the following line so that icc 9.0 is compatible with pthread lib.
+#define __thread __threadp
+MAIN_ENV
+#undef __thread
+#endif
+
+// Multi-threaded OpenMP header
+#ifdef ENABLE_OPENMP
+#include <omp.h>
+#endif
+
+#ifdef ENABLE_TBB
+#include "tbb/blocked_range.h"
+#include "tbb/parallel_for.h"
+#include "tbb/task_scheduler_init.h"
+#include "tbb/tick_count.h"
+
+using namespace std;
+using namespace tbb;
+#endif //ENABLE_TBB
+
+// Multi-threaded header for Windows
+#ifdef WIN32
+#pragma warning(disable : 4305)
+#pragma warning(disable : 4244)
+#include <windows.h>
+#endif
+
+//Precision to use for calculations
+#define fptype float
+
+#define NUM_RUNS 100
+
+typedef struct OptionData_ {
+        fptype s;          // spot price
+        fptype strike;     // strike price
+        fptype r;          // risk-free interest rate
+        fptype divq;       // dividend rate
+        fptype v;          // volatility
+        fptype t;          // time to maturity or option expiration in years
+                           //     (1yr = 1.0, 6mos = 0.5, 3mos = 0.25, ..., etc)
+        char OptionType;   // Option type.  "P"=PUT, "C"=CALL
+        fptype divs;       // dividend vals (not used in this test)
+        fptype DGrefval;   // DerivaGem Reference Value
+} OptionData;
+
+OptionData *data;
+fptype *prices;
+int numOptions;
+
+int    * otype;
+fptype * sptprice;
+fptype * strike;
+fptype * rate;
+fptype * volatility;
+fptype * otime;
+int numErrors = 0;
+int nThreads;
+
+////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////
+///////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////
+// Cumulative Normal Distribution Function
+// See Hull, Section 11.8, P.243-244
+#define inv_sqrt_2xPI 0.39894228040143270286
+
+fptype CNDF ( fptype InputX )
+{
+    int sign;
+
+    fptype OutputX;
+    fptype xInput;
+    fptype xNPrimeofX;
+    fptype expValues;
+    fptype xK2;
+    fptype xK2_2, xK2_3;
+    fptype xK2_4, xK2_5;
+    fptype xLocal, xLocal_1;
+    fptype xLocal_2, xLocal_3;
+
+    // Check for negative value of InputX
+    if (InputX < 0.0) {
+        InputX = -InputX;
+        sign = 1;
+    } else
+        sign = 0;
+
+    xInput = InputX;
+
+    // Compute NPrimeX term common to both four & six decimal accuracy calcs
+    expValues = exp(-0.5f * InputX * InputX);
+    xNPrimeofX = expValues;
+    xNPrimeofX = xNPrimeofX * inv_sqrt_2xPI;
+
+    xK2 = 0.2316419 * xInput;
+    xK2 = 1.0 + xK2;
+    xK2 = 1.0 / xK2;
+    xK2_2 = xK2 * xK2;
+    xK2_3 = xK2_2 * xK2;
+    xK2_4 = xK2_3 * xK2;
+    xK2_5 = xK2_4 * xK2;
+
+    xLocal_1 = xK2 * 0.319381530;
+    xLocal_2 = xK2_2 * (-0.356563782);
+    xLocal_3 = xK2_3 * 1.781477937;
+    xLocal_2 = xLocal_2 + xLocal_3;
+    xLocal_3 = xK2_4 * (-1.821255978);
+    xLocal_2 = xLocal_2 + xLocal_3;
+    xLocal_3 = xK2_5 * 1.330274429;
+    xLocal_2 = xLocal_2 + xLocal_3;
+
+    xLocal_1 = xLocal_2 + xLocal_1;
+    xLocal   = xLocal_1 * xNPrimeofX;
+    xLocal   = 1.0 - xLocal;
+
+    OutputX  = xLocal;
+
+    if (sign) {
+        OutputX = 1.0 - OutputX;
+    }
+
+    return OutputX;
+}
+
+//////////////////////////////////////////////////////////////////////////////////////
+//////////////////////////////////////////////////////////////////////////////////////
+//////////////////////////////////////////////////////////////////////////////////////
+//////////////////////////////////////////////////////////////////////////////////////
+fptype BlkSchlsEqEuroNoDiv( fptype sptprice,
+                            fptype strike, fptype rate, fptype volatility,
+                            fptype time, int otype, float timet )
+{
+    fptype OptionPrice;
+
+    // local private working variables for the calculation
+    fptype xStockPrice;
+    fptype xStrikePrice;
+    fptype xRiskFreeRate;
+    fptype xVolatility;
+    fptype xTime;
+    fptype xSqrtTime;
+
+    fptype logValues;
+    fptype xLogTerm;
+    fptype xD1;
+    fptype xD2;
+    fptype xPowerTerm;
+    fptype xDen;
+    fptype d1;
+    fptype d2;
+    fptype FutureValueX;
+    fptype NofXd1;
+    fptype NofXd2;
+    fptype NegNofXd1;
+    fptype NegNofXd2;
+
+    xStockPrice = sptprice;
+    xStrikePrice = strike;
+    xRiskFreeRate = rate;
+    xVolatility = volatility;
+
+    xTime = time;
+    xSqrtTime = sqrt(xTime);
+
+    logValues = log( sptprice / strike );
+
+    xLogTerm = logValues;
+
+
+    xPowerTerm = xVolatility * xVolatility;
+    xPowerTerm = xPowerTerm * 0.5;
+
+    xD1 = xRiskFreeRate + xPowerTerm;
+    xD1 = xD1 * xTime;
+    xD1 = xD1 + xLogTerm;
+
+    xDen = xVolatility * xSqrtTime;
+    xD1 = xD1 / xDen;
+    xD2 = xD1 -  xDen;
+
+    d1 = xD1;
+    d2 = xD2;
+
+    NofXd1 = CNDF( d1 );
+    NofXd2 = CNDF( d2 );
+
+    FutureValueX = strike * ( exp( -(rate)*(time) ) );
+    if (otype == 0) {
+        OptionPrice = (sptprice * NofXd1) - (FutureValueX * NofXd2);
+    } else {
+        NegNofXd1 = (1.0 - NofXd1);
+        NegNofXd2 = (1.0 - NofXd2);
+        OptionPrice = (FutureValueX * NegNofXd2) - (sptprice * NegNofXd1);
+    }
+
+    return OptionPrice;
+}
+
+#ifdef ENABLE_TBB
+struct mainWork {
+  mainWork() {}
+  mainWork(mainWork &w, tbb::split) {}
+
+  void operator()(const tbb::blocked_range<int> &range) const {
+    fptype price;
+    int begin = range.begin();
+    int end = range.end();
+
+    for (int i=begin; i!=end; i++) {
+      /* Calling main function to calculate option value based on
+       * Black & Scholes's equation.
+       */
+
+      price = BlkSchlsEqEuroNoDiv( sptprice[i], strike[i],
+                                   rate[i], volatility[i], otime[i],
+                                   otype[i], 0);
+      prices[i] = price;
+
+#ifdef ERR_CHK
+      fptype priceDelta = data[i].DGrefval - price;
+      if( fabs(priceDelta) >= 1e-5 ){
+        fprintf(stderr,"Error on %d. Computed=%.5f, Ref=%.5f, Delta=%.5f\n",
+               i, price, data[i].DGrefval, priceDelta);
+        numError ++;
+      }
+#endif
+    }
+  }
+};
+
+#endif // ENABLE_TBB
+
+//////////////////////////////////////////////////////////////////////////////////////
+//////////////////////////////////////////////////////////////////////////////////////
+//////////////////////////////////////////////////////////////////////////////////////
+//////////////////////////////////////////////////////////////////////////////////////
+
+#ifdef ENABLE_TBB
+int bs_thread(void *tid_ptr) {
+    int j;
+    tbb::affinity_partitioner a;
+
+    mainWork doall;
+    for (j=0; j<NUM_RUNS; j++) {
+      tbb::parallel_for(tbb::blocked_range<int>(0, numOptions), doall, a);
+    }
+
+    return 0;
+}
+#else // !ENABLE_TBB
+
+#ifdef WIN32
+DWORD WINAPI bs_thread(LPVOID tid_ptr){
+#else
+int bs_thread(void *tid_ptr) {
+#endif
+    int i, j;
+    fptype price;
+    fptype priceDelta;
+    int tid = *(int *)tid_ptr;
+    int start = tid * (numOptions / nThreads);
+    int end = start + (numOptions / nThreads);
+
+    for (j=0; j<NUM_RUNS; j++) {
+#ifdef ENABLE_OPENMP
+#pragma omp parallel for private(i, price, priceDelta)
+        for (i=0; i<numOptions; i++) {
+#else  //ENABLE_OPENMP
+        for (i=start; i<end; i++) {
+#endif //ENABLE_OPENMP
+            /* Calling main function to calculate option value based on
+             * Black & Scholes's equation.
+             */
+            price = BlkSchlsEqEuroNoDiv( sptprice[i], strike[i],
+                                         rate[i], volatility[i], otime[i],
+                                         otype[i], 0);
+            prices[i] = price;
+
+#ifdef ERR_CHK
+            priceDelta = data[i].DGrefval - price;
+            if( fabs(priceDelta) >= 1e-4 ){
+                printf("Error on %d. Computed=%.5f, Ref=%.5f, Delta=%.5f\n",
+                       i, price, data[i].DGrefval, priceDelta);
+                numError ++;
+            }
+#endif
+        }
+    }
+
+    return 0;
+}
+#endif //ENABLE_TBB
+
+int main (int argc, char **argv)
+{
+    FILE *file;
+    int i;
+    int loopnum;
+    fptype * buffer;
+    int * buffer2;
+    int rv;
+
+#ifdef PARSEC_VERSION
+#define __PARSEC_STRING(x) #x
+#define __PARSEC_XSTRING(x) __PARSEC_STRING(x)
+        printf("PARSEC Benchmark Suite Version "__PARSEC_XSTRING(PARSEC_VERSION)"\n");
+	fflush(NULL);
+#else
+        printf("PARSEC Benchmark Suite\n");
+	fflush(NULL);
+#endif //PARSEC_VERSION
+#ifdef ENABLE_PARSEC_HOOKS
+   __parsec_bench_begin(__parsec_blackscholes);
+#endif
+
+   if (argc != 4)
+        {
+                printf("Usage:\n\t%s <nthreads> <inputFile> <outputFile>\n", argv[0]);
+                exit(1);
+        }
+    nThreads = atoi(argv[1]);
+    char *inputFile = argv[2];
+    char *outputFile = argv[3];
+
+    //Read input data from file
+    file = fopen(inputFile, "r");
+    if(file == NULL) {
+      printf("ERROR: Unable to open file `%s'.\n", inputFile);
+      exit(1);
+    }
+    rv = fscanf(file, "%i", &numOptions);
+    if(rv != 1) {
+      printf("ERROR: Unable to read from file `%s'.\n", inputFile);
+      fclose(file);
+      exit(1);
+    }
+    if(nThreads > numOptions) {
+      printf("WARNING: Not enough work, reducing number of threads to match number of options.\n");
+      nThreads = numOptions;
+    }
+
+#if !defined(ENABLE_THREADS) && !defined(ENABLE_OPENMP) && !defined(ENABLE_TBB)
+    if(nThreads != 1) {
+        printf("Error: <nthreads> must be 1 (serial version)\n");
+        exit(1);
+    }
+#endif
+
+    // alloc spaces for the option data
+    data = (OptionData*)malloc(numOptions*sizeof(OptionData));
+    prices = (fptype*)malloc(numOptions*sizeof(fptype));
+    for ( loopnum = 0; loopnum < numOptions; ++ loopnum )
+    {
+        rv = fscanf(file, "%f %f %f %f %f %f %c %f %f", &data[loopnum].s, &data[loopnum].strike, &data[loopnum].r, &data[loopnum].divq, &data[loopnum].v, &data[loopnum].t, &data[loopnum].OptionType, &data[loopnum].divs, &data[loopnum].DGrefval);
+        if(rv != 9) {
+          printf("ERROR: Unable to read from file `%s'.\n", inputFile);
+          fclose(file);
+          exit(1);
+        }
+    }
+    rv = fclose(file);
+    if(rv != 0) {
+      printf("ERROR: Unable to close file `%s'.\n", inputFile);
+      exit(1);
+    }
+
+#ifdef ENABLE_THREADS
+    MAIN_INITENV(,8000000,nThreads);
+#endif
+    printf("Num of Options: %d\n", numOptions);
+    printf("Num of Runs: %d\n", NUM_RUNS);
+
+#define PAD 256
+#define LINESIZE 64
+
+    buffer = (fptype *) malloc(5 * numOptions * sizeof(fptype) + PAD);
+    sptprice = (fptype *) (((unsigned long long)buffer + PAD) & ~(LINESIZE - 1));
+    strike = sptprice + numOptions;
+    rate = strike + numOptions;
+    volatility = rate + numOptions;
+    otime = volatility + numOptions;
+
+    buffer2 = (int *) malloc(numOptions * sizeof(fptype) + PAD);
+    otype = (int *) (((unsigned long long)buffer2 + PAD) & ~(LINESIZE - 1));
+
+    for (i=0; i<numOptions; i++) {
+        otype[i]      = (data[i].OptionType == 'P') ? 1 : 0;
+        sptprice[i]   = data[i].s;
+        strike[i]     = data[i].strike;
+        rate[i]       = data[i].r;
+        volatility[i] = data[i].v;
+        otime[i]      = data[i].t;
+    }
+
+    printf("Size of data: %d\n", numOptions * (sizeof(OptionData) + sizeof(int)));
+
+#ifdef ENABLE_PARSEC_HOOKS
+    __parsec_roi_begin();
+#endif
+
+#ifdef ENABLE_THREADS
+#ifdef WIN32
+    HANDLE *threads;
+    int *nums;
+    threads = (HANDLE *) malloc (nThreads * sizeof(HANDLE));
+    nums = (int *) malloc (nThreads * sizeof(int));
+
+    for(i=0; i<nThreads; i++) {
+        nums[i] = i;
+        threads[i] = CreateThread(0, 0, bs_thread, &nums[i], 0, 0);
+    }
+    WaitForMultipleObjects(nThreads, threads, TRUE, INFINITE);
+    free(threads);
+    free(nums);
+#else
+    int *tids;
+    tids = (int *) malloc (nThreads * sizeof(int));
+
+    for(i=0; i<nThreads; i++) {
+        tids[i]=i;
+        CREATE_WITH_ARG(bs_thread, &tids[i]);
+    }
+    WAIT_FOR_END(nThreads);
+    free(tids);
+#endif //WIN32
+#else //ENABLE_THREADS
+#ifdef ENABLE_OPENMP
+    {
+        int tid=0;
+        omp_set_num_threads(nThreads);
+        bs_thread(&tid);
+    }
+#else //ENABLE_OPENMP
+#ifdef ENABLE_TBB
+    tbb::task_scheduler_init init(nThreads);
+
+    int tid=0;
+    bs_thread(&tid);
+#else //ENABLE_TBB
+    //serial version
+    int tid=0;
+    bs_thread(&tid);
+#endif //ENABLE_TBB
+#endif //ENABLE_OPENMP
+#endif //ENABLE_THREADS
+
+#ifdef ENABLE_PARSEC_HOOKS
+    __parsec_roi_end();
+#endif
+
+    //Write prices to output file
+    file = fopen(outputFile, "w");
+    if(file == NULL) {
+      printf("ERROR: Unable to open file `%s'.\n", outputFile);
+      exit(1);
+    }
+    rv = fprintf(file, "%i\n", numOptions);
+    if(rv < 0) {
+      printf("ERROR: Unable to write to file `%s'.\n", outputFile);
+      fclose(file);
+      exit(1);
+    }
+    for(i=0; i<numOptions; i++) {
+      rv = fprintf(file, "%.18f\n", prices[i]);
+      if(rv < 0) {
+        printf("ERROR: Unable to write to file `%s'.\n", outputFile);
+        fclose(file);
+        exit(1);
+      }
+    }
+    rv = fclose(file);
+    if(rv != 0) {
+      printf("ERROR: Unable to close file `%s'.\n", outputFile);
+      exit(1);
+    }
+
+#ifdef ERR_CHK
+    printf("Num Errors: %d\n", numError);
+#endif
+    free(data);
+    free(prices);
+
+#ifdef ENABLE_PARSEC_HOOKS
+    __parsec_bench_end();
+#endif
+
+    return 0;
+}
--- a/constantine/platforms/threadpool/benchmarks/black_scholes/blackscholes.simd.c
+++ b/constantine/platforms/threadpool/benchmarks/black_scholes/blackscholes.simd.c
@ -0,0 +1,598 @@
+// Copyright (c) 2007 Intel Corp.
+
+// Black-Scholes
+// Analytical method for calculating European Options
+//
+// 
+// Reference Source: Options, Futures, and Other Derivatives, 3rd Edition, Prentice 
+// Hall, John C. Hull,
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+#include <string.h>
+
+#ifndef WIN32
+#include <pmmintrin.h>
+#else
+#include <xmmintrin.h>
+#endif
+
+#ifdef ENABLE_PARSEC_HOOKS
+#include <hooks.h>
+#endif
+
+// Multi-threaded pthreads header
+#ifdef ENABLE_THREADS
+// Add the following line so that icc 9.0 is compatible with pthread lib.
+#define __thread __threadp  
+MAIN_ENV
+#undef __thread
+#endif
+
+// Multi-threaded OpenMP header
+#ifdef ENABLE_OPENMP
+#include <omp.h>
+#endif
+
+#ifdef ENABLE_TBB
+#include "tbb/blocked_range.h"
+#include "tbb/parallel_for.h"
+#include "tbb/task_scheduler_init.h"
+#include "tbb/tick_count.h"
+
+using namespace std;
+using namespace tbb;
+#endif //ENABLE_TBB
+
+// Multi-threaded header for Windows
+#ifdef WIN32
+#pragma warning(disable : 4305)
+#pragma warning(disable : 4244)
+#include <windows.h>
+#endif
+
+#ifdef __GNUC__
+#define _MM_ALIGN16 __attribute__((aligned (16)))
+#define MUSTINLINE __attribute__((always_inline))
+#else
+#define MUSTINLINE __forceinline
+#endif
+
+// NCO = Number of Concurrent Options = SIMD Width
+// NCO is currently set in the Makefile.
+#ifndef NCO
+#error NCO must be defined.
+#endif
+
+#if (NCO==2)
+#define fptype double
+#define SIMD_WIDTH 2
+#define _MMR      __m128d
+#define _MM_LOAD  _mm_load_pd
+#define _MM_STORE _mm_store_pd
+#define _MM_MUL   _mm_mul_pd
+#define _MM_ADD   _mm_add_pd
+#define _MM_SUB   _mm_sub_pd
+#define _MM_DIV   _mm_div_pd
+#define _MM_SQRT  _mm_sqrt_pd
+#define _MM_SET(A)  _mm_set_pd(A,A)
+#define _MM_SETR  _mm_set_pd
+#endif
+
+#if (NCO==4)
+#define fptype float
+#define SIMD_WIDTH 4
+#define _MMR      __m128
+#define _MM_LOAD  _mm_load_ps
+#define _MM_STORE _mm_store_ps
+#define _MM_MUL   _mm_mul_ps
+#define _MM_ADD   _mm_add_ps
+#define _MM_SUB   _mm_sub_ps
+#define _MM_DIV   _mm_div_ps
+#define _MM_SQRT  _mm_sqrt_ps
+#define _MM_SET(A)  _mm_set_ps(A,A,A,A)
+#define _MM_SETR  _mm_set_ps
+#endif
+
+#define NUM_RUNS 100
+
+typedef struct OptionData_ {
+        fptype s;          // spot price
+        fptype strike;     // strike price
+        fptype r;          // risk-free interest rate
+        fptype divq;       // dividend rate
+        fptype v;          // volatility
+        fptype t;          // time to maturity or option expiration in years 
+                           //     (1yr = 1.0, 6mos = 0.5, 3mos = 0.25, ..., etc)  
+        char OptionType;   // Option type.  "P"=PUT, "C"=CALL
+        fptype divs;       // dividend vals (not used in this test)
+        fptype DGrefval;   // DerivaGem Reference Value
+} OptionData;
+
+_MM_ALIGN16 OptionData* data;
+_MM_ALIGN16 fptype* prices;
+int numOptions;
+
+int    * otype;
+fptype * sptprice;
+fptype * strike;
+fptype * rate;
+fptype * volatility;
+fptype * otime;
+int numError = 0;
+int nThreads;
+
+////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////
+///////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////
+// Cumulative Normal Distribution Function
+// See Hull, Section 11.8, P.243-244
+#define inv_sqrt_2xPI 0.39894228040143270286
+
+MUSTINLINE void CNDF ( fptype * OutputX, fptype * InputX ) 
+{
+    int sign[SIMD_WIDTH];
+    int i;
+    _MMR xInput;
+    _MMR xNPrimeofX;
+    _MM_ALIGN16 fptype expValues[SIMD_WIDTH];
+    _MMR xK2;
+    _MMR xK2_2, xK2_3, xK2_4, xK2_5;
+    _MMR xLocal, xLocal_1, xLocal_2, xLocal_3;
+
+    for (i=0; i<SIMD_WIDTH; i++) {
+        // Check for negative value of InputX
+        if (InputX[i] < 0.0) {
+            InputX[i] = -InputX[i];
+            sign[i] = 1;
+        } else 
+            sign[i] = 0;
+    }
+    // printf("InputX[0]=%lf\n", InputX[0]);
+    // printf("InputX[1]=%lf\n", InputX[1]);
+
+    xInput = _MM_LOAD(InputX);
+    
+// local vars
+ 
+// Compute NPrimeX term common to both four & six decimal accuracy calcs
+
+    for (i=0; i<SIMD_WIDTH; i++) {
+        expValues[i] = exp(-0.5f * InputX[i] * InputX[i]);
+        // printf("exp[%d]: %f\n", i, expValues[i]);
+    }
+    
+    xNPrimeofX = _MM_LOAD(expValues);
+    xNPrimeofX = _MM_MUL(xNPrimeofX, _MM_SET(inv_sqrt_2xPI));
+
+    xK2 = _MM_MUL(_MM_SET(0.2316419), xInput);
+    xK2 = _MM_ADD(xK2, _MM_SET(1.0));
+    xK2 = _MM_DIV(_MM_SET(1.0), xK2);
+    // xK2 = _mm_rcp_pd(xK2);  // No rcp function for double-precision
+    
+    xK2_2 = _MM_MUL(xK2, xK2);
+    xK2_3 = _MM_MUL(xK2_2, xK2);
+    xK2_4 = _MM_MUL(xK2_3, xK2);
+    xK2_5 = _MM_MUL(xK2_4, xK2);
+    
+    xLocal_1 = _MM_MUL(xK2, _MM_SET(0.319381530));
+    xLocal_2 = _MM_MUL(xK2_2, _MM_SET(-0.356563782));
+    xLocal_3 = _MM_MUL(xK2_3, _MM_SET(1.781477937));
+    xLocal_2 = _MM_ADD(xLocal_2, xLocal_3);
+    xLocal_3 = _MM_MUL(xK2_4, _MM_SET(-1.821255978));
+    xLocal_2 = _MM_ADD(xLocal_2, xLocal_3);
+    xLocal_3 = _MM_MUL(xK2_5, _MM_SET(1.330274429));
+    xLocal_2 = _MM_ADD(xLocal_2, xLocal_3);
+    
+    xLocal_1 = _MM_ADD(xLocal_2, xLocal_1);
+    xLocal   = _MM_MUL(xLocal_1, xNPrimeofX);
+    xLocal   = _MM_SUB(_MM_SET(1.0), xLocal);
+    
+    _MM_STORE(OutputX, xLocal);
+    // _mm_storel_pd(&OutputX[0], xLocal);
+    // _mm_storeh_pd(&OutputX[1], xLocal);
+    
+    for (i=0; i<SIMD_WIDTH; i++) {
+        if (sign[i]) {
+            OutputX[i] = (1.0 - OutputX[i]);
+       }
+    } 
+} 
+
+// For debugging
+void print_xmm(_MMR in, char* s) {
+    int i;
+    _MM_ALIGN16 fptype val[SIMD_WIDTH];
+
+    _MM_STORE(val, in);
+    printf("%s: ", s);
+    for (i=0; i<SIMD_WIDTH; i++) {
+        printf("%f ", val[i]);
+    }
+    printf("\n");
+}
+
+//////////////////////////////////////////////////////////////////////////////////////
+//////////////////////////////////////////////////////////////////////////////////////
+//////////////////////////////////////////////////////////////////////////////////////
+//////////////////////////////////////////////////////////////////////////////////////
+void BlkSchlsEqEuroNoDiv (fptype * OptionPrice, int numOptions, fptype * sptprice,
+                          fptype * strike, fptype * rate, fptype * volatility,
+                          fptype * time, int * otype, float timet)
+{
+    int i;
+// local private working variables for the calculation
+    _MMR xStockPrice;
+    _MMR xStrikePrice;
+    _MMR xRiskFreeRate;
+    _MMR xVolatility;
+    _MMR xTime;
+    _MMR xSqrtTime;
+
+    _MM_ALIGN16 fptype logValues[NCO];
+    _MMR xLogTerm;
+    _MMR xD1, xD2;
+    _MMR xPowerTerm;
+    _MMR xDen;
+    _MM_ALIGN16 fptype d1[SIMD_WIDTH];
+    _MM_ALIGN16 fptype d2[SIMD_WIDTH];
+    _MM_ALIGN16 fptype FutureValueX[SIMD_WIDTH];
+    _MM_ALIGN16 fptype NofXd1[SIMD_WIDTH];
+    _MM_ALIGN16 fptype NofXd2[SIMD_WIDTH];
+    _MM_ALIGN16 fptype NegNofXd1[SIMD_WIDTH];
+    _MM_ALIGN16 fptype NegNofXd2[SIMD_WIDTH];    
+
+    xStockPrice = _MM_LOAD(sptprice);
+    xStrikePrice = _MM_LOAD(strike);
+    xRiskFreeRate = _MM_LOAD(rate);
+    xVolatility = _MM_LOAD(volatility);
+    xTime = _MM_LOAD(time);
+
+    xSqrtTime = _MM_SQRT(xTime);
+
+    for(i=0; i<SIMD_WIDTH;i ++) {
+        logValues[i] = log(sptprice[i] / strike[i]);
+    }
+
+    xLogTerm = _MM_LOAD(logValues);
+
+    xPowerTerm = _MM_MUL(xVolatility, xVolatility);
+    xPowerTerm = _MM_MUL(xPowerTerm, _MM_SET(0.5));
+    xD1 = _MM_ADD(xRiskFreeRate, xPowerTerm);
+
+    xD1 = _MM_MUL(xD1, xTime);
+
+    xD1 = _MM_ADD(xD1, xLogTerm);
+    xDen = _MM_MUL(xVolatility, xSqrtTime);
+    xD1 = _MM_DIV(xD1, xDen);
+    xD2 = _MM_SUB(xD1, xDen);
+
+    _MM_STORE(d1, xD1);
+    _MM_STORE(d2, xD2);
+
+    CNDF( NofXd1, d1 );
+    CNDF( NofXd2, d2 );
+
+    for (i=0; i<SIMD_WIDTH; i++) {
+        FutureValueX[i] = strike[i] * (exp(-(rate[i])*(time[i])));
+        // printf("FV=%lf\n", FutureValueX[i]);
+
+        // NofXd1[i] = NofX(d1[i]);
+        // NofXd2[i] = NofX(d2[i]);
+        // printf("NofXd1=%lf\n", NofXd1[i]);
+        // printf("NofXd2=%lf\n", NofXd2[i]);
+
+        if (otype[i] == 0) {
+            OptionPrice[i] = (sptprice[i] * NofXd1[i]) - (FutureValueX[i] * NofXd2[i]);
+        }
+        else { 
+            NegNofXd1[i] = (1.0 - (NofXd1[i]));
+            NegNofXd2[i] = (1.0 - (NofXd2[i]));
+            OptionPrice[i] = (FutureValueX[i] * NegNofXd2[i]) - (sptprice[i] * NegNofXd1[i]);
+        }
+        // printf("OptionPrice[0] = %lf\n", OptionPrice[i]);
+    }
+
+}
+
+#ifdef ENABLE_TBB
+struct mainWork {
+  mainWork(){}
+  mainWork(mainWork &w, tbb::split){}
+
+  void operator()(const tbb::blocked_range<int> &range) const {
+    fptype price[NCO];
+    fptype priceDelta;
+    int begin = range.begin();
+    int end = range.end();
+
+    for (int i=begin; i!=end; i+=NCO) {
+      /* Calling main function to calculate option value based on 
+       * Black & Scholes's equation.
+       */
+
+      BlkSchlsEqEuroNoDiv( price, NCO, &(sptprice[i]), &(strike[i]),
+                           &(rate[i]), &(volatility[i]), &(otime[i]), 
+                           &(otype[i]), 0);
+      for (int k=0; k<NCO; k++) {
+        prices[i+k] = price[k];
+
+#ifdef ERR_CHK 
+        priceDelta = data[i+k].DGrefval - price[k];
+        if( fabs(priceDelta) >= 1e-5 ){
+          fprintf(stderr,"Error on %d. Computed=%.5f, Ref=%.5f, Delta=%.5f\n",
+                 i+k, price, data[i+k].DGrefval, priceDelta);
+          numError ++;
+        }
+#endif
+      }
+    }
+  }
+};
+
+#endif // ENABLE_TBB
+
+//////////////////////////////////////////////////////////////////////////////////////
+//////////////////////////////////////////////////////////////////////////////////////
+//////////////////////////////////////////////////////////////////////////////////////
+//////////////////////////////////////////////////////////////////////////////////////
+
+#ifdef ENABLE_TBB
+int bs_thread(void *tid_ptr) {
+    int j;
+    tbb::affinity_partitioner a;
+
+    mainWork doall;
+    for (j=0; j<NUM_RUNS; j++) {
+      tbb::parallel_for(tbb::blocked_range<int>(0, numOptions), doall, a);
+    }
+
+    return 0;
+}
+#else // !ENABLE_TBB
+
+#ifdef WIN32
+DWORD WINAPI bs_thread(LPVOID tid_ptr){
+#else
+int bs_thread(void *tid_ptr) {
+#endif
+    int i, j, k;
+    fptype price[NCO];
+    fptype priceDelta;
+    int tid = *(int *)tid_ptr;
+    int start = tid * (numOptions / nThreads);
+    int end = start + (numOptions / nThreads);
+
+    for (j=0; j<NUM_RUNS; j++) {
+#ifdef ENABLE_OPENMP
+#pragma omp parallel for private(i, price, priceDelta)
+        for (i=0; i<numOptions; i += NCO) {
+#else  //ENABLE_OPENMP
+        for (i=start; i<end; i += NCO) {
+#endif //ENABLE_OPENMP
+            // Calling main function to calculate option value based on Black & Scholes's
+            // equation.
+            BlkSchlsEqEuroNoDiv(price, NCO, &(sptprice[i]), &(strike[i]),
+                                &(rate[i]), &(volatility[i]), &(otime[i]), &(otype[i]), 0);
+            for (k=0; k<NCO; k++) {
+              prices[i+k] = price[k];
+            } 
+#ifdef ERR_CHK
+            for (k=0; k<NCO; k++) {
+                priceDelta = data[i+k].DGrefval - price[k];
+                if (fabs(priceDelta) >= 1e-4) {
+                    printf("Error on %d. Computed=%.5f, Ref=%.5f, Delta=%.5f\n",
+                           i + k, price[k], data[i+k].DGrefval, priceDelta);
+                    numError ++;
+                }
+            }
+#endif
+        }
+    }
+
+    return 0;
+}
+#endif //ENABLE_TBB
+
+int main (int argc, char **argv)
+{
+    FILE *file;
+    int i;
+    int loopnum;
+    fptype * buffer;
+    int * buffer2;
+    int rv;
+
+#ifdef PARSEC_VERSION
+#define __PARSEC_STRING(x) #x
+#define __PARSEC_XSTRING(x) __PARSEC_STRING(x)
+        printf("PARSEC Benchmark Suite Version "__PARSEC_XSTRING(PARSEC_VERSION)"\n");
+	fflush(NULL);
+#else
+        printf("PARSEC Benchmark Suite\n");
+	fflush(NULL);
+#endif //PARSEC_VERSION
+#ifdef ENABLE_PARSEC_HOOKS
+   __parsec_bench_begin(__parsec_blackscholes);
+#endif
+
+   if (argc != 4)
+        {
+                printf("Usage:\n\t%s <nthreads> <inputFile> <outputFile>\n", argv[0]);
+                exit(1);
+        }
+    nThreads = atoi(argv[1]);
+    char *inputFile = argv[2];
+    char *outputFile = argv[3];
+
+    //Read input data from file
+    file = fopen(inputFile, "r");
+    if(file == NULL) {
+      printf("ERROR: Unable to open file `%s'.\n", inputFile);
+      exit(1);
+    }
+    rv = fscanf(file, "%i", &numOptions);
+    if(rv != 1) {
+      printf("ERROR: Unable to read from file `%s'.\n", inputFile);
+      fclose(file);
+      exit(1);
+    }
+    if(NCO > numOptions) {
+      printf("ERROR: Not enough work for SIMD operation.\n");
+      fclose(file);
+      exit(1);
+    }
+    if(nThreads > numOptions/NCO) {
+      printf("WARNING: Not enough work, reducing number of threads to match number of SIMD options packets.\n");
+      nThreads = numOptions/NCO;
+    }
+
+#if !defined(ENABLE_THREADS) && !defined(ENABLE_OPENMP) && !defined(ENABLE_TBB)
+    if(nThreads != 1) {
+        printf("Error: <nthreads> must be 1 (serial version)\n");
+        exit(1);
+    }
+#endif
+
+    data = (OptionData*)malloc(numOptions*sizeof(OptionData));
+    prices = (fptype*)malloc(numOptions*sizeof(fptype));
+    for ( loopnum = 0; loopnum < numOptions; ++ loopnum )
+    {
+        rv = fscanf(file, "%f %f %f %f %f %f %c %f %f", &data[loopnum].s, &data[loopnum].strike, &data[loopnum].r, &data[loopnum].divq, &data[loopnum].v, &data[loopnum].t, &data[loopnum].OptionType, &data[loopnum].divs, &data[loopnum].DGrefval);
+        if(rv != 9) {
+          printf("ERROR: Unable to read from file `%s'.\n", inputFile);
+          fclose(file);
+          exit(1);
+        }
+    }
+    rv = fclose(file);
+    if(rv != 0) {
+      printf("ERROR: Unable to close file `%s'.\n", inputFile);
+      exit(1);
+    }
+
+#ifdef ENABLE_THREADS
+    MAIN_INITENV(,8000000,nThreads);
+#endif
+    printf("Num of Options: %d\n", numOptions);
+    printf("Num of Runs: %d\n", NUM_RUNS);
+
+#define PAD 256
+#define LINESIZE 64
+
+    buffer = (fptype *) malloc(5 * numOptions * sizeof(fptype) + PAD);
+    sptprice = (fptype *) (((unsigned long long)buffer + PAD) & ~(LINESIZE - 1));
+    strike = sptprice + numOptions;
+    rate = strike + numOptions;
+    volatility = rate + numOptions;
+    otime = volatility + numOptions;
+
+    buffer2 = (int *) malloc(numOptions * sizeof(fptype) + PAD);
+    otype = (int *) (((unsigned long long)buffer2 + PAD) & ~(LINESIZE - 1));
+
+    for (i=0; i<numOptions; i++) {
+        otype[i]      = (data[i].OptionType == 'P') ? 1 : 0;
+        sptprice[i]   = data[i].s;
+        strike[i]     = data[i].strike;
+        rate[i]       = data[i].r;
+        volatility[i] = data[i].v;
+        otime[i]      = data[i].t;
+    }
+
+    printf("Size of data: %d\n", numOptions * (sizeof(OptionData) + sizeof(int)));
+
+#ifdef ENABLE_PARSEC_HOOKS
+    __parsec_roi_begin();
+#endif
+
+#ifdef ENABLE_THREADS
+#ifdef WIN32
+    HANDLE *threads;
+    int *nums;
+    threads = (HANDLE *) malloc (nThreads * sizeof(HANDLE));
+    nums = (int *) malloc (nThreads * sizeof(int));
+
+    for(i=0; i<nThreads; i++) {
+        nums[i] = i;
+        threads[i] = CreateThread(0, 0, bs_thread, &nums[i], 0, 0);
+    }
+    WaitForMultipleObjects(nThreads, threads, TRUE, INFINITE);
+    free(threads);
+    free(nums);
+#else
+    int *tids;
+    tids = (int *) malloc (nThreads * sizeof(int));
+
+    for(i=0; i<nThreads; i++) {
+        tids[i]=i;
+        CREATE_WITH_ARG(bs_thread, &tids[i]);
+    }
+    WAIT_FOR_END(nThreads);
+    free(tids);
+#endif //WIN32
+#else //ENABLE_THREADS
+#ifdef ENABLE_OPENMP
+    {
+        int tid=0;
+        omp_set_num_threads(nThreads);
+        bs_thread(&tid);
+    }
+#else //ENABLE_OPENMP
+#ifdef ENABLE_TBB
+    tbb::task_scheduler_init init(nThreads);
+
+    int tid=0;
+    bs_thread(&tid);
+#else //ENABLE_TBB
+    //serial version
+    int tid=0;
+    bs_thread(&tid);
+#endif //ENABLE_TBB
+#endif //ENABLE_OPENMP
+#endif //ENABLE_THREADS
+
+#ifdef ENABLE_PARSEC_HOOKS
+    __parsec_roi_end();
+#endif
+
+    //Write prices to output file
+    file = fopen(outputFile, "w");
+    if(file == NULL) {
+      printf("ERROR: Unable to open file `%s'.\n", outputFile);
+      exit(1);
+    }
+    rv = fprintf(file, "%i\n", numOptions);
+    if(rv < 0) {
+      printf("ERROR: Unable to write to file `%s'.\n", outputFile);
+      fclose(file);
+      exit(1);
+    }
+    for(i=0; i<numOptions; i++) {
+      rv = fprintf(file, "%.18f\n", prices[i]);
+      if(rv < 0) {
+        printf("ERROR: Unable to write to file `%s'.\n", outputFile);
+        fclose(file);
+        exit(1);
+      }
+    }
+    rv = fclose(file);
+    if(rv != 0) {
+      printf("ERROR: Unable to close file `%s'.\n", outputFile);
+      exit(1);
+    }
+
+#ifdef ERR_CHK
+    printf("Num Errors: %d\n", numError);
+#endif
+    free(data);
+    free(prices);
+
+#ifdef ENABLE_PARSEC_HOOKS
+    __parsec_bench_end();
+#endif
+
+    return 0;
+}
+
--- a/constantine/platforms/threadpool/benchmarks/black_scholes/c.m4.pthreads
+++ b/constantine/platforms/threadpool/benchmarks/black_scholes/c.m4.pthreads
@ -0,0 +1,122 @@
+define(EXTERN_ENV,
+`
+#include <time.h>
+
+extern pthread_t _M4_threads[MAX_THREADS];
+extern pthread_mutexattr_t _M4_normalMutexAttr;
+extern int _M4_numThreads;
+')
+
+define(MAIN_ENV, 
+`
+#ifdef _XOPEN_SOURCE
+#undef _XOPEN_SOURCE
+#define _XOPEN_SOURCE 700
+#endif
+#ifndef _GNU_SOURCE
+#define _GNU_SOURCE
+#endif
+#ifndef __USE_XOPEN2K
+#define __USE_XOPEN2K
+#endif
+#ifndef __USE_UNIX98
+#define __USE_UNIX98
+#endif
+#include <pthread.h>
+#include <time.h>
+
+#define MAX_THREADS 128
+
+pthread_t _M4_threadsTable[MAX_THREADS];
+int _M4_threadsTableAllocated[MAX_THREADS];
+pthread_mutexattr_t _M4_normalMutexAttr;
+int _M4_numThreads = MAX_THREADS;
+')
+
+define(MAIN_INITENV, `
+    pthread_mutexattr_init( &_M4_normalMutexAttr);
+//    pthread_mutexattr_settype( &_M4_normalMutexAttr, PTHREAD_MUTEX_NORMAL);
+    _M4_numThreads = $3;
+    {
+        int _M4_i;
+        for ( _M4_i = 0; _M4_i < MAX_THREADS; _M4_i++) {
+            _M4_threadsTableAllocated[_M4_i] = 0;
+        }
+    }
+')
+define(MAIN_END, `')
+
+
+define(CREATE_WITH_ARG, `
+    {
+        int _M4_i;
+        for ( _M4_i = 0; _M4_i < MAX_THREADS; _M4_i++) {
+            if ( _M4_threadsTableAllocated[_M4_i] == 0)    break;
+        }
+        pthread_create(&_M4_threadsTable[_M4_i],NULL,(void *(*)(void *))$1,(void *)$2);
+        _M4_threadsTableAllocated[_M4_i] = 1;
+    }
+')
+
+define(CREATE, `CREATE_WITH_ARG($1,NULL);')
+
+define(SELF, `( long)pthread_self()')
+
+define(BARDEC, `pthread_barrier_t $1;')
+define(BARINIT, `pthread_barrier_init(&($1),NULL,_M4_numThreads);')
+define(BARRIER, `pthread_barrier_wait(&($1))')
+
+define(LOCKDEC,  `pthread_mutex_t $1;')
+define(LOCKINIT, `pthread_mutex_init(&($1),  &_M4_normalMutexAttr);')
+define(LOCK,     `pthread_mutex_lock(&($1));')
+define(UNLOCK,   `pthread_mutex_unlock(&($1));')
+
+define(LOCKRDEC,  `pthread_mutex_t $1;')
+define(LOCKRINIT, `pthread_mutex_init(&($1), NULL);')
+define(LOCKR,     `pthread_mutex_lock(&($1));')
+define(UNLOCKR,   `pthread_mutex_unlock(&($1));')
+
+define(CVDEC,        `pthread_cond_t $1;')
+define(CVINIT,       `pthread_cond_init(&$1,NULL);')
+define(CVWAIT,       `pthread_cond_wait(&$1,&$2);')
+define(CVWAITREL,    `pthread_cond_wait(&$1,&$2);  pthread_mutex_unlock(&$2);')
+define(CVSIGNALALL,  `pthread_cond_broadcast(&$1);')
+define(CVSIGNALONE,  `pthread_cond_signal(&$1);')
+
+define(ACQUIRE,  `pthread_mutex_lock(&($1));')
+define(RELEASE,  `pthread_mutex_unlock(&($1));')
+
+define(ALOCKDEC,  `pthread_mutex_t ($1[$2]);')
+define(ALOCKINIT, `{
+                      int _M4_loop_j;
+                      for(_M4_loop_j=0; _M4_loop_j < $2; _M4_loop_j++){
+                          pthread_mutex_init((pthread_mutex_t*)&($1[_M4_loop_j]), NULL);
+                      }
+                   }')
+define(ALOCK,      `pthread_mutex_lock((pthread_mutex_t*)&($1[$2]));')
+define(AULOCK,     `pthread_mutex_unlock((pthread_mutex_t*)&($1[$2]));')
+
+define(AACQUIRE,   `pthread_mutex_lock(&($1[$2]));')
+define(ARELEASE,   `pthread_mutex_unlock(&($1[$2]));')
+
+define(WAIT_FOR_END, `
+    {
+        int _M4_i;
+        void *_M4_ret;
+        for ( _M4_i = 0; _M4_i < MAX_THREADS;_M4_i++) {
+            if ( _M4_threadsTableAllocated[_M4_i] == 0)    break;
+            pthread_join( _M4_threadsTable[_M4_i], &_M4_ret);
+        }
+    }
+')
+define(JOIN, `{pthread_join( _M4_threadsTable[($1)], NULL );}')
+
+define(CLOCK, `{long time(); ($1) = time(0);}')
+
+define(GET_PID, `$1 = pthread_self();')
+define(AUG_ON, `')
+define(AUG_OFF, `')
+
+define(G_MALLOC, `malloc($1);')
+define(MALLOC, `malloc($1);')
+
--- a/constantine/platforms/threadpool/benchmarks/black_scholes/inputgen.c
+++ b/constantine/platforms/threadpool/benchmarks/black_scholes/inputgen.c
@ -0,0 +1,85 @@
+//Copyright (c) 2009 Princeton University
+//Written by Christian Bienia
+//Generate input files for blackscholes benchmark
+
+#include <stdio.h>
+#include <stdlib.h>
+
+
+
+//Precision to use
+#define fptype double
+
+typedef struct OptionData_ {
+        fptype s;          // spot price
+        fptype strike;     // strike price
+        fptype r;          // risk-free interest rate
+        fptype divq;       // dividend rate
+        fptype v;          // volatility
+        fptype t;          // time to maturity or option expiration in years 
+                           //     (1yr = 1.0, 6mos = 0.5, 3mos = 0.25, ..., etc)  
+        const char *OptionType;  // Option type.  "P"=PUT, "C"=CALL
+        fptype divs;       // dividend vals (not used in this test)
+        fptype DGrefval;   // DerivaGem Reference Value
+} OptionData;
+
+//Total number of options in optionData.txt
+#define MAX_OPTIONS 1000
+
+OptionData data_init[] = {
+    #include "optionData.txt"
+};
+
+
+
+int main (int argc, char **argv) {
+  int numOptions;
+  char *fileName;
+  int rv;
+  int i;
+
+  if (argc != 3) {
+    printf("Usage:\n\t%s <numOptions> <fileName>\n", argv[0]);
+    exit(1);
+  }
+  numOptions = atoi(argv[1]);
+  fileName = argv[2];
+  if(numOptions < 1) {
+    printf("ERROR: Number of options must at least be 1.\n");
+    exit(1);
+  }
+
+  FILE *file;
+  file = fopen(fileName, "w");
+  if(file == NULL) {
+    printf("ERROR: Unable to open file `%s'.\n", fileName);
+    exit(1);
+  }
+
+  //write number of options
+  rv = fprintf(file, "%i\n", numOptions);
+  if(rv < 0) {
+    printf("ERROR: Unable to write to file `%s'.\n", fileName);
+    fclose(file);
+    exit(1);
+  }
+
+  //write values for options
+  for(i=0; i<numOptions; i++) {
+    //NOTE: DG RefValues specified exceed double precision, output will deviate
+    rv = fprintf(file, "%.2f %.2f %.4f %.2f %.2f %.2f %c %.2f %.18f\n", data_init[i % MAX_OPTIONS].s, data_init[i % MAX_OPTIONS].strike, data_init[i % MAX_OPTIONS].r, data_init[i % MAX_OPTIONS].divq, data_init[i % MAX_OPTIONS].v, data_init[i % MAX_OPTIONS].t, data_init[i % MAX_OPTIONS].OptionType[0], data_init[i % MAX_OPTIONS].divs, data_init[i % MAX_OPTIONS].DGrefval);
+    if(rv < 0) {
+      printf("ERROR: Unable to write to file `%s'.\n", fileName);
+      fclose(file);
+      exit(1);
+    }
+  }
+
+  rv = fclose(file);
+  if(rv != 0) {
+    printf("ERROR: Unable to close file `%s'.\n", fileName);
+    exit(1);
+  }
+
+  return 0;
+}
--- a/constantine/platforms/threadpool/benchmarks/black_scholes/openmp_black_scholes.nim
+++ b/constantine/platforms/threadpool/benchmarks/black_scholes/openmp_black_scholes.nim
@ -0,0 +1,366 @@
+# Weave
+# Copyright (c) 2019 Mamy André-Ratsimbazafy
+# Licensed and distributed under either of
+#   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
+#   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
+# at your option. This file may not be copied, modified, or distributed except according to those terms.
+
+# Reference implementation from
+# // Copyright (c) 2007 Intel Corp.
+#
+# // Black-Scholes
+# // Analytical method for calculating European Options
+# //
+# //
+# // Reference Source: Options, Futures, and Other Derivatives, 3rd Edition, Prentice
+# // Hall, John C. Hull,
+#
+# File manipulation routines ported from C++-Taskflow
+
+import
+  # Stdlib
+  strformat, os, strutils, math, system/ansi_c,
+  cpuinfo, streams, strscans,
+  # bench
+  ../wtime, ../resources
+
+# Types
+# --------------------------------
+
+{.passC:"-fopenmp".}
+{.passL:"-fopenmp".}
+
+type
+  OptionKind = enum
+    Put
+    Call
+
+  OptionData[T: SomeFloat] = object
+    spot: T     # Spot price
+    strike: T   # Strike price
+    riskfree: T # risk-free rate
+    divrate: T  # dividend rate
+    vol: T      # volatility
+    expiry: T   # expiry to maturity or option expiration in years
+                # (1 year = 1.0, 6 months = 0.5)
+    kind: OptionKind
+    divvals: T  # Dividend values (not used in this test)
+    dgrefval: T # DerivaGem reference value
+
+  Context[T: SomeFloat] = object
+    data: ptr UncheckedArray[OptionData[T]]
+    prices: ptr UncheckedArray[T]
+    numOptions: int
+    numRuns: int
+
+    otype: ptr UncheckedArray[OptionKind]
+    spot: ptr UncheckedArray[T]
+    strike: ptr UncheckedArray[T]
+    riskFreeRate: ptr UncheckedArray[T]
+    volatility: ptr UncheckedArray[T]
+    expiry: ptr UncheckedArray[T]
+    numErrors: int
+
+# Helpers
+# ---------------------------------------------------
+
+proc wv_alloc*(T: typedesc): ptr T {.inline.}=
+  ## Default allocator for the Picasso library
+  ## This allocates memory to hold the type T
+  ## and returns a pointer to it
+  ##
+  ## Can use Nim allocator to measure the overhead of its lock
+  ## Memory is not zeroed
+  when defined(WV_useNimAlloc):
+    createSharedU(T)
+  else:
+    cast[ptr T](c_malloc(csize_t sizeof(T)))
+
+proc wv_alloc*(T: typedesc, len: SomeInteger): ptr UncheckedArray[T] {.inline.} =
+  ## Default allocator for the Picasso library.
+  ## This allocates a contiguous chunk of memory
+  ## to hold ``len`` elements of type T
+  ## and returns a pointer to it.
+  ##
+  ## Can use Nim allocator to measure the overhead of its lock
+  ## Memory is not zeroed
+  when defined(WV_useNimAlloc):
+    cast[type result](createSharedU(T, len))
+  else:
+    cast[type result](c_malloc(csize_t len*sizeof(T)))
+
+proc wv_free*[T: ptr](p: T) {.inline.} =
+  when defined(WV_useNimAlloc):
+    freeShared(p)
+  else:
+    c_free(p)
+
+proc initialize[T](ctx: var Context[T], numOptions: int) =
+  ctx.numOptions = numOptions
+
+  ctx.data = wv_alloc(OptionData[T], numOptions)
+  ctx.prices = wv_alloc(T, numOptions)
+  ctx.otype = wv_alloc(OptionKind, numOptions)
+  ctx.spot = wv_alloc(T, numOptions)
+  ctx.strike = wv_alloc(T, numOptions)
+  ctx.riskFreeRate = wv_alloc(T, numOptions)
+  ctx.volatility = wv_alloc(T, numOptions)
+  ctx.expiry = wv_alloc(T, numOptions)
+
+proc delete[T](ctx: sink Context[T]) =
+
+  wv_free ctx.data
+  wv_free ctx.prices
+  wv_free ctx.otype
+  wv_free ctx.spot
+  wv_free ctx.strike
+  wv_free ctx.riskFreeRate
+  wv_free ctx.volatility
+  wv_free ctx.expiry
+
+# Cumulative Normal Distribution Function
+# ---------------------------------------------------
+# See Hull, Section 11.8, P.243-244
+
+const InvSqrt2xPI = 0.39894228040143270286
+
+func cumulNormalDist[T: SomeFloat](inputX: T): T =
+
+  # Check for negative value of inputX
+  var isNegative = false
+  var inputX = inputX
+  if inputX < 0.T:
+    inputX = -inputX
+    isNegative = true
+
+  let xInput = inputX
+
+  # Compute NPrimeX term common to both four & six decimal accuracy calcs
+  let expValues = exp(-0.5.T * inputX * inputX)
+  let xNPrimeofX = expValues * InvSqrt2xPI
+
+  let
+    xK2 = 1.0 / (1.0 + 0.2316419*xInput)
+    xK2_2 = xK2 * xK2
+    xK2_3 = xK2_2 * xK2
+    xK2_4 = xK2_3 * xK2
+    xK2_5 = xK2_4 * xK2
+
+  var
+    xLocal_1 = xK2 * 0.319381530
+    xLocal_2 = xK2_2 * -0.356563782
+    xLocal_3 = xK2_3 * 1.781477937
+  xLocal_2 += xLocal_3
+  xLocal_3 = xK2_4 * -1.821255978
+  xLocal_2 += xLocal_3
+  xLocal_3 = xK2_5 * 1.330274429
+  xLocal_2 += xLocal_3
+
+  xLocal_1 += xLocal_2
+  result = 1.T - xLocal_1 * xNPrimeofX
+
+  if isNegative:
+    result = 1.T - result
+
+# Black & Scholes
+# ---------------------------------------------------
+
+func blackScholesEqEuroNoDiv[T](
+    spot, strike, riskFreeRate, volatility, expiry: T,
+    otype: OptionKind,
+    timet: float32
+  ): T =
+
+  var xD1 = riskFreeRate + 0.5.T * volatility * volatility
+  xD1 *= expiry
+  xD1 += ln(spot / strike)
+
+  var xDen = volatility * sqrt(expiry)
+  xD1 /= xDen
+
+  let xD2 = xD1 - xDen
+
+  let nofXd1 = cumulNormalDist(xD1)
+  let nofXd2 = cumulNormalDist(xD2)
+
+  let futureValueX = strike * exp(-riskFreeRate*expiry)
+  if otype == Call:
+    result = spot * nofXd1 - futureValueX * nofXd2
+  else:
+    let negNofxd1 = 1.T - nofXd1
+    let negNofxd2 = 1.T - nofXd2
+    result = futureValueX * negNofxd2 - spot * negNofXd1
+
+func checkErrors[T](ctx: var Context[T], i: int, price: T) =
+  let priceDelta = ctx.data[i].dgrefval - price
+  if abs(priceDelta) >= 1e-4:
+    c_printf("Error on %d. Computed=%.5f, Ref=%.5f, Delta=%.5f\n",
+      i.int, price, ctx.data[i].dgrefval, priceDelta
+    )
+    ctx.numErrors += 1
+
+func blackScholesSequential(ctx: var Context) =
+
+  for j in 0 ..< ctx.numRuns:
+    for i in 0 ..< ctx.numOptions:
+      let price = blackScholesEqEuroNoDiv(
+        ctx.spot[i], ctx.strike[i],
+        ctx.riskFreeRate[i], ctx.volatility[i],
+        ctx.expiry[i], ctx.otype[i], 0
+      )
+      ctx.prices[i] = price
+
+      when defined(check):
+        checkErrors(ctx, i, price)
+
+proc blackScholesOpenMP(ctx: ptr Context) =
+
+  # Stacktraces will create issues between thread-local GC and OpenMP
+  {.push stackTrace: off.}
+  for j in 0 ..< ctx.numRuns:
+    for i in 0||(ctx.numOptions-1):
+      let price = blackScholesEqEuroNoDiv(
+        ctx.spot[i], ctx.strike[i],
+        ctx.riskFreeRate[i], ctx.volatility[i],
+        ctx.expiry[i], ctx.otype[i], 0
+      )
+      ctx.prices[i] = price
+
+      when defined(check):
+        checkErrors(ctx[], i, price)
+  {.pop.}
+
+proc dump[T](ctx: Context[T], file: string) =
+
+  let stream = openFileStream(file, fmWrite)
+  defer: stream.close()
+
+  stream.write($ctx.numOptions)
+  stream.write("\n")
+
+  for i in 0 ..< ctx.numOptions:
+    stream.write($ctx.prices[i])
+    stream.write("\n")
+
+proc parseOptions[T](ctx: var Context[T], optFile: string) =
+
+  let stream = openFileStream(optFile, fmRead)
+  defer: stream.close()
+
+  var line: string
+  discard stream.readLine(line)
+
+  # Allocate the buffers
+  # Note sure why the original bench uses a struct of arrays
+  let numOptions = line.parseInt()
+  ctx.initialize(numOptions)
+  echo "Reading ", numOptions, " options"
+
+  # For parsing Nim uses float64 by default
+  var
+    spot, strike, riskfree, divrate, vol, expiry: float64
+    optKind: string
+    divvals, dgrefval: float64
+
+  for i in 0 ..< ctx.numOptions:
+    discard stream.readLine(line)
+    let isLineParsed = scanf(line, "$f $f $f $f $f $f $w $f $f",
+        spot, strike, riskFree,
+        divrate, vol, expiry,
+        optKind, divvals, dgrefval
+      )
+    doAssert isLineParsed
+
+    ctx.data[i].spot = spot.T
+    ctx.data[i].strike = strike.T
+    ctx.data[i].riskfree = riskfree.T
+    ctx.data[i].divrate = divrate.T
+    ctx.data[i].vol = vol.T
+    ctx.data[i].expiry = expiry.T
+    ctx.data[i].divvals = divvals.T
+    ctx.data[i].dgrefval = dgrefval.T
+
+    if optKind == "C":
+      ctx.data[i].kind = Call
+    elif optKind == "P":
+      ctx.data[i].kind = Put
+    else:
+      raise newException(ValueError, "Invalid option kind: \"" & optKind & '"')
+
+    ctx.otype[i] = ctx.data[i].kind
+    ctx.spot[i] = ctx.data[i].spot
+    ctx.strike[i] = ctx.data[i].strike
+    ctx.riskFreeRate[i] = ctx.data[i].riskfree
+    ctx.volatility[i] = ctx.data[i].vol
+    ctx.expiry[i] = ctx.data[i].expiry
+
+proc main() =
+
+  var
+    input = ""
+    output = ""
+    numRounds = 2000
+
+  if paramCount() == 0:
+    let exeName = getAppFilename().extractFilename()
+    echo &"Usage: {exeName} <inputFile: string> <output = \"\"> <numRounds = 2000>"
+    quit 0
+  elif paramCount() == 1:
+    input = paramStr(1)
+  elif paramCount() == 2:
+    input = paramStr(1)
+    output = paramStr(2)
+  elif paramCount() == 3:
+    input = paramStr(1)
+    output = paramStr(2)
+    numRounds = paramStr(3).parseInt()
+  else:
+    let exeName = getAppFilename().extractFilename()
+    echo &"Usage: {exeName} <inputFile: string> <output = \"\"> <numRounds = 2000>"
+    quit 1
+
+  var ctx: Context[float32]
+  ctx.numRuns = numRounds
+  ctx.parseOptions(input)
+
+  var nthreads: int
+  if existsEnv"OMP_NUM_THREADS":
+    nthreads = getEnv"OMP_NUM_THREADS".parseInt()
+  else:
+    nthreads = countProcessors()
+
+  var ru: Rusage
+  getrusage(RusageSelf, ru)
+  var
+    rss = ru.ru_maxrss
+    flt = ru.ru_minflt
+
+  let start = wtime_msec()
+  blackScholesOpenMP(ctx.addr)
+  let stop = wtime_msec()
+
+  getrusage(RusageSelf, ru)
+  rss = ru.ru_maxrss - rss
+  flt = ru.ru_minflt - flt
+
+  echo "--------------------------------------------------------------------------"
+  echo "Scheduler:                                    OpenMP (Nim)"
+  echo "Benchmark:                                    Black & Scholes Option Pricing"
+  echo "Threads:                                      ", nthreads
+  echo "Time(ms)                                      ", round(stop - start, 3)
+  echo "Max RSS (KB):                                 ", ru.ru_maxrss
+  echo "Runtime RSS (KB):                             ", rss
+  echo "# of page faults:                             ", flt
+  echo "--------------------------------------------------------------------------"
+  echo "# of rounds:                                  ", numRounds
+  echo "# of options:                                 ", ctx.numOptions
+
+  if output != "":
+    echo "\nDumping prices to \"", output, '"'
+    dump(ctx, output)
+
+  delete(ctx)
+
+  quit 0
+
+main()
--- a/constantine/platforms/threadpool/benchmarks/black_scholes/optionData.txt
+++ b/constantine/platforms/threadpool/benchmarks/black_scholes/optionData.txt
--- a/constantine/platforms/threadpool/benchmarks/black_scholes/test10000options.txt
+++ b/constantine/platforms/threadpool/benchmarks/black_scholes/test10000options.txt
--- a/constantine/platforms/threadpool/benchmarks/black_scholes/threadpool_black_scholes.nim
+++ b/constantine/platforms/threadpool/benchmarks/black_scholes/threadpool_black_scholes.nim
@ -0,0 +1,372 @@
+# Weave
+# Copyright (c) 2019 Mamy André-Ratsimbazafy
+# Licensed and distributed under either of
+#   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
+#   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
+# at your option. This file may not be copied, modified, or distributed except according to those terms.
+
+# Reference implementation from
+# // Copyright (c) 2007 Intel Corp.
+#
+# // Black-Scholes
+# // Analytical method for calculating European Options
+# //
+# //
+# // Reference Source: Options, Futures, and Other Derivatives, 3rd Edition, Prentice
+# // Hall, John C. Hull,
+#
+# File manipulation routines ported from C++-Taskflow
+
+import
+  # Stdlib
+  system/ansi_c, std/[strformat, os, strutils, math, cpuinfo, streams, strscans],
+  # Constantine
+  ../../threadpool
+
+when defined(linux):
+  # bench
+  import ../wtime, ../resources
+
+# Types
+# --------------------------------
+
+type
+  OptionKind = enum
+    Put
+    Call
+
+  OptionData[T: SomeFloat] = object
+    spot: T     # Spot price
+    strike: T   # Strike price
+    riskfree: T # risk-free rate
+    divrate: T  # dividend rate
+    vol: T      # volatility
+    expiry: T   # expiry to maturity or option expiration in years
+                # (1 year = 1.0, 6 months = 0.5)
+    kind: OptionKind
+    divvals: T  # Dividend values (not used in this test)
+    dgrefval: T # DerivaGem reference value
+
+  Context[T: SomeFloat] = object
+    data: ptr UncheckedArray[OptionData[T]]
+    prices: ptr UncheckedArray[T]
+    numOptions: int
+    numRuns: int
+
+    otype: ptr UncheckedArray[OptionKind]
+    spot: ptr UncheckedArray[T]
+    strike: ptr UncheckedArray[T]
+    riskFreeRate: ptr UncheckedArray[T]
+    volatility: ptr UncheckedArray[T]
+    expiry: ptr UncheckedArray[T]
+    numErrors: int
+
+# Helpers
+# ---------------------------------------------------
+
+proc wv_alloc*(T: typedesc): ptr T {.inline.}=
+  ## Default allocator for the Picasso library
+  ## This allocates memory to hold the type T
+  ## and returns a pointer to it
+  ##
+  ## Can use Nim allocator to measure the overhead of its lock
+  ## Memory is not zeroed
+  when defined(WV_useNimAlloc):
+    createSharedU(T)
+  else:
+    cast[ptr T](c_malloc(csize_t sizeof(T)))
+
+proc wv_alloc*(T: typedesc, len: SomeInteger): ptr UncheckedArray[T] {.inline.} =
+  ## Default allocator for the Picasso library.
+  ## This allocates a contiguous chunk of memory
+  ## to hold ``len`` elements of type T
+  ## and returns a pointer to it.
+  ##
+  ## Can use Nim allocator to measure the overhead of its lock
+  ## Memory is not zeroed
+  when defined(WV_useNimAlloc):
+    cast[type result](createSharedU(T, len))
+  else:
+    cast[type result](c_malloc(csize_t len*sizeof(T)))
+
+proc wv_free*[T: ptr](p: T) {.inline.} =
+  when defined(WV_useNimAlloc):
+    freeShared(p)
+  else:
+    c_free(p)
+
+proc initialize[T](ctx: var Context[T], numOptions: int) =
+  ctx.numOptions = numOptions
+
+  ctx.data = wv_alloc(OptionData[T], numOptions)
+  ctx.prices = wv_alloc(T, numOptions)
+  ctx.otype = wv_alloc(OptionKind, numOptions)
+  ctx.spot = wv_alloc(T, numOptions)
+  ctx.strike = wv_alloc(T, numOptions)
+  ctx.riskFreeRate = wv_alloc(T, numOptions)
+  ctx.volatility = wv_alloc(T, numOptions)
+  ctx.expiry = wv_alloc(T, numOptions)
+
+proc delete[T](ctx: sink Context[T]) =
+
+  wv_free ctx.data
+  wv_free ctx.prices
+  wv_free ctx.otype
+  wv_free ctx.spot
+  wv_free ctx.strike
+  wv_free ctx.riskFreeRate
+  wv_free ctx.volatility
+  wv_free ctx.expiry
+
+# Cumulative Normal Distribution Function
+# ---------------------------------------------------
+# See Hull, Section 11.8, P.243-244
+
+const InvSqrt2xPI = 0.39894228040143270286
+
+func cumulNormalDist[T: SomeFloat](inputX: T): T =
+
+  # Check for negative value of inputX
+  var isNegative = false
+  var inputX = inputX
+  if inputX < 0.T:
+    inputX = -inputX
+    isNegative = true
+
+  let xInput = inputX
+
+  # Compute NPrimeX term common to both four & six decimal accuracy calcs
+  let expValues = exp(-0.5.T * inputX * inputX)
+  let xNPrimeofX = expValues * InvSqrt2xPI
+
+  let
+    xK2 = 1.0 / (1.0 + 0.2316419*xInput)
+    xK2_2 = xK2 * xK2
+    xK2_3 = xK2_2 * xK2
+    xK2_4 = xK2_3 * xK2
+    xK2_5 = xK2_4 * xK2
+
+  var
+    xLocal_1 = xK2 * 0.319381530
+    xLocal_2 = xK2_2 * -0.356563782
+    xLocal_3 = xK2_3 * 1.781477937
+  xLocal_2 += xLocal_3
+  xLocal_3 = xK2_4 * -1.821255978
+  xLocal_2 += xLocal_3
+  xLocal_3 = xK2_5 * 1.330274429
+  xLocal_2 += xLocal_3
+
+  xLocal_1 += xLocal_2
+  result = 1.T - xLocal_1 * xNPrimeofX
+
+  if isNegative:
+    result = 1.T - result
+
+# Black & Scholes
+# ---------------------------------------------------
+
+func blackScholesEqEuroNoDiv[T](
+    spot, strike, riskFreeRate, volatility, expiry: T,
+    otype: OptionKind,
+    timet: float32
+  ): T =
+
+  var xD1 = riskFreeRate + 0.5.T * volatility * volatility
+  xD1 *= expiry
+  xD1 += ln(spot / strike)
+
+  var xDen = volatility * sqrt(expiry)
+  xD1 /= xDen
+
+  let xD2 = xD1 - xDen
+
+  let nofXd1 = cumulNormalDist(xD1)
+  let nofXd2 = cumulNormalDist(xD2)
+
+  let futureValueX = strike * exp(-riskFreeRate*expiry)
+  if otype == Call:
+    result = spot * nofXd1 - futureValueX * nofXd2
+  else:
+    let negNofxd1 = 1.T - nofXd1
+    let negNofxd2 = 1.T - nofXd2
+    result = futureValueX * negNofxd2 - spot * negNofXd1
+
+func checkErrors[T](ctx: var Context[T], i: int, price: T) =
+  let priceDelta = ctx.data[i].dgrefval - price
+  if abs(priceDelta) >= 1e-4:
+    c_printf("Error on %d. Computed=%.5f, Ref=%.5f, Delta=%.5f\n",
+      i.int, price, ctx.data[i].dgrefval, priceDelta
+    )
+    ctx.numErrors += 1
+
+func blackScholesSequential(ctx: var Context) =
+
+  for j in 0 ..< ctx.numRuns:
+    for i in 0 ..< ctx.numOptions:
+      let price = blackScholesEqEuroNoDiv(
+        ctx.spot[i], ctx.strike[i],
+        ctx.riskFreeRate[i], ctx.volatility[i],
+        ctx.expiry[i], ctx.otype[i], 0
+      )
+      ctx.prices[i] = price
+
+      when defined(check):
+        checkErrors(ctx, i, price)
+
+proc blackScholesConstantine(tp: Threadpool, ctx: ptr Context) =
+
+  for j in 0 ..< ctx.numRuns:
+    tp.parallelFor i in 0 ..< ctx.numOptions:
+      captures: {ctx}
+      let price = blackScholesEqEuroNoDiv(
+        ctx.spot[i], ctx.strike[i],
+        ctx.riskFreeRate[i], ctx.volatility[i],
+        ctx.expiry[i], ctx.otype[i], 0
+      )
+      ctx.prices[i] = price
+
+      when defined(check):
+        checkErrors(ctx[], i, price)
+
+    tp.syncAll()
+
+proc dump[T](ctx: Context[T], file: string) =
+
+  let stream = openFileStream(file, fmWrite)
+  defer: stream.close()
+
+  stream.write($ctx.numOptions)
+  stream.write("\n")
+
+  for i in 0 ..< ctx.numOptions:
+    stream.write($ctx.prices[i])
+    stream.write("\n")
+
+proc parseOptions[T](ctx: var Context[T], optFile: string) =
+
+  let stream = openFileStream(optFile, fmRead)
+  defer: stream.close()
+
+  var line: string
+  discard stream.readLine(line)
+
+  # Allocate the buffers
+  # Note sure why the original bench uses a struct of arrays
+  let numOptions = line.parseInt()
+  ctx.initialize(numOptions)
+  echo "Reading ", numOptions, " options"
+
+  # For parsing Nim uses float64 by default
+  var
+    spot, strike, riskfree, divrate, vol, expiry: float64
+    optKind: string
+    divvals, dgrefval: float64
+
+  for i in 0 ..< ctx.numOptions:
+    discard stream.readLine(line)
+    let isLineParsed = scanf(line, "$f $f $f $f $f $f $w $f $f",
+        spot, strike, riskFree,
+        divrate, vol, expiry,
+        optKind, divvals, dgrefval
+      )
+    doAssert isLineParsed
+
+    ctx.data[i].spot = spot.T
+    ctx.data[i].strike = strike.T
+    ctx.data[i].riskfree = riskfree.T
+    ctx.data[i].divrate = divrate.T
+    ctx.data[i].vol = vol.T
+    ctx.data[i].expiry = expiry.T
+    ctx.data[i].divvals = divvals.T
+    ctx.data[i].dgrefval = dgrefval.T
+
+    if optKind == "C":
+      ctx.data[i].kind = Call
+    elif optKind == "P":
+      ctx.data[i].kind = Put
+    else:
+      raise newException(ValueError, "Invalid option kind: \"" & optKind & '"')
+
+    ctx.otype[i] = ctx.data[i].kind
+    ctx.spot[i] = ctx.data[i].spot
+    ctx.strike[i] = ctx.data[i].strike
+    ctx.riskFreeRate[i] = ctx.data[i].riskfree
+    ctx.volatility[i] = ctx.data[i].vol
+    ctx.expiry[i] = ctx.data[i].expiry
+
+proc main() =
+
+  var
+    input = ""
+    output = ""
+    numRounds = 2000
+
+  if paramCount() == 0:
+    let exeName = getAppFilename().extractFilename()
+    echo &"Usage: {exeName} <inputFile: string> <output = \"\"> <numRounds = 2000>"
+    quit 0
+  elif paramCount() == 1:
+    input = paramStr(1)
+  elif paramCount() == 2:
+    input = paramStr(1)
+    output = paramStr(2)
+  elif paramCount() == 3:
+    input = paramStr(1)
+    output = paramStr(2)
+    numRounds = paramStr(3).parseInt()
+  else:
+    let exeName = getAppFilename().extractFilename()
+    echo &"Usage: {exeName} <inputFile: string> <output = \"\"> <numRounds = 2000>"
+    quit 1
+
+  var ctx: Context[float32]
+  ctx.numRuns = numRounds
+  ctx.parseOptions(input)
+
+  var nthreads: int
+  if existsEnv"CTT_NUM_THREADS":
+    nthreads = getEnv"CTT_NUM_THREADS".parseInt()
+  else:
+    nthreads = countProcessors()
+
+  when not defined(windows):
+    var ru: Rusage
+    getrusage(RusageSelf, ru)
+    var
+      rss = ru.ru_maxrss
+      flt = ru.ru_minflt
+
+    let start = wtime_msec()
+  var tp = Threadpool.new(numThreads = nthreads)
+  tp.blackScholesConstantine(ctx.addr)
+  tp.shutdown()
+
+  when not defined(windows):
+    let stop = wtime_msec()
+
+    getrusage(RusageSelf, ru)
+    rss = ru.ru_maxrss - rss
+    flt = ru.ru_minflt - flt
+
+  echo "--------------------------------------------------------------------------"
+  echo "Scheduler:                                    Constantine"
+  echo "Benchmark:                                    Black & Scholes Option Pricing (including init+shutdown)"
+  echo "Threads:                                      ", nthreads
+  when not defined(windows):
+    echo "Time(ms)                                      ", round(stop - start, 3)
+    echo "Max RSS (KB):                                 ", ru.ru_maxrss
+    echo "Runtime RSS (KB):                             ", rss
+    echo "# of page faults:                             ", flt
+  echo "--------------------------------------------------------------------------"
+  echo "# of rounds:                                  ", numRounds
+  echo "# of options:                                 ", ctx.numOptions
+
+  if output != "":
+    echo "\nDumping prices to \"", output, '"'
+    dump(ctx, output)
+
+  delete(ctx)
+
+  quit 0
+
+main()
--- a/constantine/platforms/threadpool/benchmarks/dfs/threadpool_dfs.nim
+++ b/constantine/platforms/threadpool/benchmarks/dfs/threadpool_dfs.nim
@ -10,7 +10,7 @@ import
  system/ansi_c, std/[strformat, os, strutils, cpuinfo],
  # Library
  ../../threadpool
-  
+
 when not defined(windows):
  # bench
  import ../wtime
@ -63,22 +63,31 @@ proc main() =

  # Staccato benches runtime init and exit as well
  when not defined(windows):
-    let start = wtime_usec()
+    let startRuntime = wtime_usec()

  tp = Threadpool.new()
+
+  when not defined(windows):
+    let startBench = wtime_usec()
+
  answer = test(depth, breadth)
+
+  when not defined(windows):
+    let stopBench = wtime_usec()
+
  tp.shutdown()

  when not defined(windows):
-    let stop = wtime_usec()
+    let stopRuntime = wtime_usec()

  echo "--------------------------------------------------------------------------"
-  echo "Scheduler:  Constantine's Threadpool"
-  echo "Benchmark:  dfs"
-  echo "Threads:    ", nthreads
+  echo "Scheduler:        Constantine's Threadpool"
+  echo "Benchmark:        dfs"
+  echo "Threads:          ", nthreads
  when not defined(windows):
-    echo "Time(us)    ", stop - start
-  echo "Output:     ", answer
+    echo "Time(us) runtime: ", stopRuntime - startRuntime
+    echo "Time(us) bench:   ", stopBench - startBench
+  echo "Output:           ", answer
  echo "--------------------------------------------------------------------------"

  quit 0
--- a/constantine/platforms/threadpool/benchmarks/histogram_2D/README.md
+++ b/constantine/platforms/threadpool/benchmarks/histogram_2D/README.md
@ -0,0 +1,10 @@
+# Histogram 2D
+
+This is a very interesting benchmark to test map-reduce style of computation and nested parallelism.
+
+It needs:
+- 2 loops over a 2D matrix
+- A reduction
+
+OpenMP is unable to deal intuitively with nesting
+2 parallel loops and a custom reduction operation
--- a/constantine/platforms/threadpool/benchmarks/histogram_2D/openmp_histogram.c
+++ b/constantine/platforms/threadpool/benchmarks/histogram_2D/openmp_histogram.c
@ -0,0 +1,162 @@
+// Source: https://stackoverflow.com/questions/16751445/parallelize-nested-for-loop-with-openmp
+
+#include<stdio.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <math.h>
+#define float_t float
+#include <time.h>
+#include <omp.h>
+
+// This was the original function to optimize
+// where the reader had issues with OpenMP nested parallelism
+float_t generate_histogram(float_t **matrix, int *histogram, int mat_size, int hist_size)
+{
+int i,j,k,count;
+float_t max = 0.;
+float_t sum;
+
+//set histogram to zero everywhere
+for(i = 0; i < hist_size; i++)
+    histogram[i] = 0;
+
+
+//matrix computations
+//#pragma omp parallel for schedule(runtime)
+for (i = 1; i < (mat_size-1); i++)
+{
+    //pragma omp prallel for schedule(dynamic)
+    for(j = 1; j < (mat_size-1); j++)
+    {
+
+        //assign current matrix[i][j] to element in order to reduce memory access
+        sum = fabs(matrix[i][j]-matrix[i-1][j]) + fabs(matrix[i][j] - matrix[i+1][j])
+            + fabs(matrix[i][j]-matrix[i][j-1]) + fabs(matrix[i][j] - matrix[i][j+1]);
+
+        //compute index of histogram bin
+        k = (int)(sum * (float)mat_size);
+        histogram[k] += 1;
+
+        //keep track of largest element
+        if(sum > max)
+            max = sum;
+
+    }//end inner for
+}//end outer for
+
+return max;
+}
+
+// This is the proposed alternative
+float_t generate_histogram_omp(float_t **matrix, int *histogram, int mat_size, int hist_size) {
+    float_t max = 0.;
+    //set histogram to zero everywhere
+    int i;
+    for(i = 0; i < hist_size; i++)
+        histogram[i] = 0;
+
+    //matrix computations
+    #pragma omp parallel
+    {
+        int *histogram_private = (int*)malloc(hist_size * sizeof(int));
+        int i;
+        for(i = 0; i < hist_size; i++)
+            histogram_private[i] = 0;
+        float_t max_private = 0.;
+        int n;
+        int j;
+        #pragma omp for
+        for (i = 1; i < (mat_size-1); i++) {
+            for(j = 1; j < (mat_size-1); j++) {
+         //   for (n=0; n < (mat_size-2)*(mat_size-2); n++) {
+          //      int i = n/(mat_size-2)+1;
+          //      int j = n%(mat_size-2)+1;
+
+                float_t sum = fabs(matrix[i][j]-matrix[i-1][j]) + fabs(matrix[i][j] - matrix[i+1][j])
+                    + fabs(matrix[i][j]-matrix[i][j-1]) + fabs(matrix[i][j] - matrix[i][j+1]);
+
+                //compute index of histogram bin
+                int k = (int)(sum * (float)mat_size);
+                histogram_private[k] += 1;
+
+                //keep track of largest element
+                if(sum > max_private)
+                    max_private = sum;
+            }
+        }
+        #pragma omp critical
+        {
+
+            for(i = 0; i < hist_size; i++)
+                histogram[i] += histogram_private[i];
+            if(max_private>max)
+                max = max_private;
+        }
+
+        free(histogram_private);
+    }
+    return max;
+}
+
+int compare_hists(int *hist1, int *hist2, int N) {
+    int i;
+    int diff = 0;
+    for(i =0; i < N; i++) {
+        int tmp = hist1[i] - hist2[i];
+        diff += tmp;
+        if(tmp!=0) {
+            printf("i %d, hist1 %d, hist2  %d\n", i, hist1[i], hist2[i]);
+        }
+    }
+    return diff;
+}
+
+int main() {
+    int i,j,N,boxes;
+
+    // Original values
+    // N = 10000;
+    // boxes = N / 2;
+
+    N = 25000;
+    boxes = 1000;
+
+    float_t **matrix;
+    int* histogram1;
+    int* histogram2;
+
+    //allocate a matrix with some numbers
+    matrix = (float_t**)calloc(N, sizeof(float_t **));
+    for(i = 0; i < N; i++)
+        matrix[i] = (float_t*)calloc(N, sizeof(float_t *));
+    for(i = 0; i < N; i++)
+        for(j = 0; j < N; j++)
+            matrix[i][j] = 1./(float_t) N * (float_t) i * 100.0;
+
+
+    histogram1 = (int*)malloc(boxes * sizeof(int));
+    histogram2 = (int*)malloc(boxes * sizeof(int));
+
+    for(i = 0; i<boxes; i++) {
+        histogram1[i] = 0;
+        histogram2[i] = 0;
+    }
+    double dtime;
+
+    dtime = omp_get_wtime();
+    float_t max1 = generate_histogram(matrix, histogram1, N, boxes);
+    dtime = omp_get_wtime() - dtime;
+    printf("time serial: %f\n", dtime);
+    printf("max  serial: %f\n", max1);
+
+    dtime = omp_get_wtime();
+    float_t max2 = generate_histogram_omp(matrix, histogram2, N, boxes);
+    dtime = omp_get_wtime() - dtime;
+    printf("time openmp: %f\n", dtime);
+    printf("max  openmp: %f\n", max2);
+
+    int diff = compare_hists(histogram1, histogram2, boxes);
+    printf("diff %d\n", diff);
+
+    return 0;
+}
--- a/constantine/platforms/threadpool/benchmarks/histogram_2D/threadpool_histogram.nim
+++ b/constantine/platforms/threadpool/benchmarks/histogram_2D/threadpool_histogram.nim
@ -0,0 +1,303 @@
+# Weave
+# Copyright (c) 2019 Mamy André-Ratsimbazafy
+# Licensed and distributed under either of
+#   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
+#   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
+# at your option. This file may not be copied, modified, or distributed except according to those terms.
+
+import
+  # Stdlib
+  system/ansi_c, std/[os, strutils, cpuinfo, math, strformat, locks],
+  # Constantine
+  ../../threadpool
+
+when not defined(windows):
+  # bench
+  import ../wtime, ../resources
+
+# TODO: there is an overflow on Linux32 or MacOS for MaxRSS but not Linux or Windows
+# This test is quite important to ensure parallel reductions work within a generic proc.
+{.push checks: off.}
+
+# Helpers
+# -------------------------------------------------------
+
+# We need a thin wrapper around raw pointers for matrices,
+# we can't pass "var" to other threads
+type
+  Matrix[T: SomeFloat] = object
+    buffer: ptr UncheckedArray[T]
+    ld: int
+
+template `[]`[T](mat: Matrix[T], row, col: Natural): T =
+  # row-major storage
+  assert row < mat.ld
+  assert col < mat.ld
+  mat.buffer[row * mat.ld + col]
+
+template `[]=`[T](mat: Matrix[T], row, col: Natural, value: T) =
+  assert row < mat.ld
+  assert col < mat.ld
+  mat.buffer[row * mat.ld + col] = value
+
+type
+  Histogram = object
+    buffer: ptr UncheckedArray[int64]
+    len: int
+
+template `[]`(hist: Histogram, idx: Natural): int64 =
+  # row-major storage
+  assert idx in 0 ..< hist.len
+  hist.buffer[idx]
+
+template `[]=`(hist: Histogram, idx: Natural, value: int64) =
+  assert idx in 0 ..< hist.len
+  hist.buffer[idx] = value
+
+proc wv_alloc*(T: typedesc): ptr T {.inline.}=
+  cast[ptr T](c_malloc(csize_t sizeof(T)))
+
+proc wv_alloc*(T: typedesc, len: SomeInteger): ptr UncheckedArray[T] {.inline.} =
+  cast[type result](c_malloc(csize_t len*sizeof(T)))
+
+proc wv_free*[T: ptr](p: T) {.inline.} =
+  c_free(p)
+
+# -------------------------------------------------------
+
+proc prepareMatrix[T](matrix: var Matrix[T], N: int) =
+  matrix.buffer = wv_alloc(T, N*N)
+  matrix.ld = N
+
+  for i in 0 ..< N:
+    for j in 0 ..< N:
+      matrix[i, j] = 1.0 / T(N) * T(i) * 100
+
+proc newHistogram(bins: int): Histogram =
+  result.buffer = wv_alloc(int64, bins)
+  result.len = bins
+
+# Reports
+# -------------------------------------------------------
+
+template memUsage(maxRSS, runtimeRSS, pageFaults: untyped{ident}, body: untyped) =
+  var maxRSS, runtimeRSS, pageFaults: int32
+  block:
+    var ru: Rusage
+    getrusage(RusageSelf, ru)
+    runtimeRSS = ru.ru_maxrss
+    pageFaults = ru.ru_minflt
+
+    body
+
+    getrusage(RusageSelf, ru)
+    runtimeRSS = ru.ru_maxrss - runtimeRSS
+    pageFaults = ru.ru_minflt - pageFaults
+    maxRss = ru.ru_maxrss
+
+proc reportConfig(
+    scheduler: string,
+    nthreads, N, bins: int) =
+
+  echo "--------------------------------------------------------------------------"
+  echo "Scheduler:                                    ", scheduler
+  echo "Benchmark:                                    Histogram 2D "
+  echo "Threads:                                      ", nthreads
+  echo "Matrix:                                       ", N, " x ", N
+  echo "Histogram bins:                               ", bins
+
+proc reportBench(
+    time: float64, maxRSS, runtimeRss, pageFaults: int, max: SomeFloat
+  ) =
+  echo "--------------------------------------------------------------------------"
+  echo "Time(ms):                                     ", round(time, 3)
+  echo "Max RSS (KB):                                 ", maxRss
+  echo "Runtime RSS (KB):                             ", runtimeRSS
+  echo "# of page faults:                             ", pageFaults
+  echo "Max (from histogram):                         ", max
+
+template runBench(tp: Threadpool, procName: untyped, matrix: Matrix, bins: int, parallel: static bool = true) =
+  var hist = newHistogram(bins)
+
+  when not defined(windows):
+    block:
+      var max: matrix.T
+      let start = wtime_msec()
+      memUsage(maxRSS, runtimeRSS, pageFaults):
+        when parallel:
+          tp = Threadpool.new()
+          max = procName(tp, matrix, hist)
+          tp.shutdown()
+        else:
+          max = procName(matrix, hist)
+      let stop = wtime_msec()
+
+      reportBench(stop-start, maxRSS, runtimeRSS, pageFaults, max)
+  else:
+    block:
+      var max: matrix.T
+      when parallel:
+        tp = Threadpool.new()
+        max = procName(tp, matrix, hist)
+        tp.shutdown()
+      else:
+        max = procName(matrix, hist)
+
+# Algo
+# -------------------------------------------------------
+
+proc generateHistogramSerial[T](matrix: Matrix[T], hist: Histogram): T =
+
+  # zero-ing the histogram
+  for i in 0 ..< hist.len:
+    hist[i] = 0
+
+  # Note don't run on borders, they have no neighbour
+  for i in 1 ..< matrix.ld-1:
+    for j in 1 ..< matrix.ld-1:
+
+      # Sum of cell neigbors
+      let sum = abs(matrix[i, j] - matrix[i-1, j]) + abs(matrix[i,j] - matrix[i+1, j]) +
+                abs(matrix[i, j] - matrix[i, j-1] + abs(matrix[i, j] - matrix[i, j+1]))
+
+      # Compute index of histogram bin
+      let k = int(sum * T(matrix.ld))
+      hist[k] += 1
+
+      # Keep track of the largest element
+      if sum > result:
+        result = sum
+
+proc generateHistogramThreadpoolReduce[T](tp: Threadpool, matrix: Matrix[T], hist: Histogram): T =
+  # We await reduce max only, sending the histogram across threads
+  # is too costly so the temporary histogram are freed in their allocating threads
+
+  # In generic proc, Nim tries to resolve symbol earlier than when the reduce macros creates them
+  # so we need to tell Nim that the symbol will exist in time.
+  mixin distributedMax
+
+  let boxes = hist.len
+
+  for i in 0 ..< boxes:
+    hist[i] = 0
+
+  # Parallel reduction
+  tp.parallelFor i in 1 ..< matrix.ld-1:
+    captures: {hist, matrix, boxes}
+    reduceInto(distributedMax: T):
+      prologue:
+        let threadHist = newHistogram(boxes)
+        var max = T(-Inf)
+      forLoop:
+        # with inner for loop
+        for j in 1 ..< matrix.ld-1:
+          let sum = abs(matrix[i, j] - matrix[i-1, j]) + abs(matrix[i,j] - matrix[i+1, j]) +
+                    abs(matrix[i, j] - matrix[i, j-1] + abs(matrix[i, j] - matrix[i, j+1]))
+          let k = int(sum * T(matrix.ld))
+
+          threadHist[k] += 1
+          if sum > max:
+            max = sum
+      merge(remoteMax: FlowVar[T]):
+        block:
+          let remoteMax = sync(remoteMax) # Await max from other thread
+          if remoteMax > max:
+            max = remoteMax
+          for k in 0 ..< boxes:
+            discard hist[k].addr.atomicFetchAdd(threadHist[k], ATOMIC_RELAXED)
+      epilogue:
+        wv_free(threadHist.buffer)
+        return max
+
+  return sync(distributedMax)
+
+# proc generateHistogramThreadpoolStaged[T](matrix: Matrix[T], hist: Histogram): T =
+
+#   var max = T(-Inf)
+#   let maxAddr = max.addr
+
+#   var lock: Lock
+#   lock.initLock()
+#   let lockAddr = lock.addr
+
+#   let boxes = hist.len
+
+#   for i in 0 ..< boxes:
+#     hist[i] = 0
+
+#   # Parallel reduction
+#   parallelForStaged i in 1 ..< matrix.ld-1:
+#     captures: {maxAddr, lockAddr, hist, matrix, boxes}
+#     awaitable: histoLoop
+#     prologue:
+#       let threadHist = newHistogram(boxes)
+#       var threadMax = T(-Inf)
+#     forLoop:
+#       # with inner for loop
+#       for j in 1 ..< matrix.ld-1:
+#         let sum = abs(matrix[i, j] - matrix[i-1, j]) + abs(matrix[i,j] - matrix[i+1, j]) +
+#                   abs(matrix[i, j] - matrix[i, j-1] + abs(matrix[i, j] - matrix[i, j+1]))
+#         let k = int(sum * T(matrix.ld))
+
+#         threadHist[k] += 1
+#         if sum > threadMax:
+#           threadMax = sum
+#     epilogue:
+#       lockAddr[].acquire()
+#       maxAddr[] = max(maxAddr[], threadMax)
+#       if threadMax > maxAddr[]:
+#         maxAddr[] = threadMax
+#       for k in 0 ..< boxes:
+#         hist[k] += threadHist[k]
+#       lockAddr[].release()
+#       wv_free(threadHist.buffer)
+
+#   let waslastThread = sync(histoLoop)
+#   lock.deinitLock()
+#   return max
+
+proc main() =
+
+  if sizeof(int) == 4:
+    echo "Running on 32-bit. This benchmark is requires 64-bit."
+    return
+
+  var
+    matrixSize = 25000
+    boxes = 1000
+
+  if paramCount() == 0:
+    let exeName = getAppFilename().extractFilename()
+    echo &"Usage: {exeName} <matrixSize: int> <boxes: int>"
+    echo &"Running with default matrixSize={matrixSize}, boxes={boxes}"
+  elif paramCount() == 2:
+    matrixSize = paramStr(1).parseInt()
+    boxes = paramStr(2).parseInt()
+  else:
+    let exeName = getAppFilename().extractFilename()
+    echo &"Usage: {exeName} <matrixSize: int> <boxes: int>"
+    echo &"Default \"{exeName} {matrixSize} {boxes}\""
+    quit 1
+
+  var nthreads: int
+  if existsEnv"CTT_NUM_THREADS":
+    nthreads = getEnv"CTT_NUM_THREADS".parseInt()
+  else:
+    nthreads = countProcessors()
+
+  var tp: Threadpool
+  var matrix: Matrix[float32]
+  # The reference code zero-out the histogram in the bench as well
+  prepareMatrix(matrix, matrixSize)
+
+  reportConfig("Sequential", 1, matrixSize, boxes)
+  runBench(tp, generateHistogramSerial, matrix, boxes, parallel = false)
+  reportConfig("Constantine's threadpool - Parallel Reduce", nthreads, matrixSize, boxes)
+  runBench(tp, generateHistogramThreadpoolReduce, matrix, boxes)
+  # reportConfig("Constantine's threadpool - Parallel For Staged", nthreads, matrixSize, boxes)
+  # runBench(generateHistogramThreadpoolStaged, matrix, boxes)
+
+  wv_free(matrix.buffer)
+
+
+main()
--- a/constantine/platforms/threadpool/benchmarks/logsumexp/README.md
+++ b/constantine/platforms/threadpool/benchmarks/logsumexp/README.md
@ -0,0 +1,28 @@
+# Log-Sum-Exp
+
+Log-Sum-Exp computes `ln ∑i exp(xi)` by using the trick:
+
+```
+log ∑i exp(xi) = α + log ∑i exp(xi−α)
+with α = max(xi) for xi in x
+```
+
+Log-Sum-Exp is a key algorithm behind the Softmax Cross-Entropy loss function,
+which is used in almost all deep learning classification problems.
+
+Furthermore this is a huge bottleneck in NLP and research in fast softmax
+approximation is active.
+
+We are interested in testing parallel reductions
+so we do first a parallel max and then a parallel exponential sum.
+
+However note that there exist a streaming version
+at http://www.nowozin.net/sebastian/blog/streaming-log-sum-exp-computation.html
+which is similar to Welford algorithm for streaming mean and variance in statistics.
+
+Given that exponential is a very heavy operation all framework should see a linear speedup.
+The main challenge is the framework syntax for complex reduction operation.
+
+Note that `<math.h>` exponential can be significantly improved (10x)
+by using vectorization techniques from Laser
+https://github.com/numforge/laser/blob/d1e6ae6106564bfb350d4e566261df97dbb578b3/benchmarks/vector_math/bench_exp_avx2.nim#L373-L379
--- a/constantine/platforms/threadpool/benchmarks/logsumexp/threadpool_logsumexp.nim
+++ b/constantine/platforms/threadpool/benchmarks/logsumexp/threadpool_logsumexp.nim
@ -0,0 +1,402 @@
+# Weave
+# Copyright (c) 2019 Mamy André-Ratsimbazafy
+# Licensed and distributed under either of
+#   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
+#   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
+# at your option. This file may not be copied, modified, or distributed except according to those terms.
+
+import
+  # Stdlib
+  system/ansi_c, std/[strformat, os, strutils, cpuinfo, math, random, locks],
+  # Constantine
+  ../../threadpool
+
+when not defined(windows):
+  # bench
+  import ../wtime, ../resources
+
+# TODO: there is an overflow on Linux32 or MacOS for MaxRSS but not Linux or Windows
+# This test is quite important to ensure parallel reductions work within a generic proc.
+{.push checks: off.}
+
+# Helpers
+# -------------------------------------------------------
+
+# We need a thin wrapper around raw pointers for matrices,
+# Note that matrices for log-sum-exp are usually in the following shapes:
+# - Classification of a batch of 256 images in 3 categories: 256x3
+# - Classification of a batch of words from a 50000 words dictionary: 256x50000
+
+# Helpers
+# -------------------------------------------------------
+
+proc wv_alloc*(T: typedesc): ptr T {.inline.}=
+  cast[ptr T](c_malloc(csize_t sizeof(T)))
+
+proc wv_alloc*(T: typedesc, len: SomeInteger): ptr UncheckedArray[T] {.inline.} =
+  cast[type result](c_malloc(csize_t len*sizeof(T)))
+
+proc wv_free*[T: ptr](p: T) {.inline.} =
+  c_free(p)
+
+# We need a thin wrapper around raw pointers for matrices,
+# we can't pass "var" to other threads
+type
+  Matrix[T: SomeFloat] = object
+    buffer: ptr UncheckedArray[T]
+    nrows, ncols: int # int64 on x86-64
+
+func newMatrix[T](rows, cols: Natural): Matrix[T] {.inline.} =
+  # Create a rows x cols Matrix
+  result.buffer = cast[ptr UncheckedArray[T]](c_malloc(csize_t rows*cols*sizeof(T)))
+  result.nrows = rows
+  result.ncols = cols
+
+template `[]`[T](M: Matrix[T], row, col: Natural): T =
+  # row-major storage
+  assert row < M.nrows
+  assert col < M.ncols
+  M.buffer[row * M.ncols + col]
+
+template `[]=`[T](M: Matrix[T], row, col: Natural, value: T) =
+  assert row < M.nrows
+  assert col < M.ncols
+  M.buffer[row * M.ncols + col] = value
+
+proc initialize[T](M: Matrix[T]) =
+  randomize(1234) # Seed
+  for i in 0 ..< M.nrows:
+    for j in 0 ..< M.ncols:
+      M[i, j] = T(rand(1.0))
+
+func rowView*[T](M: Matrix[T], rowPos, size: Natural): Matrix[T]{.inline.}=
+  ## Returns a new view offset by the row and column stride
+  result.buffer = cast[ptr UncheckedArray[T]](
+    addr M.buffer[rowPos * M.ncols]
+  )
+  result.nrows = size
+  result.ncols = M.ncols
+
+# Reports
+# -------------------------------------------------------
+
+template memUsage(maxRSS, runtimeRSS, pageFaults: untyped{ident}, body: untyped) =
+  var maxRSS, runtimeRSS, pageFaults: int32
+  block:
+    var ru: Rusage
+    getrusage(RusageSelf, ru)
+    runtimeRSS = ru.ru_maxrss
+    pageFaults = ru.ru_minflt
+
+    body
+
+    getrusage(RusageSelf, ru)
+    runtimeRSS = ru.ru_maxrss - runtimeRSS
+    pageFaults = ru.ru_minflt - pageFaults
+    maxRss = ru.ru_maxrss
+
+proc reportConfig(
+    scheduler: string,
+    nthreads: int, datasetSize, batchSize, imageLabels, textVocabulary: int64
+  ) =
+
+  echo "--------------------------------------------------------------------------"
+  echo "Scheduler:                                    ", scheduler
+  echo "Benchmark:                                    Log-Sum-Exp (Machine Learning) "
+  echo "Threads:                                      ", nthreads
+  echo "datasetSize:                                  ", datasetSize
+  echo "batchSize:                                    ", batchSize
+  echo "# of full batches:                            ", datasetSize div batchSize
+  echo "# of image labels:                            ", imageLabels
+  echo "Text vocabulary size:                         ", textVocabulary
+
+proc reportBench(
+    batchSize, numLabels: int64,
+    time: float64, maxRSS, runtimeRss, pageFaults: int32,
+    logSumExp: float32
+  ) =
+  echo "--------------------------------------------------------------------------"
+  echo "Dataset:                                      ", batchSize,'x',numLabels
+  echo "Time(ms):                                     ", round(time, 3)
+  echo "Max RSS (KB):                                 ", maxRss
+  echo "Runtime RSS (KB):                             ", runtimeRSS
+  echo "# of page faults:                             ", pageFaults
+  echo "Logsumexp:                                    ", logsumexp
+
+template runBench(procName: untyped, datasetSize, batchSize, numLabels: int64) =
+  let data = newMatrix[float32](datasetSize, numLabels)
+  data.initialize()
+
+  when not defined(windows):
+    let start = wtime_msec()
+
+    var lse = 0'f32
+    memUsage(maxRSS, runtimeRSS, pageFaults):
+      # For simplicity we ignore the last few data points
+      for batchIdx in 0 ..< datasetSize div batchSize:
+        let X = data.rowView(batchIdx*batchSize, batchSize)
+        lse += procName(X)
+
+    let stop = wtime_msec()
+
+    reportBench(batchSize, numlabels, stop-start, maxRSS, runtimeRSS, pageFaults, lse)
+  else:
+    # For simplicity we ignore the last few data points
+    var lse = 0'f32
+    for batchIdx in 0 ..< datasetSize div batchSize:
+      let X = data.rowView(batchIdx*batchSize, batchSize)
+      lse += procName(X)
+
+
+template runBench(tp: Threadpool, procName: untyped, datasetSize, batchSize, numLabels: int64) =
+  let data = newMatrix[float32](datasetSize, numLabels)
+  data.initialize()
+
+  when not defined(windows):
+    let start = wtime_msec()
+
+    var lse = 0'f32
+    memUsage(maxRSS, runtimeRSS, pageFaults):
+      # For simplicity we ignore the last few data points
+      for batchIdx in 0 ..< datasetSize div batchSize:
+        let X = data.rowView(batchIdx*batchSize, batchSize)
+        lse += procName(tp, X)
+
+    let stop = wtime_msec()
+
+    reportBench(batchSize, numlabels, stop-start, maxRSS, runtimeRSS, pageFaults, lse)
+  else:
+    var lse = 0'f32
+    for batchIdx in 0 ..< datasetSize div batchSize:
+      let X = data.rowView(batchIdx*batchSize, batchSize)
+      lse += procName(tp, X)
+
+# Algo - Serial
+# -------------------------------------------------------
+
+proc maxSerial[T: SomeFloat](M: Matrix[T]) : T =
+  result = T(-Inf)
+
+  for i in 0 ..< M.nrows:
+    for j in 0 ..< M.ncols:
+      result = max(result, M[i, j])
+
+proc logsumexpSerial[T: SomeFloat](M: Matrix[T]): T =
+  let alpha = M.maxSerial()
+
+  result = 0
+
+  for i in 0 ..< M.nrows:
+    for j in 0 ..< M.ncols:
+      result += exp(M[i, j] - alpha)
+
+  result = alpha + ln(result)
+
+# Algo - parallel reduction
+# -------------------------------------------------------
+
+proc maxThreadpoolReduce[T: SomeFloat](tp: Threadpool, M: Matrix[T]) : T =
+  mixin globalMax
+
+  tp.parallelFor i in 0 ..< M.nrows:
+    captures:{M}
+    reduceInto(globalMax: T):
+      prologue:
+        var localMax = T(-Inf)
+      forLoop:
+        for j in 0 ..< M.ncols:
+          localMax = max(localMax, M[i, j])
+      merge(remoteMax: Flowvar[T]):
+        localMax = max(localMax, sync(remoteMax))
+      epilogue:
+        return localMax
+
+  result = sync(globalMax)
+
+proc logsumexpThreadpoolReduce[T: SomeFloat](tp: Threadpool, M: Matrix[T]): T =
+  mixin lse
+
+  let alpha = tp.maxThreadpoolReduce(M)
+  tp.parallelFor i in 0 ..< M.nrows:
+    captures:{alpha, M}
+    reduceInto(lse: T):
+      prologue:
+        var localLSE = 0.T
+      forLoop:
+        for j in 0 ..< M.ncols:
+          localLSE += exp(M[i, j] - alpha)
+      merge(remoteLSE: Flowvar[T]):
+        localLSE += sync(remoteLSE)
+      epilogue:
+        return localLSE
+
+  result = alpha + ln(sync(lse))
+
+# Algo - parallel reduction collapsed
+# -------------------------------------------------------
+
+proc maxThreadpoolCollapsed[T: SomeFloat](tp: Threadpool, M: Matrix[T]) : T =
+  mixin globalMax
+
+  tp.parallelFor ij in 0 ..< M.nrows * M.ncols:
+    captures:{M}
+    reduceInto(globalMax: T):
+      prologue:
+        var localMax = T(-Inf)
+      forLoop:
+        localMax = max(localMax, M.buffer[ij])
+      merge(remoteMax: FlowVar[T]):
+        localMax = max(localMax, sync(remoteMax))
+      epilogue:
+        return localMax
+
+  result = sync(globalMax)
+
+proc logsumexpThreadpoolCollapsed[T: SomeFloat](tp: Threadpool, M: Matrix[T]): T =
+  mixin lse
+  let alpha = tp.maxThreadpoolCollapsed(M)
+
+  tp.parallelFor ij in 0 ..< M.nrows * M.ncols:
+    captures:{alpha, M}
+    reduceInto(lse: T):
+      prologue:
+        var localLSE = 0.T
+      forLoop:
+        localLSE += exp(M.buffer[ij] - alpha)
+      merge(remoteLSE: Flowvar[T]):
+        localLSE += sync(remoteLSE)
+      epilogue:
+        return localLSE
+
+  result = alpha + ln(sync(lse))
+
+# proc maxThreadpoolStaged[T: SomeFloat](tp: Threadpool, M: Matrix[T]) : T =
+#   mixin maxLoop
+#
+#   var max = T(-Inf)
+#   let maxAddr = max.addr
+#
+#   var lock: Lock
+#   lock.initLock()
+#   let lockAddr = lock.addr
+#
+#   tp.parallelForStaged i in 0 ..< M.nrows:
+#     captures:{maxAddr, lockAddr, M}
+#     awaitable: maxLoop
+#     prologue:
+#       var localMax = T(-Inf)
+#     forLoop:
+#       for j in 0 ..< M.ncols:
+#         localMax = max(localMax, M[i, j])
+#     epilogue:
+#       lockAddr[].acquire()
+#       maxAddr[] = max(maxAddr[], localMax)
+#       lockAddr[].release()
+#
+#   let waslastThread = sync(maxLoop)
+#   lock.deinitLock()
+#
+# proc logsumexpThreadpoolStaged[T: SomeFloat](tp: Threadpool, M: Matrix[T]): T =
+#   mixin logSumExpLoop
+#   let alpha = M.maxThreadpoolStaged()
+#
+#   var lse = T(0)
+#   let lseAddr = lse.addr
+#
+#   # Atomic increment for float is done with a Compare-And-Swap loop usually.
+#   # Due to lazy splitting, load distribution is unbalanced between threads so they shouldn't
+#   # finish at the same time in general and lock contention would be low
+#   var lock: Lock
+#   lock.initLock()
+#   let lockAddr = lock.addr
+#
+#  tp.parallelForStaged i in 0 ..< M.nrows:
+#     captures:{lseAddr, lockAddr, alpha, M}
+#     awaitable: logSumExpLoop
+#     prologue:
+#       var localLSE = 0.T
+#     loop:
+#       for j in 0 ..< M.ncols:
+#         localLSE += exp(M[i, j] - alpha)
+#     epilogue:
+#       lockAddr[].acquire()
+#       lseAddr[] += localLSE
+#       lockAddr[].release()
+#
+#   let wasLastThread = sync(logSumExpLoop)
+#   result = alpha + ln(lse)
+#   lock.deinitLock()
+
+# Main
+# -------------------------------------------------------
+
+proc main() =
+  echo "Note that a text vocabulary is often in the 50000-15000 words\n"
+
+  var
+    datasetSize     = 20000'i64
+    batchSize       = 256'i64
+    imagelabels     = 1000'i64
+    textVocabulary = 10000'i64
+
+  if paramCount() == 0:
+    let exeName = getAppFilename().extractFilename()
+    echo &"Usage: {exeName} <datasetSize: int64> <batchSize: int64> <imagelabels: int64> <textVocabulary: int64>"
+    echo &"Running with default datasetSize={datasetSize}, batchSize={batchSize}, imagelabels={imagelabels}, textVocabulary={textVocabulary}"
+  elif paramCount() == 4:
+    datasetSize    = paramStr(1).parseBiggestInt().int64
+    batchSize      = paramStr(2).parseBiggestInt().int64
+    imagelabels    = paramStr(3).parseBiggestInt().int64
+    textVocabulary = paramStr(4).parseBiggestInt().int64
+  else:
+    let exeName = getAppFilename().extractFilename()
+    echo &"Usage: {exeName} <datasetSize: int64> <batchSize: int64> <imagelabels: int64> <textVocabulary: int64>"
+    echo &"Default \"{datasetSize} {batchSize} {imagelabels} {textVocabulary}\""
+    quit 1
+
+  var nthreads: int
+  if existsEnv"CTT_NUM_THREADS":
+    nthreads = getEnv"CTT_NUM_THREADS".parseInt()
+  else:
+    nthreads = countProcessors()
+
+  let sanityM = newMatrix[float32](1, 9)
+  for i in 0'i32 ..< 9:
+    sanityM[0, i] = i.float32 + 1
+
+  echo "Sanity check, logSumExp(1..<10) should be 9.4585514 (numpy logsumexp): ", logsumexpSerial(sanityM)
+  echo '\n'
+  wv_free(sanityM.buffer)
+
+  reportConfig("Sequential", 1, datasetSize, batchSize, imageLabels, textVocabulary)
+  block:
+    runBench(logsumexpSerial, datasetSize, batchSize, imageLabels)
+  block:
+    runBench(logsumexpSerial, datasetSize, batchSize, textVocabulary)
+
+  # TODO: Placing the threadpool before the sequential bench makes it take ~85 ms instead of ~48 ms
+  var tp = Threadpool.new(nthreads)
+
+  # TODO: The parallel algorithm is slower than Weave AND slower than serial
+
+  reportConfig("Constantine's Threadpool Reduce", nthreads, datasetSize, batchSize, imageLabels, textVocabulary)
+  block:
+    tp.runBench(logsumexpThreadpoolReduce, datasetSize, batchSize, imageLabels)
+  block:
+    tp.runBench(logsumexpThreadpoolReduce, datasetSize, batchSize, textVocabulary)
+
+  reportConfig("Constantine's Threadpool (Collapsed)", nthreads, datasetSize, batchSize, imageLabels, textVocabulary)
+  block:
+    tp.runBench(logsumexpThreadpoolCollapsed, datasetSize, batchSize, imageLabels)
+  block:
+    tp.runBench(logsumexpThreadpoolCollapsed, datasetSize, batchSize, textVocabulary)
+
+  # reportConfig("Constantine's Threadpool (Staged)", nthreads, datasetSize, batchSize, imageLabels, textVocabulary)
+  # block:
+  #   tp.runBench(logsumexpThreadpoolStaged, datasetSize, batchSize, imageLabels)
+  # block:
+  #   tp.runBench(logsumexpThreadpoolStaged, datasetSize, batchSize, textVocabulary)
+
+  tp.shutdown()
+
+main()
--- a/constantine/platforms/threadpool/benchmarks/matrix_transposition/README.md
+++ b/constantine/platforms/threadpool/benchmarks/matrix_transposition/README.md
@ -0,0 +1,14 @@
+# Matrix transposition
+
+This benchmark extract from [Laser](https://github.com/numforge/laser)
+stresses the ability for a runtime to support nested loops.
+
+A matrix is being copied to another buffer with transposition.
+For one buffer or the other the accesses will not be linear
+so it is important to do tiling. As dimensions might be skewed,
+the ideal tiling should be 2D with nested parallel for loops that properly find
+work for all cores even if a matrix is tall and skinny or short and fat.
+
+Note that as OpenMP nested loop support is very problematic
+we use the `collapse` clause for OpenMP which is only usable
+in a restricted number of scenarios.
--- a/constantine/platforms/threadpool/benchmarks/matrix_transposition/openmp_transposes.nim
+++ b/constantine/platforms/threadpool/benchmarks/matrix_transposition/openmp_transposes.nim
@ -0,0 +1,294 @@
+# Weave
+# Copyright (c) 2019 Mamy André-Ratsimbazafy
+# Licensed and distributed under either of
+#   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
+#   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
+# at your option. This file may not be copied, modified, or distributed except according to those terms.
+
+# Original transposition codes from Laser project
+# (c) Mamy André Ratsimbazafy, Apache  License version 2
+
+import
+  # Stdlib
+  strformat, os, strutils, math, system/ansi_c,
+  cpuinfo, streams, strscans,
+  # Third-party
+  cligen,
+  # bench
+  ../wtime, ../resources
+
+# OpenMP
+# ---------------------------------------------------
+
+{.passC:"-fopenmp".}
+{.passL:"-fopenmp".}
+{.pragma: omp, header:"omp.h".}
+proc omp_get_num_threads*(): cint {.omp.}
+
+# Memory
+# ---------------------------------------------------
+
+proc wv_alloc*(T: typedesc): ptr T {.inline.}=
+  ## Default allocator for the Picasso library
+  ## This allocates memory to hold the type T
+  ## and returns a pointer to it
+  ##
+  ## Can use Nim allocator to measure the overhead of its lock
+  ## Memory is not zeroed
+  when defined(WV_useNimAlloc):
+    createSharedU(T)
+  else:
+    cast[ptr T](c_malloc(csize_t sizeof(T)))
+
+proc wv_alloc*(T: typedesc, len: SomeInteger): ptr UncheckedArray[T] {.inline.} =
+  ## Default allocator for the Picasso library.
+  ## This allocates a contiguous chunk of memory
+  ## to hold ``len`` elements of type T
+  ## and returns a pointer to it.
+  ##
+  ## Can use Nim allocator to measure the overhead of its lock
+  ## Memory is not zeroed
+  when defined(WV_useNimAlloc):
+    cast[type result](createSharedU(T, len))
+  else:
+    cast[type result](c_malloc(csize_t len*sizeof(T)))
+
+proc wv_free*[T: ptr](p: T) {.inline.} =
+  when defined(WV_useNimAlloc):
+    freeShared(p)
+  else:
+    c_free(p)
+
+# Transpose implementations
+# ---------------------------------------------------
+
+type TransposeStrategy = enum
+  Sequential
+  Naive
+  Collapsed
+  TiledCollapsed
+
+# Question: do we need __restrict to avoid the compiler generating
+#           defensive aliasing robust code?
+
+proc sequentialTranspose(M, N: int, bufIn, bufOut: ptr UncheckedArray[float32]) =
+  for j in 0 ..< N:
+    for i in 0 ..< M:
+      bufOut[j*M+i] = bufIn[i*N+j]
+
+proc ompNaiveTranspose(M, N: int, bufIn, bufOut: ptr UncheckedArray[float32]) =
+  ## Transpose a MxN matrix into a NxM matrix
+
+  # Write are more expensive than read so we keep i accesses linear for writes
+  {.push stacktrace:off.}
+  for j in 0||(N-1):
+    for i in 0 ..< M:
+      bufOut[j*M+i] = bufIn[i*N+j]
+  {.pop.}
+
+proc ompCollapsedTranspose(M, N: int, bufIn, bufOut: ptr UncheckedArray[float32]) =
+  ## Transpose a MxN matrix into a NxM matrix
+
+  # We need to go down to C level for the collapsed clause
+  # This relies on M, N, bfIn, bufOut symbols being the same in C and Nim
+  # The proper interpolation syntax is a bit busy otherwise
+  {.emit: """
+
+  #pragma omp parallel for collapse(2)
+  for (int i = 0; i < `M`; ++i)
+    for (int j = 0; j < `N`; ++j)
+      `bufOut`[j*M+i] = `bufIn`[i*N+j];
+  """.}
+
+proc omp2DTiledCollapsedTranspose(M, N: int, bufIn, bufOut: ptr UncheckedArray[float32]) =
+  ## Transpose with 2D tiling and collapsed
+
+  const blck = 64
+
+  {.emit: """
+
+  #define min(a,b) (((a)<(b))?(a):(b))
+
+  #pragma omp parallel for collapse(2)
+  for (int j = 0; j < `N`; j+=`blck`)
+    for (int i = 0; i < `M`; i+=`blck`)
+      for (int jj = j; jj<j+`blck` && jj<`N`; jj++)
+        for (int ii = i; ii<min(i+`blck`,`M`); ii++)
+          `bufOut`[ii+jj*`M`] = `bufIn`[jj+ii*`N`];
+  """.}
+
+
+# Meta
+# ---------------------------------------------------
+
+func computeMeta(height, width: int): tuple[reqOps, reqBytes, bufSize: int] =
+
+  result.reqOps = height * width
+  result.reqBytes = sizeof(float32) * height * width
+  result.bufSize = height * width
+
+func initialize(buffer: ptr UncheckedArray[float32], len: int) =
+  for i in 0 ..< len:
+    buffer[i] = i.float32
+
+# Bench
+# ---------------------------------------------------
+
+template memUsage(maxRSS, runtimeRSS, pageFaults: untyped{ident}, body: untyped) =
+  var maxRSS, runtimeRSS, pageFaults: int32
+  block:
+    var ru: Rusage
+    getrusage(RusageSelf, ru)
+    runtimeRSS = ru.ru_maxrss
+    pageFaults = ru.ru_minflt
+
+    body
+
+    getrusage(RusageSelf, ru)
+    runtimeRSS = ru.ru_maxrss - runtimeRSS
+    pageFaults = ru.ru_minflt - pageFaults
+    maxRss = ru.ru_maxrss
+
+proc report(
+    M, N: int, nthreads: int32, nrounds: int, reordered: bool,
+    transposeStrategy: TransposeStrategy, reqOps, reqBytes: int,
+    mxnTime: float64, mxnMaxRSS, mxnRuntimeRss, mxnPageFaults: int32,
+    nxmTime: float64, nxmMaxRSS, nxmRuntimeRss, nxmPageFaults: int32,
+  ) =
+
+  let arithIntensity = reqOps.float / reqBytes.float
+  let mxnPerf = reqOps.float/(mxnTime*1e-3 / nrounds.float) * 1e-9 # Gops per second
+  let nxmPerf = reqOps.float/(nxmTime*1e-3 / nrounds.float) * 1e-9 # Gops per second
+
+  echo "--------------------------------------------------------------------------"
+  echo "Scheduler:                                    OpenMP"
+  echo "Benchmark:                                    Transpose - ", $transposeStrategy
+  echo "Threads:                                      ", nthreads
+  echo "# of rounds:                                  ", nrounds
+  echo "# of operations:                              ", reqOps
+  echo "# of bytes:                                   ", reqBytes
+  echo "Arithmetic Intensity:                         ", round(arithIntensity, 3)
+  echo "--------------------------------------------------------------------------"
+  if not reordered:
+    echo "Transposition:                                ", M,'x',N, " --> ", N, 'x', M
+    echo "Time(ms):                                     ", round(mxnTime, 3)
+    echo "Max RSS (KB):                                 ", mxnMaxRss
+    echo "Runtime RSS (KB):                             ", mxnRuntimeRSS
+    echo "# of page faults:                             ", mxnPageFaults
+    echo "Perf (GMEMOPs/s ~ GigaMemory Operations/s)    ", round(mxnPerf, 3)
+    echo "--------------------------------------------------------------------------"
+    echo "Transposition:                                ", N,'x',M, " --> ", M, 'x', N
+    echo "Time(ms):                                     ", round(nxmTime, 3)
+    echo "Max RSS (KB):                                 ", nxmMaxRss
+    echo "Runtime RSS (KB):                             ", nxmRuntimeRSS
+    echo "# of page faults:                             ", nxmPageFaults
+    echo "Perf (GMEMOPs/s ~ GigaMemory Operations/s)    ", round(nxmPerf, 3)
+  else:
+    echo "Transposition:                                ", N,'x',M, " --> ", M, 'x', N
+    echo "Time(ms):                                     ", round(nxmTime, 3)
+    echo "Max RSS (KB):                                 ", nxmMaxRss
+    echo "Runtime RSS (KB):                             ", nxmRuntimeRSS
+    echo "# of page faults:                             ", nxmPageFaults
+    echo "Perf (GMEMOPs/s ~ GigaMemory Operations/s)    ", round(mxnPerf, 3)
+    echo "--------------------------------------------------------------------------"
+    echo "Transposition:                                ", M,'x',N, " --> ", N, 'x', M
+    echo "Time(ms):                                     ", round(mxnTime, 3)
+    echo "Max RSS (KB):                                 ", mxnMaxRss
+    echo "Runtime RSS (KB):                             ", mxnRuntimeRSS
+    echo "# of page faults:                             ", mxnPageFaults
+    echo "Perf (GMEMOPs/s ~ GigaMemory Operations/s)    ", round(nxmPerf, 3)
+
+template runBench(transposeName: typed, reorderCompute: bool): untyped =
+  if not reorderCompute:
+    memUsage(mxnMaxRss, mxnRuntimeRss, mxnPageFaults):
+      let start = wtime_msec()
+      for _ in 0 ..< nrounds:
+        transposeName(M, N, bufIn, bufOut)
+      let stop = wtime_msec()
+      mxnTime = stop - start
+
+    memUsage(nxmMaxRss, nxmRuntimeRss, nxmPageFaults):
+      let start = wtime_msec()
+      for _ in 0 ..< nrounds:
+        transposeName(N, M, bufIn, bufOut)
+      let stop = wtime_msec()
+      nxmTime = stop - start
+
+    report(M, N, nthreads, nrounds, reorderCompute,
+        transposeStrat, reqOps, reqBytes,
+        mxnTime, mxnMaxRSS, mxnRuntimeRss, mxnPageFaults,
+        nxmTime, nxmMaxRSS, nxmRuntimeRss, nxmPageFaults
+      )
+  else:
+    memUsage(nxmMaxRss, nxmRuntimeRss, nxmPageFaults):
+      let start = wtime_msec()
+      for _ in 0 ..< nrounds:
+        transposeName(N, M, bufIn, bufOut)
+      let stop = wtime_msec()
+      nxmTime = stop - start
+
+    memUsage(mxnMaxRss, mxnRuntimeRss, mxnPageFaults):
+      let start = wtime_msec()
+      for _ in 0 ..< nrounds:
+        transposeName(M, N, bufIn, bufOut)
+      let stop = wtime_msec()
+      mxnTime = stop - start
+
+    report(M, N, nthreads, nrounds, reorderCompute,
+        transposeStrat, reqOps, reqBytes,
+        mxnTime, mxnMaxRSS, mxnRuntimeRss, mxnPageFaults,
+        nxmTime, nxmMaxRSS, nxmRuntimeRss, nxmPageFaults
+      )
+
+# Interface
+# ---------------------------------------------------
+
+proc main() =
+
+  var
+    M = 400
+    N = 4000
+    nrounds = 1000
+    transposeStrat = TiledCollapsed
+    reorderCompute = false
+
+  if paramCount() == 0:
+    let exeName = getAppFilename().extractFilename()
+    echo &"Usage: {exeName} <M: int> <N: int> <rounds: int> <transposeStrategy: Sequential|Naive|Nested|TiledNested> <reorderCompute: bool>"
+    echo &"Running with default M={M}, N={N}, rounds={nrounds}, transposeStrategy={transposeStrat}, reorderCompute={reorderCompute}"
+  elif paramCount() == 5:
+    M = paramStr(1).parseInt()
+    N = paramStr(2).parseInt()
+    nrounds = paramStr(3).parseInt()
+    transposeStrat = paramStr(4).parseEnum[:TransposeStrategy]()
+    reorderCompute = paramStr(5).parseBool()
+  else:
+    let exeName = getAppFilename().extractFilename()
+    echo &"Usage: {exeName} <M: int> <N: int> <rounds: int> <transposeStrategy: Sequential|Naive|Nested|TiledNested> <reorderCompute: bool>"
+    echo &"Default \"{exeName} {M} {N} {nrounds} {transposeStrat} {reorderCompute}\""
+    quit 1
+
+  echo "Inverting the transpose order may favor one transposition heavily for non-tiled strategies"
+
+  let nthreads = if transposeStrat == Sequential: 1'i32
+                 else: omp_get_num_threads()
+
+  let (reqOps, reqBytes, bufSize) = computeMeta(M, N)
+
+  let bufOut = wv_alloc(float32, bufSize)
+  let bufIn = wv_alloc(float32, bufSize)
+
+  bufIn.initialize(bufSize)
+
+  var mxnTime, nxmTime: float64
+
+  case transposeStrat
+  of Sequential: runBench(sequentialTranspose, reorderCompute)
+  of Naive: runBench(ompNaiveTranspose, reorderCompute)
+  of Collapsed: runBench(ompCollapsedTranspose, reorderCompute)
+  of TiledCollapsed: runBench(omp2DTiledCollapsedTranspose, reorderCompute)
+
+  wv_free(bufOut)
+  wv_free(bufIn)
+
+dispatch(main)
--- a/constantine/platforms/threadpool/benchmarks/matrix_transposition/threadpool_transposes.nim
+++ b/constantine/platforms/threadpool/benchmarks/matrix_transposition/threadpool_transposes.nim
@ -0,0 +1,319 @@
+# Weave
+# Copyright (c) 2019 Mamy André-Ratsimbazafy
+# Licensed and distributed under either of
+#   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
+#   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
+# at your option. This file may not be copied, modified, or distributed except according to those terms.
+
+# Original transposition codes from Laser project
+# (c) Mamy André Ratsimbazafy, Apache  License version 2
+
+import
+  # Stdlib
+  system/ansi_c, std/[strformat, os, strutils, math, cpuinfo],
+  # Constantine
+  ../../threadpool
+
+when not defined(windows):
+  # bench
+  import ../wtime, ../resources
+
+
+# Memory
+# ---------------------------------------------------
+
+proc wv_alloc*(T: typedesc): ptr T {.inline.}=
+  ## Default allocator for the Picasso library
+  ## This allocates memory to hold the type T
+  ## and returns a pointer to it
+  ##
+  ## Can use Nim allocator to measure the overhead of its lock
+  ## Memory is not zeroed
+  when defined(WV_useNimAlloc):
+    createSharedU(T)
+  else:
+    cast[ptr T](c_malloc(csize_t sizeof(T)))
+
+proc wv_alloc*(T: typedesc, len: SomeInteger): ptr UncheckedArray[T] {.inline.} =
+  ## Default allocator for the Picasso library.
+  ## This allocates a contiguous chunk of memory
+  ## to hold ``len`` elements of type T
+  ## and returns a pointer to it.
+  ##
+  ## Can use Nim allocator to measure the overhead of its lock
+  ## Memory is not zeroed
+  when defined(WV_useNimAlloc):
+    cast[type result](createSharedU(T, len))
+  else:
+    cast[type result](c_malloc(csize_t len*sizeof(T)))
+
+proc wv_free*[T: ptr](p: T) {.inline.} =
+  when defined(WV_useNimAlloc):
+    freeShared(p)
+  else:
+    c_free(p)
+
+# Transpose implementations
+# ---------------------------------------------------
+
+type TransposeStrategy = enum
+  Sequential
+  Naive
+  Nested
+  TiledNested
+
+# Question: do we need __restrict to avoid the compiler generating
+#           defensive aliasing robust code?
+
+proc sequentialTranspose(tp: Threadpool, M, N: int, bufIn, bufOut: ptr UncheckedArray[float32]) =
+  for j in 0 ..< N:
+    for i in 0 ..< M:
+      bufOut[j*M+i] = bufIn[i*N+j]
+
+proc cttNaiveTranspose(tp: Threadpool, M, N: int, bufIn, bufOut: ptr UncheckedArray[float32]) =
+  ## Transpose a MxN matrix into a NxM matrix
+
+  # Write are more expensive than read so we keep i accesses linear for writes
+  tp.parallelFor j in 0 ..< N:
+    captures: {M, N, bufIn, bufOut}
+    for i in 0 ..< M:
+      bufOut[j*M+i] = bufIn[i*N+j]
+
+proc cttNestedTranspose(tp: Threadpool, M, N: int, bufIn, bufOut: ptr UncheckedArray[float32]) =
+  ## Transpose a MxN matrix into a NxM matrix with nested for loops
+
+  tp.parallelFor j in 0 ..< N:
+    captures: {tp, M, N, bufIn, bufOut}
+    tp.parallelFor i in 0 ..< M:
+      captures: {j, M, N, bufIn, bufOut}
+      bufOut[j*M+i] = bufIn[i*N+j]
+
+proc ctt2DTiledNestedTranspose(tp: Threadpool, M, N: int, bufIn, bufOut: ptr UncheckedArray[float32]) =
+  ## Transpose with 2D tiling and nested
+
+  const blck = 64 # const do not need to be captured
+
+  tp.parallelFor j in 0 ..< N:
+    stride: blck
+    captures: {tp, M, N, bufIn, bufOut}
+    tp.parallelFor i in 0 ..< M:
+      stride: blck
+      captures: {j, M, N, bufIn, bufOut}
+      for jj in j ..< min(j+blck, N):
+        for ii in i ..< min(i+blck, M):
+          bufOut[jj*M+ii] = bufIn[ii*N+jj]
+
+# Meta
+# ---------------------------------------------------
+
+func computeMeta(height, width: int): tuple[reqOps, reqBytes, bufSize: int] =
+
+  result.reqOps = height * width
+  result.reqBytes = sizeof(float32) * height * width
+  result.bufSize = height * width
+
+func initialize(buffer: ptr UncheckedArray[float32], len: int) =
+  for i in 0 ..< len:
+    buffer[i] = i.float32
+
+# Bench
+# ---------------------------------------------------
+
+template memUsage(maxRSS, runtimeRSS, pageFaults: untyped{ident}, body: untyped) =
+  var maxRSS, runtimeRSS, pageFaults: int32
+  block:
+    when not defined(windows):
+      var ru: Rusage
+      getrusage(RusageSelf, ru)
+      runtimeRSS = ru.ru_maxrss
+      pageFaults = ru.ru_minflt
+
+    body
+
+    when not defined(windows):
+      getrusage(RusageSelf, ru)
+      runtimeRSS = ru.ru_maxrss - runtimeRSS
+      pageFaults = ru.ru_minflt - pageFaults
+      maxRss = ru.ru_maxrss
+
+proc report(
+    M, N: int, nthreads: int32, nrounds: int, reordered: bool,
+    transposeStrategy: TransposeStrategy, reqOps, reqBytes: int,
+    mxnTime: float64, mxnMaxRSS, mxnRuntimeRss, mxnPageFaults: int32,
+    nxmTime: float64, nxmMaxRSS, nxmRuntimeRss, nxmPageFaults: int32,
+  ) =
+
+  let arithIntensity = reqOps.float / reqBytes.float
+  let mxnPerf = reqOps.float/(mxnTime*1e-3 / nrounds.float) * 1e-9 # Gops per second
+  let nxmPerf = reqOps.float/(nxmTime*1e-3 / nrounds.float) * 1e-9 # Gops per second
+
+  echo "--------------------------------------------------------------------------"
+  echo "Scheduler:                                    Constantine's threadpool"
+  echo "Benchmark:                                    Transpose - ", $transposeStrategy
+  echo "Threads:                                      ", nthreads
+  echo "# of rounds:                                  ", nrounds
+  echo "# of operations:                              ", reqOps
+  echo "# of bytes:                                   ", reqBytes
+  echo "Arithmetic Intensity:                         ", round(arithIntensity, 3)
+  echo "--------------------------------------------------------------------------"
+  if not reordered:
+    echo "Transposition:                                ", M,'x',N, " --> ", N, 'x', M
+    when not defined(windows):
+      echo "Time(ms):                                     ", round(mxnTime, 3)
+      echo "Max RSS (KB):                                 ", mxnMaxRss
+      echo "Runtime RSS (KB):                             ", mxnRuntimeRSS
+      echo "# of page faults:                             ", mxnPageFaults
+      echo "Perf (GMEMOPs/s ~ GigaMemory Operations/s)    ", round(mxnPerf, 3)
+    echo "--------------------------------------------------------------------------"
+    echo "Transposition:                                ", N,'x',M, " --> ", M, 'x', N
+    when not defined(windows):
+      echo "Time(ms):                                     ", round(nxmTime, 3)
+      echo "Max RSS (KB):                                 ", nxmMaxRss
+      echo "Runtime RSS (KB):                             ", nxmRuntimeRSS
+      echo "# of page faults:                             ", nxmPageFaults
+      echo "Perf (GMEMOPs/s ~ GigaMemory Operations/s)    ", round(nxmPerf, 3)
+  else:
+    echo "Transposition:                                ", N,'x',M, " --> ", M, 'x', N
+    when not defined(windows):
+      echo "Time(ms):                                     ", round(nxmTime, 3)
+      echo "Max RSS (KB):                                 ", nxmMaxRss
+      echo "Runtime RSS (KB):                             ", nxmRuntimeRSS
+      echo "# of page faults:                             ", nxmPageFaults
+      echo "Perf (GMEMOPs/s ~ GigaMemory Operations/s)    ", round(mxnPerf, 3)
+    echo "--------------------------------------------------------------------------"
+    echo "Transposition:                                ", M,'x',N, " --> ", N, 'x', M
+    when not defined(windows):
+      echo "Time(ms):                                     ", round(mxnTime, 3)
+      echo "Max RSS (KB):                                 ", mxnMaxRss
+      echo "Runtime RSS (KB):                             ", mxnRuntimeRSS
+      echo "# of page faults:                             ", mxnPageFaults
+      echo "Perf (GMEMOPs/s ~ GigaMemory Operations/s)    ", round(nxmPerf, 3)
+
+template runBench(tp: Threadpool, transposeName: typed, reorderCompute, isSequential: bool): untyped =
+  if not reorderCompute:
+    if not isSequential:
+      tp = Threadpool.new()
+    memUsage(mxnMaxRss, mxnRuntimeRss, mxnPageFaults):
+      when not defined(windows):
+        let start = wtime_msec()
+      for _ in 0 ..< nrounds:
+        tp.transposeName(M, N, bufIn, bufOut)
+      if not isSequential:
+        tp.syncAll()
+      when not defined(windows):
+        let stop = wtime_msec()
+        mxnTime = stop - start
+
+    memUsage(nxmMaxRss, nxmRuntimeRss, nxmPageFaults):
+      when not defined(windows):
+        let start = wtime_msec()
+      for _ in 0 ..< nrounds:
+        tp.transposeName(N, M, bufIn, bufOut)
+      if not isSequential:
+        tp.syncAll()
+      when not defined(windows):
+        let stop = wtime_msec()
+        nxmTime = stop - start
+
+    if not isSequential:
+      tp.shutdown()
+
+    report(M, N, nthreads, nrounds, reorderCompute,
+        transposeStrat, reqOps, reqBytes,
+        mxnTime, mxnMaxRSS, mxnRuntimeRss, mxnPageFaults,
+        nxmTime, nxmMaxRSS, nxmRuntimeRss, nxmPageFaults
+      )
+
+  else:
+    if not isSequential:
+      tp = Threadpool.new()
+    memUsage(nxmMaxRss, nxmRuntimeRss, nxmPageFaults):
+      when not defined(windows):
+        let start = wtime_msec()
+      for _ in 0 ..< nrounds:
+        tp.transposeName(N, M, bufIn, bufOut)
+      if not isSequential:
+        tp.syncAll()
+      when not defined(windows):
+        let stop = wtime_msec()
+        nxmTime = stop - start
+
+    memUsage(mxnMaxRss, mxnRuntimeRss, mxnPageFaults):
+      when not defined(windows):
+        let start = wtime_msec()
+      for _ in 0 ..< nrounds:
+        tp.transposeName(M, N, bufIn, bufOut)
+      if not isSequential:
+        tp.syncAll()
+      when not defined(windows):
+        let stop = wtime_msec()
+        mxnTime = stop - start
+
+    if not isSequential:
+      tp.shutdown()
+
+    report(M, N, nthreads, nrounds, reorderCompute,
+        transposeStrat, reqOps, reqBytes,
+        mxnTime, mxnMaxRSS, mxnRuntimeRss, mxnPageFaults,
+        nxmTime, nxmMaxRSS, nxmRuntimeRss, nxmPageFaults
+      )
+
+# Interface
+# ---------------------------------------------------
+
+proc main() =
+
+  var
+    M = 400
+    N = 4000
+    nrounds = 1000
+    transposeStrat = TiledNested
+    reorderCompute = false
+
+  if paramCount() == 0:
+    let exeName = getAppFilename().extractFilename()
+    echo &"Usage: {exeName} <M: int> <N: int> <rounds: int> <transposeStrategy: Sequential|Naive|Nested|TiledNested> <reorderCompute: bool>"
+    echo &"Running with default M={M}, N={N}, rounds={nrounds}, transposeStrategy={transposeStrat}, reorderCompute={reorderCompute}"
+  elif paramCount() == 5:
+    M = paramStr(1).parseInt()
+    N = paramStr(2).parseInt()
+    nrounds = paramStr(3).parseInt()
+    transposeStrat = paramStr(4).parseEnum[:TransposeStrategy]()
+    reorderCompute = paramStr(5).parseBool()
+  else:
+    let exeName = getAppFilename().extractFilename()
+    echo &"Usage: {exeName} <M: int> <N: int> <rounds: int> <transposeStrategy: Sequential|Naive|Nested|TiledNested> <reorderCompute: bool>"
+    echo &"Default \"{exeName} {M} {N} {nrounds} {transposeStrat} {reorderCompute}\""
+    quit 1
+
+  echo "Inverting the transpose order may favor one transposition heavily for non-tiled strategies"
+
+  let isSequential = transposeStrat == Sequential
+  var nthreads: int32
+  if transposeStrat == Sequential:
+    nthreads = 1
+  elif existsEnv"CTT_NUM_THREADS":
+    nthreads = getEnv"CTT_NUM_THREADS".parseInt().int32
+  else:
+    nthreads = countProcessors().int32
+
+  let (reqOps, reqBytes, bufSize) = computeMeta(M, N)
+
+  let bufOut = wv_alloc(float32, bufSize)
+  let bufIn = wv_alloc(float32, bufSize)
+
+  bufIn.initialize(bufSize)
+
+  var mxnTime, nxmTime: float64
+
+  var tp: Threadpool
+  case transposeStrat
+  of Sequential: tp.runBench(sequentialTranspose, reorderCompute, isSequential)
+  of Naive: tp.runBench(cttNaiveTranspose, reorderCompute, isSequential)
+  of Nested: tp.runBench(cttNestedTranspose, reorderCompute, isSequential)
+  of TiledNested: tp.runBench(ctt2DTiledNestedTranspose, reorderCompute, isSequential)
+
+  wv_free(bufOut)
+  wv_free(bufIn)
+
+main()
--- a/constantine/platforms/threadpool/benchmarks/nqueens/threadpool_nqueens.nim
+++ b/constantine/platforms/threadpool/benchmarks/nqueens/threadpool_nqueens.nim
@ -218,7 +218,7 @@ proc main() =
  echo "Benchmark:            N-queens"
  echo "Threads:              ", nthreads
  when not defined(windows):
-    echo "Time(us)              ", stop - start
+    echo "Time(ms)              ", stop - start
    echo "Max RSS (KB):         ", ru.ru_maxrss
    echo "Runtime RSS (KB):     ", rss
    echo "# of page faults:     ", flt
--- a/constantine/platforms/threadpool/crossthread/backoff.nim
+++ b/constantine/platforms/threadpool/crossthread/backoff.nim
@ -45,7 +45,7 @@ func initialize*(en: var EventNotifier) {.inline.} =
 func `=destroy`*(en: var EventNotifier) {.inline.} =
  en.futex.teardown()

-func `=`*(dst: var EventNotifier, src: EventNotifier) {.error: "An event notifier cannot be copied".}
+func `=copy`*(dst: var EventNotifier, src: EventNotifier) {.error: "An event notifier cannot be copied".}
 func `=sink`*(dst: var EventNotifier, src: EventNotifier) {.error: "An event notifier cannot be moved".}

 func prepareToPark*(en: var EventNotifier) {.inline.} =
@ -176,7 +176,7 @@ func initialize*(ec: var EventCount) {.inline.} =
 func `=destroy`*(ec: var EventCount) {.inline.} =
  ec.futex.teardown()

-proc sleepy*(ec: var Eventcount): ParkingTicket {.inline.} =
+proc sleepy*(ec: var Eventcount): ParkingTicket {.noInit, inline.} =
  ## To be called before checking if the condition to not sleep is met.
  ## Returns a ticket to be used when committing to sleep
  let prevState = ec.state.fetchAdd(kPreWait, moAcquireRelease)
@ -209,11 +209,11 @@ proc wakeAll*(ec: var EventCount) {.inline.} =
  if (prev and kAnyWaiterMask) != 0:
    ec.futex.wakeAll()

-proc getNumWaiters*(ec: var EventCount): tuple[preSleep, committedSleep: uint32] {.inline.} =
+proc getNumWaiters*(ec: var EventCount): tuple[preSleep, committedSleep: int32] {.noInit, inline.} =
  ## Get the number of idle threads:
  ## (planningToSleep, committedToSleep)
  let waiters = ec.state.load(moAcquire)
-  result.preSleep = uint32((waiters and kPreWaitMask) shr kPreWaitShift)
-  result.committedSleep = uint32(waiters and kWaitMask)
+  result.preSleep = cast[int32]((waiters and kPreWaitMask) shr kPreWaitShift)
+  result.committedSleep = cast[int32](waiters and kWaitMask)

 {.pop.} # {.push raises:[], checks:off.}
--- a/constantine/platforms/threadpool/crossthread/taskqueues.nim
+++ b/constantine/platforms/threadpool/crossthread/taskqueues.nim
@ -76,7 +76,7 @@ proc peek*(tq: var Taskqueue): int =
  ##
  ## This is a non-locking operation.
  let # Handle race conditions
-    b = tq.back.load(moAcquire)
+    b = tq.back.load(moRelaxed)  # Only the producer peeks in the threadpool so moRelaxed is enough
    f = tq.front.load(moAcquire)

  if b >= f:
@ -199,7 +199,7 @@ proc pop*(tq: var Taskqueue): ptr Task =
        # Empty queue, no thieves can have a pointer to an old retired buffer
        tq.garbageCollect()

-proc steal*(thiefID: uint32, tq: var Taskqueue): ptr Task =
+proc steal*(thiefID: int32, tq: var Taskqueue): ptr Task =
  ## Dequeue an item at the front. Takes ownership of the item
  ## This is intended for consumers.
  var f = tq.front.load(moAcquire)
@ -244,7 +244,7 @@ proc stealHalfImpl(dst: var Buf, dstBack: int, src: var Taskqueue): int =
    if compareExchange(src.front, f, f+n, moSequentiallyConsistent, moRelaxed):
      return n

-proc stealHalf*(thiefID: uint32, dst: var Taskqueue, src: var Taskqueue): ptr Task =
+proc stealHalf*(thiefID: int32, dst: var Taskqueue, src: var Taskqueue): ptr Task =
  ## Dequeue up to half of the items in the `src` tq, fom the front.
  ## Return the last of those, or nil if none found

--- a/constantine/platforms/threadpool/crossthread/tasks_flowvars.nim
+++ b/constantine/platforms/threadpool/crossthread/tasks_flowvars.nim
@ -9,7 +9,7 @@
 import
  std/atomics,
  ../instrumentation,
-  ../../allocs,
+  ../../allocs, ../../primitives,
  ./backoff

 # Tasks have an efficient design so that a single heap allocation
@ -29,9 +29,9 @@ type
  Task* = object
    # Intrusive metadata
    # ------------------
-    parent*: ptr Task # When a task is awaiting, a thread can quickly prioritize the direct child of a task
+    parent*: ptr Task        # When a task is awaiting, a thread can quickly prioritize the direct child of a task

-    thiefID*: Atomic[uint32] # ID of the worker that stole and run the task. For leapfrogging.
+    thiefID*: Atomic[int32]  # ID of the worker that stole and run the task. For leapfrogging.

    # Result sync
    # ------------------
@ -39,21 +39,50 @@ type
    completed*: Atomic[bool]
    waiter*: Atomic[ptr EventNotifier]

+    # Data parallelism
+    # ------------------
+    isFirstIter*: bool       # Awaitable for-loops return true for first iter. Loops are split before first iter.
+    loopStart*: int
+    loopStop*: int
+    loopStride*: int
+    loopStepsLeft*: int
+    reductionDAG*: ptr ReductionDagNode # For parallel loop reduction, merge with other range result
+
+    # Dataflow parallelism
+    # --------------------
+    dependsOnEvent: bool     # We cannot leapfrog a task triggered by an event
+
    # Execution
    # ------------------
-    fn*: proc (param: pointer) {.nimcall, gcsafe, raises: [].}
-    # destroy*: proc (param: pointer) {.nimcall, gcsafe.} # Constantine only deals with plain old data
-    data*{.align:sizeof(int).}: UncheckedArray[byte]
+    fn*: proc (env: pointer) {.nimcall, gcsafe, raises: [].}
+    # destroy*: proc (env: pointer) {.nimcall, gcsafe.} # Constantine only deals with plain old data
+    envSize*: int32
+    env*{.align:sizeof(int).}: UncheckedArray[byte]

  Flowvar*[T] = object
+    ## A Flowvar is a placeholder for a future result that may be computed in parallel
    task: ptr Task

-const SentinelThief* = 0xFACADE'u32
+  ReductionDagNode* = object
+    ## In a parallel reduction, when a loop a split the worker
+    ## keeps track of the tasks to gather results from in a private task-local linked-list.
+    ## Those forms a global computation directed acyclic graph
+    ## with the initial parallel reduction task as root.
+    # Note: While this requires an extra allocation per split
+    #       the alternative, making an intrusive linked-list of reduction tasks
+    #       require synchronization between threads.
+    task*: ptr Task
+    next*: ptr ReductionDagNode

-proc new*(
+# Tasks
+# -------------------------------------------------------------------------
+
+const SentinelThief* = 0xFACADE'i32
+
+proc newSpawn*(
       T: typedesc[Task],
       parent: ptr Task,
-       fn: proc (param: pointer) {.nimcall, gcsafe.}): ptr Task {.inline.} =
+       fn: proc (env: pointer) {.nimcall, gcsafe, raises: [].}): ptr Task =

  const size = sizeof(T)

@ -64,15 +93,25 @@ proc new*(
  result.completed.store(false, moRelaxed)
  result.waiter.store(nil, moRelaxed)
  result.fn = fn
+  result.envSize = 0

-proc new*(
+  result.isFirstIter = false
+  result.loopStart = 0
+  result.loopStop = 0
+  result.loopStride = 0
+  result.loopStepsLeft = 0
+  result.reductionDAG = nil
+
+  result.dependsOnEvent = false
+
+proc newSpawn*(
       T: typedesc[Task],
       parent: ptr Task,
-       fn: proc (param: pointer) {.nimcall, gcsafe, raises: [].},
-       params: auto): ptr Task {.inline.} =
+       fn: proc (env: pointer) {.nimcall, gcsafe, raises: [].},
+       env: auto): ptr Task =

  const size = sizeof(T) + # size without Unchecked
-               sizeof(params)
+               sizeof(env)

  result = allocHeapUnchecked(T, size)
  result.parent = parent
@ -81,7 +120,78 @@ proc new*(
  result.completed.store(false, moRelaxed)
  result.waiter.store(nil, moRelaxed)
  result.fn = fn
-  cast[ptr[type params]](result.data)[] = params
+  result.envSize = int32 sizeof(env)
+  cast[ptr[type env]](result.env)[] = env
+
+  result.isFirstIter = false
+  result.loopStart = 0
+  result.loopStop = 0
+  result.loopStride = 0
+  result.loopStepsLeft = 0
+  result.reductionDAG = nil
+
+  result.dependsOnEvent = false
+
+proc newLoop*(
+       T: typedesc[Task],
+       parent: ptr Task,
+       start, stop, stride: int,
+       isFirstIter: bool,
+       fn: proc (env: pointer) {.nimcall, gcsafe, raises: [].}): ptr Task =
+  const size = sizeof(T)
+  preCondition: start < stop
+
+  result = allocHeapUnchecked(T, size)
+  result.parent = parent
+  result.thiefID.store(SentinelThief, moRelaxed)
+  result.hasFuture = false
+  result.completed.store(false, moRelaxed)
+  result.waiter.store(nil, moRelaxed)
+  result.fn = fn
+  result.envSize = 0
+
+  result.isFirstIter = isFirstIter
+  result.loopStart = start
+  result.loopStop = stop
+  result.loopStride = stride
+  result.loopStepsLeft = ceilDiv_vartime(stop-start, stride)
+  result.reductionDAG = nil
+
+  result.dependsOnEvent = false
+
+proc newLoop*(
+       T: typedesc[Task],
+       parent: ptr Task,
+       start, stop, stride: int,
+       isFirstIter: bool,
+       fn: proc (env: pointer) {.nimcall, gcsafe, raises: [].},
+       env: auto): ptr Task =
+
+  const size = sizeof(T) + # size without Unchecked
+               sizeof(env)
+  preCondition: start < stop
+
+  result = allocHeapUnchecked(T, size)
+  result.parent = parent
+  result.thiefID.store(SentinelThief, moRelaxed)
+  result.hasFuture = false
+  result.completed.store(false, moRelaxed)
+  result.waiter.store(nil, moRelaxed)
+  result.fn = fn
+  result.envSize = int32(sizeof(env))
+  cast[ptr[type env]](result.env)[] = env
+
+  result.isFirstIter = isFirstIter
+  result.loopStart = start
+  result.loopStop = stop
+  result.loopStride = stride
+  result.loopStepsLeft = ceilDiv_vartime(stop-start, stride)
+  result.reductionDAG = nil
+
+  result.dependsOnEvent = false
+
+# Flowvars
+# -------------------------------------------------------------------------

 # proc `=copy`*[T](dst: var Flowvar[T], src: Flowvar[T]) {.error: "Futures/Flowvars cannot be copied".}

@ -91,9 +201,9 @@ proc newFlowVar*(T: typedesc, task: ptr Task): Flowvar[T] {.inline.} =

  # Task with future references themselves so that readyWith can be called
  # within the constructed
-  #   proc async_fn(param: pointer) {.nimcall.}
-  # that can only access data
-  cast[ptr ptr Task](task.data.addr)[] = task
+  #   proc threadpoolSpawn_fn(env: pointer) {.nimcall.}
+  # that can only access env
+  cast[ptr ptr Task](task.env.addr)[] = task

 proc cleanup*(fv: var Flowvar) {.inline.} =
  fv.task.freeHeap()
@ -117,13 +227,24 @@ func readyWith*[T](task: ptr Task, childResult: T) {.inline.} =
  ## Send the Flowvar result from the child thread processing the task
  ## to its parent thread.
  precondition: not task.completed.load(moAcquire)
-  cast[ptr (ptr Task, T)](task.data.addr)[1] = childResult
+  cast[ptr (ptr Task, T)](task.env.addr)[1] = childResult
  task.completed.store(true, moRelease)

-proc sync*[T](fv: sink Flowvar[T]): T {.inline, gcsafe.} =
+proc sync*[T](fv: sink Flowvar[T]): T {.noInit, inline, gcsafe.} =
  ## Blocks the current thread until the flowvar is available
  ## and returned.
  ## The thread is not idle and will complete pending tasks.
  mixin completeFuture
+  if fv.task.isNil:
+    zeroMem(result.addr, sizeof(T))
+    return
  completeFuture(fv, result)
  cleanup(fv)
+
+# ReductionDagNodes
+# -------------------------------------------------------------------------
+
+proc newReductionDagNode*(task: ptr Task, next: ptr ReductionDagNode): ptr ReductionDagNode {.inline.} =
+  result = allocHeap(ReductionDagNode)
+  result.next = next
+  result.task = task
--- a/constantine/platforms/threadpool/demos/raytracing/README.md
+++ b/constantine/platforms/threadpool/demos/raytracing/README.md
@ -0,0 +1,142 @@
+# Raytracing
+
+This is a port to Nim and Weave of
+SmallPT by Kevin Beason https://www.kevinbeason.com/smallpt/
+
+The original C++ version is also provided (with 4 extra lines to highlight original code quirks when porting).
+
+## Showcase
+
+The Nim version has can use a single parallel-for loop
+or nested parallel-for loop for better load balancing (important when the inner loop .is actually larget than the outer loop)
+
+At the moment, there is no random number generator that can deal with
+the dynamic thread migration of Weave so we get interesting artifacts
+compared to the single-threaded or the single parallel-for versions.
+
+**Single-thread or single parallel-for**
+
+![ray_trace_300samples_nim_threaded](ray_trace_300samples_nim_threaded.png)
+
+**Nested parallel-for**
+
+![ray_trace_300samples_nim_nested](ray_trace_300samples_nim_nested.png)
+
+## Benchmark
+
+Note: except for the nested parallelism which has RNG issue,
+the Nim and C++ versions are pixel equivalent.
+
+### Setup
+
+CPU: i9-9980XE, 18 cores, overclocked at 4.1GHz all-core turbo (from 3.0 nominal)
+The code was compiled with default flag, hence x86-64, hence SSE2.
+
+- Nim devel (1.3.5 2020-05-16) + GCC v10.1.0
+  - `nim c --threads:off -d:danger`
+  - `nim c --threads:on -d:danger`
+- GCC v10.1.0
+  - `-O3`
+  - `-O3 -fopenmp`
+- GCC v8.4.0
+  - `-O3`
+  - `-O3 -fopenmp`
+- Clang v10.0.0
+  - `-O3`
+  - `-O3 -fopenmp`
+
+### Commands
+
+
+```bash
+git clone https://github.com/mratsim/weave
+cd weave
+nimble install -y # install Weave dependencies, here synthesis, overwriting if asked.
+
+nim -v # Ensure you have nim 1.2.0 or more recent
+
+# Threads on (by default in this repo)
+nim c -d:danger -o:build/ray_threaded demos/raytracing/smallpt.nim
+
+# Threads off
+nim c -d:danger --threads:off -o:build/ray_single demos/raytracing/smallpt.nim
+
+g++ -O3 -o build/ray_gcc_single demos/raytracing/smallpt.cpp
+g++ -O3 -fopenmp -o build/ray_gcc_omp demos/raytracing/smallpt.cpp
+
+clang++ -O3 -o build/ray_clang_single demos/raytracing/smallpt.cpp
+clang++ -O3 -fopenmp -o build/ray_clang_omp demos/raytracing/smallpt.cpp
+```
+
+Then run for 300 samples with
+
+```
+build/ray_threaded 300
+# ...
+build/ray_clang_omp 300
+```
+
+### Results & Analysis
+
+GCC 10 has a significant OpenMP regression
+
+|      Bench       |     Nim     | Clang C++ OpenMP | GCC 10 C++ OpenMP | GCC 8 C++ OpenMP |
+| ---------------- | ----------: | ---------------: | ----------------: | ---------------: |
+| Single-threaded  | 4min43.369s |        4m51.052s |       4min50.934s |        4m50.648s |
+| Multithreaded    |     12.977s |          14.428s |       2min14.616s |          12.244s |
+| Nested-parallel  |     12.981s |                  |                   |                  |
+| Parallel speedup |      21.83x |           20.17x |             2.16x |           23.74x |
+
+Single-threaded Nim is 2.7% faster than Clang C++.
+Multithreaded Nim via Weave is 11.1% faster Clang C++.
+
+GCC 8 despite a simpler OpenMP design (usage of a global task queue instead of work-stealing)
+achieves a better speedup than both Weave and Clang.
+In that case, I expect it's because the tasks are so big that there is minimal contention
+on the task queue, furthermore the OpenMP schedule is "Dynamic" so we avoid the worst case scenario
+with static scheduling where a bunch of threads are assigned easy rays that never collide with a surface
+and a couple of threads are drowned in complex rays.
+
+I have absolutely no idea of what happened to OpenMP in GCC 10.
+
+Note: I only have 18 cores but we observe speedups in the 20x
+with Weave and LLVM. This is probably due to 2 factors:
+- Raytracing is pure compute, in particular contrary to high-performance computing
+  and machine learning workloads which are also very memory-intensive (matrices and tensors with thousands to millions of elements)
+  The scene has only 10 objects and a camera to keep track off.
+  Memory is extremely slow, you can do 100 additions while waiting for data
+  in the L2 cache.
+- A CPU has a certain number of execution ports and can use instruction-level parallelism (we call that superscalar). Hyperthreading is a way to
+  use those extra execution ports. However in many computing workloads
+  the sibling cores are also competing for memory bandwidth.
+  From the previous point, this is not true for raytracing
+  and so we enjoy super linear speedup.
+
+## License
+
+Kevin Beason code is licensed under (mail redacted to avoid spam)
+
+```
+LICENSE
+
+Copyright (c) 2006-2008 Kevin Beason (<surname>.<name>@gmail.com)
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice shall be included
+in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+```
--- a/constantine/platforms/threadpool/demos/raytracing/ray_trace_300samples_nim_nested.png
+++ b/constantine/platforms/threadpool/demos/raytracing/ray_trace_300samples_nim_nested.png
--- a/constantine/platforms/threadpool/demos/raytracing/ray_trace_300samples_nim_threaded.png
+++ b/constantine/platforms/threadpool/demos/raytracing/ray_trace_300samples_nim_threaded.png
--- a/constantine/platforms/threadpool/demos/raytracing/smallpt.cpp
+++ b/constantine/platforms/threadpool/demos/raytracing/smallpt.cpp
@ -0,0 +1,103 @@
+#include <math.h>   // smallpt, a Path Tracer by Kevin Beason, 2008
+#include <stdlib.h> // Make : g++ -O3 -fopenmp smallpt.cpp -o smallpt
+#include <stdio.h>  //        Remove "-fopenmp" for g++ version < 4.2
+struct Vec {        // Usage: time ./smallpt 5000 && xv image.ppm
+  double x, y, z;                  // position, also color (r,g,b)
+  Vec(double x_=0, double y_=0, double z_=0){ x=x_; y=y_; z=z_; }
+  Vec operator+(const Vec &b) const { return Vec(x+b.x,y+b.y,z+b.z); }
+  Vec operator-(const Vec &b) const { return Vec(x-b.x,y-b.y,z-b.z); }
+  Vec operator*(double b) const { return Vec(x*b,y*b,z*b); }
+  Vec mult(const Vec &b) const { return Vec(x*b.x,y*b.y,z*b.z); }
+  Vec& norm(){ return *this = *this * (1/sqrt(x*x+y*y+z*z)); }
+  double dot(const Vec &b) const { return x*b.x+y*b.y+z*b.z; } // cross:
+  Vec operator%(Vec&b){return Vec(y*b.z-z*b.y,z*b.x-x*b.z,x*b.y-y*b.x);}
+};
+struct Ray { Vec o, d; Ray(Vec o_, Vec d_) : o(o_), d(d_) {} };
+enum Refl_t { DIFF, SPEC, REFR };  // material types, used in radiance()
+struct Sphere {
+  double rad;       // radius
+  Vec p, e, c;      // position, emission, color
+  Refl_t refl;      // reflection type (DIFFuse, SPECular, REFRactive)
+  Sphere(double rad_, Vec p_, Vec e_, Vec c_, Refl_t refl_):
+    rad(rad_), p(p_), e(e_), c(c_), refl(refl_) {}
+  double intersect(const Ray &r) const { // returns distance, 0 if nohit
+    Vec op = p-r.o; // Solve t^2*d.d + 2*t*(o-p).d + (o-p).(o-p)-R^2 = 0
+    double t, eps=1e-4, b=op.dot(r.d), det=b*b-op.dot(op)+rad*rad;
+    if (det<0) return 0; else det=sqrt(det);
+    return (t=b-det)>eps ? t : ((t=b+det)>eps ? t : 0);
+  }
+};
+Sphere spheres[] = {//Scene: radius, position, emission, color, material
+  Sphere(1e5, Vec( 1e5+1,40.8,81.6), Vec(),Vec(.75,.25,.25),DIFF),//Left
+  Sphere(1e5, Vec(-1e5+99,40.8,81.6),Vec(),Vec(.25,.25,.75),DIFF),//Rght
+  Sphere(1e5, Vec(50,40.8, 1e5),     Vec(),Vec(.75,.75,.75),DIFF),//Back
+  Sphere(1e5, Vec(50,40.8,-1e5+170), Vec(),Vec(),           DIFF),//Frnt
+  Sphere(1e5, Vec(50, 1e5, 81.6),    Vec(),Vec(.75,.75,.75),DIFF),//Botm
+  Sphere(1e5, Vec(50,-1e5+81.6,81.6),Vec(),Vec(.75,.75,.75),DIFF),//Top
+  Sphere(16.5,Vec(27,16.5,47),       Vec(),Vec(1,1,1)*.999, SPEC),//Mirr
+  Sphere(16.5,Vec(73,16.5,78),       Vec(),Vec(1,1,1)*.999, REFR),//Glas
+  Sphere(600, Vec(50,681.6-.27,81.6),Vec(12,12,12),  Vec(), DIFF) //Lite
+};
+inline double clamp(double x){ return x<0 ? 0 : x>1 ? 1 : x; }
+inline int toInt(double x){ return int(pow(clamp(x),1/2.2)*255+.5); }
+inline bool intersect(const Ray &r, double &t, int &id){
+  double n=sizeof(spheres)/sizeof(Sphere), d, inf=t=1e20;
+  for(int i=int(n);i--;) if((d=spheres[i].intersect(r))&&d<t){t=d;id=i;}
+  return t<inf;
+}
+Vec radiance(const Ray &r, int depth, unsigned short *Xi){
+  double t;                               // distance to intersection
+  int id=0;                               // id of intersected object
+  if (!intersect(r, t, id)) return Vec(); // if miss, return black
+  const Sphere &obj = spheres[id];        // the hit object
+  Vec x=r.o+r.d*t, n=(x-obj.p).norm(), nl=n.dot(r.d)<0?n:n*-1, f=obj.c;
+  double p = f.x>f.y && f.x>f.z ? f.x : f.y>f.z ? f.y : f.z; // max refl
+  if (++depth>5) if (erand48(Xi)<p) f=f*(1/p); else return obj.e; //R.R.
+  if (obj.refl == DIFF){                  // Ideal DIFFUSE reflection
+    double r1=2*M_PI*erand48(Xi), r2=erand48(Xi), r2s=sqrt(r2);
+    Vec w=nl, u=((fabs(w.x)>.1?Vec(0,1):Vec(1))%w).norm(), v=w%u;
+    Vec d = (u*cos(r1)*r2s + v*sin(r1)*r2s + w*sqrt(1-r2)).norm();
+    return obj.e + f.mult(radiance(Ray(x,d),depth,Xi));
+  } else if (obj.refl == SPEC)            // Ideal SPECULAR reflection
+    return obj.e + f.mult(radiance(Ray(x,r.d-n*2*n.dot(r.d)),depth,Xi));
+  Ray reflRay(x, r.d-n*2*n.dot(r.d));     // Ideal dielectric REFRACTION
+  bool into = n.dot(nl)>0;                // Ray from outside going in?
+  double nc=1, nt=1.5, nnt=into?nc/nt:nt/nc, ddn=r.d.dot(nl), cos2t;
+  if ((cos2t=1-nnt*nnt*(1-ddn*ddn))<0)    // Total internal reflection
+    return obj.e + f.mult(radiance(reflRay,depth,Xi));
+  Vec tdir = (r.d*nnt - n*((into?1:-1)*(ddn*nnt+sqrt(cos2t)))).norm();
+  double a=nt-nc, b=nt+nc, R0=a*a/(b*b), c = 1-(into?-ddn:tdir.dot(n));
+  double Re=R0+(1-R0)*c*c*c*c*c,Tr=1-Re,P=.25+.5*Re,RP=Re/P,TP=Tr/(1-P);
+  return obj.e + f.mult(depth>2 ? (erand48(Xi)<P ?   // Russian roulette
+    radiance(reflRay,depth,Xi)*RP:radiance(Ray(x,tdir),depth,Xi)*TP) :
+    // For determinism due to the RNG: C++ executes the right side first.
+    radiance(reflRay,depth,Xi)*Re+radiance(Ray(x,tdir),depth,Xi)*Tr);
+}
+int main(int argc, char *argv[]){
+  int w=1024, h=768, samps = argc==2 ? atoi(argv[1])/4 : 1; // # samples
+  Ray cam(Vec(50,52,295.6), Vec(0,-0.042612,-1).norm()); // cam pos, dir
+  Vec cx=Vec(w*.5135/h), cy=(cx%cam.d).norm()*.5135, r, *c=new Vec[w*h];
+#pragma omp parallel for schedule(dynamic, 1) private(r)       // OpenMP
+  for (int y=0; y<h; y++){                       // Loop over image rows
+    fprintf(stderr,"\rRendering (%d spp) %5.2f%%",samps*4,100.*y/(h-1));
+    for (unsigned short x=0, Xi[3]={0,0,(unsigned short)(y*y*y)}; x<w; x++)   // Loop cols
+      for (int sy=0, i=(h-y-1)*w+x; sy<2; sy++)     // 2x2 subpixel rows
+        for (int sx=0; sx<2; sx++, r=Vec()){        // 2x2 subpixel cols
+          for (int s=0; s<samps; s++){
+            double r1=2*erand48(Xi), dx=r1<1 ? sqrt(r1)-1: 1-sqrt(2-r1);
+            double r2=2*erand48(Xi), dy=r2<1 ? sqrt(r2)-1: 1-sqrt(2-r2);
+            Vec d = cx*( ( (sx+.5 + dx)/2 + x)/w - .5) +
+                    cy*( ( (sy+.5 + dy)/2 + y)/h - .5) + cam.d;
+            // Warning: The d in "cam.o+d*140" is actually d.norm()
+            // due to right to left execution and norm() working
+            // both in-place and out-place
+            r = r + radiance(Ray(cam.o+d*140,d.norm()),0,Xi)*(1./samps);
+          } // Camera rays are pushed ^^^^^ forward to start in interior
+          c[i] = c[i] + Vec(clamp(r.x),clamp(r.y),clamp(r.z))*.25;
+        }
+  }
+  FILE *f = fopen("image.ppm", "w");         // Write image to PPM file.
+  fprintf(f, "P3\n%d %d\n%d\n", w, h, 255);
+  for (int i=0; i<w*h; i++)
+    fprintf(f,"%d %d %d ", toInt(c[i].x), toInt(c[i].y), toInt(c[i].z));
+}
--- a/constantine/platforms/threadpool/demos/raytracing/smallpt.nim
+++ b/constantine/platforms/threadpool/demos/raytracing/smallpt.nim
@ -0,0 +1,339 @@
+# Weave
+# Copyright (c) 2020 Mamy André-Ratsimbazafy
+# Licensed and distributed under either of
+#   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
+#   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
+# at your option. This file may not be copied, modified, or distributed except according to those terms.
+
+# Warning: the original code-golfing is pretty extreme
+#          The worse being declaring, resetting non-loop variable
+#          in for-loop, including the random seed ...
+import std/[math, strformat, os, strutils]
+
+type Vec = object
+  x, y, z: float64
+
+func vec(x, y, z: float64 = 0): Vec =
+  Vec(x: x, y: y, z: z)
+func `+`(a, b: Vec): Vec =
+  vec(a.x+b.x, a.y+b.y, a.z+b.z)
+func `-`(a, b: Vec): Vec =
+  vec(a.x-b.x, a.y-b.y, a.z-b.z)
+func `*`(a: Vec, t: float64): Vec =
+  vec(a.x*t, a.y*t, a.z*t)
+func `*.`(a, b: Vec): Vec =
+  # hadamard product
+  vec(a.x*b.x, a.y*b.y, a.z*b.z)
+func norm(a: Vec): Vec =
+  # Warning! Warning! The original code
+  # mutate the vec in-place AND returns a reference to the mutated input
+  a * (1/sqrt(a.x*a.x + a.y*a.y + a.z*a.z))
+func dot(a, b: Vec): float64 =
+  a.x*b.x + a.y*b.y + a.z*b.z
+func cross(a, b: Vec): Vec =
+  vec(a.y*b.z - a.z*b.y, a.z*b.x - a.x*b.z, a.x*b.y - a.y*b.x)
+
+type
+  Ray = object
+    o, d: Vec # origin, direction
+
+  Reflection = enum
+    Diffuse, Specular, Refractive
+
+  Sphere = object
+    radius: float64
+    pos, emit, color: Vec
+    refl: Reflection
+
+func ray(origin: Vec, direction: Vec): Ray =
+  result.o = origin
+  result.d = direction
+
+func intersect(self: Sphere, ray: Ray): float64 =
+  ## Returns distance, 0 if no hit
+  ## Solve t^2*d.d + 2*t*(o-p).d + (o-p).(o-p)-R^2 = 0
+  let op = self.pos - ray.o
+  const eps = 1e-4
+  let b = op.dot(ray.d)
+  var det = b*b - op.dot(op) + self.radius*self.radius
+  if det < 0.0:
+    return 0.0
+
+  det = sqrt(det)
+  block:
+    let t = b-det
+    if t > eps:
+      return t
+  block:
+    let t = b+det
+    if t > eps:
+      return t
+  return 0.0
+
+const spheres = [ # Scene: radius, position, emission, color, material                           # Walls approximated by very large spheres
+  Sphere(radius:1e5, pos:vec( 1e5+1,  40.8, 81.6),    color:vec(0.75,0.25,0.25),refl:Diffuse),   # Left
+  Sphere(radius:1e5, pos:vec(-1e5+99, 40.8, 81.6),    color:vec(0.25,0.25,0.75),refl:Diffuse),   # Right
+  Sphere(radius:1e5, pos:vec(50,      40.8, 1e5),     color:vec(0.75,0.75,0.75),refl:Diffuse),   # Back
+  Sphere(radius:1e5, pos:vec(50,      40.8,-1e5+170),                           refl:Diffuse),   # Front
+  Sphere(radius:1e5, pos:vec(50,       1e5, 81.6),    color:vec(0.75,0.75,0.75),refl:Diffuse),   # Bottom
+  Sphere(radius:1e5, pos:vec(50, -1e5+81.6, 81.6),    color:vec(0.75,0.75,0.75),refl:Diffuse),   # Top
+  Sphere(radius:16.5,pos:vec(27,      16.5, 47),      color:vec(1,1,1)*0.999,   refl:Specular),  # Mirror
+  Sphere(radius:16.5,pos:vec(73,      16.5, 78),      color:vec(1,1,1)*0.999,   refl:Refractive),# Glass
+  Sphere(radius:600, pos:vec(50, 681.6-0.27,81.6), emit:vec(12,12,12),          refl:Diffuse),   # Light
+]
+func clamp(x: float64): float64 {.inline.} =
+  if x < 0: 0.0 else: (if x > 1: 1.0 else: x)
+
+func toInt(x: float64): int32 =
+  # This seems to do gamma correction by 2.2
+  int32(
+    pow(clamp(x),1/2.2) * 255 + 0.5
+  )
+
+func intersect(r: Ray, t: var float64, id: var int32): bool =
+  # out parameters ...
+  const inf = 1e20
+  t = inf
+  for i in countdown(spheres.len-1, 0):
+    let d = spheres[i].intersect(r)
+    if d != 0 and d < t:
+      t = d
+      id = i.int32
+  return t < inf
+
+when defined(cpp):
+  # Seems like Nim codegen for mutable arrays is slightly different from the C++ API
+  # and needs a compatibility shim
+  proc erand48(xi: ptr cushort): cdouble {.importc, header:"<stdlib.h>", sideeffect.}
+  proc erand48(xi: var array[3, cushort]): float64 {.inline.} =
+    erand48(xi[0].addr)
+else:
+  # Need the same RNG for comparison
+  proc erand48(xi: var array[3, cushort]): cdouble {.importc, header:"<stdlib.h>", sideeffect.}
+
+proc radiance(r: Ray, depth: int32, xi: var array[3, cushort]): Vec =
+  var t: float64                      # distance to intersection
+  var id = 0'i32                      # id of intersected object
+  if not r.intersect(t, id):          # if miss return black
+    return vec()
+  template obj: untyped = spheres[id] # alias the hit object
+
+  let x = r.o + r.d * t
+  let n = norm(x-obj.pos);
+  let nl = if n.dot(r.d) < 0: n else: n * -1
+  var f = obj.color
+  let p = max(f.x, max(f.y, f.z))     # max reflect
+  let depth=depth+1
+  if depth>5:
+    if erand48(xi) < p:
+      f = f*(1/p)
+    else:
+      return obj.emit                 # Russian Roulette
+
+  if obj.refl == Diffuse:             # ideal diffuse reflection
+    let
+      r1  = 2*PI*erand48(xi)
+      r2  = erand48(xi)
+      r2s = sqrt(r2)
+      w   = nl
+      u   = (if w.x.abs() > 0.1: vec(0,1) else: vec(1)).cross(w).norm()
+      v   = w.cross(u)
+      d   = (u*cos(r1)*r2s + v*sin(r1)*r2s + w*sqrt(1-r2)).norm()
+    return obj.emit + f *. radiance(ray(x, d), depth, xi)
+  elif obj.refl == Specular:          # ideal specular reflection
+    return obj.emit + f *. radiance(ray(x, r.d - n*2*n.dot(r.d)), depth, xi)
+
+  # Dielectric refraction
+  let
+    reflRay = ray(x, r.d - n*2*n.dot(r.d))
+    into = n.dot(nl) > 0              # Ray from outside going in
+    nc = 1.0
+    nt = 1.5
+    nnt = if into: nc/nt else: nt/nc
+    ddn = r.d.dot(nl)
+    cos2t = 1-nnt*nnt*(1-ddn*ddn)
+
+  if cos2t < 0:
+    return obj.emit + f *. radiance(reflRay, depth, xi)
+  let
+    tdir = (r.d*nnt - n*(if into: 1 else: -1)*(ddn*nnt+sqrt(cos2t))).norm()
+    a = nt - nc
+    b = nt + nc
+    R0 = a*a/(b*b)
+    c = 1 - (if into: -ddn else: tdir.dot(n))
+    Re = R0 + (1-R0)*c*c*c*c*c
+    Tr = 1-Re
+    P = 0.25+0.5*Re
+    RP = Re/P
+    TP = Tr/(1-P)
+  return obj.emit + f *. (block:
+    if depth>2:       # Russian roulette
+      if erand48(xi)<P:
+        radiance(reflRay, depth, xi)*RP
+      else:
+        radiance(ray(x, tdir), depth, xi)*TP
+    else:
+      # Note: to exacly reproduce the C++ result,
+      # since we have a random function and C++ seem to resolve
+      # from right to left, we exchange our processing order
+      radiance(ray(x, tdir), depth, xi)*Tr +
+        radiance(reflRay, depth, xi)*Re
+  )
+
+proc tracer_single(C: var seq[Vec], w, h: static int, samples: int) =
+  const
+    cam = ray(vec(50,52,295.6), vec(0,-0.042612,-1).norm())
+    cx = vec(w*0.5135/h)
+    cy = cx.cross(cam.d).norm()*0.5135
+
+  for y in 0 ..< h:                       # Loop over image rows
+    stderr.write &"\rRendering ({samples*4} samples per pixel) {100.0*y.float64/float(h-1):5.2f}%"
+    var xi = [cushort 0, 0, cushort y*y*y]
+    for x in 0 ..< w:                     # Loop over columns
+      let i = (h-y-1)*w+x
+      for sy in 0 ..< 2:                  # 2x2 subpixel rows
+        for sx in 0 ..< 2:                # 2x2 subpixel cols
+          var r = vec()
+          for s in 0 ..< samples:
+            let
+              r1 = 2*erand48(xi)
+              dx = if r1<1: sqrt(r1)-1 else: 1-sqrt(2-r1)
+              r2 = 2*erand48(xi)
+              dy = if r2<1: sqrt(r2)-1 else: 1-sqrt(2-r2)
+              d = cx*(((sx.float64 + 0.5 + dx)/2 + x.float64)/w - 0.5) +
+                  cy*(((sy.float64 + 0.5 + dy)/2 + y.float64)/h - 0.5) + cam.d
+            let dnorm = d.norm() # Warning, the original code is deceptive since d is modified by d.norm()
+            let ray = ray(cam.o + dnorm*140'f64, dnorm)
+            let rad = radiance(ray, depth = 0, xi)
+            r = r + rad * (1.0/samples.float64)
+          C[i] = C[i] + vec(r.x.clamp(), r.y.clamp(), r.z.clamp()) * 0.25 # / num subpixels
+
+when compileOption("threads"):
+  import ../../threadpool
+
+  proc tracer_threaded(C: var seq[Vec], w, h: static int, samples: int) =
+    # This gives the exact same result as single threaded and GCC OpenMP
+    # assumes that C++ calls resolve function calls from right to left
+
+    const
+      cam = ray(vec(50,52,295.6), vec(0,-0.042612,-1).norm())
+      cx = vec(w*0.5135/h)
+      cy = cx.cross(cam.d).norm()*0.5135
+
+     # We need the buffer raw address
+    let buf = cast[ptr UncheckedArray[Vec]](C[0].addr)
+
+    var tp = Threadpool.new()
+
+    tp.parallelFor y in 0 ..< h:            # Loop over image rows
+      captures: {tp, buf, samples}
+      try:
+        stderr.write &"\rRendering ({samples*4} samples per pixel) {100.0*y.float64/float(h-1):5.2f}%"
+      except:
+        echo getCurrentExceptionMsg()
+        quit 1
+      var xi = [cushort 0, 0, cushort y*y*y]
+      for x in 0 ..< w:                     # Loop over columns
+        let i = (h-y-1)*w+x
+        for sy in 0 ..< 2:                  # 2x2 subpixel rows
+          for sx in 0 ..< 2:                # 2x2 subpixel cols
+            var r = vec()
+            for s in 0 ..< samples:
+              let
+                r1 = 2*erand48(xi)
+                dx = if r1<1: sqrt(r1)-1 else: 1-sqrt(2-r1)
+                r2 = 2*erand48(xi)
+                dy = if r2<1: sqrt(r2)-1 else: 1-sqrt(2-r2)
+                d = cx*(((sx.float64 + 0.5 + dx)/2 + x.float64)/w - 0.5) +
+                    cy*(((sy.float64 + 0.5 + dy)/2 + y.float64)/h - 0.5) + cam.d
+              let dnorm = d.norm() # Warning, the original code is deceptive since d is modified by d.norm()
+              let ray = ray(cam.o + dnorm*140'f64, dnorm)
+              let rad = radiance(ray, depth = 0, xi)
+              r = r + rad * (1.0/samples.float64)
+            buf[i] = buf[i] + vec(r.x.clamp(), r.y.clamp(), r.z.clamp()) * 0.25 # / num subpixels
+
+    tp.shutdown() # This implicitly blocks until tasks are all done
+
+  proc tracer_nested_parallelism(C: var seq[Vec], w, h: static int, samples: int) =
+    # The results are different since the RNG will not be seeded the same
+    # The rng needs to be thread-local but task stealing + nested parallelismmakes the
+    # actual executing thread pretty random.
+    # So the RNG has been moved to an inner scope,
+    # downside is that resulting images have horizontal noise instead of pointwise
+
+    const
+      cam = ray(vec(50,52,295.6), vec(0,-0.042612,-1).norm())
+      cx = vec(w*0.5135/h)
+      cy = cx.cross(cam.d).norm()*0.5135
+
+     # We need the buffer raw address
+    let buf = cast[ptr UncheckedArray[Vec]](C[0].addr)
+
+    var tp = Threadpool.new()
+
+    tp.parallelFor y in 0 ..< h:            # Loop over image rows
+      captures: {buf, samples}
+      try:
+        stderr.write &"\rRendering ({samples*4} samples per pixel) {100.0*y.float64/float(h-1):5.2f}%"
+      except:
+        echo getCurrentExceptionMsg()
+        quit 1
+      tp.parallelFor x in 0 ..< w:          # Loop over columns
+        captures: {y, buf, samples}
+        var xi = [cushort 0, 0, cushort y*y*y]
+        let i = (h-y-1)*w+x
+        for sy in 0 ..< 2:                  # 2x2 subpixel rows
+          for sx in 0 ..< 2:                # 2x2 subpixel cols
+            var r = vec()
+            for s in 0 ..< samples:
+              let
+                r1 = 2*erand48(xi)
+                dx = if r1<1: sqrt(r1)-1 else: 1-sqrt(2-r1)
+                r2 = 2*erand48(xi)
+                dy = if r2<1: sqrt(r2)-1 else: 1-sqrt(2-r2)
+                d = cx*(((sx.float64 + 0.5 + dx)/2 + x.float64)/w - 0.5) +
+                    cy*(((sy.float64 + 0.5 + dy)/2 + y.float64)/h - 0.5) + cam.d
+              let dnorm = d.norm() # Warning, the original code is deceptive since d is modified by d.norm()
+              let ray = ray(cam.o + dnorm*140'f64, dnorm)
+              let rad = radiance(ray, depth = 0, xi)
+              r = r + rad * (1.0/samples.float64)
+            buf[i] = buf[i] + vec(r.x.clamp(), r.y.clamp(), r.z.clamp()) * 0.25 # / num subpixels
+
+    tp.shutdown() # This implicitly blocks until tasks are all done
+
+proc main() =
+  const w = 1024
+  const h = 768
+  var samples = 1
+  var outFile = "image.ppm"
+  let exeName = getAppFilename().extractFilename()
+  if paramCount() == 0:
+    echo &"Usage: {exeName} <samples per pixel:{samples*4}> <output image:{outFile}>"
+    echo &"Running with default samples = {samples*4}, output = {outFile}"
+  elif paramCount() == 1:
+    samples = paramStr(1).parseInt() div 4 # (2*2 blocks)
+    samples = max(samples, 1)
+  elif paramCount() == 2:
+    samples = paramStr(1).parseInt() div 4
+    samples = max(samples, 1)
+    outFile = paramStr(2)
+  else:
+    echo &"Usage: {exeName} <samples per pixel:{samples}> <output image:{outFile}>"
+
+  var C = newSeq[Vec](w*h)
+  when compileOption("threads"):
+    echo "Running multithreaded. (⚠️ Stack overflow at depth 4480 (400 samples)?)"
+    # tracer_threaded(C, w, h, samples)
+    tracer_nested_parallelism(C, w, h, samples)
+  else:
+    echo "Running single-threaded."
+    tracer_single(C, w, h, samples)
+
+  let f = open(outFile, mode = fmWrite)
+  defer: f.close()
+  f.write &"P3\n{w} {h}\n255\n"
+  for i in 0 ..< w*h:
+    f.write &"{C[i].x.toInt()} {C[i].y.toInt()} {C[i].z.toInt()} "
+
+  stderr.write "\nDone.\n"
+main()
--- a/constantine/platforms/threadpool/design.md
+++ b/constantine/platforms/threadpool/design.md
@ -0,0 +1,109 @@
+# Threadpool design
+
+The threadpool design is heavily inspired by [Weave](https://github.com/mratsim/weave), the wealth of preparatory [research](https://github.com/mratsim/weave/tree/master/research) and the simplified Weave, [nim-taskpools](https://github.com/status-im/nim-taskpools)
+
+The goal is to produce an extremely high-performance, low-overhead, energy-efficient multithreading runtime.
+However, as the backend to a cryptographic library it needs to be high-assurance, in particular auditable and maintainable.
+
+Unfortunately, Weave design, based on work-requesting requires more machinery compared to work-stealing, which means more state. Furthermore it includes a custom memory pool.
+
+On the other hand, nim-taskpools does not support data parallelism (parallel for loop).
+
+Also neither supports putting awaiting threads to sleep when the future they want to complete is not done AND there is no work left.
+
+## Features
+
+| Features                                                                                         | OpenMP                                                     | Weave                                              | nim-taskpools | Constantine's Threadpool                           |
+|--------------------------------------------------------------------------------------------------|------------------------------------------------------------|----------------------------------------------------|---------------|----------------------------------------------------|
+| Task parallelism (Futures with spawn/sync)                                                       | no                                                         | yes                                                | yes           | yes                                                |
+| Data parallelism (parallel for-loop)                                                             | yes                                                        | yes                                                | no            | yes                                                |
+| Nested parallel-for regions support                                                              | no (lead to oversubscription)                              | yes                                                | N/A           | yes                                                |
+| Dataflow parallelism (Tasks triggered by events / precise task dependencies)                     | yes                                                        | yes                                                | no            | yes                                                |
+| Communication mechanisms | Shared-memory | Message-passing / Channels | Shared-memory | Shared-memory
+| Load balancing strategy                                                                          | static (GCC), work-stealing (Intel/LLVM)                   | work-sharing / work-requesting                     | work-stealing | work-stealing                                      |
+| Blocked tasks don't block runtime                                                                | N/A                                                        | no                                                 | yes           | yes                                                |
+| Load-balancing strategy for task parallelism (important for fine-grained parallelism)            | global queue (GCC), steal-one (Intel/LLVM)                 | Adaptative steal-one/steal-half                    | steal-one     | steal-one (steal-half WIP)                         |
+| Load-balancing strategy for data parallelism                                                     | eager splitting depending on iteration count and cpu count | lazy splitting depending on idle CPUs and workload | N/A           | lazy splitting depending on idle CPUs and workload |
+| Backoff worker when idle                                                                         | yes (?)                                                    | yes                                                | yes           | yes                                                |
+| Backoff worker when awaiting task but no work                                                    | N/A                                                        | no                                                 | no            | yes                                                |
+| Scheduler overhead/contention (measured on Fibonacci 40), important for fine-grained parallelism | Extreme: frozen runtime (GCC), high (Intel/LLVM)           | low to very low                                    | medium        | low                                                |
+
+## Key features design
+
+### Load-balancing for data parallelism
+
+A critical issue in most (all?) runtimes used in HPC (OpenMP and Intel TBB in particular) is that they split their parallel for loop ahead of time.
+They do not know how many idle threads there are, or how costly the workload that will be run will be. This leads to significant inefficiencies and performance unportability.
+For example this repo https://github.com/zy97140/omp-benchmark-for-pytorch gives the number of elements thresholds under which parallelization is not profitable or even hurt performance for common float operations:
+
+> |CPU Model|Sockets|Cores/Socket|Frequency|
+> |---|---|---|---|
+> |Intel(R) Xeon(R) CPU E5-2699 v4   |2|22|2.20GHz|
+> |Intel(R) Xeon(R) Platinum 8180 CPU|2|28|2.50GHz|
+> |Intel(R) Core(TM) i7-5960X CPU |1|8|3.00GHz|
+>
+> |   |Xeon(R) Platinum 8180 CPU|Xeon(R) CPU E5-2699 v4| i7-5960X CPU|
+> |---|------------------------:|---------------------:|------------:|
+> |copy|80k|20k|8k|
+> |add |80k|20k|8k|
+> |div |50k|10k|2k|
+> |exp |1k |1k |1k|
+> |sin |1k |1k |1k|
+> |sum |1k |1k |1k|
+> |prod|1k |1k |1k|
+>
+> Details on the Xeon Platinum
+>
+> |Tensor Size|In series|In parallel|SpeedUp|
+> |---|---:|---:|---:|
+> |1k	|1.04	|5.15|		0.20X      |
+> |2k	|1.23	|5.47|		0.22X      |
+> |3k	|1.33	|5.34|		0.24X      |
+> |4k	|1.47	|5.41|		0.27X      |
+> |5k	|1.48	|5.40|		0.27X      |
+> |8k	|1.81	|5.55|		0.32X      |
+> |10k|1.98	|5.66|		0.35X      |
+> |20k|2.74	|6.74|		0.40X      |
+> |50k|5.12	|6.59|		0.77X      |
+> |__80k__|__14.79__|__6.59__|		__2.24X__      |
+> |__100k__|__21.97__|__6.70__|		__3.27X__      |
+
+Instead we can have each thread start working and use backpressure to lazily evaluate when it's profitable to split:
+- Lazy Binary-Splitting: A Run-Time Adaptive Work-Stealing Scheduler
+  Tzannes, Caragea, Barua, Vishkin
+  https://terpconnect.umd.edu/~barua/ppopp164.pdf
+
+### Backoff workers when awaiting a future
+
+This problem is quite tricky:
+- For latency, and to potentially create more work ASAP, we want such a worker to follow through on the blocked future ASAP.
+- For throughput, and because a scheduler is optimal only when greedy, we want an idle thread to take any available work.
+  - But what if the work then blocks that worker for 1s? This hurts latency and might lead to work starvation if the continuation would have created more work.
+- There is no robust, cross-platform API, to wake a specific thread awaiting on a futex or condition variable.
+  - The simplest design then would be to have an array of futexes, when backing-off sleep on those.
+    The issue then is that when offering work you have to scan that array to find a worker to wake.
+    Contrary to a idle worker, the waker is working so this scan hurts throughput and latency, and due to the many
+    atomics operations required, will completely thrash the cache of that worker.
+  - Another potential data structure would be a concurrent sparse-set but designing concurrent data structures is difficult.
+    and locking would be expensive for an active worker
+
+The solution in Constantine's threadpool to minimize latency and maximize throughput and avoid implementing another concurrent data structure is
+having "reserve threads". Before sleeping when blocked on a future the thread wakes a reserve thread. This maintain throughput, and the thread is immediately available
+as well when the future is completed. A well-behaved program will always have at least 1 thread making progress among N, so a reserve of size N is sufficient.
+
+### Scheduler overhead/contention
+
+To enable fine-grained parallelism, i.e. parallelizing tasks in the microseconds range, it's critical to reduce contention.
+A global task queue will be hammered by N threads, leading to each thrashing each other caches.
+In contrast, distributed task queues with random victim selection significantly reduce contention.
+
+Another source of overhead is the allocator, the worst case for allocators is allocation in a thread and deallocation in another, especially if the
+allocating thread is always the same. Unfortunately this is common in producer-consumer workloads.
+Besides multithreaded allocations/deallocations will trigger costly atomic-swaps and possibly fragmentation.
+Minimizing allocations to the utmost will significantly help on fine-grained tasks.
+- Weave solved that problem by having 2 levels of cache: a memory-pool for tasks and futures and a lookaside list that caches tasks to reduce further pressure on the memory pool.
+- Nim-taskpools does not address this problem, it has an allocation overhead per tasks of 1 for std/tasks, 1 for the linked list that holds them, 1 for the result channel/flowvar.
+  Unlike GCC OpenMP which freezes on a fibonacci 40 benchmark, it can still finish but it's 20x slower than Weave.
+- Constantine's threadpool solves the problem by making everything intrusive to a task: the task env, the future, the linked list.
+In fact this solution is even faster than Weave's, probably due to significantly less page faults and cache misses.
+Note that Weave has an even faster mode when futures don't escape their function by allocating them on the stack but without compiler support (heap allocation elision) that restricts the programs you can write.
--- a/constantine/platforms/threadpool/examples/e03_parallel_for.nim
+++ b/constantine/platforms/threadpool/examples/e03_parallel_for.nim
@ -0,0 +1,60 @@
+import ../threadpool, ../instrumentation
+
+block:
+  proc main() =
+    echo "\n=============================================================================================="
+    echo "Running 'threadpool/examples/e03_parallel_for.nim'"
+    echo "=============================================================================================="
+
+    var tp = Threadpool.new(numThreads = 4)
+
+    tp.parallelFor i in 0 ..< 100:
+      log("%d\n", i)
+
+    tp.shutdown()
+
+  echo "Simple parallel for"
+  echo "-------------------------"
+  main()
+  echo "-------------------------"
+
+block: # Capturing outside scope
+  proc main2() =
+    echo "\n=============================================================================================="
+    echo "Running 'threadpool/examples/e03_parallel_for.nim'"
+    echo "=============================================================================================="
+
+    var tp = Threadpool.new(numThreads = 4)
+
+    var a = 100
+    var b = 10
+    tp.parallelFor i in 0 ..< 10:
+      captures: {a, b}
+      log("a+b+i = %d \n", a+b+i)
+
+    tp.shutdown()
+
+  echo "\n\nCapturing outside variables"
+  echo "-------------------------"
+  main2()
+  echo "-------------------------"
+
+block: # Nested loops
+  proc main3() =
+    echo "\n=============================================================================================="
+    echo "Running 'threadpool/examples/e03_parallel_for.nim'"
+    echo "=============================================================================================="
+
+    var tp = Threadpool.new(numThreads = 4)
+
+    tp.parallelFor i in 0 ..< 4:
+      tp.parallelFor j in 0 ..< 8:
+        captures: {i}
+        log("Matrix[%d, %d]\n", i, j)
+
+    tp.shutdown()
+
+  echo "\n\nNested loops"
+  echo "-------------------------"
+  main3()
+  echo "-------------------------"
--- a/constantine/platforms/threadpool/examples/e04_parallel_reduce.nim
+++ b/constantine/platforms/threadpool/examples/e04_parallel_reduce.nim
@ -0,0 +1,34 @@
+import ../threadpool
+
+block:
+  proc main() =
+    echo "\n=============================================================================================="
+    echo "Running 'threadpool/examples/e04_parallel_reduce.nim'"
+    echo "=============================================================================================="
+
+    proc sumReduce(tp: Threadpool, n: int): int64 =
+      tp.parallelFor i in 0 .. n:
+        reduceInto(globalSum: int64):
+          prologue:
+            var localSum = 0'i64
+          forLoop:
+            localSum += int64(i)
+          merge(remoteSum: Flowvar[int64]):
+            localSum += sync(remoteSum)
+          epilogue:
+            return localSum
+
+      result = sync(globalSum)
+
+    var tp = Threadpool.new(numThreads = 4)
+
+    let sum1M = tp.sumReduce(1000000)
+    echo "Sum reduce(0..1000000): ", sum1M
+    doAssert sum1M == 500_000_500_000'i64
+
+    tp.shutdown()
+
+  echo "Simple parallel reduce"
+  echo "-------------------------"
+  main()
+  echo "-------------------------"
--- a/constantine/platforms/threadpool/instrumentation.nim
+++ b/constantine/platforms/threadpool/instrumentation.nim
@ -14,6 +14,10 @@ template log*(args: varargs[untyped]): untyped =
  c_printf(args)
  flushFile(stdout)

+template debugSplit*(body: untyped): untyped =
+  when defined(TP_DebugSplit) or defined(TP_Debug):
+    {.noSideEffect, gcsafe.}: body
+
 template debugTermination*(body: untyped): untyped =
  when defined(TP_DebugTermination) or defined(TP_Debug):
    {.noSideEffect, gcsafe.}: body
--- a/constantine/platforms/threadpool/parallel_offloading.nim
+++ b/constantine/platforms/threadpool/parallel_offloading.nim
@ -1,6 +1,5 @@
-# Constantine
-# Copyright (c) 2018-2019    Status Research & Development GmbH
-# Copyright (c) 2020-Present Mamy André-Ratsimbazafy
+# Weave
+# Copyright (c) 2019 Mamy André-Ratsimbazafy
 # Licensed and distributed under either of
 #   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
 #   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
@ -8,111 +7,128 @@

 import
  std/macros,
-  ./crossthread/tasks_flowvars
+  ./crossthread/tasks_flowvars,
+  ../ast_rebuilder

-# Task parallelism - spawn
-# ---------------------------------------------
+# Parallel offloading API
+# -----------------------
+
+# This file implements all the macros necessary
+# to provide a comprehensive and hopefully intuitive API
+# for all the parallelim paradigms supported:
+#
+# - Task parallelism
+# - Data parallelism / parallel for
+#   - parallel-for with thread-local prologue and epilogue
+#   - parallel-reduction without atomics or locks
+# - Dataflow parallelism
+#   - also known as:
+#     - Graph parallelism
+#     - Stream parallelism
+#     - Pipeline parallelism
+#     - Data-driven (task) parallelism
+#     with precise input/output dependencies
+
+# ############################################################
+#                                                            #
+#                   Task parallelism                         #
+#                                                            #
+# ############################################################

 proc spawnVoid(funcCall: NimNode, args, argsTy: NimNode, workerContext, schedule: NimNode): NimNode =
  # Create the async function
  let fn = funcCall[0]
  let fnName = $fn
  let withArgs = args.len > 0
-  let async_fn = ident("async_" & fnName)
-  var fnCall = newCall(fn)
-  let data = ident("data")   # typed pointer to data
+  let tpSpawn_closure = ident("ctt_tpSpawnVoidClosure_" & fnName)
+  var loopFnCall = newCall(fn)
+  let env = ident("ctt_tpSpawnVoidEnv_")   # typed pointer to env

  # Schedule
-  let task = ident"task"
+  let task = ident"ctt_tpSpawnVoidTask_"
  let scheduleBlock = newCall(schedule, workerContext, task)

  result = newStmtList()

  if funcCall.len == 2:
    # With only 1 arg, the tuple syntax doesn't construct a tuple
-    # let data = (123) # is an int
-    fnCall.add nnkDerefExpr.newTree(data)
+    # let env = (123) # is an int
+    loopFnCall.add nnkDerefExpr.newTree(env)
  else: # This handles the 0 arg case as well
    for i in 1 ..< funcCall.len:
-      fnCall.add nnkBracketExpr.newTree(
-        data,
-        newLit i-1
-      )
+      loopFnCall.add nnkBracketExpr.newTree(
+        env,
+        newLit i-1)

  # Create the async call
  result.add quote do:
-    proc `async_fn`(param: pointer) {.nimcall.} =
+    proc `tpSpawn_closure`(env: pointer) {.nimcall, gcsafe, raises: [].} =
      when bool(`withArgs`):
-        let `data` = cast[ptr `argsTy`](param)
-      `fnCall`
+        let `env` = cast[ptr `argsTy`](env)
+      `loopFnCall`

  # Create the task
  result.add quote do:
    block enq_deq_task:
      when bool(`withArgs`):
-        let `task` = Task.new(
+        let `task` = Task.newSpawn(
          parent = `workerContext`.currentTask,
-          fn = `async_fn`,
-          params = `args`)
+          fn = `tpSpawn_closure`,
+          env = `args`)
      else:
-        let `task` = Task.new(
+        let `task` = Task.newSpawn(
          parent = `workerContext`.currentTask,
-          fn = `async_fn`)
+          fn = `tpSpawn_closure`)
      `scheduleBlock`

 proc spawnRet(funcCall: NimNode, retTy, args, argsTy: NimNode, workerContext, schedule: NimNode): NimNode =
  # Create the async function
-  let fn = funcCall[0]
-  let fnName = $fn
-  let async_fn = ident("async_" & fnName)
-  var fnCall = newCall(fn)
-  let data = ident("data")   # typed pointer to data
-
-  # Schedule
-  let task = ident"task"
-  let scheduleBlock = newCall(schedule, workerContext, task)
-
  result = newStmtList()

+  let fn = funcCall[0]
+  let fnName = $fn
+  let tpSpawn_closure = ident("ctt_tpSpawnRetClosure_" & fnName)
+  var loopFnCall = newCall(fn)
+  let env = ident("ctt_tpSpawnRetEnv_")   # typed pointer to env
+
  # tasks have no return value.
-  # 1. The start of the task `data` buffer will store the return value for the flowvar and awaiter/sync
-  # 2. We create a wrapper async_fn without return value that send the return value in the channel
+  # 1. The start of the task `env` buffer will store the return value for the flowvar and awaiter/sync
+  # 2. We create a wrapper tpSpawn_closure without return value that send the return value in the channel
  # 3. We package that wrapper function in a task

-  # We store the following in task.data:
+  # We store the following in task.env:
  #
  # | ptr Task | result | arg₀ | arg₁ | ... | argₙ
-  let fut = ident"fut"
-  let taskSelfReference = ident"taskSelfReference"
-  let retVal = ident"retVal"
+  let fut = ident"ctt_tpSpawnRetFut_"
+  let taskSelfReference = ident"ctt_taskSelfReference"
+  let retVal = ident"ctt_retVal"

-  var futArgs = nnkPar.newTree
-  var futArgsTy = nnkPar.newTree
-  futArgs.add taskSelfReference
-  futArgsTy.add nnkPtrTy.newTree(bindSym"Task")
-  futArgs.add retVal
-  futArgsTy.add retTy
+  var envParams = nnkPar.newTree
+  var envParamsTy = nnkPar.newTree
+  envParams.add taskSelfReference
+  envParamsTy.add nnkPtrTy.newTree(bindSym"Task")
+  envParams.add retVal
+  envParamsTy.add retTy

  for i in 1 ..< funcCall.len:
-    futArgsTy.add getTypeInst(funcCall[i])
-    futArgs.add funcCall[i]
+    envParamsTy.add getTypeInst(funcCall[i])
+    envParams.add funcCall[i]

-  # data stores | ptr Task | result | arg₀ | arg₁ | ... | argₙ
-  # so arguments starts at data[2] in the wrapping funcCall functions
+  # env stores | ptr Task | result | arg₀ | arg₁ | ... | argₙ
+  # so arguments starts at env[2] in the wrapping funcCall functions
  for i in 1 ..< funcCall.len:
-    fnCall.add nnkBracketExpr.newTree(
-      data,
-      newLit i+1
-    )
+    loopFnCall.add nnkBracketExpr.newTree(env, newLit i+1)

  result.add quote do:
-    proc `async_fn`(param: pointer) {.nimcall.} =
-      let `data` = cast[ptr `futArgsTy`](param)
-      let res = `fnCall`
-      readyWith(`data`[0], res)
+    proc `tpSpawn_closure`(env: pointer) {.nimcall, gcsafe, raises: [].} =
+      let `env` = cast[ptr `envParamsTy`](env)
+      let res = `loopFnCall`
+      readyWith(`env`[0], res)

  # Regenerate fresh ident, retTy has been tagged as a function call param
  let retTy = ident($retTy)
+  let task = ident"ctt_tpSpawnRetTask_"
+  let scheduleBlock = newCall(schedule, workerContext, task)

  # Create the task
  result.add quote do:
@ -120,11 +136,11 @@ proc spawnRet(funcCall: NimNode, retTy, args, argsTy: NimNode, workerContext, sc
      let `taskSelfReference` = cast[ptr Task](0xDEADBEEF)
      let `retVal` = default(`retTy`)

-      let `task` = Task.new(
+      let `task` = Task.newSpawn(
        parent = `workerContext`.currentTask,
-        fn = `async_fn`,
-        params = `futArgs`)
-      let `fut` = newFlowvar(`retTy`, `task`)
+        fn = `tpSpawn_closure`,
+        env = `envParams`)
+      let `fut` = newFlowVar(`retTy`, `task`)
      `scheduleBlock`
      # Return the future
      `fut`
@ -133,8 +149,8 @@ proc spawnImpl*(tp: NimNode{nkSym}, funcCall: NimNode, workerContext, schedule:
  funcCall.expectKind(nnkCall)

  # Get the return type if any
-  let retType = funcCall[0].getImpl[3][0]
-  let needFuture = retType.kind != nnkEmpty
+  let retTy = funcCall[0].getImpl[3][0]
+  let needFuture = retTy.kind != nnkEmpty

  # Get a serialized type and data for all function arguments
  # We use adhoc tuple
@ -148,7 +164,628 @@ proc spawnImpl*(tp: NimNode{nkSym}, funcCall: NimNode, workerContext, schedule:
  if not needFuture:
    result = spawnVoid(funcCall, args, argsTy, workerContext, schedule)
  else:
-    result = spawnRet(funcCall, retType, args, argsTy, workerContext, schedule)
+    result = spawnRet(funcCall, retTy, args, argsTy, workerContext, schedule)

  # Wrap in a block for namespacing
  result = nnkBlockStmt.newTree(newEmptyNode(), result)
+
+# ############################################################
+#                                                            #
+#                   Data parallelism                         #
+#                                                            #
+# ############################################################
+
+# Error messages generation
+# --------------------------------------------------------------------------------------------------
+# This outputs nice syntax examples for the parallel reduction
+# and parallel staged domain specific languages.
+
+type Example = enum
+  Reduce
+  Staged
+
+template parReduceExample() {.dirty.}=
+  # Used for a nice error message
+
+  proc parallelReduceExample(n: int): int =
+    tp.parallelFor i in 0 ..< n:
+      ## Declare a parallelFor or parallelForStrided loop as usual
+      reduceInto(globalSum: int64):
+        ## Indicate that the loop is a reduction and declare the global reduction variable to sync with
+        prologue:
+          ## Declare your local reduction variable(s) here
+          ## It should be initialized with the neutral element
+          ## corresponding to your fold operation.
+          ## (0 for addition, 1 for multiplication, -Inf for max, +Inf for min, ...)
+          ##
+          ## This is task-local (and thread-local), each tasks set this section independently.
+          ## Splitting in multiple tasks is done dynamically at the runtime discretion
+          ## depending on available parallelism and load.
+          var localSum = 0
+        forLoop:
+          ## This is the reduction loop
+          localSum += i
+        merge(remoteSum: FlowVar[int64]):
+          ## Define how to merge with partial reductions from remote threads
+          ## Remote threads result come as Flowvar that needs to be synced.
+          ## Latency-hiding techniques can be use to overlap epilogue computations
+          ## with other threads sync.
+          localSum += sync(remoteSum)
+        epilogue:
+          ## Local task cleanup like memory allocated in prologue
+          ## and returning the local accumulator
+          return localSum
+
+    ## Await the parallel reduction
+    return sync(globalSum)
+
+template parStagedExample() {.dirty.} =
+  # Used for a nice error message
+
+  proc parallelStagedSumExample(n: int): int =
+    ## We will do a sum reduction to illustrate
+    ## staged parallel for
+
+    ## First take the address of the result
+    let res = result.addr
+
+    ## Declare a parallelForStaged loop
+    tp.parallelForStaged i in 0 ..< n:
+      captures: {res}
+      prologue:
+        ## Declare anything needed before the for-loop
+        ## This will be thread-local, so each thread will run this section independently.
+        ## The loop increment is not available here
+        var localSum = 0
+      forLoop:
+        ## This is within the parallel loop
+        localSum += i
+      epilogue:
+        ## Once the loop is finished, you have a final opportunity for processing.
+        ## Thread-local cleanup should happen here as well
+        ## Here we print the localSum and atomically increment the global sum
+        ## before ending the task.
+        echo "localsum = ", localSum
+        res[].atomicInc(localSum)
+
+    ## Await all tasks
+    tp.syncAll()
+
+proc printReduceExample() =
+  let example = getAst(parReduceExample())
+  echo example.toStrLit()
+proc printStagedExample() =
+  let example = getAst(parStagedExample())
+  echo example.toStrLit()
+
+proc testKind(nn: NimNode, nnk: NimNodeKind, kind: Example) =
+  if nn.kind != nnk:
+    case kind
+    of Reduce: printReduceExample()
+    of Staged: printStagedExample()
+    nn.expectKind(nnk) # Gives nice line numbers
+
+# Parallel Loop Domain Specific Language Descriptor
+# --------------------------------------------------------------------------------------------------
+
+type
+  LoopKind = enum
+    kForLoop
+    kReduction
+    kStaged
+
+  LoopDescriptor = object
+    ## A loop descriptor fully described a parallel loop
+    ## before final code generation
+    ##
+    ## Fields are ordered by depth of the call stack:
+    ## - Users defines the loop boundaries and captures
+    ## - a closure with signature `proc MyFunctionName(env: pointer)`
+    ##   is generated
+    ## - it gets packaged in a task
+    ## - on task execution, the inner proc is reconstructed
+    ## - That inner proc may have various sections depending on the loop kind
+
+    kind: LoopKind
+
+    # Loop bounds
+    # -----------
+    indexVariable: NimNode
+    start: NimNode
+    stopEx: NimNode
+    stride: NimNode
+
+    # Closure generation
+    # ------------------
+    envName: NimNode
+    closureName: NimNode
+    closureDef: NimNode
+    capturedVars: NimNode
+    capturedTypes: NimNode
+
+    # Task packaging and scheduling
+    # -----------------------------
+    taskName: NimNode
+    taskCreation: NimNode
+    workerContext: NimNode
+    scheduleFn: NimNode
+
+    # Parallel loop stages
+    # --------------------
+    # There are 3 calls level for loops:
+    # - closure(env: pointer) {.nimcall, gcsafe, raises: [].}
+    # - loopFn(args: ptr (argsTy₀, argsTy₁, ..., argsTyₙ)): returnType {.inline, nimcall, gcsafe, raises: [].}
+    #     let (args₀, args₁, ..., argsₙ) = args[]
+    #     loopTemplate(indexVar, prologue, loopBody, ...)
+    # - loopTemplate(indexVar, prologue, loopBody, ...: untyped)
+    #
+    # The last 2 levels are inline in the closure.
+    # - The closure deals with removing type erasure from an untyped environment and updating the future once the task is finished
+    # - The loopFn reinstalls the captured values
+    # - The loopTemplate reimplements the sections as well as runtime interaction
+    #   for loop splitting checks and merging reduction accumulators with splitted tasks.
+    #
+    # A side-benefit of the loopFn is that it allows borrow-checking:
+    # - error if we capture a `var parameter`
+    # - error if we forget to capture a runtime variable (compile-time constants do not have to be captured)
+    loopFnName: NimNode   # inner function called by the closure once environment is retyped
+    loopTemplate: NimNode # inner function implementation, defined in threadpool.nim
+    prologue: NimNode
+    forLoop: NimNode
+    epilogue: NimNode
+
+    # Futures - awaitable loops and reductions
+    # ----------------------------------------
+    globalAwaitable: NimNode
+    remoteTaskAwaitable: NimNode
+    awaitableType: NimNode
+    mergeLocalWithRemote: NimNode
+
+# Parsing parallel loop DSL
+# --------------------------------------------------------------------------------------------------
+
+proc checkLoopBounds(loopBounds: NimNode) =
+  ## Checks loop parameters
+  ## --------------------------------------------------------
+  ## loopBounds should have the form "i in 0..<10"
+  loopBounds.expectKind(nnkInfix)
+  assert loopBounds[0].eqIdent"in"
+  loopBounds[1].expectKind(nnkIdent)
+  loopBounds[2].expectKind(nnkInfix) # 0 ..< 10 / 0 .. 10, for now we don't support slice objects
+  assert loopBounds[2][0].eqIdent".." or loopBounds[2][0].eqIdent"..<"
+
+proc parseLoopBounds(ld: var LoopDescriptor, loopBounds: NimNode) =
+  ## Extract the index, start and stop of the loop
+  ## Strides must be dealt with separately
+  let loopBounds = rebuildUntypedAst(loopBounds, dropRootStmtList = true)
+  checkLoopBounds(loopBounds)
+  ld.indexVariable = loopBounds[1]
+  ld.start = loopBounds[2][1]
+  ld.stopEx = loopBounds[2][2]
+  # We use exclusive bounds
+  if loopBounds[2][0].eqIdent"..":
+    ld.stopEx = newCall(ident"succ", ld.stopEx)
+
+proc parseCaptures(ld: var LoopDescriptor, body: NimNode) =
+  ## Extract captured variables from the for-loop body.
+  ## Once extracted the section that declared those captures will be discarded.
+  ##
+  ## Returns the captured variable and the captured variable types
+  ## in a tuple of nnkPar for easy use in tuple construction and destructuring.
+  # parallelFor i in 0 ..< 10:
+  #   captures: a
+  #   ...
+  #
+  # StmtList
+  #   Call
+  #     Ident "captures"
+  #     StmtList
+  #       Curly
+  #         Ident "a"
+  #   Rest of the body
+  for i in 0 ..< body.len:
+    if body[i].kind == nnkCall and body[i][0].eqIdent"captures":
+      ld.capturedVars = nnkPar.newTree()
+      ld.capturedTypes = nnkPar.newTree()
+
+      body[i][1].expectKind(nnkStmtList)
+      body[i][1][0].expectKind(nnkCurly)
+      for j in 0 ..< body[i][1][0].len:
+        ld.capturedVars.add body[i][1][0][j]
+        ld.capturedTypes.add newCall(ident"typeof", body[i][1][0][j])
+
+      # Remove the captures section
+      body[i] = nnkDiscardStmt.newTree(body[i].toStrLit)
+      return
+
+proc extractSection(ldField: var NimNode, body: NimNode, sectionName: string) =
+  body.expectKind(nnkStmtList)
+  for i in 0 ..< body.len:
+    if body[i].kind == nnkCall and body[i][0].eqIdent(sectionName):
+      body[i][1].expectKind(nnkStmtList)
+      ldField = body[i][1]
+      # Remove the section
+      body[i] = nnkDiscardStmt.newTree(body[i].toStrLit)
+      return
+
+# Code generation
+# --------------------------------------------------------------------------------------------------
+
+proc generateClosure(ld: LoopDescriptor): NimNode =
+
+  let env = ld.envName
+  let capturedTypes = ld.capturedTypes
+  let withCaptures = ld.capturedTypes.len > 0
+
+  let closureName = ld.closureName
+  var loopFnCall = newCall(ld.loopFnName)
+  if withCaptures:
+    loopFnCall.add(env)
+
+  case ld.kind
+  of kForLoop:
+    result = quote do:
+      proc `closureName`(env: pointer) {.nimcall, gcsafe, raises: [].} =
+        when bool(`withCaptures`):
+          let `env` = cast[ptr `capturedTypes`](env)
+        `loopFnCall`
+  of kReduction:
+    let retTy = ld.awaitableType
+
+    result = quote do:
+      proc `closureName`(env: pointer) {.nimcall, gcsafe, raises: [].} =
+        let taskSelfReference = cast[ptr ptr Task](env)
+        when bool(`withCaptures`):
+          let offset = cast[ByteAddress](env) +% sizeof((ptr Task, `retTy`))
+          let `env` = cast[ptr `capturedTypes`](offset)
+        let res = `loopFnCall`
+        readyWith(taskSelfReference[], res)
+  else:
+    error "Not Implemented"
+
+proc generateAndScheduleLoopTask(ld: LoopDescriptor): NimNode =
+  result = newStmtList()
+
+  var withCaptures = false
+  if not ld.capturedVars.isNil:
+    withCaptures = true
+
+  # TODO: awaitable for loop
+
+  # Dependencies
+  # ---------------------------------------------------
+  var scheduleBlock: NimNode
+  let task = ident"ctt_tpLoopTask_"
+  # TODO: Dataflow parallelism / precise task dependencies
+  scheduleBlock = newCall(ld.scheduleFn, ld.workerContext, task)
+
+  # ---------------------------------------------------
+  let
+    (start, stopEx, stride) = (ld.start, ld.stopEx, ld.stride)
+    workerContext = ld.workerContext
+    (closureName, capturedVars) = (ld.closureName, ld.capturedVars)
+    (globalAwaitable, awaitableType) = (ld.globalAwaitable, ld.awaitableType)
+  if ld.awaitableType.isNil():
+    result = quote do:
+      block enq_deq_task: # block for namespacing
+        let start  = `start`      # Ensure single evaluation / side-effect
+        let stopEx = `stopEx`
+        if stopEx-start != 0:
+          when bool(`withCaptures`):
+            let `task` = Task.newLoop(
+              parent = `workerContext`.currentTask,
+              start, stopEx, `stride`,
+              isFirstIter = true,
+              fn = `closureName`,
+              env = `capturedVars`)
+          else:
+            let `task` = Task.newLoop(
+              parent = `workerContext`.currentTask,
+              start, stopEx, `stride`,
+              isFirstIter = true,
+              fn = `closureName`)
+          `scheduleBlock`
+  else:
+    result = quote do:
+      var `globalAwaitable`: FlowVar[`awaitableType`]
+      block enq_deq_task: # Block for name spacing
+        let start  = `start`      # Ensure single evaluation / side-effect
+        let stopEx = `stopEx`
+        if stopEx-start != 0:
+          let taskSelfReference = cast[ptr Task](0xDEADBEEF)
+          var retValBuffer = default(`awaitableType`)
+
+          when bool(`withCaptures`):
+            let `task` = Task.newLoop(
+              parent = `workerContext`.currentTask,
+              start, stopEx, `stride`,
+              isFirstIter = true,
+              fn = `closureName`,
+              env = (taskSelfReference, retValBuffer, `capturedVars`))
+          else:
+            let `task` = Task.newLoop(
+              parent = `workerContext`.currentTask,
+              start, stopEx, `stride`,
+              isFirstIter = true,
+              fn = `closureName`,
+              env = (taskSelfReference, retValBuffer))
+          `globalAwaitable` = newFlowVar(`awaitableType`, `task`)
+          `scheduleBlock`
+
+proc generateParallelLoop(ld: LoopDescriptor): NimNode =
+  # Package a parallel for loop into a proc
+  # Returns the statements that implements it.
+  let pragmas = nnkPragma.newTree(
+                  ident"nimcall", ident"gcsafe", ident"inline",
+                  nnkExprColonExpr.newTree(ident"raises", nnkBracket.newTree())) # raises: []
+
+  var params: seq[NimNode]
+  if ld.awaitableType.isNil:
+    params.add newEmptyNode()
+  else:
+    params.add ld.awaitableType
+
+  var procBody = newStmtList()
+
+  if ld.capturedVars.len > 0:
+    params.add newIdentDefs(ld.envName, nnkPtrTy.newTree(ld.capturedTypes))
+
+    let derefEnv = nnkBracketExpr.newTree(ld.envName)
+    if ld.capturedVars.len > 1:
+      # Unpack the variables captured from the environment
+      # let (a, b, c) = env[]
+      var unpacker = nnkVarTuple.newTree()
+      ld.capturedVars.copyChildrenTo(unpacker)
+      unpacker.add newEmptyNode()
+      unpacker.add derefEnv
+
+      procBody.add nnkLetSection.newTree(unpacker)
+    else:
+      procBody.add newLetStmt(ld.capturedVars[0], derefEnv)
+
+  case ld.kind
+  of kForLoop:
+    procBody.add newCall(ld.loopTemplate, ld.indexVariable, ld.forLoop)
+  of kReduction:
+    procBody.add newCall(
+      ld.loopTemplate, ld.indexVariable,
+      ld.prologue, ld.forLoop, ld.mergeLocalWithRemote, ld.epilogue,
+      ld.remoteTaskAwaitable, ld.awaitableType)
+  else:
+    error " Unimplemented"
+
+  result = newProc(
+    name = ld.loopFnName,
+    params = params,
+    body = procBody,
+    pragmas = pragmas)
+
+# Parallel for
+# --------------------------------------------------------------------------------------------------
+
+proc parallelForImpl*(workerContext, scheduleFn, loopTemplate, loopBounds, body: NimNode): NimNode =
+  ## Parallel for loop
+  ## Syntax:
+  ##
+  ## parallelFor i in 0 ..< 10:
+  ##   echo(i)
+  ##
+  ## Variables from the external scope needs to be explicitly captured
+  ##
+  ##  var a = 100
+  ##  var b = 10
+  ##  parallelFor i in 0 ..< 10:
+  ##    captures: {a, b}
+  ##    echo a + b + i
+
+  result = newStmtList()
+  var ld = LoopDescriptor(kind: kForLoop, workerContext: workerContext, scheduleFn: scheduleFn)
+
+  # Parse the loop Domain-Specific Language
+  # --------------------------------------------------------
+  body.expectKind(nnkStmtList)
+  ld.parseLoopBounds(loopBounds)
+  ld.stride.extractSection(body, "stride")
+  if ld.stride.isNil:
+    ld.stride = newLit(1)
+  ld.parseCaptures(body)
+  ld.forLoop = body
+
+  # Code generation
+  # --------------------------------------------------------
+  ld.loopTemplate = loopTemplate
+  ld.loopFnName   = ident("ctt_tpParForImpl_")
+  ld.envName      = ident("ctt_tpParForEnv_")
+  result.add ld.generateParallelLoop()
+
+  ld.closureName  = ident("ctt_tpParForClosure_")
+  result.add ld.generateClosure()
+
+  ld.taskName     = ident("ctt_tpParForTask_")
+  result.add ld.generateAndScheduleLoopTask()
+
+# Parallel reductions
+# --------------------------------------------------------------------------------------------------
+
+proc parseReductionSection(body: NimNode):
+       tuple[globalAwaitable, awaitableType, reductionBody: NimNode] =
+  for i in 0 ..< body.len:
+    # parallelFor i in 0 .. n:
+    #   reduceInto(globalSum: int64):
+    #     prologue:
+    #       var localSum = 0'i64
+    #
+    #   StmtList
+    #     Call
+    #       ObjConstr
+    #         Ident "reduceInto"
+    #         ExprColonExpr
+    #           Ident "globalSum"
+    #           Ident "int64"
+    #       StmtList
+    #         Call
+    #           Ident "prologue"
+    #           StmtList
+    #             VarSection
+    #               IdentDefs
+    #                 Ident "localSum"
+    #                 Empty
+    #                 Int64Lit 0
+    if body[i].kind == nnkCall and
+         body[i][0].kind == nnkObjConstr and
+         body[i][0][0].eqident"reduceInto":
+      body[i][0][1].testKind(nnkExprColonExpr, Reduce)
+      body[i][1].testKind(nnkStmtList, Reduce)
+
+      if body[i][1].len != 4:
+        printReduceExample()
+        error "A reduction should have 4 sections named:\n" &
+              "  prologue, forLoop, merge and epilogue statements\n"
+        #      (globalAwaitable, awaitableType, reductionBody)
+      return (body[i][0][1][0], body[i][0][1][1], body[i][1])
+
+  printReduceExample()
+  error "Missing section \"reduceInto(globalAwaitable: awaitableType):\""
+
+proc extractRemoteTaskMerge(ld: var LoopDescriptor, body: NimNode) =
+  for i in 0 ..< body.len:
+    if body[i].kind == nnkCall and
+         body[i][0].kind == nnkObjConstr and
+         body[i][0][0].eqident"merge":
+      body[i][0][1].testKind(nnkExprColonExpr, Reduce)
+      body[i][1].testKind(nnkStmtList, Reduce)
+
+      ld.remoteTaskAwaitable = body[i][0][1][0]
+      ld.mergeLocalWithRemote = body[i][1]
+      return
+
+  printReduceExample()
+  error "Missing section \"merge(remoteThreadAccumulator: Flowvar[accumulatorType]):\""
+
+proc parallelReduceImpl*(workerContext, scheduleFn, loopTemplate, loopBounds, body: NimNode): NimNode =
+  ## Parallel reduce loop
+  ## Syntax:
+  ##
+  ##   parallelFor i in 0 ..< 100:
+  ##     reduceInto(globalSum: int64):
+  ##       prologue:
+  ##         ## Initialize before the loop
+  ##         var localSum = 0
+  ##       forLoop:
+  ##         ## Compute the partial reductions
+  ##         localSum += i
+  ##       merge(remoteSum: Flowvar[int64]):
+  ##         ## Merge our local reduction with reduction from remote threads
+  ##         localSum += sync(remoteSum)
+  ##       return localSum
+  ##
+  ##   # Await our result
+  ##   let sum = sync(globalSum)
+  ##
+  ## The first element from the iterator (i) in the example is not available in the prologue.
+  ## Depending on multithreaded scheduling it may start at 0 or halfway or close to completion.
+  ## The accumulator set in the prologue should be set at the neutral element for your fold operation:
+  ## - 0 for addition, 1 for multiplication, +Inf for min, -Inf for max, ...
+  ##
+  ## In the forLoop section the iterator i is available, the number of iterations is undefined.
+  ## The runtime chooses dynamically how many iterations are done to maximize throughput.
+  ## - This requires your operation to be associative, i.e. (a+b)+c = a+(b+c).
+  ## - It does not require your operation to be commutative (a+b = b+a is not needed).
+  ## - In particular floating-point addition is NOT associative due to rounding errors.
+  ##   and result may differ between runs.
+  ##   For inputs usually in [-1,1]
+  ##   the floating point addition error is within 1e-8 (float32) or 1e-15 (float64).
+  ##   For inputs beyond 1e^9 please evaluate the acceptable precision.
+  ##   Note: that the main benefits of "-ffast-math" is considering floating-point addition
+  ##         associative
+  ##
+  ## In the merge section, a tuple (identifier: Flowvar[MyType]) for a partial reduction from a remote core must be passed.
+  ## The merge section may be executed multiple times if a loop was split between many threads.
+  ## The local partial reduction must be returned.
+  ##
+  ## Variables from the external scope needs to be explicitly captured.
+  ## For example, to compute the variance of a seq in parallel
+  ##
+  ##    var s = newSeqWith(1000, rand(100.0))
+  ##    let mean = mean(s)
+  ##
+  ##    let ps = cast[ptr UncheckedArray[float64]](s)
+  ##
+  ##    parallelFor i in 0 ..< s.len:
+  ##      captures: {ps, mean}
+  ##      reduceInto(globalVariance: float64):
+  ##        prologue:
+  ##          var localVariance = 0.0
+  ##        fold:
+  ##          localVariance += (ps[i] - mean)^2
+  ##        merge(remoteVariance: Flowvar[float64]):
+  ##          localVariance += sync(remoteVariance)
+  ##        return localVariance
+  ##
+  ##    # Await our result
+  ##    let variance = sync(globalVariance)
+  ##
+  ## Performance note:
+  ##   For trivial floating points operations like addition/sum reduction:
+  ##   before parallelizing reductions on multiple cores
+  ##   you might try to parallelize it on a single core by
+  ##   creating multiple accumulators (between 2 and 4)
+  ##   and unrolling the accumulation loop by that amount.
+  ##
+  ##   The compiler is unable to do that (without -ffast-math)
+  ##   as floating point addition is NOT associative and changing
+  ##   order will change the result due to floating point rounding errors.
+  ##
+  ##   The performance improvement is dramatic (2x-3x) as at a low-level
+  ##   there is no data dependency between each accumulators and
+  ##   the CPU can now use instruction-level parallelism instead
+  ##   of suffer from data dependency latency (3 or 4 cycles)
+  ##   https://software.intel.com/sites/landingpage/IntrinsicsGuide/#techs=SSE&expand=158
+  ##   The reduction becomes memory-bound instead of CPU-latency-bound.
+
+  result = newStmtList()
+  var ld = LoopDescriptor(kind: kReduction, workerContext: workerContext, scheduleFn: scheduleFn)
+
+  # Parse the loop Domain-Specific Language
+  # --------------------------------------------------------
+  body.testKind(nnkStmtList, Reduce)
+  ld.parseLoopBounds(loopBounds)
+  ld.stride.extractSection(body, "stride")
+  if ld.stride.isNil:
+    ld.stride = newLit(1)
+  ld.parseCaptures(body)
+
+  var reductionBody: NimNode
+  (ld.globalAwaitable, ld.awaitableType, reductionBody) = parseReductionSection(body)
+  ld.extractRemoteTaskMerge(reductionBody)
+
+  ld.prologue.extractSection(reductionBody, "prologue")
+  ld.forLoop.extractSection(reductionBody, "forLoop")
+  ld.epilogue.extractSection(reductionBody, "epilogue")
+
+  # Code generation
+  # --------------------------------------------------------
+  ld.loopTemplate = loopTemplate
+  ld.loopFnName   = ident("ctt_tpParReduceImpl_")
+  ld.envName      = ident("ctt_tpParReduceEnv_")
+  result.add ld.generateParallelLoop()
+
+  ld.closureName  = ident("ctt_tpParReduceClosure_")
+  result.add ld.generateClosure()
+
+  ld.taskName     = ident("ctt_tpParReduceTask_")
+  result.add ld.generateAndScheduleLoopTask()
+
+# ############################################################
+#                                                            #
+#                Parallel For Dispatchers                    #
+#                                                            #
+# ############################################################
+
+proc hasReduceSection*(body: NimNode): bool =
+  for i in 0 ..< body.len:
+    if body[i].kind == nnkCall:
+      for j in 0 ..< body[i].len:
+        if body[i][j].kind == nnkObjConstr and body[i][j][0].eqIdent"reduceInto":
+          return true
+  return false
--- a/constantine/platforms/threadpool/threadpool.nim
+++ b/constantine/platforms/threadpool/threadpool.nim
@ -9,7 +9,7 @@
 when not compileOption("threads"):
  {.error: "This requires --threads:on compilation flag".}

-{.push raises: [].}
+{.push raises: [], checks: off.}

 import
  std/[cpuinfo, atomics, macros],
@ -27,8 +27,14 @@ export
  # flowvars
  Flowvar, isSpawned, isReady, sync

+# ############################################################
+#                                                            #
+#                           Types                            #
+#                                                            #
+# ############################################################
+
 type
-  WorkerID = uint32
+  WorkerID = int32
  Signal = object
    terminate {.align: 64.}: Atomic[bool]

@ -52,26 +58,33 @@ type

    # Adaptative theft policy
    stealHalf: bool
-    recentTasks: uint32
-    recentThefts: uint32
-    recentTheftsAdaptative: uint32
-    recentLeaps: uint32
+    recentTasks: int32
+    recentThefts: int32
+    recentTheftsAdaptative: int32
+    recentLeaps: int32

  Threadpool* = ptr object
-    barrier: SyncBarrier             # Barrier for initialization and teardown
+    barrier: SyncBarrier                                         # Barrier for initialization and teardown
    # -- align: 64
-    globalBackoff: EventCount        # Multi-Producer Multi-Consumer backoff
+    globalBackoff: EventCount                                    # Multi-Producer Multi-Consumer backoff
+    reserveBackoff: EventCount
    # -- align: 64
-    numThreads*{.align: 64.}: uint32
-    workerQueues: ptr UncheckedArray[Taskqueue]
-    workers: ptr UncheckedArray[Thread[(Threadpool, WorkerID)]]
-    workerSignals: ptr UncheckedArray[Signal]
+    numThreads*{.align: 64.}: int32                              # N regular workers + N reserve workers
+    workerQueues: ptr UncheckedArray[Taskqueue]                  # size 2N
+    workers: ptr UncheckedArray[Thread[(Threadpool, WorkerID)]]  # size 2N
+    workerSignals: ptr UncheckedArray[Signal]                    # size 2N
+    # -- align: 64
+    numIdleThreadsAwaitingFutures*{.align: 64.}: Atomic[int32]

-# Thread-local config
-# ---------------------------------------------
+# ############################################################
+#                                                            #
+#                         Workers                            #
+#                                                            #
+# ############################################################

 var workerContext {.threadvar.}: WorkerContext
  ## Thread-local Worker context
+  ## We assume that a threadpool has exclusive ownership

 proc setupWorker() =
  ## Initialize the thread-local context of a worker
@ -79,7 +92,7 @@ proc setupWorker() =
  template ctx: untyped = workerContext

  preCondition: not ctx.threadpool.isNil()
-  preCondition: 0 <= ctx.id and ctx.id < ctx.threadpool.numThreads.uint32
+  preCondition: 0 <= ctx.id and ctx.id < 2*ctx.threadpool.numThreads
  preCondition: not ctx.threadpool.workerQueues.isNil()
  preCondition: not ctx.threadpool.workerSignals.isNil()

@ -109,7 +122,8 @@ proc teardownWorker() =
  workerContext.localBackoff.`=destroy`()
  workerContext.taskqueue[].teardown()

-proc eventLoop(ctx: var WorkerContext) {.raises:[].}
+proc eventLoopRegular(ctx: var WorkerContext) {.raises:[], gcsafe.}
+proc eventLoopReserve(ctx: var WorkerContext) {.raises:[], gcsafe.}

 proc workerEntryFn(params: tuple[threadpool: Threadpool, id: WorkerID]) {.raises: [].} =
  ## On the start of the threadpool workers will execute this
@ -128,19 +142,24 @@ proc workerEntryFn(params: tuple[threadpool: Threadpool, id: WorkerID]) {.raises
  # 1 matching barrier in Threadpool.new() for root thread
  discard params.threadpool.barrier.wait()

-  {.cast(gcsafe).}: # Compiler does not consider that GC-safe by default when multi-threaded due to thread-local variables
-    ctx.eventLoop()
+  if ctx.id < ctx.threadpool.numThreads:
+    ctx.eventLoopRegular()
+  else:
+    ctx.eventLoopReserve()

  debugTermination:
-    log(">>> Worker %2d shutting down <<<\n", ctx.id)
+    log(">>> Worker %3d shutting down <<<\n", ctx.id)

  # 1 matching barrier in threadpool.shutdown() for root thread
  discard params.threadpool.barrier.wait()

  teardownWorker()

-# Tasks
-# ---------------------------------------------
+# ############################################################
+#                                                            #
+#                           Tasks                            #
+#                                                            #
+# ############################################################

 # Sentinel values
 const ReadyFuture = cast[ptr EventNotifier](0xCA11AB1E)
@ -150,26 +169,26 @@ proc run*(ctx: var WorkerContext, task: ptr Task) {.raises:[].} =
  ## Run a task, frees it if it is not owned by a Flowvar
  let suspendedTask = workerContext.currentTask
  ctx.currentTask = task
-  debug: log("Worker %2d: running task.fn 0x%.08x (%d pending)\n", ctx.id, task.fn, ctx.taskqueue[].peek())
-  task.fn(task.data.addr)
-  debug: log("Worker %2d: completed task.fn 0x%.08x (%d pending)\n", ctx.id, task.fn, ctx.taskqueue[].peek())
+  debug: log("Worker %3d: running task 0x%.08x (previous: 0x%.08x, %d pending, thiefID %d)\n", ctx.id, task, suspendedTask, ctx.taskqueue[].peek(), task.thiefID)
+  task.fn(task.env.addr)
+  debug: log("Worker %3d: completed task 0x%.08x (%d pending)\n", ctx.id, task, ctx.taskqueue[].peek())
  ctx.recentTasks += 1
  ctx.currentTask = suspendedTask
  if not task.hasFuture:
    freeHeap(task)
    return

-  # Sync with an awaiting thread without work in completeFuture
+  # Sync with an awaiting thread in completeFuture that didn't find work
  var expected = (ptr EventNotifier)(nil)
  if not compareExchange(task.waiter, expected, desired = ReadyFuture, moAcquireRelease):
-    debug: log("Worker %2d: completed task 0x%.08x, notifying waiter 0x%.08x\n", ctx.id, task, expected)
+    debug: log("Worker %3d: completed task 0x%.08x, notifying waiter 0x%.08x\n", ctx.id, task, expected)
    expected[].notify()

 proc schedule(ctx: var WorkerContext, tn: ptr Task, forceWake = false) {.inline.} =
  ## Schedule a task in the threadpool
  ## This wakes a sibling thread if our local queue is empty
  ## or forceWake is true.
-  debug: log("Worker %2d: schedule task 0x%.08x (parent 0x%.08x, current 0x%.08x)\n", ctx.id, tn, tn.parent, ctx.currentTask)
+  debug: log("Worker %3d: schedule task 0x%.08x (parent 0x%.08x, current 0x%.08x)\n", ctx.id, tn, tn.parent, ctx.currentTask)

  # Instead of notifying every time a task is scheduled, we notify
  # only when the worker queue is empty. This is a good approximation
@ -183,10 +202,252 @@ proc schedule(ctx: var WorkerContext, tn: ptr Task, forceWake = false) {.inline.
  if forceWake or wasEmpty:
    ctx.threadpool.globalBackoff.wake()

-# Scheduler
-# ---------------------------------------------
+# ############################################################
+#                                                            #
+#              Parallel-loops load-balancing                 #
+#                                                            #
+# ############################################################

-iterator pseudoRandomPermutation(randomSeed, maxExclusive: uint32): uint32 =
+# Inpired by
+# - Lazy binary-splitting: a run-time adaptive work-stealing scheduler.
+#   Tzannes, A., G. C. Caragea, R. Barua, and U. Vishkin.
+#   In PPoPP ’10, Bangalore, India, January 2010. ACM, pp. 179–190.
+#   https://user.eng.umd.edu/~barua/ppopp164.pdf
+# - Embracing Explicit Communication in Work-Stealing Runtime Systems.
+#   Andreas Prell, 2016
+#   https://epub.uni-bayreuth.de/id/eprint/2990/
+#
+# Instead of splitting loops ahead of time depending on the number of cores,
+# we split just-in-time depending on idle threads.
+# This allows the code to lazily evaluate when it's profitable to split,
+# making parallel-for performance portable to any CPU and any inner algorithm
+# unlike OpenMP or TBB for example, see design.md for performance unportability benchmark numbers.
+# This frees the developer from grain size / work splitting thresholds.
+
+iterator splitUpperRanges(
+           ctx: WorkerContext, task: ptr Task,
+           curLoopIndex: int, numIdle: int32
+         ): tuple[start, size: int] =
+  ## Split the iteration range based on the number of idle threads
+  ## returns chunks with parameters (start, stopEx, len)
+  ##
+  ## - Chunks are balanced, their size differs by at most 1.
+  ##   Balanced workloads are scheduled with overhead similar to static scheduling.
+  ## - Split is adaptative, unlike static scheduling or guided scheduling in OpenMP
+  ##   it is based on idle workers and not the number of cores.
+  ##   If enough parallelism is exposed, for example due to nested parallelism,
+  ##   there is no splitting overhead.
+  ##
+  ## - Updates the current task loopStop with the lower range
+  #
+  # Unbalanced example:
+  #  Splitting 40 iterations on 12 threads
+  #  A simple chunking algorithm with division + remainder
+  #  will lead to a base chunk size of 40/12 = 3.
+  #  3*11 = 33, so last thread will do 7 iterations, more than twice the work.
+  #
+  # Note: Each division costs 55 cycles, 55x more than addition/substraction/bit operations
+  #       and they can't use instruction-level parallelism.
+  #       When dealing with loop ranges and strides we need to carefully craft our code to
+  #       only use division where unavoidable: dividing by the number of idle threads.
+  #       Loop metadata should allow us to avoid loop-bounds-related divisions completely.
+  preCondition: task.loopStepsLeft > 1
+  preCondition: curLoopIndex mod task.loopStride == 0
+
+  debugSplit:
+    log("Worker %3d: task 0x%.08x - %8d step(s) left                    (current: %3d, start: %3d, stop: %3d, stride: %3d, %3d idle worker(s))\n",
+      ctx.id, task, task.loopStepsLeft, curLoopIndex, task.loopStart, task.loopStop, task.loopStride, numIdle)
+
+  # Send a chunk of work to all idle workers + ourselves
+  let availableWorkers = numIdle + 1
+  let baseChunkSize = task.loopStepsLeft div availableWorkers
+  let cutoff        = task.loopStepsLeft mod availableWorkers
+
+  block: # chunkID 0 is ours! My precious!!!
+    task.loopStepsLeft = baseChunkSize + int(0 < cutoff)
+    task.loopStop = min(task.loopStop, curLoopIndex + task.loopStepsLeft*task.loopStride)
+
+    debugSplit:
+      log("Worker %3d: task 0x%.08x - %8d step(s) kept locally            (current: %3d, start: %3d, stop: %3d, stride: %3d)\n",
+        ctx.id, task, task.loopStepsLeft, curLoopIndex, task.loopStart, task.loopStop, task.loopStride)
+
+  for chunkID in 1 ..< availableWorkers:
+    # As the iterator callsite is copy-pasted, we want a single yield point.
+    var chunkSize = baseChunkSize
+    var offset    = curLoopIndex
+    if chunkID < cutoff:
+      chunkSize += 1
+      offset += task.loopStride*chunkSize*chunkID
+    else:
+      offset += task.loopStride*(baseChunkSize*chunkID + cutoff)
+    yield (offset, chunkSize)
+
+type BalancerBackoff = object
+  ## We want to dynamically split parallel loops depending on the number of idle threads.
+  ## However checking an atomic variable require synchronization which at the very least means
+  ## reloading its value in all caches, a guaranteed cache miss. In a tight loop,
+  ## this might be a significant cost, especially given that memory is often the bottleneck.
+  ##
+  ## There is no synchronization possible with thieves, unlike Prell PhD thesis.
+  ## We want to avoid the worst-case scenario in Tzannes paper, tight-loop with too many available cores
+  ## so the producer deque is always empty, leading to it spending all its CPU time splitting loops.
+  ## For this we split depending on the numbers of idle CPUs. This prevents also splitting unnecessarily.
+  ##
+  ## Tzannes et al mentions that checking the thread own deque emptiness is a good approximation of system load
+  ## with low overhead except in very fine-grained parallelism.
+  ## With a better approximation, by checking the number of idle threads we can instead
+  ## directly do the correct number of splits or avoid splitting. But this check is costly.
+  ##
+  ## To minimize checking cost while keeping latency low, even in bursty cases,
+  ## we use log-log iterated backoff.
+  ## - Adversarial Contention Resolution for Simple Channels
+  ##   Bender, Farach-Colton, He, Kuszmaul, Leiserson, 2005
+  ##   https://people.csail.mit.edu/bradley/papers/BenderFaHe05.pdf
+  nextCheck: int
+  windowLogSize: uint32 # while loopIndex < lastCheck + 2^windowLogSize, don't recheck.
+  round: uint32         # windowSize += 1 after log(windowLogSize) rounds
+
+func increase(backoff: var BalancerBackoff) {.inline.} =
+  # On failure, we use log-log iterated backoff, an optimal backoff strategy
+  # suitable for bursts and adversarial conditions.
+  backoff.round += 1
+  if backoff.round >= log2_vartime(backoff.windowLogSize):
+    backoff.round = 0
+    backoff.windowLogSize += 1
+
+func decrease(backoff: var BalancerBackoff) {.inline.} =
+  # On success, we exponentially reduce check window.
+  # Note: the thieves will start contributing as well.
+  backoff.windowLogSize -= 1
+  backoff.round = 0
+  if backoff.windowLogSize < 0:
+    backoff.windowLogSize = 0
+
+proc splitAndDispatchLoop(ctx: var WorkerContext, task: ptr Task, curLoopIndex: int, numIdle: int32) =
+  # The iterator mutates the task with the first chunk metadata
+  let stop = task.loopStop
+  for (offset, numSteps) in ctx.splitUpperRanges(task, curLoopIndex, numIdle):
+    if numSteps == 0:
+      break
+
+    let upperSplit = allocHeapUnchecked(Task, sizeof(Task) + task.envSize)
+    copyMem(upperSplit, task, sizeof(Task) + task.envSize)
+
+    upperSplit.parent        = task
+    upperSplit.thiefID.store(SentinelThief, moRelaxed)
+    upperSplit.waiter.store(nil, moRelaxed)
+
+    upperSplit.isFirstIter   = false
+    upperSplit.loopStart     = offset
+    upperSplit.loopStop      = min(stop, offset + numSteps*upperSplit.loopStride)
+    upperSplit.loopStepsLeft = numSteps
+
+    if upperSplit.hasFuture:
+      # Update self-reference
+      cast[ptr ptr Task](upperSplit.env.addr)[] = upperSplit
+      # Create a private task-local linked-list of awaited tasks
+      task.reductionDAG = newReductionDagNode(task = upperSplit, next = task.reductionDAG)
+      upperSplit.reductionDAG = nil
+
+    debugSplit:
+      log("Worker %3d: task 0x%.08x - %8d step(s) sent in task 0x%.08x (start: %3d, stop: %3d, stride: %3d)\n",
+           ctx.id, task, upperSplit.loopStepsLeft, upperSplit, upperSplit.loopStart, upperSplit.loopStop, upperSplit.loopStride)
+    ctx.taskqueue[].push(upperSplit)
+
+  ctx.threadpool.globalBackoff.wakeAll()
+
+proc loadBalanceLoop(ctx: var WorkerContext, task: ptr Task, curLoopIndex: int, backoff: var BalancerBackoff) =
+  ## Split a parallel loop when necessary
+  # We might want to make this inline to cheapen the first check
+  # but it is 10% faster not inline on the transpose benchmark (memory-bandwidth bound)
+  if task.loopStepsLeft > 1 and curLoopIndex == backoff.nextCheck:
+    if ctx.taskqueue[].peek() == 0:
+      let waiters = ctx.threadpool.globalBackoff.getNumWaiters()
+      # We assume that the worker that scheduled the task will work on it. I.e. idleness is underestimated.
+      let numIdle = waiters.preSleep + waiters.committedSleep + int32(task.isFirstIter)
+      if numIdle > 0:
+        ctx.splitAndDispatchLoop(task, curLoopIndex, numIdle)
+        backoff.decrease()
+      else:
+        backoff.increase()
+    else:
+      backoff.increase()
+
+    backoff.nextCheck += task.loopStride shl backoff.windowLogSize
+
+template parallelForWrapper(idx: untyped{ident}, loopBody: untyped): untyped =
+  ## To be called within a loop task
+  ## Gets the loop bounds and iterate the over them
+  ## Also polls runtime status for dynamic loop splitting
+  ##
+  ## Loop prologue, epilogue,
+  ## remoteAccum, resultTy and returnStmt
+  ## are unused
+  block:
+    let this = workerContext.currentTask
+    var backoff = BalancerBackoff(
+      nextCheck: this.loopStart,
+      windowLogSize: 0,
+      round: 0)
+    if not this.isFirstIter:
+      # Task was just stolen, no need to check runtime status. do one loop first
+      backoff.nextCheck += this.loopStride
+
+    var idx = this.loopStart
+    while idx < this.loopStop:
+      loadBalanceLoop(workerContext, this, idx, backoff)
+      loopBody
+      idx += this.loopStride
+      this.loopStepsLeft -= 1
+
+template parallelReduceWrapper(
+    idx: untyped{ident},
+    prologue, loopBody, mergeLocalWithRemote, epilogue,
+    remoteTaskAwaitable, awaitableType: untyped): untyped =
+  ## To be called within a loop task
+  ## Gets the loop bounds and iterate the over them
+  ## Also polls runtime status for dynamic loop splitting
+  block:
+    let this = workerContext.currentTask
+    var backoff = BalancerBackoff(
+      nextCheck: this.loopStart,
+      windowLogSize: 0,
+      round: 0
+    )
+    if not this.isFirstIter:
+      # Task was just stolen, no need to check runtime status. do one loop first
+      backoff.nextCheck += this.loopStride
+
+    prologue
+
+    block: # loop body
+      var idx = this.loopStart
+      while idx < this.loopStop:
+        loadBalanceLoop(workerContext, this, idx, backoff)
+        loopBody
+        idx += this.loopStride
+        this.loopStepsLeft -= 1
+
+    block: # Merging with flowvars from remote threads
+      while not this.reductionDAG.isNil:
+        let reductionDagNode = this.reductionDAG
+        let remoteTaskAwaitable = cast[Flowvar[awaitableType]](reductionDagNode.task)
+        this.reductionDAG = reductionDagNode.next
+
+        mergeLocalWithRemote
+
+        # In `merge` there should be a sync which frees `reductionDagNode.task`
+        freeHeap(reductionDagNode)
+
+    epilogue
+
+# ############################################################
+#                                                            #
+#                       Scheduler                            #
+#                                                            #
+# ############################################################
+
+iterator pseudoRandomPermutation(randomSeed: uint32, maxExclusive: int32): int32 =
  ## Create a (low-quality) pseudo-random permutation from [0, max)
  # Design considerations and randomness constraint for work-stealing, see docs/random_permutations.md
  #
@ -216,6 +477,7 @@ iterator pseudoRandomPermutation(randomSeed, maxExclusive: uint32): uint32 =
  # - a != 1, so we now have a multiplicative factor, which makes output more "random looking".

  # n and (m-1) <=> n mod m, if m is a power of 2
+  let maxExclusive = cast[uint32](maxExclusive)
  let M = maxExclusive.nextPowerOfTwo_vartime()
  let c = (randomSeed and ((M shr 1) - 1)) * 2 + 1 # c odd and c ∈ [0, M)
  let a = (randomSeed and ((M shr 2) - 1)) * 4 + 1 # a-1 divisible by 2 (all prime factors of m) and by 4 if m divisible by 4
@ -226,7 +488,7 @@ iterator pseudoRandomPermutation(randomSeed, maxExclusive: uint32): uint32 =
  var x = start
  while true:
    if x < maxExclusive:
-      yield x
+      yield cast[int32](x)
    x = (a*x + c) and mask                         # ax + c (mod M), with M power of 2
    if x == start:
      break
@ -234,7 +496,7 @@ iterator pseudoRandomPermutation(randomSeed, maxExclusive: uint32): uint32 =
 proc tryStealOne(ctx: var WorkerContext): ptr Task =
  ## Try to steal a task.
  let seed = ctx.rng.next().uint32
-  for targetId in seed.pseudoRandomPermutation(ctx.threadpool.numThreads):
+  for targetId in seed.pseudoRandomPermutation(2*ctx.threadpool.numThreads):
    if targetId == ctx.id:
      continue

@ -279,7 +541,7 @@ proc tryStealAdaptative(ctx: var WorkerContext): ptr Task =
  # ctx.updateStealStrategy()

  let seed = ctx.rng.next().uint32
-  for targetId in seed.pseudoRandomPermutation(ctx.threadpool.numThreads):
+  for targetId in seed.pseudoRandomPermutation(2*ctx.threadpool.numThreads):
    if targetId == ctx.id:
      continue

@ -308,14 +570,14 @@ proc tryLeapfrog(ctx: var WorkerContext, awaitedTask: ptr Task): ptr Task =

  var thiefID = SentinelThief
  while true:
-    debug: log("Worker %2d: waiting for thief to publish their ID\n", ctx.id)
+    debug: log("Worker %3d: leapfrogging - waiting for thief of task 0x%.08x to publish their ID\n", ctx.id, awaitedTask)
    thiefID = awaitedTask.thiefID.load(moAcquire)
    if thiefID != SentinelThief:
      break
    cpuRelax()
-  ascertain: 0 <= thiefID and thiefID < ctx.threadpool.numThreads
+  ascertain: 0 <= thiefID and thiefID < 2*ctx.threadpool.numThreads

-  # Leapfrogging is used when completing a future, steal only one
+  # Leapfrogging is used when completing a future, so steal only one task
  # and don't leave tasks stranded in our queue.
  let leapTask = ctx.id.steal(ctx.threadpool.workerQueues[thiefID])
  if not leapTask.isNil():
@ -325,37 +587,100 @@ proc tryLeapfrog(ctx: var WorkerContext, awaitedTask: ptr Task): ptr Task =
    return leapTask
  return nil

-proc eventLoop(ctx: var WorkerContext) {.raises:[].} =
+proc eventLoopRegular(ctx: var WorkerContext) {.raises:[], gcsafe.} =
  ## Each worker thread executes this loop over and over.
  while true:
    # 1. Pick from local queue
-    debug: log("Worker %2d: eventLoop 1 - searching task from local queue\n", ctx.id)
+    debug: log("Worker %3d: eventLoopRegular 1 - searching task from local queue\n", ctx.id)
    while (var task = ctx.taskqueue[].pop(); not task.isNil):
-      debug: log("Worker %2d: eventLoop 1 - running task 0x%.08x (parent 0x%.08x, current 0x%.08x)\n", ctx.id, task, task.parent, ctx.currentTask)
+      debug: log("Worker %3d: eventLoopRegular 1 - running task 0x%.08x (parent 0x%.08x, current 0x%.08x)\n", ctx.id, task, task.parent, ctx.currentTask)
      ctx.run(task)

    # 2. Run out of tasks, become a thief
-    debug: log("Worker %2d: eventLoop 2 - becoming a thief\n", ctx.id)
+    debug: log("Worker %3d: eventLoopRegular 2 - becoming a thief\n", ctx.id)
    let ticket = ctx.threadpool.globalBackoff.sleepy()
    if (var stolenTask = ctx.tryStealAdaptative(); not stolenTask.isNil):
      # We manage to steal a task, cancel sleep
      ctx.threadpool.globalBackoff.cancelSleep()
      # 2.a Run task
-      debug: log("Worker %2d: eventLoop 2.a - stole task 0x%.08x (parent 0x%.08x, current 0x%.08x)\n", ctx.id, stolenTask, stolenTask.parent, ctx.currentTask)
+      debug: log("Worker %3d: eventLoopRegular 2.a - stole task 0x%.08x (parent 0x%.08x, current 0x%.08x)\n", ctx.id, stolenTask, stolenTask.parent, ctx.currentTask)
      ctx.run(stolenTask)
    elif ctx.signal.terminate.load(moAcquire):
      # 2.b Threadpool has no more tasks and we were signaled to terminate
      ctx.threadpool.globalBackoff.cancelSleep()
-      debugTermination: log("Worker %2d: eventLoop 2.b - terminated\n", ctx.id)
+      debugTermination: log("Worker %3d: eventLoopRegular 2.b - terminated\n", ctx.id)
      break
    else:
-      # 2.b Park the thread until a new task enters the threadpool
-      debug: log("Worker %2d: eventLoop 2.b - sleeping\n", ctx.id)
+      # 2.c Park the thread until a new task enters the threadpool
+      debug: log("Worker %3d: eventLoopRegular 2.b - sleeping\n", ctx.id)
      ctx.threadpool.globalBackoff.sleep(ticket)
-      debug: log("Worker %2d: eventLoop 2.b - waking\n", ctx.id)
+      debug: log("Worker %3d: eventLoopRegular 2.b - waking\n", ctx.id)

-# Sync
-# ---------------------------------------------
+proc eventLoopReserve(ctx: var WorkerContext) {.raises:[], gcsafe.} =
+  ## A reserve worker is a relay when a thread is stuck awaiting a future completion.
+  ## This ensure those threads are available as soon as the future completes, minimizing latency
+  ## while ensuring the runtime uses all available hardware resources, maximizing throughput.
+
+  template reserveSleepCheck: untyped =
+    let ticket = ctx.threadpool.reserveBackoff.sleepy()
+    let (reservePlanningSleep, reserveCommittedSleep) = ctx.threadpool.reserveBackoff.getNumWaiters()
+    let numActiveReservists = ctx.threadpool.numThreads - (reservePlanningSleep-1 + reserveCommittedSleep) # -1 we don't want to count ourselves
+
+    if ctx.signal.terminate.load(moAcquire): # If terminated, we leave everything as-is, the regular workers will finish
+      ctx.threadpool.reserveBackoff.cancelSleep()
+      debugTermination: log("Worker %3d: reserveSleepCheck - terminated\n", ctx.id)
+      return
+    elif numActiveReservists > ctx.threadpool.numIdleThreadsAwaitingFutures.load(moAcquire):
+      ctx.threadpool.globalBackoff.wake() # In case we were just woken up for a task or we have tasks in our queue, pass the torch
+      debug: log("Worker %3d: reserveSleepCheck - going to sleep on reserve backoff\n", ctx.id)
+      ctx.threadpool.reserveBackoff.sleep(ticket)
+      debug: log("Worker %3d: reserveSleepCheck - waking on reserve backoff\n", ctx.id)
+    else:
+      ctx.threadpool.reserveBackoff.cancelSleep()
+
+  while true:
+    # 1. Pick from local queue
+    debug: log("Worker %3d: eventLoopReserve 1 - searching task from local queue\n", ctx.id)
+    while true:
+      reserveSleepCheck()
+      var task = ctx.taskqueue[].pop()
+      if task.isNil():
+        break
+      debug: log("Worker %3d: eventLoopReserve 1 - running task 0x%.08x (parent 0x%.08x, current 0x%.08x)\n", ctx.id, task, task.parent, ctx.currentTask)
+      ctx.run(task)
+
+    # 2. Run out of tasks, become a thief
+    debug: log("Worker %3d: eventLoopReserve 2 - becoming a thief\n", ctx.id)
+    let ticket = ctx.threadpool.globalBackoff.sleepy() # If using a reserve worker was necessary, sleep on the backoff for active threads
+    if (var stolenTask = ctx.tryStealAdaptative(); not stolenTask.isNil):
+      # We manage to steal a task, cancel sleep
+      ctx.threadpool.globalBackoff.cancelSleep()
+      # 2.a Run task
+      debug: log("Worker %3d: eventLoopReserve 2.a - stole task 0x%.08x (parent 0x%.08x, current 0x%.08x)\n", ctx.id, stolenTask, stolenTask.parent, ctx.currentTask)
+      ctx.run(stolenTask)
+    elif ctx.signal.terminate.load(moAcquire):
+      # 2.b Threadpool has no more tasks and we were signaled to terminate
+      ctx.threadpool.globalBackoff.cancelSleep()
+      debugTermination: log("Worker %3d: eventLoopReserve 2.b - terminated\n", ctx.id)
+      break
+    else:
+      # 2.c Park the thread until a new task enters the threadpool.
+      #     It is intentionally parked with all active threads as long as a reservist is needed
+      let (reservePlanningSleep, reserveCommittedSleep) = ctx.threadpool.reserveBackoff.getNumWaiters()
+      let numActiveReservists = ctx.threadpool.numThreads - (reservePlanningSleep-1 + reserveCommittedSleep) # -1 we don't want to count ourselves
+      if numActiveReservists > ctx.threadpool.numIdleThreadsAwaitingFutures.load(moAcquire):
+        ctx.threadpool.globalBackoff.cancelSleep()
+        continue
+
+      debug: log("Worker %3d: eventLoopReserve 2.b - sleeping on active threads backoff\n", ctx.id)
+      ctx.threadpool.globalBackoff.sleep(ticket)
+      debug: log("Worker %3d: eventLoopReserve 2.b - waking on active threads backoff\n", ctx.id)
+
+# ############################################################
+#                                                            #
+#                 Futures & Synchronization                  #
+#                                                            #
+# ############################################################

 template isRootTask(task: ptr Task): bool =
  task == RootTask
@ -367,7 +692,7 @@ proc completeFuture*[T](fv: Flowvar[T], parentResult: var T) {.raises:[].} =
  template isFutReady(): untyped =
    let isReady = fv.task.completed.load(moAcquire)
    if isReady:
-      parentResult = cast[ptr (ptr Task, T)](fv.task.data.addr)[1]
+      parentResult = cast[ptr (ptr Task, T)](fv.task.env.addr)[1]
    isReady

  if isFutReady():
@ -375,55 +700,55 @@ proc completeFuture*[T](fv: Flowvar[T], parentResult: var T) {.raises:[].} =

  ## 1. Process all the children of the current tasks.
  ##    This ensures that we can give control back ASAP.
-  debug: log("Worker %2d: sync 1 - searching task from local queue\n", ctx.id)
+  debug: log("Worker %3d: sync 1 - searching task from local queue\n", ctx.id)
  while (let task = ctx.taskqueue[].pop(); not task.isNil):
    if task.parent != ctx.currentTask:
-      debug: log("Worker %2d: sync 1 - skipping non-direct descendant task 0x%.08x (parent 0x%.08x, current 0x%.08x)\n", ctx.id, task, task.parent, ctx.currentTask)
+      debug: log("Worker %3d: sync 1 - skipping non-direct descendant task 0x%.08x (parent 0x%.08x, current 0x%.08x)\n", ctx.id, task, task.parent, ctx.currentTask)
      ctx.schedule(task, forceWake = true) # reschedule task and wake a sibling to take it over.
      break
-    debug: log("Worker %2d: sync 1 - running task 0x%.08x (parent 0x%.08x, current 0x%.08x)\n", ctx.id, task, task.parent, ctx.currentTask)
+    debug: log("Worker %3d: sync 1 - running task 0x%.08x (parent 0x%.08x, current 0x%.08x)\n", ctx.id, task, task.parent, ctx.currentTask)
    ctx.run(task)
    if isFutReady():
-      debug: log("Worker %2d: sync 1 - future ready, exiting\n", ctx.id)
+      debug: log("Worker %3d: sync 1 - future ready, exiting\n", ctx.id)
      return

  ## 2. We run out-of-tasks or out-of-direct-child of our current awaited task
  ##    So the task is bottlenecked by dependencies in other threads,
-  ##    hence we abandon our enqueued work and steal in the others' queues
-  ##    in hope it advances our awaited task. This prioritizes latency over throughput.
+  ##    hence we abandon our enqueued work and steal.
  ##
  ##    See also
  ##    - Proactive work-stealing for futures
  ##      Kyle Singer, Yifan Xu, I-Ting Angelina Lee, 2019
  ##      https://dl.acm.org/doi/10.1145/3293883.3295735
-  debug: log("Worker %2d: sync 2 - future not ready, becoming a thief (currentTask 0x%.08x)\n", ctx.id, ctx.currentTask)
+  debug: log("Worker %3d: sync 2 - future not ready, becoming a thief (currentTask 0x%.08x)\n", ctx.id, ctx.currentTask)
  while not isFutReady():
    if (let leapTask = ctx.tryLeapfrog(fv.task); not leapTask.isNil):
-      # We stole a task generated by the task we are awaiting.
-      debug: log("Worker %2d: sync 2.1 - leapfrog task 0x%.08x (parent 0x%.08x, current 0x%.08x)\n", ctx.id, leapTask, leapTask.parent, ctx.currentTask)
+      # Leapfrogging, the thief had an empty queue, hence if there are tasks in its queue, it's generated by our blocked task.
+      # Help the thief clear those, as if it did not finish, it's likely blocked on those children tasks.
+      debug: log("Worker %3d: sync 2.1 - leapfrog task 0x%.08x (parent 0x%.08x, current 0x%.08x)\n", ctx.id, leapTask, leapTask.parent, ctx.currentTask)
      ctx.run(leapTask)
-    elif (let stolenTask = ctx.tryStealOne(); not stolenTask.isNil):
-      # We stole a task, we hope we advance our awaited task.
-      debug: log("Worker %2d: sync 2.2 - stole task 0x%.08x (parent 0x%.08x, current 0x%.08x)\n", ctx.id, stolenTask, stolenTask.parent, ctx.currentTask)
-      ctx.run(stolenTask)
-    elif (let ownTask = ctx.taskqueue[].pop(); not ownTask.isNil):
-      # We advance our own queue, this increases global throughput but may impact latency on the awaited task.
-      #
-      # Note: for a scheduler to be optimal (i.e. within 2x than ideal) it should be greedy
-      #       so all workers should be working. This is a difficult tradeoff.
-      debug: log("Worker %2d: sync 2.3 - couldn't steal, running own task\n", ctx.id)
-      ctx.run(ownTask)
    else:
-      # Nothing to do, we park.
-      # Note: On today's hyperthreaded systems, it might be more efficient to always park
-      #       instead of working on unrelated tasks in our task queue, despite making the scheduler non-greedy.
-      #       The actual hardware resources are 2x less than the actual number of cores
+      # At this point, we have significant design decisions:
+      # - Do we steal from other workers in hope we advance our awaited task?
+      # - Do we advance our own queue for tasks that are not child of our awaited tasks?
+      # - Do we park instead of working on unrelated task. With hyperthreading that would actually still leave the core busy enough?
+      #
+      # - If we work, we maximize throughput, but we increase latency to handle the future's continuation.
+      #   If that future creates more parallel work, we would actually have restricted parallelism.
+      # - If we park, we minimize latency, but we don't use the full hardware resources, and there are CPUs without hyperthreading (on ARM for example)
+      #   Furthermore, a work-stealing scheduler is within 2x an optimal scheduler if it is greedy, i.e., as long as there is enough work, all cores are used.
+      #
+      # The solution chosen is to wake a reserve thread, keeping hardware offered/throughput constant. And put the awaiting thread to sleep.
      ctx.localBackoff.prepareToPark()
+      discard ctx.threadpool.numIdleThreadsAwaitingFutures.fetchAdd(1, moRelease)
+      ctx.threadpool.reserveBackoff.wake()

      var expected = (ptr EventNotifier)(nil)
      if compareExchange(fv.task.waiter, expected, desired = ctx.localBackoff.addr, moAcquireRelease):
        ctx.localBackoff.park()

+      discard ctx.threadpool.numIdleThreadsAwaitingFutures.fetchSub(1, moRelease)
+
 proc syncAll*(tp: Threadpool) {.raises: [].} =
  ## Blocks until all pending tasks are completed
  ## This MUST only be called from
@ -431,64 +756,76 @@ proc syncAll*(tp: Threadpool) {.raises: [].} =
  template ctx: untyped = workerContext

  debugTermination:
-    log(">>> Worker %2d enters barrier <<<\n", ctx.id)
+    log(">>> Worker %3d enters barrier <<<\n", ctx.id)

  preCondition: ctx.id == 0
  preCondition: ctx.currentTask.isRootTask()

-  # Empty all tasks
-  tp.globalBackoff.wakeAll()
-
  while true:
    # 1. Empty local tasks
-    debug: log("Worker %2d: syncAll 1 - searching task from local queue\n", ctx.id)
+    debug: log("Worker %3d: syncAll 1 - searching task from local queue\n", ctx.id)
    while (let task = ctx.taskqueue[].pop(); not task.isNil):
-      debug: log("Worker %2d: syncAll 1 - running task 0x%.08x (parent 0x%.08x, current 0x%.08x)\n", ctx.id, task, task.parent, ctx.currentTask)
+      debug: log("Worker %3d: syncAll 1 - running task 0x%.08x (parent 0x%.08x, current 0x%.08x)\n", ctx.id, task, task.parent, ctx.currentTask)
      ctx.run(task)

-    if tp.numThreads == 1:
-      break
-
    # 2. Help other threads
-    debug: log("Worker %2d: syncAll 2 - becoming a thief\n", ctx.id)
+    debugTermination:
+      let regular = tp.globalBackoff.getNumWaiters()
+      let reserve = tp.reserveBackoff.getNumWaiters()
+      log("Worker %3d: syncAll 2 - becoming a thief - (preSleep: %d, sleeping %d) regular and (preSleep: %d, sleeping %d) reserve workers\n",
+          ctx.id, regular.preSleep, regular.committedSleep, reserve.preSleep, reserve.committedSleep)
    if (var stolenTask = ctx.tryStealAdaptative(); not stolenTask.isNil):
      # 2.a We stole some task
-      debug: log("Worker %2d: syncAll 2.a - stole task 0x%.08x (parent 0x%.08x, current 0x%.08x)\n", ctx.id, stolenTask, stolenTask.parent, ctx.currentTask)
+      debug: log("Worker %3d: syncAll 2.a - stole task 0x%.08x (parent 0x%.08x, current 0x%.08x)\n", ctx.id, stolenTask, stolenTask.parent, ctx.currentTask)
      ctx.run(stolenTask)
-    elif tp.globalBackoff.getNumWaiters() == (0'u32, tp.numThreads - 1):
+    elif tp.reserveBackoff.getNumWaiters() == (0'i32, tp.numThreads) and
+         tp.globalBackoff.getNumWaiters() == (0'i32, tp.numThreads-1): # Don't count ourselves
      # 2.b all threads besides the current are parked
-      debugTermination: log("Worker %2d: syncAll 2.b - termination, all other threads sleeping\n", ctx.id)
+      debugTermination: log("Worker %3d: syncAll 2.b - termination, all other threads sleeping\n", ctx.id)
      break
    else:
      # 2.c We don't park as there is no notif for task completion
      cpuRelax()

  debugTermination:
-    log(">>> Worker %2d leaves barrier <<<\n", ctx.id)
+    log(">>> Worker %3d leaves barrier <<<\n", ctx.id)

-# Runtime
-# ---------------------------------------------
+# ############################################################
+#                                                            #
+#                     Runtime API                            #
+#                                                            #
+# ############################################################

 proc new*(T: type Threadpool, numThreads = countProcessors()): T {.raises: [ResourceExhaustedError].} =
  ## Initialize a threadpool that manages `numThreads` threads.
  ## Default to the number of logical processors available.
+  ##
+  ## A Constantine's threadpool cannot be instantiated
+  ## on a thread managed by another Constantine's threadpool
+  ## including the root thread.
+  ##
+  ## Mixing with other libraries' threadpools and runtime
+  ## will not impact correctness but may impact performance.

  type TpObj = typeof(default(Threadpool)[]) # due to C import, we need a dynamic sizeof
  var tp = allocHeapUncheckedAlignedPtr(Threadpool, sizeof(TpObj), alignment = 64)

-  tp.barrier.init(numThreads.uint32)
+  tp.barrier.init(2*numThreads.uint32)
  tp.globalBackoff.initialize()
-  tp.numThreads = numThreads.uint32
-  tp.workerQueues = allocHeapArrayAligned(Taskqueue, numThreads, alignment = 64)
-  tp.workers = allocHeapArrayAligned(Thread[(Threadpool, WorkerID)], numThreads, alignment = 64)
-  tp.workerSignals = allocHeapArrayAligned(Signal, numThreads, alignment = 64)
+  tp.reserveBackoff.initialize()
+  tp.numThreads = numThreads.int32
+  tp.numIdleThreadsAwaitingFutures.store(0, moRelaxed)
+  # Allocate for `numThreads` regular workers and `numTHreads` reserve workers
+  tp.workerQueues = allocHeapArrayAligned(Taskqueue, 2*numThreads, alignment = 64)
+  tp.workers = allocHeapArrayAligned(Thread[(Threadpool, WorkerID)], 2*numThreads, alignment = 64)
+  tp.workerSignals = allocHeapArrayAligned(Signal, 2*numThreads, alignment = 64)

  # Setup master thread
  workerContext.id = 0
  workerContext.threadpool = tp

  # Start worker threads
-  for i in 1 ..< numThreads:
+  for i in 1 ..< 2*numThreads:
    createThread(tp.workers[i], workerEntryFn, (tp, WorkerID(i)))

  # Root worker
@ -505,7 +842,7 @@ proc cleanup(tp: var Threadpool) {.raises: [].} =
  ## Cleanup all resources allocated by the threadpool
  preCondition: workerContext.currentTask.isRootTask()

-  for i in 1 ..< tp.numThreads:
+  for i in 1 ..< 2*tp.numThreads:
    joinThread(tp.workers[i])

  tp.workerSignals.freeHeapAligned()
@ -522,10 +859,11 @@ proc shutdown*(tp: var Threadpool) {.raises:[].} =
  tp.syncAll()

  # Signal termination to all threads
-  for i in 0 ..< tp.numThreads:
+  for i in 0 ..< 2*tp.numThreads:
    tp.workerSignals[i].terminate.store(true, moRelease)

  tp.globalBackoff.wakeAll()
+  tp.reserveBackoff.wakeAll()

  # 1 matching barrier in workerEntryFn
  discard tp.barrier.wait()
@ -538,6 +876,18 @@ proc shutdown*(tp: var Threadpool) {.raises:[].} =

 {.pop.} # raises:[]

+# ############################################################
+#                                                            #
+#                     Parallel API                           #
+#                                                            #
+# ############################################################
+
+proc getThreadID(tp: Threadpool): int {.inline, used.} =
+  ## Returns the worker local ID.
+  ## This is a debug proc for logging purposes
+  ## The threadpool needs to be imported with {.all.} pragma
+  workerContext.id
+
 # Task parallel API
 # ---------------------------------------------

@ -551,3 +901,44 @@ macro spawn*(tp: Threadpool, fnCall: typed): untyped =
  ##
  ## Tasks are processed approximately in Last-In-First-Out (LIFO) order
  result = spawnImpl(tp, fnCall, bindSym"workerContext", bindSym"schedule")
+
+# Data parallel API
+# ---------------------------------------------
+# TODO: we can fuse parallelFor and parallelForStrided
+#       in a single proc once {.experimental: "flexibleOptionalParams".}
+#       is not experimental anymore
+
+macro parallelFor*(tp: Threadpool, loopParams: untyped, body: untyped): untyped =
+  ## Parallel for loop.
+  ## Syntax:
+  ##
+  ## tp.parallelFor i in 0 ..< 10:
+  ##   echo(i)
+  ##
+  ## Variables from the external scope needs to be explicitly captured
+  ##
+  ##  var a = 100
+  ##  var b = 10
+  ##  tp.parallelFor i in 0 ..< 10:
+  ##    captures: {a, b}
+  ##    echo a + b + i
+  ##
+  result = newStmtList()
+  result.add quote do:
+    # Avoid integer overflow checks in tight loop
+    # and no exceptions in code.
+    {.push checks:off.}
+
+  if body.hasReduceSection():
+    result.add parallelReduceImpl(
+      bindSym"workerContext", bindSym"schedule",
+      bindSym"parallelReduceWrapper",
+      loopParams, body)
+  else:
+    result.add parallelForImpl(
+      bindSym"workerContext", bindSym"schedule",
+      bindSym"parallelForWrapper",
+      loopParams, body)
+
+  result.add quote do:
+    {.pop.}
--- a/helpers/prng_unsafe.nim
+++ b/helpers/prng_unsafe.nim
@ -247,7 +247,7 @@ func random_long01Seq(rng: var RngState, a: var BigInt) =
  ## Initialize a bigint
  ## It is skewed towards producing strings of 1111... and 0000
  ## to trigger edge cases
-  var buf: array[(a.bits + 7) div 8, byte]
+  var buf: array[a.bits.ceilDiv_vartime(8), byte]
  rng.random_long01Seq(buf)
  let order = rng.sample_unsafe([bigEndian, littleEndian])
  if order == bigEndian:
--- a/tests/gpu/t_nvidia_fp.nim
+++ b/tests/gpu/t_nvidia_fp.nim
@ -12,7 +12,8 @@ import
  std/[unittest, times],
  # Internal
  ../../constantine/platforms/gpu/[llvm, nvidia, ir],
-  ../../constantine/math/config/[curves, type_bigint],
+  ../../constantine/platforms/static_for,
+  ../../constantine/math/config/curves,
  ../../constantine/math/io/io_bigints,
  ../../constantine/math/arithmetic,
  ../../constantine/math_gpu/fields_nvidia,
@ -34,8 +35,7 @@ proc init(T: type CurveMetadata, asy: Assembler_LLVM, curve: static Curve, wordS
      fpBits = uint32 curve.getCurveBitwidth(),
      fpMod = curve.Mod().toHex(),
      frBits = uint32 curve.getCurveOrderBitwidth(),
-      frMod = curve.getCurveOrder().toHex()
-    )
+      frMod = curve.getCurveOrder().toHex())

 proc genFieldAddPTX(asy: Assembler_LLVM, cm: CurveMetadata) =
  let fpAdd = asy.field_add_gen(cm, fp)
--- a/tests/math/t_bigints_mod_vs_gmp.nim
+++ b/tests/math/t_bigints_mod_vs_gmp.nim
@ -107,8 +107,8 @@ proc main() =

    #########################################################
    # Conversion to GMP
-    const aLen = (aBits + 7) div 8
-    const mLen = (mBits + 7) div 8
+    const aLen = aBits.ceilDiv_vartime(8)
+    const mLen = mBits.ceilDiv_vartime(8)

    var aBuf: array[aLen, byte]
    var mBuf: array[mLen, byte]
--- a/tests/math/t_bigints_mul_high_words_vs_gmp.nim
+++ b/tests/math/t_bigints_mul_high_words_vs_gmp.nim
@ -78,8 +78,8 @@ proc main() =

    #########################################################
    # Conversion to GMP
-    const aLen = (aBits + 7) div 8
-    const bLen = (bBits + 7) div 8
+    const aLen = aBits.ceilDiv_vartime(8)
+    const bLen = bBits.ceilDiv_vartime(8)

    var aBuf: array[aLen, byte]
    var bBuf: array[bLen, byte]
--- a/tests/math/t_bigints_mul_vs_gmp.nim
+++ b/tests/math/t_bigints_mul_vs_gmp.nim
@ -71,8 +71,8 @@ proc main() =

    #########################################################
    # Conversion to GMP
-    const aLen = (aBits + 7) div 8
-    const bLen = (bBits + 7) div 8
+    const aLen = aBits.ceilDiv_vartime(8)
+    const bLen = bBits.ceilDiv_vartime(8)

    var aBuf: array[aLen, byte]
    var bBuf: array[bLen, byte]
--- a/tests/math/t_finite_fields_vs_gmp.nim
+++ b/tests/math/t_finite_fields_vs_gmp.nim
@ -62,8 +62,8 @@ proc binary_prologue[C: static Curve, N: static int](

  #########################################################
  # Conversion to GMP
-  const aLen = (C.getCurveBitwidth() + 7) div 8
-  const bLen = (C.getCurveBitwidth()  + 7) div 8
+  const aLen = C.getCurveBitwidth().ceilDiv_vartime(8)
+  const bLen = C.getCurveBitwidth().ceilDiv_vartime(8)

  var aBuf: array[aLen, byte]
  var bBuf: array[bLen, byte]
@ -118,7 +118,7 @@ proc addTests(rng: var RngState, a, b, p, r: var mpz_t, C: static Curve) =

  const
    bits = C.getCurveBitwidth()
-    bufLen = (bits + 7) div 8
+    bufLen = bits.ceilDiv_vartime(8)
  var
    aTest, bTest{.noInit.}: Fp[C]
    aBuf, bBuf: array[bufLen, byte]
@ -141,7 +141,7 @@ proc subTests(rng: var RngState, a, b, p, r: var mpz_t, C: static Curve) =

  const
    bits = C.getCurveBitwidth()
-    bufLen = (bits + 7) div 8
+    bufLen = bits.ceilDiv_vartime(8)
  var
    aTest, bTest{.noInit.}: Fp[C]
    aBuf, bBuf: array[bufLen, byte]
@ -169,7 +169,7 @@ proc mulTests(rng: var RngState, a, b, p, r: var mpz_t, C: static Curve) =

  const
    bits = C.getCurveBitwidth()
-    bufLen = (bits + 7) div 8
+    bufLen = bits.ceilDiv_vartime(8)
  var
    aTest, bTest{.noInit.}: Fp[C]
    aBuf, bBuf: array[bufLen, byte]
@ -193,7 +193,7 @@ proc invTests(rng: var RngState, a, b, p, r: var mpz_t, C: static Curve) =

  const
    bits = C.getCurveBitwidth()
-    bufLen = (bits + 7) div 8
+    bufLen = bits.ceilDiv_vartime(8)
  var
    aTest, bTest{.noInit.}: Fp[C]
    aBuf, bBuf: array[bufLen, byte]
--- a/tests/math/t_io_unsaturated.nim
+++ b/tests/math/t_io_unsaturated.nim
@ -42,7 +42,7 @@ proc testRoundtrip(curve: static Curve, gen: static RandomGen) =
  const bits = curve.getCurveBitwidth()
  const Excess = 2
  const UnsatBitwidth = WordBitWidth - Excess
-  const N = (bits + UnsatBitwidth-1) div UnsatBitwidth
+  const N = bits.ceilDiv_vartime(UnsatBitwidth)

  let a = rng.random_bigint(curve, gen)
  var u: LimbsUnsaturated[N, Excess]