initial commit

2021-06-28 16:47:06 +02:00 · 2021-06-28 16:47:06 +02:00 · 216aabe629
commit 216aabe629
41 changed files with 3976 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,4 @@
 nimcache/
 # Executables shall be put in an ignored build/ directory
 build/
--- a/README.md
+++ b/README.md
@ -0,0 +1,35 @@
 # Taskpools
 ## API
 The API spec follows https://github.com/nim-lang/RFCs/issues/347#task-parallelism-api
 ## Overview
 This implements a lightweight, energy-efficient, easily auditable multithreaded taskpools.
 This taskpools will be used in a highly security-sensitive blockchain application
 targeted at resource-restricted devices hence desirable properties are:
 - Ease of auditing and maintenance.
  - Formally verified synchronization primitives are highly-sought after.
  - Otherwise primitives are implemented from papers or ported from proven codebases
    that can serve as reference for auditors.
 - Resource-efficient. Threads spindown to save power, low memory use.
 - Decent performance and scalability. The workload to parallelize are cryptography-related
  and require at least 1ms runtime per thread.
  This means that only a simple scheduler is required.
 Non-goals:
 - Supporting task priorities
 - Being distributed
 - Supporting GC-ed memory on Nim default GC (sequences and strings)
 - Have async-awaitable tasks
 In particular compared to [Weave](https://github.com/mratsim/weave), here are the tradeoffs:
 - Taskpools only provide spawn/sync (task parallelism).\
  There is no parallel for (data parallelism)\
  or precise in/out dependencies (dataflow parallelism).
 - Weave can handle trillions of small tasks that require only 10µs per task. (Load Balancing overhead)
 - Weave maintains an adaptive memory pool to reduce memory allocation overhead,
  Taskpools allocations are as-needed. (Scheduler overhead)
--- a/benchmarks/bouncing_producer_consumer/README.md
+++ b/benchmarks/bouncing_producer_consumer/README.md
@ -0,0 +1,11 @@
 # BPC (Bouncing Producer-Consumer)
 From [tasking-2.0](https://github.com/aprell/tasking-2.0) description
 > **BPC**, short for **B**ouncing **P**roducer-**C**onsumer benchmark, as far
 > as I know, first described by [Dinan et al][1]. There are two types of
 > tasks, producer and consumer tasks. Each producer task creates another
 > producer task followed by *n* consumer tasks, until a certain depth *d* is
 > reached. Consumer tasks run for *t* microseconds. The smaller the values of
 > *n* and *t*, the harder it becomes to exploit the available parallelism. A
 > solid contender for the most antagonistic microbenchmark.
--- a/benchmarks/bouncing_producer_consumer/taskpool_bpc.nim
+++ b/benchmarks/bouncing_producer_consumer/taskpool_bpc.nim
@ -0,0 +1,156 @@
 import
  # STD lib
  os, strutils, system/ansi_c, cpuinfo, strformat, math,
  # Library
  ../../taskpools,
  # bench
  ../wtime, ../resources
 var
  Depth: int32 # For example 10000
  NumTasksPerDepth: int32 # For example 9
  # The total number of tasks in the BPC benchmark is
  # (NumTasksPerDepth + 1) * Depth
  NumTasksTotal: int32
  TaskGranularity: int32 # in microseconds
  PollInterval: float64  # in microseconds
  tp: Taskpool
 var global_poll_elapsed {.threadvar.}: float64
 template dummy_cpt(): untyped =
  # Dummy computation
  # Calculate fib(30) iteratively
  var
    fib = 0
    f2 = 0
    f1 = 1
  for i in 2 .. 30:
    fib = f1 + f2
    f2 = f1
    f1 = fib
 proc bpc_consume(usec: int32) =
  var pollElapsed = 0'f64
  let start = wtime_usec()
  let stop = usec.float64
  global_poll_elapsed = PollInterval
  while true:
    var elapsed = wtime_usec() - start
    elapsed -= pollElapsed
    if elapsed >= stop:
      break
    dummy_cpt()
    # if elapsed >= global_poll_elapsed:
    #   let pollStart = wtime_usec()
    #   loadBalance(Weave)
    #   pollElapsed += wtime_usec() - pollStart
    #   global_poll_elapsed += PollInterval
 proc bpc_consume_nopoll(usec: int32) =
  let start = wtime_usec()
  let stop = usec.float64
  while true:
    var elapsed = wtime_usec() - start
    if elapsed >= stop:
      break
    dummy_cpt()
 proc bpc_produce(n, d: int32) =
  if d > 0:
    # Create producer task
    tp.spawn bpc_produce(n, d-1)
  else:
    return
  # Followed by n consumer tasks
  for i in 0 ..< n:
    tp.spawn bpc_consume(TaskGranularity)
 proc main() =
  Depth = 10000
  NumTasksPerDepth = 999
  TaskGranularity = 1
  if paramCount() == 0:
    let exeName = getAppFilename().extractFilename()
    echo &"Usage: {exeName} <depth: {Depth}> " &
         &"<# of tasks per depth: {NumTasksPerDepth}> " &
         &"[task granularity (us): {TaskGranularity}] " &
         &"[polling interval (us): task granularity]"
    echo &"Running with default config Depth = {Depth}, NumTasksPerDepth = {NumTasksPerDepth}, granularity (us) = {TaskGranularity}, polling (us) = {PollInterval}"
  if paramCount() >= 1:
    Depth = paramStr(1).parseInt.int32
  if paramCount() >= 2:
    NumTasksPerDepth = paramStr(2). parseInt.int32
  if paramCount() >= 3:
    TaskGranularity = paramStr(3). parseInt.int32
  if paramCount() == 4:
    PollInterval = paramStr(4).parseInt.float64
  else:
    PollInterval = TaskGranularity.float64
  if paramCount() > 4:
    let exeName = getAppFilename().extractFilename()
    echo &"Usage: {exeName} <depth: {Depth}> " &
         &"<# of tasks per depth: {NumTasksPerDepth}> " &
         &"[task granularity (us): {TaskGranularity}] " &
         &"[polling interval (us): task granularity]"
    quit 1
  NumTasksTotal = (NumTasksPerDepth + 1) * Depth
  var nthreads: int
  if existsEnv"TASKPOOL_NUM_THREADS":
    nthreads = getEnv"TASKPOOL_NUM_THREADS".parseInt()
  else:
    nthreads = countProcessors()
  tp = Taskpool.new(numThreads = nthreads)
  # measure overhead during tasking
  var ru: Rusage
  getrusage(RusageSelf, ru)
  var
    rss = ru.ru_maxrss
    flt = ru.ru_minflt
  let start = wtime_msec()
  bpc_produce(NumTasksPerDepth, Depth)
  tp.syncAll()
  let stop = wtime_msec()
  getrusage(RusageSelf, ru)
  rss = ru.ru_maxrss - rss
  flt = ru.ru_minflt - flt
  tp.shutdown()
  echo "--------------------------------------------------------------------------"
  echo "Scheduler:                                     Taskpool"
  echo "Benchmark:                                     BPC (Bouncing Producer-Consumer)"
  echo "Threads:                                       ", nthreads
  echo "Time(ms)                                       ", round(stop - start, 3)
  echo "Max RSS (KB):                                  ", ru.ru_maxrss
  echo "Runtime RSS (KB):                              ", rss
  echo "# of page faults:                              ", flt
  echo "--------------------------------------------------------------------------"
  echo "# of tasks:                                    ", NumTasksTotal
  echo "# of tasks/depth:                              ", NumTasksPerDepth
  echo "Depth:                                         ", Depth
  echo "Task granularity (us):                         ", TaskGranularity
  echo "Polling / manual load balancing interval (us): ", PollInterval
  quit 0
 main()
--- a/benchmarks/dfs/weave_dfs.nim
+++ b/benchmarks/dfs/weave_dfs.nim
@ -0,0 +1,85 @@
 # Weave
 # Copyright (c) 2019 Mamy André-Ratsimbazafy
 # Licensed and distributed under either of
 #   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
 #   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
 # at your option. This file may not be copied, modified, or distributed except according to those terms.
 import
  # Stdlib
  system/ansi_c, strformat, os, strutils, cpuinfo,
  # Weave
  ../../weave
 when not defined(windows):
  # bench
  import ../wtime
 proc dfs(depth, breadth: int): uint32 =
  if depth == 0:
    return 1
  # We could use alloca to avoid heap allocation here
  var sums = newSeq[Flowvar[uint32]](breadth)
  for i in 0 ..< breadth:
    sums[i] = spawn dfs(depth - 1, breadth)
  for i in 0 ..< breadth:
    result += sync(sums[i])
 proc test(depth, breadth: int): uint32 =
  result = sync spawn dfs(depth, breadth)
 proc main() =
  var
    depth = 8
    breadth = 8
    answer: uint32
    nthreads: int
  if existsEnv"WEAVE_NUM_THREADS":
    nthreads = getEnv"WEAVE_NUM_THREADS".parseInt()
  else:
    nthreads = countProcessors()
  if paramCount() == 0:
    let exeName = getAppFilename().extractFilename()
    echo &"Usage: {exeName} <depth:{depth}> <breadth:{breadth}>"
    echo &"Running with default config depth = {depth} and breadth = {breadth}"
  if paramCount() >= 1:
    depth = paramStr(1).parseInt()
  if paramCount() == 2:
    breadth = paramStr(2).parseInt()
  if paramCount() > 2:
    let exeName = getAppFilename().extractFilename()
    echo &"Usage: {exeName} <depth:{depth}> <breadth:{breadth}>"
    echo &"Up to 2 parameters are valid. Received {paramCount()}"
    quit 1
  # Staccato benches runtime init and exit as well
  when not defined(windows):
    let start = wtime_usec()
  init(Weave)
  answer = test(depth, breadth)
  exit(Weave)
  when not defined(windows):
    let stop = wtime_usec()
  const lazy = defined(WV_LazyFlowvar)
  const config = if lazy: " (lazy flowvars)"
                 else: " (eager flowvars)"
  echo "Scheduler:  Weave", config
  echo "Benchmark:  dfs"
  echo "Threads:    ", nthreads
  when not defined(windows):
    echo "Time(us)    ", stop - start
  echo "Output:     ", answer
  quit 0
 main()
--- a/benchmarks/heat/stdnim_heat.nim
+++ b/benchmarks/heat/stdnim_heat.nim
@ -0,0 +1,300 @@
 # Weave
 # Copyright (c) 2019 Mamy André-Ratsimbazafy
 # Licensed and distributed under either of
 #   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
 #   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
 # at your option. This file may not be copied, modified, or distributed except according to those terms.
 # From fibril
 #
 # Original license
 #
 # /*
 #  * Heat diffusion (Jacobi-type iteration)
 #  *
 #  * Volker Strumpen, Boston                                 August 1996
 #  *
 #  * Copyright (c) 1996 Massachusetts Institute of Technology
 #  *
 #  * This program is free software; you can redistribute it and/or modify
 #  * it under the terms of the GNU General Public License as published by
 #  * the Free Software Foundation; either version 2 of the License, or
 #  * (at your option) any later version.
 #  *
 #  * This program is distributed in the hope that it will be useful,
 #  * but WITHOUT ANY WARRANTY; without even the implied warranty of
 #  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 #  * GNU General Public License for more details.
 #  *
 #  * You should have received a copy of the GNU General Public License
 #  * along with this program; if not, write to the Free Software
 #  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 #  */
 import
  # Stdlib
  strformat, os, strutils, math, system/ansi_c,
  cpuinfo, threadpool,
  # bench
  ../wtime, ../resources
 # This deadlocks :/
 # Helpers
 # -------------------------------------------------------
 # We need a thin wrapper around raw pointers for matrices,
 # we can't pass "var seq[seq[float64]]" to other threads
 # nor "var" for that matter
 type
  Matrix[T] = object
    buffer: ptr UncheckedArray[T]
    m, n: int
  Row[T] = object
    buffer: ptr UncheckedArray[T]
    len: int
 func newMatrix[T](m, n: int): Matrix[T] {.inline.} =
  result.buffer = cast[ptr UncheckedArray[T]](c_malloc(csize_t m*n*sizeof(T)))
  result.m = m
  result.n = n
 template `[]`[T](mat: Matrix[T], row, col: Natural): T =
  # row-major storage
  assert row < mat.m
  assert col < mat.n
  mat.buffer[row * mat.n + col]
 template `[]=`[T](mat: Matrix[T], row, col: Natural, value: T) =
  assert row < mat.m
  assert col < mat.n
  mat.buffer[row * mat.n + col] = value
 func getRow[T](mat: Matrix[T], rowIdx: Natural): Row[T] {.inline.} =
  # row-major storage, there are n columns in between each rows
  assert rowIdx < mat.m
  result.buffer = cast[ptr UncheckedArray[T]](mat.buffer[rowIdx * mat.n].addr)
  result.len = mat.m
 template `[]`[T](row: Row[T], idx: Natural): T =
  assert idx < row.len
  row.buffer[idx]
 template `[]=`[T](row: Row[T], idx: Natural, value: T) =
  assert idx < row.len
  row.buffer[idx] = value
 func delete[T](mat: sink Matrix[T]) =
  c_free(mat.buffer)
 # And an auto converter for int32 -> float64 so we don't have to convert
 # all i, j indices manually
 converter i32toF64(x: int32): float64 {.inline.} =
  float64(x)
 # -------------------------------------------------------
 template f(x, y: SomeFloat): SomeFloat =
  sin(x) * sin(y)
 template randa[T: SomeFloat](x, t: T): T =
  T(0.0)
 proc randb(x, t: SomeFloat): SomeFloat {.inline.} =
  # proc instead of template to avoid Nim constant folding bug:
  # https://github.com/nim-lang/Nim/issues/12783
  exp(-2 * t) * sin(x)
 template randc[T: SomeFloat](y, t: T): T =
  T(0.0)
 proc randd(y, t: SomeFloat): SomeFloat {.inline.} =
  # proc instead of template to avoid Nim constant folding bug:
  # https://github.com/nim-lang/Nim/issues/12783
  exp(-2 * t) * sin(y)
 template solu(x, y, t: SomeFloat): SomeFloat =
  exp(-2 * t) * sin(x) * sin(y)
 const n = 4096'i32
 var
  nx, ny, nt: int32
  xu, xo, yu, yo, tu, to: float64
  dx, dy, dt: float64
  dtdxsq, dtdysq: float64
  odd: Matrix[float64]
  even: Matrix[float64]
 proc heat(m: Matrix[float64], il, iu: int32): bool {.discardable.}=
  # TODO to allow awaiting `heat` we return a dummy bool
  # The parallel spawns are updating the same matrix cells otherwise
  if iu - il > 1:
    let im = (il + iu) div 2
    let h = spawn heat(m, il, im)
    heat(m, im, iu)
    discard ^h
    return true
  # ------------------------
  let i = il
  let row = m.getRow(i)
  if i == 0:
    for j in 0 ..< ny:
      row[j] = randc(yu + j*dy, 0)
  elif i == nx - 1:
    for j in 0 ..< ny:
      row[j] = randd(yu + j*dy, 0)
  else:
    row[0] = randa(xu + i*dx, 0)
    for j in 1 ..< ny - 1:
      row[j] = f(xu + i*dx, yu + j*dy)
    row[ny - 1] = randb(xu + i*dx, 0)
 proc diffuse(output: Matrix[float64], input: Matrix[float64], il, iu: int32, t: float64): bool {.discardable.} =
  # TODO to allow awaiting `diffuse` we return a dummy bool
  # The parallel spawns are updating the same matrix cells otherwise
  if iu - il > 1:
    let im = (il + iu) div 2
    let d = spawn diffuse(output, input, il, im, t)
    diffuse(output, input, im, iu, t)
    discard ^d
    return true
  # ------------------------
  let i = il
  let row = output.getRow(i)
  if i == 0:
    for j in 0 ..< ny:
      row[j] = randc(yu + j*dy, t)
  elif i == nx - 1:
    for j in 0 ..< ny:
      row[j] = randd(yu + j*dy, t)
  else:
    row[0] = randa(xu + i*dx, t)
    for j in 1 ..< ny - 1:
      row[j] = input[i, j] + # The use of nested sequences here is a bad idea ...
               dtdysq * (input[i, j+1] - 2 * input[i, j] + input[i, j-1]) +
               dtdxsq * (input[i+1, j] - 2 * input[i, j] + input[i-1, j])
    row[ny - 1] = randb(xu + i*dx, t)
 proc initTest() =
  nx = n
  ny = 1024
  nt = 100
  xu = 0.0
  xo = 1.570796326794896558
  yu = 0.0
  yo = 1.570796326794896558
  tu = 0.0
  to = 0.0000001
  dx = (xo - xu) / float64(nx - 1)
  dy = (yo - yu) / float64(ny - 1)
  dt = (to - tu) / float64(nt)
  dtdxsq = dt / (dx * dx)
  dtdysq = dt / (dy * dy)
  even = newMatrix[float64](nx, ny)
  odd = newMatrix[float64](nx, ny)
 proc prep() =
  heat(even, 0, nx)
 proc test() =
  var t = tu
  for _ in countup(1, nt.int, 2):
    # nt included
    t += dt
    diffuse(odd, even, 0, nx, t)
    t += dt
    diffuse(even, odd, 0, nx, t)
  if nt mod 2 != 0:
    t += dt
    diffuse(odd, even, 0, nx, t)
 proc verify() =
  var
    mat: Matrix[float64]
    mae: float64
    mre: float64
    me:  float64
  mat = if nt mod 2 != 0: odd else: even
  for a in 0 ..< nx:
    for b in 0 ..< ny:
      var tmp = abs(mat[a, b] - solu(xu + a*dx, yu + b*dy, to))
      if tmp > 1e-3:
        echo "nx: ", nx, " - ny: ", ny
        echo "mat[", a, ", ", b, "] = ", mat[a, b], ", expected sol = ", solu(xu + a*dx, yu + b*dy, to)
        quit 1
      me += tmp
      if tmp > mae: mae = tmp
      if mat[a, b] != 0.0: tmp /= mat[a, b]
      if tmp > mre: mre = tmp
  me /= nx * ny
  if mae > 1e-12:
    echo &"Local maximal absolute error {mae:1.3e}"
    quit 1
  if mre > 1e-12:
    echo &"Local maximal relative error {mre:1.3e}"
    quit 1
  if me > 1e-12:
    echo &"Global mean absolute error {me:1.3e}"
    quit 1
  echo "Verification successful"
 proc main() =
  var nthreads: int
  nthreads = countProcessors()
  var ru: Rusage
  getrusage(RusageSelf, ru)
  var
    rss = ru.ru_maxrss
    flt = ru.ru_minflt
  initTest()
  prep()
  let start = wtime_usec()
  test()
  let stop = wtime_usec()
  getrusage(RusageSelf, ru)
  rss = ru.ru_maxrss - rss
  flt = ru.ru_minflt - flt
  sync()
  verify()
  delete(even)
  delete(odd)
  echo "Scheduler:  Nim threadpool (standard lib)"
  echo "Benchmark:        heat"
  echo "Threads:          ", nthreads
  echo "Time(us)          ", stop - start
  echo "Max RSS (KB):     ", ru.ru_maxrss
  echo "Runtime RSS (KB): ", rss
  echo "# of page faults: ", flt
  quit 0
 main()
--- a/benchmarks/heat/taskpool_heat.nim
+++ b/benchmarks/heat/taskpool_heat.nim
@ -0,0 +1,313 @@
 # Weave
 # Copyright (c) 2019 Mamy André-Ratsimbazafy
 # Licensed and distributed under either of
 #   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
 #   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
 # at your option. This file may not be copied, modified, or distributed except according to those terms.
 # From fibril
 #
 # Original license
 #
 # /*
 #  * Heat diffusion (Jacobi-type iteration)
 #  *
 #  * Volker Strumpen, Boston                                 August 1996
 #  *
 #  * Copyright (c) 1996 Massachusetts Institute of Technology
 #  *
 #  * This program is free software; you can redistribute it and/or modify
 #  * it under the terms of the GNU General Public License as published by
 #  * the Free Software Foundation; either version 2 of the License, or
 #  * (at your option) any later version.
 #  *
 #  * This program is distributed in the hope that it will be useful,
 #  * but WITHOUT ANY WARRANTY; without even the implied warranty of
 #  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 #  * GNU General Public License for more details.
 #  *
 #  * You should have received a copy of the GNU General Public License
 #  * along with this program; if not, write to the Free Software
 #  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 #  */
 import
  # Stdlib
  strformat, os, strutils, math, system/ansi_c,
  cpuinfo,
  # Taskpools
  ../../taskpools
 when not defined(windows):
  # bench
  import ../wtime, ../resources
 # Helpers
 # -------------------------------------------------------
 # We need a thin wrapper around raw pointers for matrices,
 # we can't pass "var seq[seq[float64]]" to other threads
 # nor "var" for that matter
 type
  Matrix[T] = object
    buffer: ptr UncheckedArray[T]
    m, n: int
  Row[T] = object
    buffer: ptr UncheckedArray[T]
    len: int
 var tp: Taskpool
 func newMatrix[T](m, n: int): Matrix[T] {.inline.} =
  result.buffer = cast[ptr UncheckedArray[T]](c_malloc(csize_t m*n*sizeof(T)))
  result.m = m
  result.n = n
 template `[]`[T](mat: Matrix[T], row, col: Natural): T =
  # row-major storage
  assert row < mat.m
  assert col < mat.n
  mat.buffer[row * mat.n + col]
 template `[]=`[T](mat: Matrix[T], row, col: Natural, value: T) =
  assert row < mat.m
  assert col < mat.n
  mat.buffer[row * mat.n + col] = value
 func getRow[T](mat: Matrix[T], rowIdx: Natural): Row[T] {.inline.} =
  # row-major storage, there are n columns in between each rows
  assert rowIdx < mat.m
  result.buffer = cast[ptr UncheckedArray[T]](mat.buffer[rowIdx * mat.n].addr)
  result.len = mat.m
 template `[]`[T](row: Row[T], idx: Natural): T =
  assert idx < row.len
  row.buffer[idx]
 template `[]=`[T](row: Row[T], idx: Natural, value: T) =
  assert idx < row.len
  row.buffer[idx] = value
 func delete[T](mat: sink Matrix[T]) =
  c_free(mat.buffer)
 # And an auto converter for int32 -> float64 so we don't have to convert
 # all i, j indices manually
 converter i32toF64(x: int32): float64 {.inline.} =
  float64(x)
 # -------------------------------------------------------
 template f(x, y: SomeFloat): SomeFloat =
  sin(x) * sin(y)
 template randa[T: SomeFloat](x, t: T): T =
  T(0.0)
 proc randb(x, t: SomeFloat): SomeFloat {.inline.} =
  # proc instead of template to avoid Nim constant folding bug:
  # https://github.com/nim-lang/Nim/issues/12783
  exp(-2 * t) * sin(x)
 template randc[T: SomeFloat](y, t: T): T =
  T(0.0)
 proc randd(y, t: SomeFloat): SomeFloat {.inline.} =
  # proc instead of template to avoid Nim constant folding bug:
  # https://github.com/nim-lang/Nim/issues/12783
  exp(-2 * t) * sin(y)
 template solu(x, y, t: SomeFloat): SomeFloat =
  exp(-2 * t) * sin(x) * sin(y)
 const n = 4096'i32
 var
  nx, ny, nt: int32
  xu, xo, yu, yo, tu, to: float64
  dx, dy, dt: float64
  dtdxsq, dtdysq: float64
  odd: Matrix[float64]
  even: Matrix[float64]
 proc heat(m: Matrix[float64], il, iu: int32): bool {.discardable.}=
  # TODO to allow awaiting `heat` we return a dummy bool
  # The parallel spawns are updating the same matrix cells otherwise
  if iu - il > 1:
    let im = (il + iu) div 2
    let h = tp.spawn heat(m, il, im)
    heat(m, im, iu)
    discard sync(h)
    return true
  # ------------------------
  let i = il
  let row = m.getRow(i)
  if i == 0:
    for j in 0 ..< ny:
      row[j] = randc(yu + j*dy, 0)
  elif i == nx - 1:
    for j in 0 ..< ny:
      row[j] = randd(yu + j*dy, 0)
  else:
    row[0] = randa(xu + i*dx, 0)
    for j in 1 ..< ny - 1:
      row[j] = f(xu + i*dx, yu + j*dy)
    row[ny - 1] = randb(xu + i*dx, 0)
 proc diffuse(output: Matrix[float64], input: Matrix[float64], il, iu: int32, t: float64): bool {.discardable.} =
  # TODO to allow awaiting `diffuse` we return a dummy bool
  # The parallel spawns are updating the same matrix cells otherwise
  if iu - il > 1:
    let im = (il + iu) div 2
    let d = tp.spawn diffuse(output, input, il, im, t)
    diffuse(output, input, im, iu, t)
    discard sync(d)
    return true
  # ------------------------
  let i = il
  let row = output.getRow(i)
  if i == 0:
    for j in 0 ..< ny:
      row[j] = randc(yu + j*dy, t)
  elif i == nx - 1:
    for j in 0 ..< ny:
      row[j] = randd(yu + j*dy, t)
  else:
    row[0] = randa(xu + i*dx, t)
    for j in 1 ..< ny - 1:
      row[j] = input[i, j] + # The use of nested sequences here is a bad idea ...
               dtdysq * (input[i, j+1] - 2 * input[i, j] + input[i, j-1]) +
               dtdxsq * (input[i+1, j] - 2 * input[i, j] + input[i-1, j])
    row[ny - 1] = randb(xu + i*dx, t)
 proc initTest() =
  nx = n
  ny = 1024
  nt = 100
  xu = 0.0
  xo = 1.570796326794896558
  yu = 0.0
  yo = 1.570796326794896558
  tu = 0.0
  to = 0.0000001
  dx = (xo - xu) / float64(nx - 1)
  dy = (yo - yu) / float64(ny - 1)
  dt = (to - tu) / float64(nt)
  dtdxsq = dt / (dx * dx)
  dtdysq = dt / (dy * dy)
  even = newMatrix[float64](nx, ny)
  odd = newMatrix[float64](nx, ny)
 proc prep() =
  heat(even, 0, nx)
 proc test() =
  var t = tu
  for _ in countup(1, nt.int, 2):
    # nt included
    t += dt
    diffuse(odd, even, 0, nx, t)
    t += dt
    diffuse(even, odd, 0, nx, t)
  if nt mod 2 != 0:
    t += dt
    diffuse(odd, even, 0, nx, t)
 proc verify() =
  var
    mat: Matrix[float64]
    mae: float64
    mre: float64
    me:  float64
  mat = if nt mod 2 != 0: odd else: even
  for a in 0 ..< nx:
    for b in 0 ..< ny:
      var tmp = abs(mat[a, b] - solu(xu + a*dx, yu + b*dy, to))
      if tmp > 1e-3:
        echo "nx: ", nx, " - ny: ", ny
        echo "mat[", a, ", ", b, "] = ", mat[a, b], ", expected sol = ", solu(xu + a*dx, yu + b*dy, to)
        quit 1
      me += tmp
      if tmp > mae: mae = tmp
      if mat[a, b] != 0.0: tmp /= mat[a, b]
      if tmp > mre: mre = tmp
  me /= nx * ny
  if mae > 1e-12:
    echo &"Local maximal absolute error {mae:1.3e}"
    quit 1
  if mre > 1e-12:
    echo &"Local maximal relative error {mre:1.3e}"
    quit 1
  if me > 1e-12:
    echo &"Global mean absolute error {me:1.3e}"
    quit 1
  echo "Verification successful"
 proc main() =
  var nthreads: int
  if existsEnv"TASKPOOL_NUM_THREADS":
    nthreads = getEnv"TASKPOOL_NUM_THREADS".parseInt()
  else:
    nthreads = countProcessors()
  when not defined(windows):
    var ru: Rusage
    getrusage(RusageSelf, ru)
    var
      rss = ru.ru_maxrss
      flt = ru.ru_minflt
  initTest()
  # Fibril initializes before benching
  tp = Taskpool.new(numThreads = nthreads)
  prep()
  when not defined(windows):
    let start = wtime_usec()
  test()
  when not defined(windows):
    let stop = wtime_usec()
    getrusage(RusageSelf, ru)
    rss = ru.ru_maxrss - rss
    flt = ru.ru_minflt - flt
  tp.shutdown()
  verify()
  delete(even)
  delete(odd)
  echo "Scheduler:        Taskpools"
  echo "Benchmark:        heat"
  echo "Threads:          ", nthreads
  when not defined(windows):
    echo "Time(us)          ", stop - start
    echo "Max RSS (KB):     ", ru.ru_maxrss
    echo "Runtime RSS (KB): ", rss
    echo "# of page faults: ", flt
  quit 0
 main()
--- a/benchmarks/matmul_cache_oblivious/README.md
+++ b/benchmarks/matmul_cache_oblivious/README.md
@ -0,0 +1,12 @@
 # Cache-Oblivious Matrix Multiplication
 From Staccato and Cilk
 https://bradley.csail.mit.edu/svn/repos/cilk/5.4.3/examples/matmul.cilk
 See the paper ``Cache-Oblivious Algorithms'', by
 Matteo Frigo, Charles E. Leiserson, Harald Prokop, and
 Sridhar Ramachandran, FOCS 1999, for an explanation of
 why this algorithm is good for caches.
 Note that the benchmarks output incorrect matrix traces
 according to the check ...
--- a/benchmarks/matmul_cache_oblivious/taskpool_matmul_co.nim
+++ b/benchmarks/matmul_cache_oblivious/taskpool_matmul_co.nim
@ -0,0 +1,213 @@
 # Weave
 # Copyright (c) 2019 Mamy André-Ratsimbazafy
 # Licensed and distributed under either of
 #   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
 #   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
 # at your option. This file may not be copied, modified, or distributed except according to those terms.
 # Rectangular matrix multiplication.
 #
 # Adapted from Cilk 5.4.3 example
 #
 # https://bradley.csail.mit.edu/svn/repos/cilk/5.4.3/examples/matmul.cilk;
 # See the paper ``Cache-Oblivious Algorithms'', by
 # Matteo Frigo, Charles E. Leiserson, Harald Prokop, and
 # Sridhar Ramachandran, FOCS 1999, for an explanation of
 # why this algorithm is good for caches.
 import
  # Stdlib
  strformat, os, strutils, math, system/ansi_c,
  cpuinfo,
  # Taskpool
  ../../taskpools,
  # bench
  ../wtime, ../resources
 # Helpers
 # -------------------------------------------------------
 # We need a thin wrapper around raw pointers for matrices,
 # we can't pass "var" to other threads
 type
  Matrix[T: SomeFloat] = object
    buffer: ptr UncheckedArray[T]
    ld: int
 var tp: Taskpool
 func newMatrixNxN[T](n: int): Matrix[T] {.inline.} =
  result.buffer = cast[ptr UncheckedArray[T]](c_malloc(csize_t n*n*sizeof(T)))
  result.ld = n
 template `[]`[T](mat: Matrix[T], row, col: Natural): T =
  # row-major storage
  assert row < mat.ld, $i & " < " & $mat.ld
  assert col < mat.ld, $i & " < " & $mat.ld
  mat.buffer[row * mat.ld + col]
 template `[]=`[T](mat: Matrix[T], row, col: Natural, value: T) =
  assert row < mat.ld, $i & " < " & $mat.ld
  assert col < mat.ld, $i & " < " & $mat.ld
  mat.buffer[row * mat.ld + col] = value
 func stride*[T](mat: Matrix[T], row, col: Natural): Matrix[T]{.inline.}=
  ## Returns a new view offset by the row and column stride
  result.buffer = cast[ptr UncheckedArray[T]](
    addr mat.buffer[row*mat.ld + col]
  )
 func delete[T](mat: sink Matrix[T]) =
  c_free(mat.buffer)
 # -------------------------------------------------------
 proc xorshiftRand(): uint32 =
  var x {.global.} = uint32(2463534242)
  x = x xor (x shr 13)
  x = x xor (x shl 17)
  x = x xor (x shr 5)
  return x
 func zero[T](A: Matrix[T]) =
  # zeroing is not timed
  zeroMem(A.buffer, A.ld * A.ld * sizeof(T))
 proc fill[T](A: Matrix[T]) =
  for i in 0 ..< A.ld:
    for j in 0 ..< A.ld:
      A[i, j] = T(xorshiftRand() mod A.ld.uint32)
 func maxError(A, B: Matrix): float64 =
  assert A.ld == B.ld
  for i in 0 ..< A.ld:
    for j in 0 ..< A.ld:
      var diff = (A[i, j] - B[i, j]) / A[i, j]
      if diff < 0:
        diff = -diff
      if diff > result:
        result = diff
 func check[T](A, B, C: Matrix[T], n: int): bool =
  var
    tr_C = 0.T
    tr_AB = 0.T
  for i in 0 ..< n:
    for j in 0 ..< n:
      tr_AB += A[i, j] * B[j, i]
    tr_C += C[i, i]
  # Note, all benchmarks return false ‾\_(ツ)_/‾
  return abs(tr_AB - tr_C) < 1e-3
 proc matmul[T](A, B, C: Matrix[T], m, n, p: int, add: bool): bool =
  # The original bench passes around a ``ld`` parameter (leading dimension?),
  # we store it in the matrices
  # We return a dummy bool to allow waiting on the matmul
  # Threshold
  if (m + n + p) <= 64:
    if add:
      for i in 0 ..< m:
        for k in 0 ..< p:
          var c = 0.T
          for j in 0 ..< n:
            c += A[i, j] * B[j, k]
          C[i, k] += c
    else:
      for i in 0 ..< m:
        for k in 0 ..< p:
          var c = 0.T
          for j in 0 ..< n:
            c += A[i, j] * B[j, k]
          C[i, k] = c
    return
  var h0, h1: FlowVar[bool]
  ## Each half of the computation
  # matrix is larger than threshold
  if m >= n and n >= p:
    let m1 = m shr 1 # divide by 2
    h0 = tp.spawn matmul(A, B, C, m1, n, p, add)
    h1 = tp.spawn matmul(A.stride(m1, 0), B, C.stride(m1, 0), m - m1, n, p, add)
  elif n >= m and n >= p:
    let n1 = n shr 1 # divide by 2
    h0 = tp.spawn matmul(A, B, C, m, n1, p, add)
    h1 = tp.spawn matmul(A.stride(0, n1), B.stride(n1, 0), C, m, n - n1, p, add = true)
  else:
    let p1 = p shr 1
    h0 = tp.spawn matmul(A, B, C, m, n, p1, add)
    h1 = tp.spawn matmul(A, B.stride(0, p1), C.stride(0, p1), m, n, p - p1, add)
  discard sync(h0)
  discard sync(h1)
 proc main() =
  echo "Warning the benchmark seems to not be correct."
  var
    n = 3000
    nthreads: int
  if existsEnv"TASKPOOL_NUM_THREADS":
    nthreads = getEnv"TASKPOOL_NUM_THREADS".parseInt()
  else:
    nthreads = countProcessors()
  if paramCount() == 0:
    let exeName = getAppFilename().extractFilename()
    echo &"Usage: {exeName} <n (matrix size):{n}>"
    echo &"Running with default config n = {n}"
  elif paramCount() == 1:
    n = paramStr(1).parseInt()
  else:
    let exeName = getAppFilename().extractFilename()
    echo &"Usage: {exeName} <n (matrix size):{n}>"
    echo &"Up to 1 parameter is valid. Received {paramCount()}"
    quit 1
  var A = newMatrixNxN[float32](n)
  var B = newMatrixNxN[float32](n)
  var C = newMatrixNxN[float32](n)
  fill(A)
  fill(B)
  zero(C)
  var ru: Rusage
  getrusage(RusageSelf, ru)
  var
    rss = ru.ru_maxrss
    flt = ru.ru_minflt
  # Staccato benches runtime init and exit as well
  let start = wtime_msec()
  tp = Taskpool.new(numThreads = nthreads)
  discard sync tp.spawn matmul(A, B, C, n, n, n, add = false)
  tp.shutdown()
  let stop = wtime_msec()
  getrusage(RusageSelf, ru)
  rss = ru.ru_maxrss - rss
  flt = ru.ru_minflt - flt
  echo "Scheduler:        Taskpool"
  echo "Benchmark:        Matrix Multiplication (cache oblivious)"
  echo "Threads:          ", nthreads
  echo "Time(ms)          ", stop - start
  echo "Max RSS (KB):     ", ru.ru_maxrss
  echo "Runtime RSS (KB): ", rss
  echo "# of page faults: ", flt
  echo "Input:            ", n
  echo "Error:           ", check(A, B, C, n)
  delete A
  delete B
  delete C
  quit 0
 main()
--- a/benchmarks/nqueens/stdnim_nqueens.nim
+++ b/benchmarks/nqueens/stdnim_nqueens.nim
@ -0,0 +1,187 @@
 # Weave
 # Copyright (c) 2019 Mamy André-Ratsimbazafy
 # Licensed and distributed under either of
 #   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
 #   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
 # at your option. This file may not be copied, modified, or distributed except according to those terms.
 #
 # Original code licenses
 # ------------------------------------------------------------------------------------------------
 #
 # /**********************************************************************************************/
 # /*  This program is part of the Barcelona OpenMP Tasks Suite                                  */
 # /*  Copyright (C) 2009 Barcelona Supercomputing Center - Centro Nacional de Supercomputacion  */
 # /*  Copyright (C) 2009 Universitat Politecnica de Catalunya                                   */
 # /*                                                                                            */
 # /*  This program is free software; you can redistribute it and/or modify                      */
 # /*  it under the terms of the GNU General Public License as published by                      */
 # /*  the Free Software Foundation; either version 2 of the License, or                         */
 # /*  (at your option) any later version.                                                       */
 # /*                                                                                            */
 # /*  This program is distributed in the hope that it will be useful,                           */
 # /*  but WITHOUT ANY WARRANTY; without even the implied warranty of                            */
 # /*  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the                             */
 # /*  GNU General Public License for more details.                                              */
 # /*                                                                                            */
 # /*  You should have received a copy of the GNU General Public License                         */
 # /*  along with this program; if not, write to the Free Software                               */
 # /*  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA            */
 # /**********************************************************************************************/
 #
 # /*
 #  * Original code from the Cilk project (by Keith Randall)
 #  *
 #  * Copyright (c) 2000 Massachusetts Institute of Technology
 #  * Copyright (c) 2000 Matteo Frigo
 #  */
 import
  # Stdlib
  system/ansi_c, strformat, os, strutils,
  threadpool,
  # bench
  ../wtime
 # This deadlocks :/
 # Nim helpers
 # -------------------------------------------------
 when defined(windows):
  proc alloca(size: csize): pointer {.header: "<malloc.h>".}
 else:
  proc alloca(size: csize): pointer {.header: "<alloca.h>".}
 template alloca*(T: typedesc): ptr T =
  cast[ptr T](alloca(sizeof(T)))
 template alloca*(T: typedesc, len: Natural): ptr UncheckedArray[T] =
  cast[ptr UncheckedArray[T]](alloca(sizeof(T) * len))
 proc wv_alloc*(T: typedesc, len: SomeInteger): ptr UncheckedArray[T] {.inline.} =
  when defined(WV_useNimAlloc):
    cast[type result](createSharedU(T, len))
  else:
    cast[type result](c_malloc(csize_t len*sizeof(T)))
 proc wv_free*[T: ptr](p: T) {.inline.} =
  when defined(WV_useNimAlloc):
    freeShared(p)
  else:
    c_free(p)
 # We assume that Nim zeroMem vs C memset
 # and Nim copyMem vs C memcpy have no difference
 # Nim does have extra checks to handle GC-ed types
 # but they should be eliminated by the Nim compiler.
 # -------------------------------------------------
 type CharArray = ptr UncheckedArray[char]
 var example_solution: ptr UncheckedArray[char]
 func isValid(n: int32, a: CharArray): bool =
  ## `a` contains an array of `n` queen positions.
  ## Returns true if none of the queens conflict and 0 otherwise.
  for i in 0'i32 ..< n:
    let p = cast[int32](a[i])
    for j in i+1 ..< n:
      let q = cast[int32](a[j])
      if q == p or q == p - (j-i) or q == p + (j-i):
        return false
  return true
 proc nqueens_ser(n, j: int32, a: CharArray): int32 =
  # Serial nqueens
  if n == j:
    # Good solution count it
    if example_solution.isNil:
      example_solution = wv_alloc(char, n)
      copyMem(example_solution, a, n * sizeof(char))
      return 1
  # Try each possible position for queen `j`
  for i in 0 ..< n:
    a[j] = cast[char](i)
    if isValid(j+1, a):
      result += nqueens_ser(n, j+1, a)
 proc nqueens_par(n, j: int32, a: CharArray): int32 =
  if n == j:
    # Good solution, count it
    return 1
  var localCounts = alloca(Flowvar[int32], n)
  zeroMem(localCounts, n * sizeof(Flowvar[int32]))
  # Try each position for queen `j`
  for i in 0 ..< n:
    var b = alloca(char, j+1)
    copyMem(b, a, j * sizeof(char))
    b[j] = cast[char](i)
    if isValid(j+1, b):
      localCounts[i] = spawn nqueens_par(n, j+1, b)
  for i in 0 ..< n:
    if not localCounts[i].isNil():
      result += ^localCounts[i]
 const solutions = [
  1,
  0,
  0,
  2,
  10, # 5x5
  4,
  10,
  92, # 8x8
  352,
  724, # 10x10
  2680,
  14200,
  73712,
  365596,
  2279184, # 15x15
  14772512
 ]
 proc verifyQueens(n, res: int32) =
  if n > solutions.len:
    echo &"Cannot verify result: {n} is out of range [1,{solutions.len}]"
    return
  if res != solutions[n-1]:
    echo &"N-Queens failure: {res} is different from expected {solutions[n-1]}"
 proc main() =
  if paramCount() != 1:
    let exeName = getAppFilename().extractFilename()
    echo &"Usage: {exeName} <n: number of queens on a nxn board>"
    quit 0
  let n = paramStr(1).parseInt.int32
  if n notin 1 .. solutions.len:
    echo &"The number of queens N (on a NxN board) must be in the range [1, {solutions.len}]"
    quit 1
  let start = wtime_msec()
  let count = nqueens_par(n, 0, alloca(char, n))
  let stop = wtime_msec()
  verifyQueens(n, count)
  if not example_solution.isNil:
    stdout.write("Example solution: ")
    for i in 0 ..< n:
      c_printf("%2d ", example_solution[i])
    stdout.write('\n')
  echo &"Elapsed wall time: {stop-start:2.4f} ms"
 main()
--- a/benchmarks/nqueens/taskpool_nqueens.nim
+++ b/benchmarks/nqueens/taskpool_nqueens.nim
@ -0,0 +1,229 @@
 # Weave
 # Copyright (c) 2019 Mamy André-Ratsimbazafy
 # Licensed and distributed under either of
 #   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
 #   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
 # at your option. This file may not be copied, modified, or distributed except according to those terms.
 #
 # Original code licenses
 # ------------------------------------------------------------------------------------------------
 #
 # /**********************************************************************************************/
 # /*  This program is part of the Barcelona OpenMP Tasks Suite                                  */
 # /*  Copyright (C) 2009 Barcelona Supercomputing Center - Centro Nacional de Supercomputacion  */
 # /*  Copyright (C) 2009 Universitat Politecnica de Catalunya                                   */
 # /*                                                                                            */
 # /*  This program is free software; you can redistribute it and/or modify                      */
 # /*  it under the terms of the GNU General Public License as published by                      */
 # /*  the Free Software Foundation; either version 2 of the License, or                         */
 # /*  (at your option) any later version.                                                       */
 # /*                                                                                            */
 # /*  This program is distributed in the hope that it will be useful,                           */
 # /*  but WITHOUT ANY WARRANTY; without even the implied warranty of                            */
 # /*  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the                             */
 # /*  GNU General Public License for more details.                                              */
 # /*                                                                                            */
 # /*  You should have received a copy of the GNU General Public License                         */
 # /*  along with this program; if not, write to the Free Software                               */
 # /*  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA            */
 # /**********************************************************************************************/
 #
 # /*
 #  * Original code from the Cilk project (by Keith Randall)
 #  *
 #  * Copyright (c) 2000 Massachusetts Institute of Technology
 #  * Copyright (c) 2000 Matteo Frigo
 #  */
 import
  # Stdlib
  system/ansi_c, strformat, os, strutils, cpuinfo,
  # Taskpools
  ../../taskpools
 when not defined(windows):
  # bench
  import ../wtime, ../resources
 # Nim helpers
 # -------------------------------------------------
 when defined(windows):
  proc alloca(size: int): pointer {.header: "<malloc.h>".}
 else:
  proc alloca(size: int): pointer {.header: "<alloca.h>".}
 template alloca*(T: typedesc): ptr T =
  cast[ptr T](alloca(sizeof(T)))
 template alloca*(T: typedesc, len: Natural): ptr UncheckedArray[T] =
  cast[ptr UncheckedArray[T]](alloca(sizeof(T) * len))
 proc tp_alloc*(T: typedesc, len: SomeInteger): ptr UncheckedArray[T] {.inline.} =
  when defined(TP_useNimAlloc):
    cast[type result](createSharedU(T, len))
  else:
    cast[type result](c_malloc(csize_t len*sizeof(T)))
 proc tp_free*[T: ptr](p: T) {.inline.} =
  when defined(TP_useNimAlloc):
    freeShared(p)
  else:
    c_free(p)
 # We assume that Nim zeroMem vs C memset
 # and Nim copyMem vs C memcpy have no difference
 # Nim does have extra checks to handle GC-ed types
 # but they should be eliminated by the Nim compiler.
 # -------------------------------------------------
 type CharArray = ptr UncheckedArray[char]
 var tp: Taskpool
 var example_solution: ptr UncheckedArray[char]
 func isValid(n: int32, a: CharArray): bool =
  ## `a` contains an array of `n` queen positions.
  ## Returns true if none of the queens conflict and 0 otherwise.
  for i in 0'i32 ..< n:
    let p = cast[int32](a[i])
    for j in i+1 ..< n:
      let q = cast[int32](a[j])
      if q == p or q == p - (j-i) or q == p + (j-i):
        return false
  return true
 proc nqueens_ser(n, j: int32, a: CharArray): int32 =
  # Serial nqueens
  if n == j:
    # Good solution count it
    if example_solution.isNil:
      example_solution = tp_alloc(char, n)
      copyMem(example_solution, a, n * sizeof(char))
    return 1
  # Try each possible position for queen `j`
  for i in 0 ..< n:
    a[j] = cast[char](i)
    if isValid(j+1, a):
      result += nqueens_ser(n, j+1, a)
 proc nqueens_par(n, j: int32, a: CharArray): int32 =
  if n == j:
    # Good solution, count it
    return 1
  var localCounts = alloca(Flowvar[int32], n)
  zeroMem(localCounts, n * sizeof(Flowvar[int32]))
  # Try each position for queen `j`
  for i in 0 ..< n:
    var b = alloca(char, j+1)
    copyMem(b, a, j * sizeof(char))
    b[j] = cast[char](i)
    if isValid(j+1, b):
      localCounts[i] = tp.spawn nqueens_par(n, j+1, b)
  for i in 0 ..< n:
    if localCounts[i].isSpawned():
      result += sync(localCounts[i])
 const solutions = [
  1,
  0,
  0,
  2,
  10, # 5x5
  4,
  10,
  92, # 8x8
  352,
  724, # 10x10
  2680,
  14200,
  73712,
  365596,
  2279184, # 15x15
  14772512
 ]
 proc verifyQueens(n, res: int32) =
  if n > solutions.len:
    echo &"Cannot verify result: {n} is out of range [1,{solutions.len}]"
    return
  if res != solutions[n-1]:
    echo &"N-Queens failure: {res} is different from expected {solutions[n-1]}"
 proc main() =
  var
    n = 11'i32
    nthreads: int
  if existsEnv"TASKPOOL_NUM_THREADS":
    nthreads = getEnv"TASKPOOL_NUM_THREADS".parseInt()
  else:
    nthreads = countProcessors()
  if paramCount() == 0:
    let exeName = getAppFilename().extractFilename()
    echo &"Usage: {exeName} <N:{n}>"
    echo &"Running with default config N = {n}\n"
  if paramCount() >= 1:
    n = paramStr(1).parseInt.int32
  if n notin 1 .. solutions.len:
    echo &"The number of queens N (on a NxN board) must be in the range [1, {solutions.len}]"
    quit 1
  when not defined(windows):
    var ru: Rusage
    getrusage(RusageSelf, ru)
    var
      rss = ru.ru_maxrss
      flt = ru.ru_minflt
  tp = Taskpool.new(numThreads = nthreads)
  when not defined(windows):
    let start = wtime_msec()
  let count = nqueens_par(n, 0, alloca(char, n))
  when not defined(windows):
    let stop = wtime_msec()
  when not defined(windows):
    getrusage(RusageSelf, ru)
    rss = ru.ru_maxrss - rss
    flt = ru.ru_minflt - flt
  tp.shutdown()
  verifyQueens(n, count)
  if not example_solution.isNil:
    stdout.write("Example solution: ")
    for i in 0 ..< n:
      c_printf("%2d ", example_solution[i])
    stdout.write('\n')
  echo "Scheduler:            Taskpool"
  echo "Benchmark:            N-queens"
  echo "Threads:              ", nthreads
  when not defined(windows):
    echo "Time(us)              ", stop - start
    echo "Max RSS (KB):         ", ru.ru_maxrss
    echo "Runtime RSS (KB):     ", rss
    echo "# of page faults:     ", flt
  echo "Problem size:         ", n,"x",n, " board with ",n, " queens"
  echo "Solutions found:      ", count
  quit 0
 main()
--- a/benchmarks/resources.nim
+++ b/benchmarks/resources.nim
@ -0,0 +1,24 @@
 type
  Timeval {.importc: "timeval", header:"<sys/time.h>", bycopy.} = object
  Rusage* {.importc: "struct rusage", header:"<sys/resource.h>", bycopy.} = object
    ru_utime {.importc.}: Timeval
    ru_stime {.importc.}: Timeval
    ru_maxrss* {.importc.}: int32  # Maximum resident set size
    # ...
    ru_minflt* {.importc.}: int32  # page reclaims (soft page faults)
  RusageWho* {.size: sizeof(cint).} = enum
    RusageChildren = -1
    RusageSelf = 0
    RusageThread = 1
 when defined(debug):
  var H_RUSAGE_SELF{.importc, header:"<sys/resource.h".}: cint
  var H_RUSAGE_CHILDREN{.importc, header:"<sys/resource.h".}: cint
  var H_RUSAGE_THREAD{.importc, header:"<sys/resource.h".}: cint
  assert H_RUSAGE_SELF == ord(RusageSelf)
  assert H_RUSAGE_CHILDREN = ord(RusageChildren)
  assert H_RUSAGE_THREAD = ord(RusageThread)
 proc getrusage*(who: RusageWho, usage: var Rusage) {.importc, header: "sys/resource.h".}
--- a/benchmarks/single_task_producer/README.md
+++ b/benchmarks/single_task_producer/README.md
@ -0,0 +1,7 @@
 # Simple single-producer multiple consumers benchmarks
 SPC A Simple Producer-Consumer benchmark.
 A single worker produces n tasks,
 each running for t microseconds. This benchmark allows us to test how many
 concurrent consumers a single producer can sustain.
--- a/benchmarks/single_task_producer/taskpool_spc.nim
+++ b/benchmarks/single_task_producer/taskpool_spc.nim
@ -0,0 +1,145 @@
 import
  # STD lib
  os, strutils, system/ansi_c, cpuinfo, strformat, math,
  # Library
  ../../taskpools,
  # bench
  ../wtime, ../resources
 var NumTasksTotal: int32
 var TaskGranularity: int32 # microsecond
 var PollInterval: float64  # microsecond
 var tp: Taskpool
 var global_poll_elapsed {.threadvar.}: float64
 template dummy_cpt(): untyped =
  # Dummy computation
  # Calculate fib(30) iteratively
  var
    fib = 0
    f2 = 0
    f1 = 1
  for i in 2 .. 30:
    fib = f1 + f2
    f2 = f1
    f1 = fib
 proc spc_consume(usec: int32) =
  var pollElapsed = 0'f64
  let start = wtime_usec()
  let stop = usec.float64
  global_poll_elapsed = PollInterval
  while true:
    var elapsed = wtime_usec() - start
    elapsed = elapsed - pollElapsed
    if elapsed >= stop:
      break
    dummy_cpt()
    # if elapsed >= global_poll_elapsed:
    #   let pollStart = wtime_usec()
    #   loadBalance(Weave)
    #   pollElapsed += wtime_usec() - pollStart
    #   global_poll_elapsed += PollInterval
  # c_printf("Elapsed: %.2lfus\n", elapsed)
 proc spc_consume_nopoll(usec: int32) =
  let start = wtime_usec()
  let stop = usec.float64
  while true:
    var elapsed = wtime_usec() - start
    if elapsed >= stop:
      break
    dummy_cpt()
  # c_printf("Elapsed: %.2lfus\n", elapsed)
 proc spc_produce(n: int32) =
  for i in 0 ..< n:
    tp.spawn spc_consume(TaskGranularity)
 proc spc_produce_seq(n: int32) =
  for i in 0 ..< n:
    spc_consume_no_poll(TaskGranularity)
 proc main() =
  NumTasksTotal = 1000000
  TaskGranularity = 10
  PollInterval = 10
  if paramCount() == 0:
    let exeName = getAppFilename().extractFilename()
    echo &"Usage: {exeName} <# of tasks:{NumTasksTotal}> " &
         &"<task granularity (us): {TaskGranularity}> " &
         &"[polling interval (us): task granularity]"
    echo &"Running with default config tasks = {NumTasksTotal}, granularity (us) = {TaskGranularity}, polling (us) = {PollInterval}"
  if paramCount() >= 1:
    NumTasksTotal = paramStr(1).parseInt.int32
  if paramCount() >= 2:
    TaskGranularity = paramStr(2). parseInt.int32
  if paramCount() == 3:
    PollInterval = paramStr(3).parseInt.float64
  else:
    PollInterval = TaskGranularity.float64
  if paramCount() > 3:
    let exeName = getAppFilename().extractFilename()
    echo &"Usage: {exeName} <# of tasks:{NumTasksTotal}> " &
         &"<task granularity (us): {TaskGranularity}> " &
         &"[polling interval (us): task granularity]"
    quit 1
  var nthreads: int
  if existsEnv"WEAVE_NUM_THREADS":
    nthreads = getEnv"WEAVE_NUM_THREADS".parseInt()
  else:
    nthreads = countProcessors()
  tp = Taskpool.new(numThreads = nthreads)
  # measure overhead during tasking
  var ru: Rusage
  getrusage(RusageSelf, ru)
  var
    rss = ru.ru_maxrss
    flt = ru.ru_minflt
  let start = wtime_msec()
  # spc_produce_seq(NumTasksTotal)
  spc_produce(NumTasksTotal)
  tp.syncAll()
  let stop = wtime_msec()
  getrusage(RusageSelf, ru)
  rss = ru.ru_maxrss - rss
  flt = ru.ru_minflt - flt
  tp.shutdown()
  echo "--------------------------------------------------------------------------"
  echo "Scheduler:                                     Taskpool"
  echo "Benchmark:                                     SPC (Single task Producer - multi Consumer)"
  echo "Threads:                                       ", nthreads
  echo "Time(ms)                                       ", round(stop - start, 3)
  echo "Max RSS (KB):                                  ", ru.ru_maxrss
  echo "Runtime RSS (KB):                              ", rss
  echo "# of page faults:                              ", flt
  echo "--------------------------------------------------------------------------"
  echo "# of tasks:                                    ", NumTasksTotal
  echo "Task granularity (us):                         ", TaskGranularity
  echo "Polling / manual load balancing interval (us): ", PollInterval
  quit 0
 main()
--- a/benchmarks/wtime.h
+++ b/benchmarks/wtime.h
@ -0,0 +1,53 @@
 #ifndef WTIME_H
 #define WTIME_H
 #include <sys/time.h>
 #include <time.h>
 // Number of seconds since the Epoch
 static inline double Wtime_sec(void)
 {
 	struct timeval tv;
 	gettimeofday(&tv, NULL);
 	return tv.tv_sec + tv.tv_usec / 1e6;
 }
 // Number of milliseconds since the Epoch
 static inline double Wtime_msec(void)
 {
 	struct timeval tv;
 	gettimeofday(&tv, NULL);
 	return tv.tv_sec * 1e3 + tv.tv_usec / 1e3;
 }
 // Number of microseconds since the Epoch
 static inline double Wtime_usec(void)
 {
 	struct timeval tv;
 	gettimeofday(&tv, NULL);
 	return tv.tv_sec * 1e6 + tv.tv_usec;
 }
 // Read time stamp counter on x86
 static inline unsigned long long readtsc(void)
 {
 	unsigned int lo, hi;
 	// RDTSC copies contents of 64-bit TSC into EDX:EAX
 	asm volatile ("rdtsc" : "=a" (lo), "=d" (hi));
 	return (unsigned long long)hi << 32 | lo;
 }
 #define WTIME_unique_var_name_paste(id, n) id ## n
 #define WTIME_unique_var_name(id, n) WTIME_unique_var_name_paste(id, n)
 #define WTIME_unique_var(id) WTIME_unique_var_name(id, __LINE__)
 // Convenience macro for time measurement
 #define WTIME(unit) \
 	double WTIME_unique_var(_start_##unit##_) = Wtime_##unit##ec(); \
 	int WTIME_unique_var(_i_) = 0; \
 	for (; WTIME_unique_var(_i_) == 0 || \
 		 (printf("Elapsed wall time: %.2lf "#unit"\n", \
 			     Wtime_##unit##ec() - WTIME_unique_var(_start_##unit##_)), 0); \
 		 WTIME_unique_var(_i_)++)
 #endif // WTIME_H
--- a/benchmarks/wtime.nim
+++ b/benchmarks/wtime.nim
@ -0,0 +1,10 @@
 import strutils, os
 const cSourcesPath = currentSourcePath.rsplit(DirSep, 1)[0]
 const cHeader = csourcesPath / "wtime.h"
 {.passC: "-I" & cSourcesPath .}
 proc wtime_usec*: float64 {.importc: "Wtime_usec", header: cHeader.}
 proc wtime_msec*: float64 {.importc: "Wtime_msec", header: cHeader.}
--- a/doc/README.md
+++ b/doc/README.md
@ -0,0 +1,17 @@
 # Taskpools architecture
 Taskpools architecture is a simple threadpool with work-stealing to handle unbalanced workloads.
 ## Architecture
 ### Processing steps
 1. On a `spawn` expression, thread i packages the function call in a task.
 2. It enqueues it in it's own dequeue.
 3. It notify_one a condition variable that holds all sleeping threads.
 4. The notified thread wakes up and
 5. The notified thread randomly tries to steal a task in a worker.
 6. If no tasks are found, it goes back to sleep.
 7. Otherwise it runs the task.
 8. On a `sync` statement, it runs task in its own task dequeue or steal a task from another worker.
 9. Once the `sync` task is ready, it can run the following statements (continuation).
--- a/examples/e01_simple_tasks.nim
+++ b/examples/e01_simple_tasks.nim
@ -0,0 +1,43 @@
 import ../taskpools/taskpools
 import std/macros
 block: # Async without result
  proc display_int(x: int) =
    stdout.write(x)
    stdout.write(" - SUCCESS\n")
  proc main() =
    echo "\nSanity check 1: Printing 123456 654321 in parallel"
    var tp = Taskpool.new(numThreads = 4)
    tp.spawn display_int(123456)
    tp.spawn display_int(654321)
    tp.shutdown()
  main()
 block: # Async/Await
  var tp: Taskpool
  proc async_fib(n: int): int =
    if n < 2:
      return n
    let x = tp.spawn async_fib(n-1)
    let y = async_fib(n-2)
    result = sync(x) + y
  proc main2() =
    echo "\nSanity check 2: fib(20)"
    tp = Taskpool.new()
    let f = async_fib(20)
    tp.shutdown()
    doAssert f == 6765
  main2()
--- a/papers/Chase-Lev
+++ b/papers/Chase-Lev
--- a/papers/Nhat
+++ b/papers/Nhat
--- a/taskpools.nim
+++ b/taskpools.nim
@ -0,0 +1,9 @@
 # Nim-Taskpools
 # Copyright (c) 2021 Status Research & Development GmbH
 # Licensed and distributed under either of
 #   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
 #   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
 # at your option. This file may not be copied, modified, or distributed except according to those terms.
 import taskpools/taskpools
 export taskpools
--- a/taskpools/ast_utils.nim
+++ b/taskpools/ast_utils.nim
@ -0,0 +1,33 @@
 # Nim-Taskpools
 # Copyright (c) 2021 Status Research & Development GmbH
 # Licensed and distributed under either of
 #   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
 #   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
 # at your option. This file may not be copied, modified, or distributed except according to those terms.
 import macros
 template letsGoDeeper =
  var rTree = node.kind.newTree()
  for child in node:
    rTree.add inspect(child)
  return rTree
 proc replaceSymsByIdents*(ast: NimNode): NimNode =
  proc inspect(node: NimNode): NimNode =
    case node.kind:
    of {nnkIdent, nnkSym}:
      return ident($node)
    of nnkEmpty:
      return node
    of nnkLiterals:
      return node
    of nnkHiddenStdConv:
      if node[1].kind == nnkIntLit:
        return node[1]
      else:
        expectKind(node[1], nnkSym)
        return ident($node[1])
    else:
      letsGoDeeper()
  result = inspect(ast)
--- a/taskpools/channels_spsc_single.nim
+++ b/taskpools/channels_spsc_single.nim
@ -0,0 +1,178 @@
 # Weave
 # Copyright (c) 2019 Mamy André-Ratsimbazafy
 # Licensed and distributed under either of
 #   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
 #   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
 # at your option. This file may not be copied, modified, or distributed except according to those terms.
 import
  std/atomics,
  ./instrumentation/[contracts, loggers]
 type
  ChannelSPSCSingle* = object
    ## A type-erased SPSC channel.
    ##
    ## Wait-free bounded single-producer single-consumer channel
    ## that can only buffer a single item
    ## Properties:
    ##   - wait-free
    ##   - supports weak memory models
    ##   - buffers a single item
    ##   - Padded to avoid false sharing in collections
    ##   - No extra indirection to access the item, the buffer is inline the channel
    ##   - Linearizable
    ##   - Default usable size is 254 bytes (WV_MemBlockSize - 2).
    ##     If used in an intrusive manner, it's 126 bytes due to the default 128 bytes padding.
    ##
    ## The channel should be the last field of an object if used in an intrusive manner
    ##
    ## Motivations for type erasure
    ## - when LazyFlowvar needs to be converted
    ##   from stack-allocated memory to heap to extended their lifetime
    ##   we have no type information at all as the whole runtime
    ##   and especially tasks does not retain it.
    ##
    ## - When a task depends on a future that was generated from lazy loop-splitting
    ##   we don't have type information either.
    ##
    ## - An extra benefit is probably easier embedding, or calling
    ##   from C or JIT code.
    full{.align: 64.}: Atomic[bool]
    itemSize*: uint8
    buffer*{.align: 8.}: UncheckedArray[byte]
 proc `=`(
    dest: var ChannelSPSCSingle,
    source: ChannelSPSCSingle
  ) {.error: "A channel cannot be copied".}
 proc initialize*(chan: var ChannelSPSCSingle, itemsize: SomeInteger) {.inline.} =
  ## If ChannelSPSCSingle is used intrusive another data structure
  ## be aware that it should be the last part due to ending by UncheckedArray
  preCondition: itemsize.int in 0 .. int high(uint8)
  chan.itemSize = uint8 itemsize
  chan.full.store(false, moRelaxed)
 func isEmpty*(chan: var ChannelSPSCSingle): bool {.inline.} =
  not chan.full.load(moAcquire)
 func tryRecv*[T](chan: var ChannelSPSCSingle, dst: var T): bool {.inline.} =
  ## Try receiving the item buffered in the channel
  ## Returns true if successful (channel was not empty)
  ##
  ## ⚠ Use only in the consumer thread that reads from the channel.
  preCondition: (sizeof(T) == chan.itemsize.int) or
                # Support dummy object
                (sizeof(T) == 0 and chan.itemsize == 1)
  let full = chan.full.load(moAcquire)
  if not full:
    return false
  dst = cast[ptr T](chan.buffer.addr)[]
  chan.full.store(false, moRelease)
  return true
 func trySend*[T](chan: var ChannelSPSCSingle, src: sink T): bool {.inline.} =
  ## Try sending an item into the channel
  ## Reurns true if successful (channel was empty)
  ##
  ## ⚠ Use only in the producer thread that writes from the channel.
  preCondition: (sizeof(T) == chan.itemsize.int) or
                # Support dummy object
                (sizeof(T) == 0 and chan.itemsize == 1)
  let full = chan.full.load(moAcquire)
  if full:
    return false
  cast[ptr T](chan.buffer.addr)[] = src
  chan.full.store(true, moRelease)
  return true
 # Sanity checks
 # ------------------------------------------------------------------------------
 when isMainModule:
  import ../memory/memory_pools
  when not compileOption("threads"):
    {.error: "This requires --threads:on compilation flag".}
  template sendLoop[T](chan: var ChannelSPSCSingle,
                       data: sink T,
                       body: untyped): untyped =
    while not chan.trySend(data):
      body
  template recvLoop[T](chan: var ChannelSPSCSingle,
                       data: var T,
                       body: untyped): untyped =
    while not chan.tryRecv(data):
      body
  type
    ThreadArgs = object
      ID: WorkerKind
      chan: ptr ChannelSPSCSingle
    WorkerKind = enum
      Sender
      Receiver
  template Worker(id: WorkerKind, body: untyped): untyped {.dirty.} =
    if args.ID == id:
      body
  proc thread_func(args: ThreadArgs) =
    # Worker RECEIVER:
    # ---------
    # <- chan
    # <- chan
    # <- chan
    #
    # Worker SENDER:
    # ---------
    # chan <- 42
    # chan <- 53
    # chan <- 64
    Worker(Receiver):
      var val: int
      for j in 0 ..< 10:
        args.chan[].recvLoop(val):
          # Busy loop, in prod we might want to yield the core/thread timeslice
          discard
        echo "                  Receiver got: ", val
        doAssert val == 42 + j*11
    Worker(Sender):
      doAssert args.chan.full.load(moRelaxed) == false
      for j in 0 ..< 10:
        let val = 42 + j*11
        args.chan[].sendLoop(val):
          # Busy loop, in prod we might want to yield the core/thread timeslice
          discard
        echo "Sender sent: ", val
  proc main() =
    echo "Testing if 2 threads can send data"
    echo "-----------------------------------"
    var threads: array[2, Thread[ThreadArgs]]
    var pool: TLPoolAllocator
    pool.initialize()
    var chan = pool.borrow(ChannelSPSCSingle)
    chan[].initialize(itemSize = sizeof(int))
    createThread(threads[0], thread_func, ThreadArgs(ID: Receiver, chan: chan))
    createThread(threads[1], thread_func, ThreadArgs(ID: Sender, chan: chan))
    joinThread(threads[0])
    joinThread(threads[1])
    recycle(chan)
    echo "-----------------------------------"
    echo "Success"
  main()
--- a/taskpools/chase_lev_deques.nim
+++ b/taskpools/chase_lev_deques.nim
@ -0,0 +1,181 @@
 # Nim-Taskpools
 # Copyright (c) 2021 Status Research & Development GmbH
 # Licensed and distributed under either of
 #   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
 #   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
 # at your option. This file may not be copied, modified, or distributed except according to those terms.
 # chase_lev_deques.nim
 # --------------------
 # This file implements a Chase-Lev deque
 # This is a single-consumer multi-consumer concurrent queue
 # for work-stealing schedulers.
 #
 # Papers:
 # - Dynamic Circular Work-Stealing Deque
 #   David Chase, Yossi Lev, 1993
 #   https://www.dre.vanderbilt.edu/~schmidt/PDF/work-stealing-dequeue.pdf
 #
 # - Correct and Efficient Work-Stealing for Weak Memory Models
 #   Nhat Minh Lê, Antoniu Pop, Albert Cohen, Francesco Zappa Nardelli, 2013
 #   https://fzn.fr/readings/ppopp13.pdf
 #
 # We straight translate the second paper which includes formal proofs of correctness,
 # and uses modern C++11 code.
 #
 # A Chase-lev dequeue implements the following push, pop, steal.
 #
 #     top                                            bottom
 #               ---------------------------------
 #               |         |          |          | <- push()
 #  steal()   <- | Task 0  |  Task 1  |  Task 2  | -> pop()
 #  any thread   |         |          |          |    owner-only
 #               ---------------------------------
 #
 # To reduce contention, stealing is done on the opposite end from push/pop
 # so that there is a race only for the very last task.
 {.push raises: [].}
 import
  system/ansi_c,
  std/[locks, typetraits, atomics],
  ./instrumentation/[contracts, loggers]
 type
  Buf[T] = object
    ## Backend buffer of a ChaseLevDeque
    ## `capacity` MUST be a power of 2
    capacity: int
    mask: int        # == capacity-1 implies (i and mask) == (i mod capacity)
    rawBuffer: UncheckedArray[Atomic[T]]
  ChaseLevDeque*[T] = object
    ## This implements a lock-free, growable, work-stealing deque.
    ## The owning thread enqueues and dequeues at the bottom
    ## Foreign threads steal at the top.
    ##
    ## Default queue size is 8
    ## Queue can grow to handle up to 34 359 738 368 tasks in flights
    ## TODO:
    ##   with --gc:arc / --gc:orc, use a seq instead of a fixed max size.
    top {.align: 64.}: Atomic[int]
    bottom: Atomic[int]
    buf: Atomic[ptr Buf[T]]
    garbage: array[32, ptr Buf[T]] # up to 34 359 738 368 sized buffer
    garbageUsed: uint8
 func isPowerOfTwo(n: int): bool {.inline.} =
  (n and (n - 1)) == 0 and (n != 0)
 proc newBuf(T: typedesc, capacity: int): ptr Buf[T] =
  # Tasks have a destructor
  # static:
  #   doAssert supportsCopyMem(T), $T & " must be a (POD) plain-old-data type: no seq, string, ref."
  preCondition: capacity.isPowerOfTwo()
  result = cast[ptr Buf[T]](
    c_malloc(csize_t 2*sizeof(int) + sizeof(T)*capacity)
  )
  result.capacity = capacity
  result.mask = capacity - 1
  result.rawBuffer.addr.zeroMem(sizeof(T)*capacity)
 proc `[]=`[T](buf: var Buf[T], index: int, item: T) {.inline.} =
  buf.rawBuffer[index and buf.mask].store(item, moRelaxed)
 proc `[]`[T](buf: var Buf[T], index: int): T {.inline.} =
  result = buf.rawBuffer[index and buf.mask].load(moRelaxed)
 proc grow[T](deque: var ChaseLevDeque[T], buf: var ptr Buf[T], top, bottom: int) {.inline.} =
  ## Double the buffer size
  ## bottom is the last item index
  ##
  ## To handle race-conditions the current "top", "bottom" and "buf"
  ## have to be saved before calling this procedure.
  ## It reads and writes the "deque.buf", "deque.garbage" and "deque.garbageUsed"
  # Read -> Copy -> Update
  var tmp = newBuf(T, buf.capacity*2)
  for i in top ..< bottom:
    tmp[][i] = buf[][i]
  # This requires 68+ billions tasks in flight (per-thread)
  ascertain: deque.garbageUsed.int < deque.garbage.len
  deque.garbage[deque.garbageUsed] = buf
  swap(buf, tmp)
  deque.buf.store(buf, moRelaxed)
 # Public API
 # ---------------------------------------------------
 proc init*[T](deque: var ChaseLevDeque[T]) =
  ## Initializes a new Chase-lev work-stealing deque.
  deque.reset()
  deque.buf.store(newBuf(T, 8), moRelaxed)
 proc teardown*[T](deque: var ChaseLevDeque[T]) =
  ## Teardown a Chase-lev work-stealing deque.
  for i in 0 ..< deque.garbageUsed.int:
    c_free(deque.garbage[i])
  c_free(deque.buf.load(moRelaxed))
 proc push*[T](deque: var ChaseLevDeque[T], item: T) =
  ## Enqueue an item at the bottom
  ## The item should not be used afterwards.
  let # Handle race conditions
    b = deque.bottom.load(moRelaxed)
    t = deque.top.load(moAcquire)
  var a = deque.buf.load(moRelaxed)
  if b-t > a.capacity - 1:
    # Full queue
    deque.grow(a, t, b)
  a[][b] = item
  fence(moRelease)
  deque.bottom.store(b+1, moRelaxed)
 proc pop*[T](deque: var ChaseLevDeque[T]): T =
  ## Deque an item at the bottom
  let # Handle race conditions
    b = deque.bottom.load(moRelaxed) - 1
    a = deque.buf.load(moRelaxed)
  deque.bottom.store(b, moRelaxed)
  fence(moSequentiallyConsistent)
  var t = deque.top.load(moRelaxed)
  if t <= b:
    # Non-empty queue.
    result = a[][b]
    if t == b:
      # Single last element in queue.
      if not compare_exchange(deque.top, t, t+1, moSequentiallyConsistent, moRelaxed):
        # Failed race.
        result = default(T)
      deque.bottom.store(b+1, moRelaxed)
  else:
    # Empty queue.
    result = default(T)
    deque.bottom.store(b+1, moRelaxed)
 proc steal*[T](deque: var ChaseLevDeque[T]): T =
  ## Deque an item at the top
  var t = deque.top.load(moAcquire)
  fence(moSequentiallyConsistent)
  let b = deque.bottom.load(moAcquire)
  result = default(T)
  if t < b:
    # Non-empty queue.
    let a = deque.buf.load(moConsume)
    result = a[][t]
    if not compare_exchange(deque.top, t, t+1, moSequentiallyConsistent, moRelaxed):
      # Failed race.
      return default(T)
--- a/taskpools/event_notifiers.nim
+++ b/taskpools/event_notifiers.nim
@ -0,0 +1,82 @@
 # Nim-Taskpools
 # Copyright (c) 2021 Status Research & Development GmbH
 # Licensed and distributed under either of
 #   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
 #   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
 # at your option. This file may not be copied, modified, or distributed except according to those terms.
 # event_notifier.nim
 # ------------------
 # This file implements an event notifier.
 # It allows putting idle threads to sleep or waking them up.
 # Design
 # Currently it is a shared lock + condition variable (a.k.a. a semaphore)
 #
 # In the future an eventcount might be considered, an event count significantly
 # reduces scheduler overhead by removing lock acquisition from critical path.
 # See overview and implementations at
 # https://gist.github.com/mratsim/04a29bdd98d6295acda4d0677c4d0041
 #
 # Weave "one event-notifier per thread" further reduces overhead
 # but requires the threadpool to be message-passing based.
 # https://github.com/mratsim/weave/blob/a230cce98a8524b2680011e496ec17de3c1039f2/weave/cross_thread_com/event_notifiers.nim
 import
  std/locks,
  ./instrumentation/contracts
 type
  EventNotifier* = object
    ## This data structure allows threads to be parked when no events are pending
    ## and woken up when a new event is.
    # Lock must be aligned to a cache-line to avoid false-sharing.
    lock{.align: 64.}: Lock
    cond: Cond
    parked: int
    signals: int
 func initialize*(en: var EventNotifier) {.inline.} =
  ## Initialize the event notifier
  en.lock.initLock()
  en.cond.initCond()
  en.parked = 0
  en.signals = 0
 func `=destroy`*(en: var EventNotifier) {.inline.} =
  en.cond.deinitCond()
  en.lock.deinitLock()
 func `=`*(dst: var EventNotifier, src: EventNotifier) {.error: "An event notifier cannot be copied".}
 func `=sink`*(dst: var EventNotifier, src: EventNotifier) {.error: "An event notifier cannot be moved".}
 proc park*(en: var EventNotifier) {.inline.} =
  ## Wait until we are signaled of an event
  ## Thread is parked and does not consume CPU resources
  en.lock.acquire()
  preCondition: en.signals == 0
  en.parked += 1
  while en.signals == 0: # handle spurious wakeups
    en.cond.wait(en.lock)
  en.parked -= 1
  en.signals -= 1
  postCondition: en.signals >= 0
  en.lock.release()
 proc notify*(en: var EventNotifier) {.inline.} =
  ## Unpark a thread if any is available
  en.lock.acquire()
  if en.parked > 0:
    en.signals += 1
    en.cond.signal()
  en.lock.release()
 proc getParked*(en: var EventNotifier): int {.inline.} =
  ## Get the number of parked thread
  en.lock.acquire()
  result = en.parked
  en.lock.release()
--- a/taskpools/flowvars.nim
+++ b/taskpools/flowvars.nim
@ -0,0 +1,71 @@
 # Weave
 # Copyright (c) 2019 Mamy André-Ratsimbazafy
 # Licensed and distributed under either of
 #   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
 #   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
 # at your option. This file may not be copied, modified, or distributed except according to those terms.
 import
  ./channels_spsc_single,
  system/ansi_c,
  ./instrumentation/contracts,
  std/os
 {.push gcsafe.}
 type
  Flowvar*[T] = object
    ## A Flowvar is a placeholder for a future result that may be computed in parallel
    # Flowvar are optimized when containing a ptr type.
    # They take less size in memory by testing isNil
    # instead of having an extra atomic bool
    # They also use type-erasure to avoid having duplicate code
    # due to generic monomorphization.
    chan: ptr ChannelSPSCSingle
 # proc `=copy`*[T](dst: var Flowvar[T], src: Flowvar[T]) {.error: "Futures/Flowvars cannot be copied".}
 #
 # Unfortunately we cannot prevent this easily as internally
 # we need a copy:
 # - nim-taskpools level when doing toTask(fnCall(args, fut)) and then returning fut. (Can be worked around with copyMem)
 # - in std/tasks (need upstream workaround)
 proc newFlowVar*(T: typedesc): Flowvar[T] {.inline.} =
  let size = 2 + sizeof(T) # full flag + item size + buffer
  result.chan = cast[ptr ChannelSPSCSingle](c_calloc(1, csize_t size))
  result.chan[].initialize(sizeof(T))
 proc cleanup(fv: sink Flowvar) {.inline.} =
  if not fv.chan.isNil:
    c_free(fv.chan)
 func isSpawned*(fv: Flowvar): bool {.inline.} =
  ## Returns true if a flowvar is spawned
  ## This may be useful for recursive algorithms that
  ## may or may not spawn a flowvar depending on a condition.
  ## This is similar to Option or Maybe types
  return not fv.chan.isNil
 proc readyWith*[T](fv: Flowvar[T], childResult: T) {.inline.} =
  ## Send the Flowvar result from the child thread processing the task
  ## to its parent thread.
  let resultSent {.used.} = fv.chan[].trySend(childResult)
  postCondition: resultSent
 template tryComplete*[T](fv: Flowvar, parentResult: var T): bool =
  fv.chan[].tryRecv(parentResult)
 func isReady*[T](fv: Flowvar[T]): bool {.inline.} =
  ## Returns true if the result of a Flowvar is ready.
  ## In that case `sync` will not block.
  ## Otherwise the current will block to help on all the pending tasks
  ## until the Flowvar is ready.
  not fv.chan[].isEmpty()
 proc sync*[T](fv: sink Flowvar[T]): T {.inline, gcsafe.} =
  ## Blocks the current thread until the flowvar is available
  ## and returned.
  ## The thread is not idle and will complete pending tasks.
  mixin forceFuture
  forceFuture(fv, result)
  cleanup(fv)
--- a/taskpools/instrumentation/contracts.nim
+++ b/taskpools/instrumentation/contracts.nim
@ -0,0 +1,113 @@
 # Weave
 # Copyright (c) 2019 Mamy André-Ratsimbazafy
 # Licensed and distributed under either of
 #   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
 #   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
 # at your option. This file may not be copied, modified, or distributed except according to those terms.
 import macros, os, strutils
 {.used.}
 # A simple design-by-contract API
 # ----------------------------------------------------------------------------------
 # Everything should be a template that doesn't produce any code
 # when WV_Asserts is not defined.
 # Those checks are controlled by a custom flag instead of
 # "--boundsChecks" or "--nilChecks" to decouple them from user code checks.
 # Furthermore, we want them to be very lightweight on performance
 # TODO auto-add documentation
 proc inspectInfix(node: NimNode): NimNode =
  ## Inspect an expression,
  ## Returns the AST as string with runtime values inlined
  ## from infix operators inlined.
  # TODO: pointer and custom type need a default repr
  #       otherwise we can only resulve simple expressions
  proc inspect(node: NimNode): NimNode =
    case node.kind:
    of nnkInfix:
      return newCall(
          bindSym"&",
          newCall(
            bindSym"&",
            newCall(ident"$", inspect(node[1])),
            newLit(" " & $node[0] & " ")
          ),
          newCall(ident"$", inspect(node[2]))
        )
    of {nnkIdent, nnkSym}:
      return node
    of nnkDotExpr:
      return quote do:
        when `node` is pointer or
             `node` is ptr or
             `node` is (proc):
          toHex(cast[ByteAddress](`node`) and 0xffff_ffff)
        else:
          $(`node`)
    of nnkPar:
      result = nnkPar.newTree()
      for sub in node:
        result.add inspect(sub)
    else:
      return node.toStrLit()
  return inspect(node)
 macro assertContract(
        checkName: static string,
        predicate: untyped) =
  let lineinfo = lineinfoObj(predicate)
  let file = extractFilename(lineinfo.filename)
  var strippedPredicate: NimNode
  if predicate.kind == nnkStmtList:
    assert predicate.len == 1, "Only one-liner conditions are supported"
    strippedPredicate = predicate[0]
  else:
    strippedPredicate = predicate
  let debug = "\n    Contract violated for " & checkName & " at " & file & ":" & $lineinfo.line &
              "\n        " & $strippedPredicate.toStrLit &
              "\n    The following values are contrary to expectations:" &
              "\n        "
  let values = inspectInfix(strippedPredicate)
  let myID = quote do:
    when declared(myID):
      $myID()
    else:
      "N/A"
  result = quote do:
    {.noSideEffect.}:
      when compileOption("assertions"):
        assert(`predicate`, `debug` & $`values` & "  [Worker " & `myID` & "]\n")
      elif defined(WV_Asserts):
        if unlikely(not(`predicate`)):
          raise newException(AssertionError, `debug` & $`values` & '\n')
 # A way way to get the caller function would be nice.
 template preCondition*(require: untyped) =
  ## Optional runtime check before returning from a function
  assertContract("pre-condition", require)
 template postCondition*(ensure: untyped) =
  ## Optional runtime check at the start of a function
  assertContract("post-condition", ensure)
 template ascertain*(check: untyped) =
  ## Optional runtime check in the middle of processing
  assertContract("transient condition", check)
 # Sanity checks
 # ----------------------------------------------------------------------------------
 when isMainModule:
  proc assertGreater(x, y: int) =
    postcondition(x > y)
  # We should get a nicely formatted exception
  assertGreater(10, 12)
--- a/taskpools/instrumentation/loggers.nim
+++ b/taskpools/instrumentation/loggers.nim
@ -0,0 +1,22 @@
 # Weave
 # Copyright (c) 2019 Mamy André-Ratsimbazafy
 # Licensed and distributed under either of
 #   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
 #   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
 # at your option. This file may not be copied, modified, or distributed except according to those terms.
 import system/ansi_c
 {.used.}
 template log*(args: varargs[untyped]): untyped =
  c_printf(args)
  flushFile(stdout)
 template debugTermination*(body: untyped): untyped =
  when defined(TP_DebugTermination) or defined(TP_Debug):
    {.noSideEffect, gcsafe.}: body
 template debug*(body: untyped): untyped =
  when defined(TP_Debug):
    {.noSideEffect, gcsafe.}: body
--- a/taskpools/primitives/affinity_posix.nim
+++ b/taskpools/primitives/affinity_posix.nim
@ -0,0 +1,52 @@
 # Weave
 # Copyright (c) 2019 Mamy André-Ratsimbazafy
 # Licensed and distributed under either of
 #   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
 #   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
 # at your option. This file may not be copied, modified, or distributed except according to those terms.
 # Thread primitives
 # ----------------------------------------------------------------------------------
 type
  Pthread {.importc: "pthread_t", header: "<sys/types.h>".} = distinct culong
  CpuSet {.byref, importc: "cpu_set_t", header: "<sched.h>".} = object
 proc pthread_self(): Pthread {.header: "<pthread.h>".}
 proc pthread_setaffinity_np(
       thread: Pthread,
       cpuset_size: int,
       cpuset: CpuSet
  ) {.header: "<pthread.h>".}
  ## Limit specified `thread` to run only on the processors
  ## represented in `cpuset`
 # Note CpuSet is always passed by (hidden) pointer
 proc cpu_zero(cpuset: var CpuSet) {.importc: "CPU_ZERO", header: "<sched.h>".}
  ## Clears the set so that it contains no CPU
 proc cpu_set(cpu: cint, cpuset: var CpuSet) {.importc: "CPU_SET", header: "<sched.h>".}
  ## Add CPU to set
 # Affinity
 # ----------------------------------------------------------------------------------
 # Nim doesn't allow the main thread to set its own affinity
 proc set_thread_affinity(t: Pthread, cpu: int32) {.inline.}=
  when defined(osx) or defined(android):
    {.warning: "To improve performance we should pin threads to cores.\n" &
                "This is not possible with MacOS or Android.".}
    # Note: on Android it's even more complex due to the Big.Little architecture
    #       with cores with different performance profiles to save on battery
  else:
    var cpuset {.noinit.}: CpuSet
    cpu_zero(cpuset)
    cpu_set(cpu, cpuset)
    pthread_setaffinity_np(t, sizeof(CpuSet), cpuset)
 proc pinToCpu*(cpu: int32) {.inline.} =
  ## Set the affinity of the main thread (the calling thread)
  set_thread_affinity(pthread_self(), cpu)
--- a/taskpools/primitives/affinity_windows.nim
+++ b/taskpools/primitives/affinity_windows.nim
@ -0,0 +1,18 @@
 # Weave
 # Copyright (c) 2019 Mamy André-Ratsimbazafy
 # Licensed and distributed under either of
 #   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
 #   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
 # at your option. This file may not be copied, modified, or distributed except according to those terms.
 import winlean
 when not compileOption("threads"):
    {.error: "This requires --threads:on compilation flag".}
 proc setThreadAffinityMask(hThread: Handle, dwThreadAffinityMask: uint) {.
    importc: "SetThreadAffinityMask", stdcall, header: "<windows.h>".}
 proc pinToCpu*(cpu: int32) {.inline.} =
  ## Set the affinity of the main thread (the calling thread)
  setThreadAffinityMask(getThreadID(), uint(1 shl cpu))
--- a/taskpools/primitives/barriers.md
+++ b/taskpools/primitives/barriers.md
@ -0,0 +1,53 @@
 # Synchronization Barriers
 OSX does not implement pthread_barrier as its an optional part
 of the POSIX standard and they probably want to drive people to libdispatch/Grand Central Dispatch.
 So we need to roll our own with a POSIX compatible API.
 ## Glibc barriers, design bug and implementation
 > Note: due to GPL licensing, do not lift the code.
 >       Not that we can as it is heavily dependent on futexes
 >       which are not available on OSX
 We need to make sure that we don't hit the same bug
 as glibc: https://sourceware.org/bugzilla/show_bug.cgi?id=13065
 which seems to be an issue in some of the barrier implementations
 in the wild.
 The design of Glibc barriers is here:
 https://sourceware.org/git/?p=glibc.git;a=blob;f=nptl/DESIGN-barrier.txt;h=23463c6b7e77231697db3e13933b36ce295365b1;hb=HEAD
 And implementation:
 - https://sourceware.org/git/?p=glibc.git;a=blob;f=nptl/pthread_barrier_destroy.c;h=76957adef3ee751e5b0cfa429fcf4dd3cfd80b2b;hb=HEAD
 - https://sourceware.org/git/?p=glibc.git;a=blob;f=nptl/pthread_barrier_init.c;h=c8ebab3a3cb5cbbe469c0d05fb8d9ca0c365b2bb;hb=HEAD`
 - https://sourceware.org/git/?p=glibc.git;a=blob;f=nptl/pthread_barrier_wait.c;h=49fcfd370c1c4929fdabdf420f2f19720362e4a0;hb=HEAD
 ## Synchronization barrier techniques
 This article goes over the techniques of
 "pool barrier" and "ticket barrier"
 https://locklessinc.com/articles/barriers/
 to reach 2x to 20x the speed of pthreads barrier
 This course https://cs.anu.edu.au/courses/comp8320/lectures/aux/comp422-Lecture21-Barriers.pdf
 goes over
 - centralized barrier with sense reversal
 - combining tree barrier
 - dissemination barrier
 - tournament barrier
 - scalable tree barrier
 More courses:
 - http://www.cs.rochester.edu/u/sandhya/csc458/seminars/jb_Barrier_Methods.pdf
 It however requires lightweight mutexes like Linux futexes
 that OSX lacks.
 This post goes over lightweight mutexes like Benaphores (from BeOS)
 https://preshing.com/20120226/roll-your-own-lightweight-mutex/
 This gives a few barrier implementations
 http://gallium.inria.fr/~maranget/MPRI/02.pdf
 and refers to Cubible paper for formally verifying synchronization barriers
 http://cubicle.lri.fr/papers/jfla2014.pdf (in French)
--- a/taskpools/primitives/barriers.nim
+++ b/taskpools/primitives/barriers.nim
@ -0,0 +1,69 @@
 # Weave
 # Copyright (c) 2019 Mamy André-Ratsimbazafy
 # Licensed and distributed under either of
 #   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
 #   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
 # at your option. This file may not be copied, modified, or distributed except according to those terms.
 when defined(windows):
  import ./barriers_windows
  when compileOption("assertions"):
    import os
  type SyncBarrier* = SynchronizationBarrier
  proc init*(syncBarrier: var SyncBarrier, threadCount: range[0'i32..high(int32)]) {.inline.} =
    ## Initialize a synchronization barrier that will block ``threadCount`` threads
    ## before release.
    let err {.used.} = InitializeSynchronizationBarrier(syncBarrier, threadCount, -1)
    when compileOption("assertions"):
      if err != 1:
        assert err == 0
        raiseOSError(osLastError())
  proc wait*(syncBarrier: var SyncBarrier): bool {.inline.} =
    ## Blocks thread at a synchronization barrier.
    ## Returns true for one of the threads (the last one on Windows, undefined on Posix)
    ## and false for the others.
    result = bool EnterSynchronizationBarrier(syncBarrier, SYNCHRONIZATION_BARRIER_FLAGS_NO_DELETE)
  proc delete*(syncBarrier: sink SyncBarrier) {.inline.} =
    ## Deletes a synchronization barrier.
    ## This assumes no race between waiting at a barrier and deleting it,
    ## and reuse of the barrier requires initialization.
    DeleteSynchronizationBarrier(syncBarrier.addr)
 else:
  import ./barriers_posix
  when compileOption("assertions"):
    import os
  type SyncBarrier* = PthreadBarrier
  proc init*(syncBarrier: var SyncBarrier, threadCount: range[0'i32..high(int32)]) {.inline.} =
    ## Initialize a synchronization barrier that will block ``threadCount`` threads
    ## before release.
    let err {.used.} = pthread_barrier_init(syncBarrier, nil, threadCount)
    when compileOption("assertions"):
      if err != 0:
        raiseOSError(OSErrorCode(err))
  proc wait*(syncBarrier: var SyncBarrier): bool {.inline.} =
    ## Blocks thread at a synchronization barrier.
    ## Returns true for one of the threads (the last one on Windows, undefined on Posix)
    ## and false for the others.
    let err {.used.} = pthread_barrier_wait(syncBarrier)
    when compileOption("assertions"):
      if err != PTHREAD_BARRIER_SERIAL_THREAD and err < 0:
        raiseOSError(OSErrorCode(err))
    result = if err == PTHREAD_BARRIER_SERIAL_THREAD: true
             else: false
  proc delete*(syncBarrier: sink SyncBarrier) {.inline.} =
    ## Deletes a synchronization barrier.
    ## This assumes no race between waiting at a barrier and deleting it,
    ## and reuse of the barrier requires initialization.
    let err {.used.} = pthread_barrier_destroy(syncBarrier)
    when compileOption("assertions"):
      if err < 0:
        raiseOSError(OSErrorCode(err))
--- a/taskpools/primitives/barriers_macos.nim
+++ b/taskpools/primitives/barriers_macos.nim
@ -0,0 +1,88 @@
 # Weave
 # Copyright (c) 2019 Mamy André-Ratsimbazafy
 # Licensed and distributed under either of
 #   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
 #   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
 # at your option. This file may not be copied, modified, or distributed except according to those terms.
 # OSX doesn't implement pthread_barrier_t
 # It's an optional part of the POSIX standard
 #
 # This is a manual implementation of a sense reversing barrier
 import locks
 type
  Natural32 = range[0'i32..high(int32)]
  Errno* = cint
  PthreadAttr* = object
    ## Dummy
  PthreadBarrier* = object
    ## Implementation of a sense reversing barrier
    ## (The Art of Multiprocessor Programming by Maurice Herlihy & Nir Shavit)
    lock: Lock                      # Alternatively spinlock on Atomic
    cond {.guard: lock.}: Cond
    sense {.guard: lock.}: bool     # Choose int32 to avoid zero-expansion cost in registers?
    left {.guard: lock.}: Natural32 # Number of threads missing at the barrier before opening
    count: Natural32                # Total number of threads that need to arrive before opening the barrier
 const
  PTHREAD_BARRIER_SERIAL_THREAD* = Errno(1)
 proc pthread_cond_broadcast(cond: var Cond): Errno {.header:"<pthread.h>".}
  ## Nim only signal one thread in locks
  ## We need to unblock all
 proc broadcast(cond: var Cond) {.inline.}=
  discard pthread_cond_broadcast(cond)
 func pthread_barrier_init*(
        barrier: var PthreadBarrier,
        attr: ptr PthreadAttr,
        count: range[0'i32..high(int32)]
      ): Errno =
  barrier.lock.initLock()
  {.locks: [barrier.lock].}:
    barrier.cond.initCond()
    barrier.left = count
  barrier.count = count
  # barrier.sense = false
 proc pthread_barrier_wait*(barrier: var PthreadBarrier): Errno =
  ## Wait on `barrier`
  ## Returns PTHREAD_BARRIER_SERIAL_THREAD for a single arbitrary thread
  ## Returns 0 for the other
  ## Returns Errno if there is an error
  barrier.lock.acquire()
  {.locks: [barrier.lock].}:
    var local_sense = barrier.sense # Thread local sense
    dec barrier.left
    if barrier.left == 0:
      # Last thread to arrive at the barrier
      # Reverse phase and release it
      barrier.left = barrier.count
      barrier.sense = not barrier.sense
      barrier.cond.broadcast()
      barrier.lock.release()
      return PTHREAD_BARRIER_SERIAL_THREAD
    while barrier.sense == local_sense:
      # We are waiting for threads
      # Wait for the sense to reverse
      # while loop because we might have spurious wakeups
      barrier.cond.wait(barrier.lock)
    # Reversed, we can leave the barrier
    barrier.lock.release()
    return Errno(0)
 proc pthread_barrier_destroy*(barrier: var PthreadBarrier): Errno =
  {.locks: [barrier.lock].}:
    barrier.cond.deinitCond()
  barrier.lock.deinitLock()
 # TODO: tests
--- a/taskpools/primitives/barriers_posix.nim
+++ b/taskpools/primitives/barriers_posix.nim
@ -0,0 +1,51 @@
 # Weave
 # Copyright (c) 2019 Mamy André-Ratsimbazafy
 # Licensed and distributed under either of
 #   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
 #   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
 # at your option. This file may not be copied, modified, or distributed except according to those terms.
 # Abstractions over POSIX barriers (non-)implementations
 when not compileOption("threads"):
  {.error: "This requires --threads:on compilation flag".}
 # Types
 # -------------------------------------------------------
 when defined(osx):
  import ./barriers_macos
  export PthreadAttr, PthreadBarrier, Errno, PTHREAD_BARRIER_SERIAL_THREAD
 else:
  type
    PthreadAttr* {.byref, importc: "pthread_attr_t", header: "<sys/types.h>".} = object
    PthreadBarrier* {.byref, importc: "pthread_barrier_t", header: "<sys/types.h>".} = object
    Errno* = cint
  var PTHREAD_BARRIER_SERIAL_THREAD* {.importc, header:"<pthread.h>".}: Errno
 # Pthread
 # -------------------------------------------------------
 when defined(osx):
  export pthread_barrier_init, pthread_barrier_wait, pthread_barrier_destroy
 else:
  proc pthread_barrier_init*(
        barrier: PthreadBarrier,
        attr: PthreadAttr or ptr PthreadAttr,
        count: range[0'i32..high(int32)]
      ): Errno {.header: "<pthread.h>".}
    ## Initialize `barrier` with the attributes `attr`.
    ## The barrier is opened when `count` waiters arrived.
  proc pthread_barrier_destroy*(
        barrier: sink PthreadBarrier): Errno {.header: "<pthread.h>".}
    ## Destroy a previously dynamically initialized `barrier`.
  proc pthread_barrier_wait*(
        barrier: var PthreadBarrier
      ): Errno {.header: "<pthread.h>".}
    ## Wait on `barrier`
    ## Returns PTHREAD_BARRIER_SERIAL_THREAD for a single arbitrary thread
    ## Returns 0 for the other
    ## Returns Errno if there is an error
--- a/taskpools/primitives/barriers_windows.nim
+++ b/taskpools/primitives/barriers_windows.nim
@ -0,0 +1,31 @@
 # Weave
 # Copyright (c) 2019 Mamy André-Ratsimbazafy
 # Licensed and distributed under either of
 #   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
 #   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
 # at your option. This file may not be copied, modified, or distributed except according to those terms.
 import winlean
 # Technically in <synchapi.h> but MSVC complains with 
 # @m..@s..@sweave@sscheduler.nim.cpp
 # C:\Program Files (x86)\Windows Kits\10\include\10.0.17763.0\um\winnt.h(154): fatal error C1189: #error:  "No Target Architecture
 type
  SynchronizationBarrier*{.importc:"SYNCHRONIZATION_BARRIER", header:"<windows.h>".} = object
 var SYNCHRONIZATION_BARRIER_FLAGS_NO_DELETE* {.importc, header: "<windows.h>".}: DWORD
  ## Skip expensive checks on barrier enter if a barrier is never deleted.
 proc EnterSynchronizationBarrier*(lpBarrier: var SynchronizationBarrier, dwFlags: DWORD): WINBOOL {.importc, stdcall, header: "<windows.h>".}
 proc DeleteSynchronizationBarrier*(lpBarrier: ptr SynchronizationBarrier) {.importc, stdcall, header: "<windows.h>".}
 proc InitializeSynchronizationBarrier*(lpBarrier: var SynchronizationBarrier, lTotalThreads: LONG, lSpinCount: LONG): WINBOOL {.importc, stdcall, header: "<windows.h>".}
 when isMainModule:
  import os
  var x{.noinit.}: SynchronizationBarrier
  let err = InitializeSynchronizationBarrier(x, 2, -1)
  if err != 1:
    assert err == 0
    raiseOSError(osLastError())
--- a/taskpools/shims_pre_1_6/README.md
+++ b/taskpools/shims_pre_1_6/README.md
@ -0,0 +1,12 @@
 # Versions
 ## std/tasks
 - https://github.com/nim-lang/Nim/blob/3619a5a2aa1c7387ec7df01b195bc683943654ff/lib/std/tasks.nim
 We don't support aborting if there is a closure as this requires [#17501](https://github.com/nim-lang/Nim/pull/17501/files)
 ## std/isolation
 - https://github.com/nim-lang/Nim/blob/603af22b7ca46ac566f8c7c15402028f3f976a4e/lib/std/isolation.nim
 ## std/effecttraits
 - https://github.com/nim-lang/Nim/blob/603af22b7ca46ac566f8c7c15402028f3f976a4e/lib/std/effecttraits.nim
--- a/taskpools/shims_pre_1_6/effecttraits.nim
+++ b/taskpools/shims_pre_1_6/effecttraits.nim
@ -0,0 +1,54 @@
 #
 #
 #            Nim's Runtime Library
 #        (c) Copyright 2020 Nim contributors
 #
 #    See the file "copying.txt", included in this
 #    distribution, for details about the copyright.
 #
 ## This module provides access to the inferred .raises effects
 ## for Nim's macro system.
 ## **Since**: Version 1.4.
 ##
 ## One can test for the existance of this standard module
 ## via `defined(nimHasEffectTraitsModule)`.
 import macros
 proc getRaisesListImpl(n: NimNode): NimNode = discard "see compiler/vmops.nim"
 proc getTagsListImpl(n: NimNode): NimNode = discard "see compiler/vmops.nim"
 proc isGcSafeImpl(n: NimNode): bool = discard "see compiler/vmops.nim"
 proc hasNoSideEffectsImpl(n: NimNode): bool = discard "see compiler/vmops.nim"
 proc getRaisesList*(fn: NimNode): NimNode =
  ## Extracts the `.raises` list of the func/proc/etc `fn`.
  ## `fn` has to be a resolved symbol of kind `nnkSym`. This
  ## implies that the macro that calls this proc should accept `typed`
  ## arguments and not `untyped` arguments.
  expectKind fn, nnkSym
  result = getRaisesListImpl(fn)
 proc getTagsList*(fn: NimNode): NimNode =
  ## Extracts the `.tags` list of the func/proc/etc `fn`.
  ## `fn` has to be a resolved symbol of kind `nnkSym`. This
  ## implies that the macro that calls this proc should accept `typed`
  ## arguments and not `untyped` arguments.
  expectKind fn, nnkSym
  result = getTagsListImpl(fn)
 proc isGcSafe*(fn: NimNode): bool =
  ## Return true if the func/proc/etc `fn` is `gcsafe`.
  ## `fn` has to be a resolved symbol of kind `nnkSym`. This
  ## implies that the macro that calls this proc should accept `typed`
  ## arguments and not `untyped` arguments.
  expectKind fn, nnkSym
  result = isGcSafeImpl(fn)
 proc hasNoSideEffects*(fn: NimNode): bool =
  ## Return true if the func/proc/etc `fn` has `noSideEffect`.
  ## `fn` has to be a resolved symbol of kind `nnkSym`. This
  ## implies that the macro that calls this proc should accept `typed`
  ## arguments and not `untyped` arguments.
  expectKind fn, nnkSym
  result = hasNoSideEffectsImpl(fn)
--- a/taskpools/shims_pre_1_6/isolation.nim
+++ b/taskpools/shims_pre_1_6/isolation.nim
@ -0,0 +1,50 @@
 #
 #
 #            Nim's Runtime Library
 #        (c) Copyright 2020 Nim contributors
 #
 #    See the file "copying.txt", included in this
 #    distribution, for details about the copyright.
 #
 ## This module implements the `Isolated[T]` type for
 ## safe construction of isolated subgraphs that can be
 ## passed efficiently to different channels and threads.
 ##
 ## .. warning:: This module is experimental and its interface may change.
 ##
 type
  Isolated*[T] = object ## Isolated data can only be moved, not copied.
    value: T
 proc `=copy`*[T](dest: var Isolated[T]; src: Isolated[T]) {.error.}
 proc `=sink`*[T](dest: var Isolated[T]; src: Isolated[T]) {.inline.} =
  # delegate to value's sink operation
  `=sink`(dest.value, src.value)
 proc `=destroy`*[T](dest: var Isolated[T]) {.inline.} =
  # delegate to value's destroy operation
  `=destroy`(dest.value)
 # XXX: removed the {.magic: "Isolate".}
 func isolate*[T](value: sink T): Isolated[T] =
  ## Creates an isolated subgraph from the expression `value`.
  ## Isolation is checked at compile time.
  ##
  ## Please read https://github.com/nim-lang/RFCs/issues/244
  ## for more details.
  Isolated[T](value: value)
 func unsafeIsolate*[T](value: sink T): Isolated[T] =
  ## Creates an isolated subgraph from the expression `value`.
  ##
  ## .. warning:: The proc doesn't check whether `value` is isolated.
  ##
  Isolated[T](value: value)
 func extract*[T](src: var Isolated[T]): T =
  ## Returns the internal value of `src`.
  ## The value is moved from `src`.
  result = move(src.value)
--- a/taskpools/shims_pre_1_6/tasks.nim
+++ b/taskpools/shims_pre_1_6/tasks.nim
@ -0,0 +1,284 @@
 #
 #
 #            Nim's Runtime Library
 #        (c) Copyright 2021 Nim contributors
 #
 #    See the file "copying.txt", included in this
 #    distribution, for details about the copyright.
 #
 ## This module provides basic primitives for creating parallel programs.
 ## A `Task` should be only owned by a single Thread, it cannot be shared by threads.
 import std/[macros, typetraits]
 import system/ansi_c
 import ./isolation
 export isolation
 when compileOption("threads"):
  from ./effecttraits import isGcSafe
 #
 # proc hello(a: int, b: string) =
 #   echo $a & b
 #
 # let literal = "Nim"
 # let t = toTask(hello(521, literal))
 #
 #
 # is roughly converted to
 #
 # type
 #   ScratchObj_369098780 = object
 #     a: int
 #     b: string
 #
 # let scratch_369098762 = cast[ptr ScratchObj_369098780](c_calloc(csize_t 1,
 #     csize_t sizeof(ScratchObj_369098780)))
 # if scratch_369098762.isNil:
 #   raise newException(OutOfMemDefect, "Could not allocate memory")
 # block:
 #   var isolate_369098776 = isolate(521)
 #   scratch_369098762.a = extract(isolate_369098776)
 #   var isolate_369098778 = isolate(literal)
 #   scratch_369098762.b = extract(isolate_369098778)
 # proc hello_369098781(args`gensym3: pointer) {.nimcall.} =
 #   let objTemp_369098775 = cast[ptr ScratchObj_369098780](args`gensym3)
 #   let :tmp_369098777 = objTemp_369098775.a
 #   let :tmp_369098779 = objTemp_369098775.b
 #   hello(a = :tmp_369098777, b = :tmp_369098779)
 #
 # proc destroyScratch_369098782(args`gensym3: pointer) {.nimcall.} =
 #   let obj_369098783 = cast[ptr ScratchObj_369098780](args`gensym3)
 #   =destroy(obj_369098783[])
 # let t = Task(callback: hello_369098781, args: scratch_369098762, destroy: destroyScratch_369098782)
 #
 type
  Task* = object ## `Task` contains the callback and its arguments.
    callback: proc (args: pointer) {.nimcall, gcsafe.}
    args: pointer
    destroy: proc (args: pointer) {.nimcall.}
 proc `=copy`*(x: var Task, y: Task) {.error.}
 proc `=destroy`*(t: var Task) {.inline.} =
  ## Frees the resources allocated for a `Task`.
  if t.args != nil:
    if t.destroy != nil:
      t.destroy(t.args)
    c_free(t.args)
 proc invoke*(task: Task) {.inline, gcsafe.} =
  ## Invokes the `task`.
  assert task.callback != nil
  task.callback(task.args)
 template checkIsolate(scratchAssignList: seq[NimNode], procParam, scratchDotExpr: NimNode) =
  # block:
  #   var isoTempA = isolate(521)
  #   scratch.a = extract(isolateA)
  #   var isoTempB = isolate(literal)
  #   scratch.b = extract(isolateB)
  let isolatedTemp = genSym(nskTemp, "isoTemp")
  # XXX: Fix sym bindings
  # scratchAssignList.add newVarStmt(isolatedTemp, newCall(newidentNode("isolate"), procParam))
  # scratchAssignList.add newAssignment(scratchDotExpr,
  #     newcall(newIdentNode("extract"), isolatedTemp))
  scratchAssignList.add newVarStmt(isolatedTemp, newCall(bindSym("isolate"), procParam))
  scratchAssignList.add newAssignment(scratchDotExpr,
      newcall(bindSym("extract"), isolatedTemp))
 template addAllNode(assignParam: NimNode, procParam: NimNode) =
  let scratchDotExpr = newDotExpr(scratchIdent, formalParams[i][0])
  checkIsolate(scratchAssignList, procParam, scratchDotExpr)
  let tempNode = genSym(kind = nskTemp, ident = formalParams[i][0].strVal)
  callNode.add nnkExprEqExpr.newTree(formalParams[i][0], tempNode)
  tempAssignList.add newLetStmt(tempNode, newDotExpr(objTemp, formalParams[i][0]))
  scratchRecList.add newIdentDefs(newIdentNode(formalParams[i][0].strVal), assignParam)
 macro toTask*(e: typed{nkCall | nkInfix | nkPrefix | nkPostfix | nkCommand | nkCallStrLit}): Task =
  ## Converts the call and its arguments to `Task`.
  runnableExamples("--gc:orc"):
    proc hello(a: int) = echo a
    let b = toTask hello(13)
    assert b is Task
  doAssert getTypeInst(e).typeKind == ntyVoid
  # requires 1.6
  # when compileOption("threads"):
  #   if not isGcSafe(e[0]):
  #     error("'toTask' takes a GC safe call expression")
  # TODO
  # https://github.com/nim-lang/Nim/pull/17501/files
  #
  # if hasClosure(e[0]):
  #   error("closure call is not allowed")
  if e.len > 1:
    let scratchIdent = genSym(kind = nskTemp, ident = "scratch")
    let impl = e[0].getTypeInst
    when defined(nimTasksDebug):
      echo impl.treeRepr
      echo e.treeRepr
    let formalParams = impl[0]
    var
      scratchRecList = newNimNode(nnkRecList)
      scratchAssignList: seq[NimNode]
      tempAssignList: seq[NimNode]
      callNode: seq[NimNode]
    let
      objTemp = genSym(nskTemp, ident = "objTemp")
    for i in 1 ..< formalParams.len:
      var param = formalParams[i][1]
      if param.kind == nnkBracketExpr and param[0].eqIdent("sink"):
        param = param[0]
      if param.typeKind in {ntyExpr, ntyStmt}:
        error("'toTask'ed function cannot have a 'typed' or 'untyped' parameter")
      case param.kind
      of nnkVarTy:
        error("'toTask'ed function cannot have a 'var' parameter")
      of nnkBracketExpr:
        if param[0].typeKind == ntyTypeDesc:
          callNode.add nnkExprEqExpr.newTree(formalParams[i][0], e[i])
        elif param[0].typeKind in {ntyVarargs, ntyOpenArray}:
          if param[1].typeKind in {ntyExpr, ntyStmt}:
            error("'toTask'ed function cannot have a 'typed' or 'untyped' parameter")
          let
            seqType = nnkBracketExpr.newTree(newIdentNode("seq"), param[1])
            seqCallNode = newcall("@", e[i])
          addAllNode(seqType, seqCallNode)
        else:
          addAllNode(param, e[i])
      of nnkBracket, nnkObjConstr:
        # passing by static parameters
        # so we pass them directly instead of passing by scratchObj
        callNode.add nnkExprEqExpr.newTree(formalParams[i][0], e[i])
      of nnkSym, nnkPtrTy:
        addAllNode(param, e[i])
      of nnkCharLit..nnkNilLit:
        callNode.add nnkExprEqExpr.newTree(formalParams[i][0], e[i])
      else:
        error("not supported type kinds")
    let scratchObjType = genSym(kind = nskType, ident = "ScratchObj")
    let scratchObj = nnkTypeSection.newTree(
                      nnkTypeDef.newTree(
                        scratchObjType,
                        newEmptyNode(),
                        nnkObjectTy.newTree(
                          newEmptyNode(),
                          newEmptyNode(),
                          scratchRecList
                        )
                      )
                    )
    let scratchObjPtrType = quote do:
      cast[ptr `scratchObjType`](c_calloc(csize_t 1, csize_t sizeof(`scratchObjType`)))
    let scratchLetSection = newLetStmt(
      scratchIdent,
      scratchObjPtrType
    )
    let scratchCheck = quote do:
      if `scratchIdent`.isNil:
        # Renamed in 1.4
        # raise newException(OutOfMemDefect, "Could not allocate memory")
        raise newException(OutOfMemError, "Could not allocate memory")
    var stmtList = newStmtList()
    stmtList.add(scratchObj)
    stmtList.add(scratchLetSection)
    stmtList.add(scratchCheck)
    stmtList.add(nnkBlockStmt.newTree(newEmptyNode(), newStmtList(scratchAssignList)))
    var functionStmtList = newStmtList()
    let funcCall = newCall(e[0], callNode)
    functionStmtList.add tempAssignList
    functionStmtList.add funcCall
    let funcName = genSym(nskProc, e[0].strVal)
    let destroyName = genSym(nskProc, "destroyScratch")
    let objTemp2 = genSym(ident = "obj")
    let tempNode = quote("@") do:
        `=destroy`(@objTemp2[])
    result = quote do:
      `stmtList`
      proc `funcName`(args: pointer) {.gcsafe, nimcall.} =
        let `objTemp` = cast[ptr `scratchObjType`](args)
        `functionStmtList`
      proc `destroyName`(args: pointer) {.nimcall.} =
        let `objTemp2` = cast[ptr `scratchObjType`](args)
        `tempNode`
      Task(callback: `funcName`, args: `scratchIdent`, destroy: `destroyName`)
  else:
    let funcCall = newCall(e[0])
    let funcName = genSym(nskProc, e[0].strVal)
    result = quote do:
      proc `funcName`(args: pointer) {.gcsafe, nimcall.} =
        `funcCall`
      Task(callback: `funcName`, args: nil)
  when defined(nimTasksDebug):
    echo result.repr
 runnableExamples("--gc:orc"):
  block:
    var num = 0
    proc hello(a: int) = inc num, a
    let b = toTask hello(13)
    b.invoke()
    assert num == 13
    # A task can be invoked multiple times
    b.invoke()
    assert num == 26
  block:
    type
      Runnable = ref object
        data: int
    var data: int
    proc hello(a: Runnable) {.nimcall.} =
      a.data += 2
      data = a.data
    when false:
      # the parameters of call must be isolated.
      let x = Runnable(data: 12)
      let b = toTask hello(x) # error ----> expression cannot be isolated: x
      b.invoke()
    let b = toTask(hello(Runnable(data: 12)))
    b.invoke()
    assert data == 14
    b.invoke()
    assert data == 16
--- a/taskpools/sparsesets.nim
+++ b/taskpools/sparsesets.nim
@ -0,0 +1,151 @@
 # Weave
 # Copyright (c) 2019 Mamy André-Ratsimbazafy
 # Licensed and distributed under either of
 #   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
 #   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
 # at your option. This file may not be copied, modified, or distributed except according to those terms.
 import
  std/random,
  system/ansi_c,
  ./instrumentation/contracts
 const TP_MaxWorkers = 255
 type Setuint = uint8 # We support at most 255 threads (0xFF is kept as special value to signify absence in the set)
 const Empty = high(Setuint)
 type
  SparseSet* = object
    ## Stores efficiently a set of integers in the range [0 .. Capacity)
    ## Supports:
    ## - O(1)      inclusion, exclusion and contains
    ## - O(1)      random pick
    ## - O(1)      length
    ## - O(length) iteration
    ##
    ## Space: Capacity * sizeof(words)
    ##
    ## This is contrary to bitsets which requires:
    ## - random picking: multiple random "contains" + a fallback to uncompressing the set
    ## - O(Capacity/sizeof(words)) length (via popcounts)
    ## - O(capacity) iteration
    indices: ptr UncheckedArray[Setuint]
    values: ptr UncheckedArray[Setuint]
    rawBuffer: ptr UncheckedArray[Setuint]
    len*: Setuint
    capacity*: Setuint
 func allocate*(s: var SparseSet, capacity: SomeInteger) {.inline.} =
  preCondition: capacity <= TP_MaxWorkers
  s.capacity = Setuint capacity
  s.rawBuffer = cast[ptr UncheckedArray[Setuint]](c_calloc(csize_t 2*capacity, csize_t sizeof(Setuint)))
  s.indices = s.rawBuffer
  s.values = cast[ptr UncheckedArray[Setuint]](s.rawBuffer[capacity].addr)
 func delete*(s: var SparseSet) {.inline.} =
  s.indices = nil
  s.values = nil
  c_free(s.rawBuffer)
 func refill*(s: var SparseSet) {.inline.} =
  ## Reset the sparseset by including all integers
  ## in the range [0 .. Capacity)
  preCondition: not s.indices.isNil
  preCondition: not s.values.isNil
  preCondition: not s.rawBuffer.isNil
  preCondition: s.capacity != 0
  s.len = s.capacity
  for i in Setuint(0) ..< s.len:
    s.indices[i] = i
    s.values[i] = i
 func isEmpty*(s: SparseSet): bool {.inline.} =
  s.len == 0
 func contains*(s: SparseSet, n: SomeInteger): bool {.inline.} =
  assert n.int != Empty.int
  s.indices[n] != Empty
 func incl*(s: var SparseSet, n: SomeInteger) {.inline.} =
  preCondition: n < Empty
  if n in s: return
  preCondition: s.len < s.capacity
  s.indices[n] = s.len
  s.values[s.len] = n
  s.len += 1
 func peek*(s: SparseSet): int32 {.inline.} =
  ## Returns the last point in the set
  ## Note: if an item is deleted this is not the last inserted point
  preCondition: s.len.int > 0
  int32 s.values[s.len - 1]
 func excl*(s: var SparseSet, n: SomeInteger) {.inline.} =
  if n notin s: return
  # We do constant time deletion by replacing the deleted
  # integer by the last value in the array of values
  let delIdx = s.indices[n]
  s.len -= 1
  let lastVal = s.values[s.len]
  s.indices[lastVal] = del_idx         # Last value now points to deleted index
  s.values[delIdx] = s.values[lastVal] # Deleted item is now last value
  # Erase the item
  s.indices[n] = Empty
 func randomPick*(s: SparseSet, rng: var Rand): int {.inline.} =
  ## Randomly pick from the set.
  # The value is NOT removed from it.
  let pickIdx = rng.rand(s.len-1)
  result = s.values[pickIdx].int
 func `$`*(s: SparseSet): string =
  $toOpenArray(s.values, 0, s.len.int - 1)
 # Sanity checks
 # ------------------------------------------------------------------------------
 when isMainModule:
  const Size = 10
  const Picked = 5
  var S: SparseSet
  S.allocate(Size)
  S.refill()
  echo S
  var rngState = initRand(123)
  var picked: seq[int]
  for _ in 0 ..< Picked:
    let p = S.randomPick(rngState)
    picked.add p
    S.excl p
    echo "---"
    echo "picked: ", p
    echo "S indices: ", toOpenArray(S.indices, 0, S.capacity.int - 1)
  echo "---"
  echo "picked: ", picked
  echo "S: ", S
  echo "S indices: ", toOpenArray(S.indices, 0, S.capacity.int - 1)
  for x in 0 ..< Size:
    if x notin picked:
      echo x, " notin picked -> in S"
      doAssert x in S
    else:
      echo x, " in picked -> notin S"
      doAssert x notin S
--- a/taskpools/taskpools.nim
+++ b/taskpools/taskpools.nim
@ -0,0 +1,530 @@
 # Nim-Taskpools
 # Copyright (c) 2021 Status Research & Development GmbH
 # Licensed and distributed under either of
 #   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
 #   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
 # at your option. This file may not be copied, modified, or distributed except according to those terms.
 # Taskpools
 #
 # This file implements a taskpool
 #
 # Implementation:
 #
 # It is a simple shared memory based work-stealing threadpool.
 # The primary focus is:
 # - Delegate compute intensive tasks to the threadpool.
 # - Simple to audit by staying close to foundational papers
 #   and using simple datastructures otherwise.
 # - Low energy consumption:
 #   threads should be put to sleep ASAP
 #   instead of polling/spinning (energy vs latency tradeoff)
 # - Decent performance:
 #   Work-stealing has optimal asymptotic parallel speedup.
 #   Work-stealing has significantly reduced contention
 #   when many tasks are created,
 #   for example by divide-and-conquer algorithms, compared to a global task queue
 #
 # Not a priority:
 # - Handling trillions of very short tasks (less than 100µs).
 # - Advanced task dependencies or events API.
 # - Unbalanced parallel-for loops.
 # - Handling services that should run for the lifetime of the program.
 #
 # Doing IO on a compute threadpool should be avoided
 # In case a thread is blocked for IO, other threads can steal pending tasks in that thread.
 # If all threads are pending for IO, the threadpool will not make any progress and be soft-locked.
 {.push raises: [].}
 import
  system/ansi_c,
  std/[random, cpuinfo, atomics, macros],
  ./channels_spsc_single,
  ./chase_lev_deques,
  ./event_notifiers,
  ./primitives/barriers,
  ./instrumentation/[contracts, loggers],
  ./sparsesets,
  ./flowvars,
  ./ast_utils
 export
  # flowvars
  Flowvar, isSpawned, isReady, sync
 when defined(windows):
  import ./primitives/affinity_windows
 else:
  import ./primitives/affinity_posix
 when (NimMajor,NimMinor,NimPatch) >= (1,6,0):
  import std/tasks
 else:
  import ./shims_pre_1_6/tasks
 type
  WorkerID = int32
  TaskNode = ptr object
    # Linked list of tasks
    parent: TaskNode
    task: Task
  Signal = object
    terminate {.align: 64.}: Atomic[bool]
  WorkerContext = object
    ## Thread-local worker context
    # Params
    id: WorkerID
    taskpool: Taskpool
    # Tasks
    taskDeque: ptr ChaseLevDeque[TaskNode] # owned task deque
    currentTask: TaskNode
    # Synchronization
    eventNotifier: ptr EventNotifier # shared event notifier
    signal: ptr Signal               # owned signal
    # Thefts
    rng: Rand                        # RNG state to select victims
    numThreads: int
    otherDeques: ptr UncheckedArray[ChaseLevDeque[TaskNode]]
    victims: SparseSet
  Taskpool* = ptr object
    barrier: SyncBarrier
      ## Barrier for initialization and teardown
    eventNotifier: EventNotifier
      ## Puts thread to sleep
    numThreads{.align: 64.}: int
    workerDeques: ptr UncheckedArray[ChaseLevDeque[TaskNode]]
      ## Direct access for task stealing
    workers: ptr UncheckedArray[Thread[(Taskpool, WorkerID)]]
    workerSignals: ptr UncheckedArray[Signal]
      ## Access signaledTerminate
 # Thread-local config
 # ---------------------------------------------
 var workerContext {.threadvar.}: WorkerContext
  ## Thread-local Worker context
 proc setupWorker() =
  ## Initialize the thread-local context of a worker
  ## Requires the ID and taskpool fields to be initialized
  template ctx: untyped = workerContext
  preCondition: not ctx.taskpool.isNil()
  preCondition: 0 <= ctx.id and ctx.id < ctx.taskpool.numThreads
  preCondition: not ctx.taskpool.workerDeques.isNil()
  preCondition: not ctx.taskpool.workerSignals.isNil()
  # Thefts
  ctx.rng = initRand(0xEFFACED + ctx.id)
  ctx.numThreads = ctx.taskpool.numThreads
  ctx.otherDeques = ctx.taskpool.workerDeques
  ctx.victims.allocate(ctx.taskpool.numThreads)
  # Synchronization
  ctx.eventNotifier = addr ctx.taskpool.eventNotifier
  ctx.signal = addr ctx.taskpool.workerSignals[ctx.id]
  ctx.signal.terminate.store(false, moRelaxed)
  # Tasks
  ctx.taskDeque = addr ctx.taskpool.workerDeques[ctx.id]
  ctx.currentTask = nil
  # Init
  ctx.taskDeque[].init()
 proc teardownWorker() =
  ## Cleanup the thread-local context of a worker
  template ctx: untyped = workerContext
  ctx.taskDeque[].teardown()
  ctx.victims.delete()
 proc eventLoop(ctx: var WorkerContext) {.raises:[Exception].}
 proc workerEntryFn(params: tuple[taskpool: Taskpool, id: WorkerID])
       {.raises: [Exception].} =
  ## On the start of the threadpool workers will execute this
  ## until they receive a termination signal
  # We assume that thread_local variables start all at their binary zero value
  preCondition: workerContext == default(WorkerContext)
  template ctx: untyped = workerContext
  # If the following crashes, you need --tlsEmulation:off
  ctx.id = params.id
  ctx.taskpool = params.taskpool
  setupWorker()
  # 1 matching barrier in Taskpool.new() for root thread
  discard params.taskpool.barrier.wait()
  {.gcsafe.}: # Not GC-safe when multi-threaded due to thread-local variables
    ctx.eventLoop()
  debugTermination:
    log(">>> Worker %2d shutting down <<<\n", ctx.id)
  # 1 matching barrier in taskpool.shutdown() for root thread
  discard params.taskpool.barrier.wait()
  teardownWorker()
 # Tasks
 # ---------------------------------------------
 proc new(T: type TaskNode, parent: TaskNode, task: sink Task): T =
  type TaskNodeObj = typeof(default(T)[])
  var tn = cast[TaskNode](c_calloc(1, csize_t sizeof(TaskNodeObj)))
  tn.parent = parent
  tn.task = task
  return tn
 proc runTask(tn: var TaskNode) {.raises:[Exception], inline.} =
  ## Run a task and consumes the taskNode
  tn.task.invoke()
  tn.c_free()
 proc schedule(ctx: WorkerContext, tn: sink TaskNode) {.inline.} =
  ## Schedule a task in the taskpool
  debug: log("Worker %2d: schedule task 0x%.08x (parent 0x%.08x, current 0x%.08x)\n", ctx.id, tn, tn.parent, ctx.currentTask)
  ctx.taskDeque[].push(tn)
  ctx.taskpool.eventNotifier.notify()
 # Scheduler
 # ---------------------------------------------
 proc trySteal(ctx: var WorkerContext): TaskNode =
  ## Try to steal a task.
  ctx.victims.refill()
  ctx.victims.excl(ctx.id)
  while not ctx.victims.isEmpty():
    let target = ctx.victims.randomPick(ctx.rng)
    let stolenTask = ctx.otherDeques[target].steal()
    if not stolenTask.isNil:
      return stolenTask
    ctx.victims.excl(target)
  return nil
 proc eventLoop(ctx: var WorkerContext) {.raises:[Exception].} =
  ## Each worker thread executes this loop over and over.
  while not ctx.signal.terminate.load(moRelaxed):
    # 1. Pick from local deque
    debug: log("Worker %2d: eventLoop 1 - searching task from local deque\n", ctx.id)
    while (var taskNode = ctx.taskDeque[].pop(); not taskNode.isNil):
      debug: log("Worker %2d: eventLoop 1 - running task 0x%.08x (parent 0x%.08x, current 0x%.08x)\n", ctx.id, taskNode, taskNode.parent, ctx.currentTask)
      taskNode.runTask()
    # 2. Run out of tasks, become a thief
    debug: log("Worker %2d: eventLoop 2 - becoming a thief\n", ctx.id)
    var stolenTask = ctx.trySteal()
    if not stolenTask.isNil:
      # 2.a Run task
      debug: log("Worker %2d: eventLoop 2.a - stole task 0x%.08x (parent 0x%.08x, current 0x%.08x)\n", ctx.id, stolenTask, stolenTask.parent, ctx.currentTask)
      stolenTask.runTask()
    else:
      # 2.b Park the thread until a new task enters the taskpool
      debug: log("Worker %2d: eventLoop 2.b - sleeping\n", ctx.id)
      ctx.eventNotifier[].park()
      debug: log("Worker %2d: eventLoop 2.b - waking\n", ctx.id)
 # Tasking
 # ---------------------------------------------
 const RootTask = default(Task) # TODO: sentinel value different from null task
 template isRootTask(task: Task): bool =
  task == RootTask
 proc forceFuture*[T](fv: Flowvar[T], parentResult: var T) {.raises:[Exception].} =
  ## Eagerly complete an awaited FlowVar
  template ctx: untyped = workerContext
  template isFutReady(): untyped =
    fv.chan[].tryRecv(parentResult)
  if isFutReady():
    return
  ## 1. Process all the children of the current tasks.
  ##    This ensures that we can give control back ASAP.
  debug: log("Worker %2d: sync 1 - searching task from local deque\n", ctx.id)
  while (var taskNode = ctx.taskDeque[].pop(); not taskNode.isNil):
    if taskNode.parent != ctx.currentTask:
      debug: log("Worker %2d: sync 1 - skipping non-direct descendant task 0x%.08x (parent 0x%.08x, current 0x%.08x)\n", ctx.id, taskNode, taskNode.parent, ctx.currentTask)
      ctx.schedule(taskNode)
      break
    debug: log("Worker %2d: sync 1 - running task 0x%.08x (parent 0x%.08x, current 0x%.08x)\n", ctx.id, taskNode, taskNode.parent, ctx.currentTask)
    taskNode.runTask()
    if isFutReady():
      debug: log("Worker %2d: sync 1 - future ready, exiting\n", ctx.id)
      return
  ## 2. We run out-of-tasks or out-of-direct-child of our current awaited task
  ##    So the task is bottlenecked by dependencies in other threads,
  ##    hence we abandon our enqueued work and steal in the others' queues
  ##    in hope it advances our awaited task. This prioritizes latency over throughput.
  debug: log("Worker %2d: sync 2 - future not ready, becoming a thief (currentTask 0x%.08x)\n", ctx.id, ctx.currentTask)
  while not isFutReady():
    var taskNode = ctx.trySteal()
    if not taskNode.isNil:
      # We stole some task, we hope we advance our awaited task
      debug: log("Worker %2d: sync 2.1 - stole task 0x%.08x (parent 0x%.08x, current 0x%.08x)\n", ctx.id, taskNode, taskNode.parent, ctx.currentTask)
      taskNode.runTask()
    # elif (taskNode = ctx.taskDeque[].pop(); not taskNode.isNil):
    #   # We advance our own queue, this increases throughput but may impact latency on the awaited task
    #   debug: log("Worker %2d: sync 2.2 - couldn't steal, running own task\n", ctx.id)
    #   taskNode.runTask()
    else:
      # We don't park as there is no notif for task completion
      cpuRelax()
 proc syncAll*(pool: Taskpool) {.raises: [Exception].} =
  ## Blocks until all pending tasks are completed
  ## This MUST only be called from
  ## the root scope that created the taskpool
  template ctx: untyped = workerContext
  debugTermination:
    log(">>> Worker %2d enters barrier <<<\n", ctx.id)
  preCondition: ctx.id == 0
  preCondition: ctx.currentTask.task.isRootTask()
  # Empty all tasks
  var foreignThreadsParked = false
  while not foreignThreadsParked:
    # 1. Empty local tasks
    debug: log("Worker %2d: syncAll 1 - searching task from local deque\n", ctx.id)
    while (var taskNode = ctx.taskDeque[].pop(); not taskNode.isNil):
      debug: log("Worker %2d: syncAll 1 - running task 0x%.08x (parent 0x%.08x, current 0x%.08x)\n", ctx.id, taskNode, taskNode.parent, ctx.currentTask)
      taskNode.runTask()
    if ctx.numThreads == 1 or foreignThreadsParked:
      break
    # 2. Help other threads
    debug: log("Worker %2d: syncAll 2 - becoming a thief\n", ctx.id)
    var taskNode = ctx.trySteal()
    if not taskNode.isNil:
      # 2.1 We stole some task
      debug: log("Worker %2d: syncAll 2.1 - stole task 0x%.08x (parent 0x%.08x, current 0x%.08x)\n", ctx.id, taskNode, taskNode.parent, ctx.currentTask)
      taskNode.runTask()
    else:
      # 2.2 No task to steal
      if pool.eventNotifier.getParked() == pool.numThreads - 1:
        # 2.2.1 all threads besides the current are parked
        debugTermination:
          log("Worker %2d: syncAll 2.2.1 - termination, all other threads sleeping\n", ctx.id)
        foreignThreadsParked = true
      else:
        # 2.2.2 We don't park as there is no notif for task completion
        cpuRelax()
  debugTermination:
    log(">>> Worker %2d leaves barrier <<<\n", ctx.id)
 # Runtime
 # ---------------------------------------------
 proc new*(T: type Taskpool, numThreads = countProcessors()): T {.raises: [Exception].} =
  ## Initialize a threadpool that manages `numThreads` threads.
  ## Default to the number of logical processors available.
  var tp = cast[T](c_calloc(1, csize_t sizeof(default(Taskpool)[])))
  tp.barrier.init(numThreads.int32)
  tp.eventNotifier.initialize()
  tp.numThreads = numThreads
  tp.workerDeques = cast[ptr UncheckedArray[ChaseLevDeque[TaskNode]]](c_calloc(csize_t numThreads, csize_t sizeof ChaseLevDeque[TaskNode]))
  tp.workers = cast[ptr UncheckedArray[Thread[(Taskpool, WorkerID)]]](c_calloc(csize_t numThreads, csize_t sizeof Thread[(Taskpool, WorkerID)]))
  tp.workerSignals = cast[ptr UncheckedArray[Signal]](c_calloc(csize_t numThreads, csize_t sizeof Signal))
  # Setup master thread
  workerContext.id = 0
  workerContext.taskpool = tp
  when not(defined(cpp) and defined(vcc)):
    # TODO: Nim casts between Windows Handles but that requires reinterpret cast for C++
    pinToCpu(0)
  # Start worker threads
  for i in 1 ..< numThreads:
    createThread(tp.workers[i], worker_entry_fn, (tp, WorkerID(i)))
    # TODO: we might want to take into account Hyper-Threading (HT)
    #       and allow spawning tasks and pinning to cores that are not HT-siblings.
    #       This is important for memory-bound workloads (like copy, addition, ...)
    #       where both sibling cores will compete for L1 and L2 cache, effectively
    #       halving the memory bandwidth or worse, flushing what the other put in cache.
    #       Note that while 2x siblings is common, Xeon Phi has 4x Hyper-Threading.
    when not(defined(cpp) and defined(vcc)):
      # TODO: Nim casts between Windows Handles but that requires reinterpret cast for C++
      pinToCpu(tp.workers[i], i)
  # Root worker
  setupWorker()
  # Root task, this is a sentinel task that is never called.
  workerContext.currentTask = TaskNode.new(
    parent = nil,
    task = default(Task) # TODO RootTask, somehow this uses `=copy`
  )
  # Wait for the child threads
  discard tp.barrier.wait()
  return tp
 proc cleanup(tp: var TaskPool) {.raises: [OSError].} =
  ## Cleanup all resources allocated by the taskpool
  preCondition: workerContext.currentTask.task.isRootTask()
  for i in 1 ..< tp.numThreads:
    joinThread(tp.workers[i])
  tp.workerSignals.c_free()
  tp.workers.c_free()
  tp.workerDeques.c_free()
  `=destroy`(tp.eventNotifier)
  tp.barrier.delete()
  tp.c_free()
 proc shutdown*(tp: var TaskPool) {.raises:[Exception].} =
  ## Wait until all tasks are processed and then shutdown the taskpool
  preCondition: workerContext.currentTask.task.isRootTask()
  tp.syncAll()
  # Signal termination to all threads
  for i in 0 ..< tp.numThreads:
    tp.workerSignals[i].terminate.store(true, moRelaxed)
  let parked = tp.eventNotifier.getParked()
  for i in 0 ..< parked:
    tp.eventNotifier.notify()
  # 1 matching barrier in worker_entry_fn
  discard tp.barrier.wait()
  teardownWorker()
  tp.cleanup()
  # Dealloc dummy task
  workerContext.currentTask.c_free()
 # Task parallelism
 # ---------------------------------------------
 {.pop.} # raises:[]
 macro spawn*(tp: TaskPool, fnCall: typed): untyped =
  ## Spawns the input function call asynchronously, potentially on another thread of execution.
  ##
  ## If the function calls returns a result, spawn will wrap it in a Flowvar.
  ## You can use `sync` to block the current thread and extract the asynchronous result from the flowvar.
  ## You can use `isReady` to check if result is available and if subsequent
  ## `spawn` returns immediately.
  ##
  ## Tasks are processed approximately in Last-In-First-Out (LIFO) order
  result = newStmtList()
  let fn = fnCall[0]
  let fnName = $fn
  # Get the return type if any
  let retType = fnCall[0].getImpl[3][0]
  let needFuture = retType.kind != nnkEmpty
  # Package in a task
  let taskNode = ident("taskNode")
  let task = ident("task")
  if not needFuture:
    result.add quote do:
      let `task` = toTask(`fnCall`)
      let `taskNode` = TaskNode.new(workerContext.currentTask, `task`)
      schedule(workerContext, `taskNode`)
  else:
    # tasks have no return value.
    # 1. We create a channel/flowvar to transmit the return value to awaiter/sync
    # 2. We create a wrapper async_fn without return value that send the return value in the channel
    # 3. We package that wrapper function in a task
    # 1. Create the channel
    let fut = ident("fut")
    let futTy = nnkBracketExpr.newTree(
      bindSym"FlowVar",
      retType
    )
    result.add quote do:
      let `fut` = newFlowVar(type `retType`)
    # 2. Create a wrapper function that sends result to the channel
    # TODO, upstream "getImpl" doesn't return the generic params
    let genericParams = fn.getImpl()[2].replaceSymsByIdents()
    let formalParams = fn.getImpl()[3].replaceSymsByIdents()
    var asyncParams = nnkFormalParams.newTree(
      newEmptyNode()
    )
    var fnCallIdents = nnkCall.newTree(
      fnCall[0]
    )
    for i in 1 ..< formalParams.len:
      let ident = formalParams[i].replaceSymsByIdents()
      asyncParams.add ident
      for j in 0 ..< ident.len - 2:
        # Handle "a, b: int"
        fnCallIdents.add ident[j]
    let futFnParam = ident("fut")
    asyncParams.add newIdentDefs(futFnParam, futTy)
    let asyncBody = quote do:
      # XXX: can't test that when the RootTask is default(Task) instead of a sentinel value
      # preCondition: not isRootTask(workerContext.currentTask.task)
      let res = `fnCallIdents`
      readyWith(`futFnParam`, res)
    let asyncFn = ident("taskpool_" & fnName)
    result.add nnkProcDef.newTree(
      asyncFn,
      newEmptyNode(),
      genericParams,
      asyncParams,
      nnkPragma.newTree(ident("nimcall")),
      newEmptyNode(),
      asyncBody
    )
    var asyncCall = newCall(asyncFn)
    for i in 1 ..< fnCall.len:
      asyncCall.add fnCall[i].replaceSymsByIdents()
    asyncCall.add fut
    result.add quote do:
      let `task` = toTask(`asyncCall`)
      let `taskNode` = TaskNode.new(workerContext.currentTask, `task`)
      schedule(workerContext, `taskNode`)
      # Return the future / flowvar
      `fut`
  # Wrap in a block for namespacing
  result = nnkBlockStmt.newTree(newEmptyNode(), result)
  echo result.toStrLit()