initial commit
This commit is contained in:
commit
216aabe629
|
@ -0,0 +1,4 @@
|
|||
nimcache/
|
||||
|
||||
# Executables shall be put in an ignored build/ directory
|
||||
build/
|
|
@ -0,0 +1,35 @@
|
|||
# Taskpools
|
||||
|
||||
## API
|
||||
|
||||
The API spec follows https://github.com/nim-lang/RFCs/issues/347#task-parallelism-api
|
||||
|
||||
## Overview
|
||||
|
||||
This implements a lightweight, energy-efficient, easily auditable multithreaded taskpools.
|
||||
|
||||
This taskpools will be used in a highly security-sensitive blockchain application
|
||||
targeted at resource-restricted devices hence desirable properties are:
|
||||
|
||||
- Ease of auditing and maintenance.
|
||||
- Formally verified synchronization primitives are highly-sought after.
|
||||
- Otherwise primitives are implemented from papers or ported from proven codebases
|
||||
that can serve as reference for auditors.
|
||||
- Resource-efficient. Threads spindown to save power, low memory use.
|
||||
- Decent performance and scalability. The workload to parallelize are cryptography-related
|
||||
and require at least 1ms runtime per thread.
|
||||
This means that only a simple scheduler is required.
|
||||
|
||||
Non-goals:
|
||||
- Supporting task priorities
|
||||
- Being distributed
|
||||
- Supporting GC-ed memory on Nim default GC (sequences and strings)
|
||||
- Have async-awaitable tasks
|
||||
|
||||
In particular compared to [Weave](https://github.com/mratsim/weave), here are the tradeoffs:
|
||||
- Taskpools only provide spawn/sync (task parallelism).\
|
||||
There is no parallel for (data parallelism)\
|
||||
or precise in/out dependencies (dataflow parallelism).
|
||||
- Weave can handle trillions of small tasks that require only 10µs per task. (Load Balancing overhead)
|
||||
- Weave maintains an adaptive memory pool to reduce memory allocation overhead,
|
||||
Taskpools allocations are as-needed. (Scheduler overhead)
|
|
@ -0,0 +1,11 @@
|
|||
# BPC (Bouncing Producer-Consumer)
|
||||
|
||||
From [tasking-2.0](https://github.com/aprell/tasking-2.0) description
|
||||
|
||||
> **BPC**, short for **B**ouncing **P**roducer-**C**onsumer benchmark, as far
|
||||
> as I know, first described by [Dinan et al][1]. There are two types of
|
||||
> tasks, producer and consumer tasks. Each producer task creates another
|
||||
> producer task followed by *n* consumer tasks, until a certain depth *d* is
|
||||
> reached. Consumer tasks run for *t* microseconds. The smaller the values of
|
||||
> *n* and *t*, the harder it becomes to exploit the available parallelism. A
|
||||
> solid contender for the most antagonistic microbenchmark.
|
|
@ -0,0 +1,156 @@
|
|||
import
|
||||
# STD lib
|
||||
os, strutils, system/ansi_c, cpuinfo, strformat, math,
|
||||
# Library
|
||||
../../taskpools,
|
||||
# bench
|
||||
../wtime, ../resources
|
||||
|
||||
var
|
||||
Depth: int32 # For example 10000
|
||||
NumTasksPerDepth: int32 # For example 9
|
||||
# The total number of tasks in the BPC benchmark is
|
||||
# (NumTasksPerDepth + 1) * Depth
|
||||
NumTasksTotal: int32
|
||||
TaskGranularity: int32 # in microseconds
|
||||
PollInterval: float64 # in microseconds
|
||||
|
||||
tp: Taskpool
|
||||
|
||||
var global_poll_elapsed {.threadvar.}: float64
|
||||
|
||||
template dummy_cpt(): untyped =
|
||||
# Dummy computation
|
||||
# Calculate fib(30) iteratively
|
||||
var
|
||||
fib = 0
|
||||
f2 = 0
|
||||
f1 = 1
|
||||
for i in 2 .. 30:
|
||||
fib = f1 + f2
|
||||
f2 = f1
|
||||
f1 = fib
|
||||
|
||||
proc bpc_consume(usec: int32) =
|
||||
|
||||
var pollElapsed = 0'f64
|
||||
|
||||
let start = wtime_usec()
|
||||
let stop = usec.float64
|
||||
global_poll_elapsed = PollInterval
|
||||
|
||||
while true:
|
||||
var elapsed = wtime_usec() - start
|
||||
elapsed -= pollElapsed
|
||||
if elapsed >= stop:
|
||||
break
|
||||
|
||||
dummy_cpt()
|
||||
|
||||
# if elapsed >= global_poll_elapsed:
|
||||
# let pollStart = wtime_usec()
|
||||
# loadBalance(Weave)
|
||||
# pollElapsed += wtime_usec() - pollStart
|
||||
# global_poll_elapsed += PollInterval
|
||||
|
||||
proc bpc_consume_nopoll(usec: int32) =
|
||||
|
||||
let start = wtime_usec()
|
||||
let stop = usec.float64
|
||||
|
||||
while true:
|
||||
var elapsed = wtime_usec() - start
|
||||
if elapsed >= stop:
|
||||
break
|
||||
|
||||
dummy_cpt()
|
||||
|
||||
proc bpc_produce(n, d: int32) =
|
||||
if d > 0:
|
||||
# Create producer task
|
||||
tp.spawn bpc_produce(n, d-1)
|
||||
else:
|
||||
return
|
||||
|
||||
# Followed by n consumer tasks
|
||||
for i in 0 ..< n:
|
||||
tp.spawn bpc_consume(TaskGranularity)
|
||||
|
||||
proc main() =
|
||||
Depth = 10000
|
||||
NumTasksPerDepth = 999
|
||||
TaskGranularity = 1
|
||||
|
||||
if paramCount() == 0:
|
||||
let exeName = getAppFilename().extractFilename()
|
||||
echo &"Usage: {exeName} <depth: {Depth}> " &
|
||||
&"<# of tasks per depth: {NumTasksPerDepth}> " &
|
||||
&"[task granularity (us): {TaskGranularity}] " &
|
||||
&"[polling interval (us): task granularity]"
|
||||
echo &"Running with default config Depth = {Depth}, NumTasksPerDepth = {NumTasksPerDepth}, granularity (us) = {TaskGranularity}, polling (us) = {PollInterval}"
|
||||
if paramCount() >= 1:
|
||||
Depth = paramStr(1).parseInt.int32
|
||||
if paramCount() >= 2:
|
||||
NumTasksPerDepth = paramStr(2). parseInt.int32
|
||||
if paramCount() >= 3:
|
||||
TaskGranularity = paramStr(3). parseInt.int32
|
||||
if paramCount() == 4:
|
||||
PollInterval = paramStr(4).parseInt.float64
|
||||
else:
|
||||
PollInterval = TaskGranularity.float64
|
||||
if paramCount() > 4:
|
||||
let exeName = getAppFilename().extractFilename()
|
||||
echo &"Usage: {exeName} <depth: {Depth}> " &
|
||||
&"<# of tasks per depth: {NumTasksPerDepth}> " &
|
||||
&"[task granularity (us): {TaskGranularity}] " &
|
||||
&"[polling interval (us): task granularity]"
|
||||
quit 1
|
||||
|
||||
NumTasksTotal = (NumTasksPerDepth + 1) * Depth
|
||||
|
||||
var nthreads: int
|
||||
if existsEnv"TASKPOOL_NUM_THREADS":
|
||||
nthreads = getEnv"TASKPOOL_NUM_THREADS".parseInt()
|
||||
else:
|
||||
nthreads = countProcessors()
|
||||
|
||||
tp = Taskpool.new(numThreads = nthreads)
|
||||
|
||||
# measure overhead during tasking
|
||||
var ru: Rusage
|
||||
getrusage(RusageSelf, ru)
|
||||
var
|
||||
rss = ru.ru_maxrss
|
||||
flt = ru.ru_minflt
|
||||
|
||||
let start = wtime_msec()
|
||||
|
||||
bpc_produce(NumTasksPerDepth, Depth)
|
||||
tp.syncAll()
|
||||
|
||||
let stop = wtime_msec()
|
||||
|
||||
getrusage(RusageSelf, ru)
|
||||
rss = ru.ru_maxrss - rss
|
||||
flt = ru.ru_minflt - flt
|
||||
|
||||
tp.shutdown()
|
||||
|
||||
echo "--------------------------------------------------------------------------"
|
||||
echo "Scheduler: Taskpool"
|
||||
echo "Benchmark: BPC (Bouncing Producer-Consumer)"
|
||||
echo "Threads: ", nthreads
|
||||
echo "Time(ms) ", round(stop - start, 3)
|
||||
echo "Max RSS (KB): ", ru.ru_maxrss
|
||||
echo "Runtime RSS (KB): ", rss
|
||||
echo "# of page faults: ", flt
|
||||
echo "--------------------------------------------------------------------------"
|
||||
echo "# of tasks: ", NumTasksTotal
|
||||
echo "# of tasks/depth: ", NumTasksPerDepth
|
||||
echo "Depth: ", Depth
|
||||
echo "Task granularity (us): ", TaskGranularity
|
||||
echo "Polling / manual load balancing interval (us): ", PollInterval
|
||||
|
||||
quit 0
|
||||
|
||||
main()
|
|
@ -0,0 +1,85 @@
|
|||
# Weave
|
||||
# Copyright (c) 2019 Mamy André-Ratsimbazafy
|
||||
# Licensed and distributed under either of
|
||||
# * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
|
||||
# * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
|
||||
# at your option. This file may not be copied, modified, or distributed except according to those terms.
|
||||
|
||||
import
|
||||
# Stdlib
|
||||
system/ansi_c, strformat, os, strutils, cpuinfo,
|
||||
# Weave
|
||||
../../weave
|
||||
when not defined(windows):
|
||||
# bench
|
||||
import ../wtime
|
||||
|
||||
proc dfs(depth, breadth: int): uint32 =
|
||||
if depth == 0:
|
||||
return 1
|
||||
|
||||
# We could use alloca to avoid heap allocation here
|
||||
var sums = newSeq[Flowvar[uint32]](breadth)
|
||||
|
||||
for i in 0 ..< breadth:
|
||||
sums[i] = spawn dfs(depth - 1, breadth)
|
||||
|
||||
for i in 0 ..< breadth:
|
||||
result += sync(sums[i])
|
||||
|
||||
proc test(depth, breadth: int): uint32 =
|
||||
result = sync spawn dfs(depth, breadth)
|
||||
|
||||
proc main() =
|
||||
|
||||
var
|
||||
depth = 8
|
||||
breadth = 8
|
||||
answer: uint32
|
||||
nthreads: int
|
||||
|
||||
if existsEnv"WEAVE_NUM_THREADS":
|
||||
nthreads = getEnv"WEAVE_NUM_THREADS".parseInt()
|
||||
else:
|
||||
nthreads = countProcessors()
|
||||
|
||||
if paramCount() == 0:
|
||||
let exeName = getAppFilename().extractFilename()
|
||||
echo &"Usage: {exeName} <depth:{depth}> <breadth:{breadth}>"
|
||||
echo &"Running with default config depth = {depth} and breadth = {breadth}"
|
||||
|
||||
if paramCount() >= 1:
|
||||
depth = paramStr(1).parseInt()
|
||||
if paramCount() == 2:
|
||||
breadth = paramStr(2).parseInt()
|
||||
if paramCount() > 2:
|
||||
let exeName = getAppFilename().extractFilename()
|
||||
echo &"Usage: {exeName} <depth:{depth}> <breadth:{breadth}>"
|
||||
echo &"Up to 2 parameters are valid. Received {paramCount()}"
|
||||
quit 1
|
||||
|
||||
# Staccato benches runtime init and exit as well
|
||||
when not defined(windows):
|
||||
let start = wtime_usec()
|
||||
|
||||
init(Weave)
|
||||
answer = test(depth, breadth)
|
||||
exit(Weave)
|
||||
|
||||
when not defined(windows):
|
||||
let stop = wtime_usec()
|
||||
|
||||
const lazy = defined(WV_LazyFlowvar)
|
||||
const config = if lazy: " (lazy flowvars)"
|
||||
else: " (eager flowvars)"
|
||||
|
||||
echo "Scheduler: Weave", config
|
||||
echo "Benchmark: dfs"
|
||||
echo "Threads: ", nthreads
|
||||
when not defined(windows):
|
||||
echo "Time(us) ", stop - start
|
||||
echo "Output: ", answer
|
||||
|
||||
quit 0
|
||||
|
||||
main()
|
|
@ -0,0 +1,300 @@
|
|||
# Weave
|
||||
# Copyright (c) 2019 Mamy André-Ratsimbazafy
|
||||
# Licensed and distributed under either of
|
||||
# * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
|
||||
# * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
|
||||
# at your option. This file may not be copied, modified, or distributed except according to those terms.
|
||||
|
||||
# From fibril
|
||||
#
|
||||
# Original license
|
||||
#
|
||||
# /*
|
||||
# * Heat diffusion (Jacobi-type iteration)
|
||||
# *
|
||||
# * Volker Strumpen, Boston August 1996
|
||||
# *
|
||||
# * Copyright (c) 1996 Massachusetts Institute of Technology
|
||||
# *
|
||||
# * This program is free software; you can redistribute it and/or modify
|
||||
# * it under the terms of the GNU General Public License as published by
|
||||
# * the Free Software Foundation; either version 2 of the License, or
|
||||
# * (at your option) any later version.
|
||||
# *
|
||||
# * This program is distributed in the hope that it will be useful,
|
||||
# * but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# * GNU General Public License for more details.
|
||||
# *
|
||||
# * You should have received a copy of the GNU General Public License
|
||||
# * along with this program; if not, write to the Free Software
|
||||
# * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
||||
# */
|
||||
|
||||
import
|
||||
# Stdlib
|
||||
strformat, os, strutils, math, system/ansi_c,
|
||||
cpuinfo, threadpool,
|
||||
# bench
|
||||
../wtime, ../resources
|
||||
|
||||
# This deadlocks :/
|
||||
|
||||
# Helpers
|
||||
# -------------------------------------------------------
|
||||
|
||||
# We need a thin wrapper around raw pointers for matrices,
|
||||
# we can't pass "var seq[seq[float64]]" to other threads
|
||||
# nor "var" for that matter
|
||||
type
|
||||
Matrix[T] = object
|
||||
buffer: ptr UncheckedArray[T]
|
||||
m, n: int
|
||||
|
||||
Row[T] = object
|
||||
buffer: ptr UncheckedArray[T]
|
||||
len: int
|
||||
|
||||
func newMatrix[T](m, n: int): Matrix[T] {.inline.} =
|
||||
result.buffer = cast[ptr UncheckedArray[T]](c_malloc(csize_t m*n*sizeof(T)))
|
||||
result.m = m
|
||||
result.n = n
|
||||
|
||||
template `[]`[T](mat: Matrix[T], row, col: Natural): T =
|
||||
# row-major storage
|
||||
assert row < mat.m
|
||||
assert col < mat.n
|
||||
mat.buffer[row * mat.n + col]
|
||||
|
||||
template `[]=`[T](mat: Matrix[T], row, col: Natural, value: T) =
|
||||
assert row < mat.m
|
||||
assert col < mat.n
|
||||
mat.buffer[row * mat.n + col] = value
|
||||
|
||||
func getRow[T](mat: Matrix[T], rowIdx: Natural): Row[T] {.inline.} =
|
||||
# row-major storage, there are n columns in between each rows
|
||||
assert rowIdx < mat.m
|
||||
result.buffer = cast[ptr UncheckedArray[T]](mat.buffer[rowIdx * mat.n].addr)
|
||||
result.len = mat.m
|
||||
|
||||
template `[]`[T](row: Row[T], idx: Natural): T =
|
||||
assert idx < row.len
|
||||
row.buffer[idx]
|
||||
|
||||
template `[]=`[T](row: Row[T], idx: Natural, value: T) =
|
||||
assert idx < row.len
|
||||
row.buffer[idx] = value
|
||||
|
||||
func delete[T](mat: sink Matrix[T]) =
|
||||
c_free(mat.buffer)
|
||||
|
||||
# And an auto converter for int32 -> float64 so we don't have to convert
|
||||
# all i, j indices manually
|
||||
|
||||
converter i32toF64(x: int32): float64 {.inline.} =
|
||||
float64(x)
|
||||
|
||||
# -------------------------------------------------------
|
||||
|
||||
template f(x, y: SomeFloat): SomeFloat =
|
||||
sin(x) * sin(y)
|
||||
|
||||
template randa[T: SomeFloat](x, t: T): T =
|
||||
T(0.0)
|
||||
|
||||
proc randb(x, t: SomeFloat): SomeFloat {.inline.} =
|
||||
# proc instead of template to avoid Nim constant folding bug:
|
||||
# https://github.com/nim-lang/Nim/issues/12783
|
||||
exp(-2 * t) * sin(x)
|
||||
|
||||
template randc[T: SomeFloat](y, t: T): T =
|
||||
T(0.0)
|
||||
|
||||
proc randd(y, t: SomeFloat): SomeFloat {.inline.} =
|
||||
# proc instead of template to avoid Nim constant folding bug:
|
||||
# https://github.com/nim-lang/Nim/issues/12783
|
||||
exp(-2 * t) * sin(y)
|
||||
|
||||
template solu(x, y, t: SomeFloat): SomeFloat =
|
||||
exp(-2 * t) * sin(x) * sin(y)
|
||||
|
||||
const n = 4096'i32
|
||||
|
||||
var
|
||||
nx, ny, nt: int32
|
||||
xu, xo, yu, yo, tu, to: float64
|
||||
|
||||
dx, dy, dt: float64
|
||||
dtdxsq, dtdysq: float64
|
||||
|
||||
odd: Matrix[float64]
|
||||
even: Matrix[float64]
|
||||
|
||||
proc heat(m: Matrix[float64], il, iu: int32): bool {.discardable.}=
|
||||
# TODO to allow awaiting `heat` we return a dummy bool
|
||||
# The parallel spawns are updating the same matrix cells otherwise
|
||||
if iu - il > 1:
|
||||
let im = (il + iu) div 2
|
||||
|
||||
let h = spawn heat(m, il, im)
|
||||
heat(m, im, iu)
|
||||
discard ^h
|
||||
return true
|
||||
# ------------------------
|
||||
|
||||
let i = il
|
||||
let row = m.getRow(i)
|
||||
|
||||
if i == 0:
|
||||
for j in 0 ..< ny:
|
||||
row[j] = randc(yu + j*dy, 0)
|
||||
elif i == nx - 1:
|
||||
for j in 0 ..< ny:
|
||||
row[j] = randd(yu + j*dy, 0)
|
||||
else:
|
||||
row[0] = randa(xu + i*dx, 0)
|
||||
for j in 1 ..< ny - 1:
|
||||
row[j] = f(xu + i*dx, yu + j*dy)
|
||||
row[ny - 1] = randb(xu + i*dx, 0)
|
||||
|
||||
proc diffuse(output: Matrix[float64], input: Matrix[float64], il, iu: int32, t: float64): bool {.discardable.} =
|
||||
# TODO to allow awaiting `diffuse` we return a dummy bool
|
||||
# The parallel spawns are updating the same matrix cells otherwise
|
||||
if iu - il > 1:
|
||||
let im = (il + iu) div 2
|
||||
|
||||
let d = spawn diffuse(output, input, il, im, t)
|
||||
diffuse(output, input, im, iu, t)
|
||||
discard ^d
|
||||
return true
|
||||
# ------------------------
|
||||
|
||||
let i = il
|
||||
let row = output.getRow(i)
|
||||
|
||||
if i == 0:
|
||||
for j in 0 ..< ny:
|
||||
row[j] = randc(yu + j*dy, t)
|
||||
elif i == nx - 1:
|
||||
for j in 0 ..< ny:
|
||||
row[j] = randd(yu + j*dy, t)
|
||||
else:
|
||||
row[0] = randa(xu + i*dx, t)
|
||||
for j in 1 ..< ny - 1:
|
||||
row[j] = input[i, j] + # The use of nested sequences here is a bad idea ...
|
||||
dtdysq * (input[i, j+1] - 2 * input[i, j] + input[i, j-1]) +
|
||||
dtdxsq * (input[i+1, j] - 2 * input[i, j] + input[i-1, j])
|
||||
row[ny - 1] = randb(xu + i*dx, t)
|
||||
|
||||
proc initTest() =
|
||||
nx = n
|
||||
ny = 1024
|
||||
nt = 100
|
||||
xu = 0.0
|
||||
xo = 1.570796326794896558
|
||||
yu = 0.0
|
||||
yo = 1.570796326794896558
|
||||
tu = 0.0
|
||||
to = 0.0000001
|
||||
|
||||
dx = (xo - xu) / float64(nx - 1)
|
||||
dy = (yo - yu) / float64(ny - 1)
|
||||
dt = (to - tu) / float64(nt)
|
||||
|
||||
dtdxsq = dt / (dx * dx)
|
||||
dtdysq = dt / (dy * dy)
|
||||
|
||||
even = newMatrix[float64](nx, ny)
|
||||
odd = newMatrix[float64](nx, ny)
|
||||
|
||||
proc prep() =
|
||||
heat(even, 0, nx)
|
||||
|
||||
proc test() =
|
||||
var t = tu
|
||||
|
||||
for _ in countup(1, nt.int, 2):
|
||||
# nt included
|
||||
t += dt
|
||||
diffuse(odd, even, 0, nx, t)
|
||||
t += dt
|
||||
diffuse(even, odd, 0, nx, t)
|
||||
|
||||
if nt mod 2 != 0:
|
||||
t += dt
|
||||
diffuse(odd, even, 0, nx, t)
|
||||
|
||||
proc verify() =
|
||||
var
|
||||
mat: Matrix[float64]
|
||||
mae: float64
|
||||
mre: float64
|
||||
me: float64
|
||||
|
||||
mat = if nt mod 2 != 0: odd else: even
|
||||
|
||||
for a in 0 ..< nx:
|
||||
for b in 0 ..< ny:
|
||||
var tmp = abs(mat[a, b] - solu(xu + a*dx, yu + b*dy, to))
|
||||
if tmp > 1e-3:
|
||||
echo "nx: ", nx, " - ny: ", ny
|
||||
echo "mat[", a, ", ", b, "] = ", mat[a, b], ", expected sol = ", solu(xu + a*dx, yu + b*dy, to)
|
||||
quit 1
|
||||
|
||||
me += tmp
|
||||
if tmp > mae: mae = tmp
|
||||
if mat[a, b] != 0.0: tmp /= mat[a, b]
|
||||
if tmp > mre: mre = tmp
|
||||
|
||||
me /= nx * ny
|
||||
|
||||
if mae > 1e-12:
|
||||
echo &"Local maximal absolute error {mae:1.3e}"
|
||||
quit 1
|
||||
if mre > 1e-12:
|
||||
echo &"Local maximal relative error {mre:1.3e}"
|
||||
quit 1
|
||||
if me > 1e-12:
|
||||
echo &"Global mean absolute error {me:1.3e}"
|
||||
quit 1
|
||||
|
||||
echo "Verification successful"
|
||||
|
||||
proc main() =
|
||||
var nthreads: int
|
||||
nthreads = countProcessors()
|
||||
|
||||
var ru: Rusage
|
||||
getrusage(RusageSelf, ru)
|
||||
var
|
||||
rss = ru.ru_maxrss
|
||||
flt = ru.ru_minflt
|
||||
|
||||
initTest()
|
||||
|
||||
prep()
|
||||
let start = wtime_usec()
|
||||
test()
|
||||
let stop = wtime_usec()
|
||||
|
||||
getrusage(RusageSelf, ru)
|
||||
rss = ru.ru_maxrss - rss
|
||||
flt = ru.ru_minflt - flt
|
||||
|
||||
sync()
|
||||
|
||||
verify()
|
||||
delete(even)
|
||||
delete(odd)
|
||||
|
||||
echo "Scheduler: Nim threadpool (standard lib)"
|
||||
echo "Benchmark: heat"
|
||||
echo "Threads: ", nthreads
|
||||
echo "Time(us) ", stop - start
|
||||
echo "Max RSS (KB): ", ru.ru_maxrss
|
||||
echo "Runtime RSS (KB): ", rss
|
||||
echo "# of page faults: ", flt
|
||||
|
||||
quit 0
|
||||
|
||||
main()
|
|
@ -0,0 +1,313 @@
|
|||
# Weave
|
||||
# Copyright (c) 2019 Mamy André-Ratsimbazafy
|
||||
# Licensed and distributed under either of
|
||||
# * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
|
||||
# * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
|
||||
# at your option. This file may not be copied, modified, or distributed except according to those terms.
|
||||
|
||||
# From fibril
|
||||
#
|
||||
# Original license
|
||||
#
|
||||
# /*
|
||||
# * Heat diffusion (Jacobi-type iteration)
|
||||
# *
|
||||
# * Volker Strumpen, Boston August 1996
|
||||
# *
|
||||
# * Copyright (c) 1996 Massachusetts Institute of Technology
|
||||
# *
|
||||
# * This program is free software; you can redistribute it and/or modify
|
||||
# * it under the terms of the GNU General Public License as published by
|
||||
# * the Free Software Foundation; either version 2 of the License, or
|
||||
# * (at your option) any later version.
|
||||
# *
|
||||
# * This program is distributed in the hope that it will be useful,
|
||||
# * but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# * GNU General Public License for more details.
|
||||
# *
|
||||
# * You should have received a copy of the GNU General Public License
|
||||
# * along with this program; if not, write to the Free Software
|
||||
# * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
||||
# */
|
||||
|
||||
import
|
||||
# Stdlib
|
||||
strformat, os, strutils, math, system/ansi_c,
|
||||
cpuinfo,
|
||||
# Taskpools
|
||||
../../taskpools
|
||||
when not defined(windows):
|
||||
# bench
|
||||
import ../wtime, ../resources
|
||||
|
||||
# Helpers
|
||||
# -------------------------------------------------------
|
||||
|
||||
# We need a thin wrapper around raw pointers for matrices,
|
||||
# we can't pass "var seq[seq[float64]]" to other threads
|
||||
# nor "var" for that matter
|
||||
type
|
||||
Matrix[T] = object
|
||||
buffer: ptr UncheckedArray[T]
|
||||
m, n: int
|
||||
|
||||
Row[T] = object
|
||||
buffer: ptr UncheckedArray[T]
|
||||
len: int
|
||||
|
||||
var tp: Taskpool
|
||||
|
||||
func newMatrix[T](m, n: int): Matrix[T] {.inline.} =
|
||||
result.buffer = cast[ptr UncheckedArray[T]](c_malloc(csize_t m*n*sizeof(T)))
|
||||
result.m = m
|
||||
result.n = n
|
||||
|
||||
template `[]`[T](mat: Matrix[T], row, col: Natural): T =
|
||||
# row-major storage
|
||||
assert row < mat.m
|
||||
assert col < mat.n
|
||||
mat.buffer[row * mat.n + col]
|
||||
|
||||
template `[]=`[T](mat: Matrix[T], row, col: Natural, value: T) =
|
||||
assert row < mat.m
|
||||
assert col < mat.n
|
||||
mat.buffer[row * mat.n + col] = value
|
||||
|
||||
func getRow[T](mat: Matrix[T], rowIdx: Natural): Row[T] {.inline.} =
|
||||
# row-major storage, there are n columns in between each rows
|
||||
assert rowIdx < mat.m
|
||||
result.buffer = cast[ptr UncheckedArray[T]](mat.buffer[rowIdx * mat.n].addr)
|
||||
result.len = mat.m
|
||||
|
||||
template `[]`[T](row: Row[T], idx: Natural): T =
|
||||
assert idx < row.len
|
||||
row.buffer[idx]
|
||||
|
||||
template `[]=`[T](row: Row[T], idx: Natural, value: T) =
|
||||
assert idx < row.len
|
||||
row.buffer[idx] = value
|
||||
|
||||
func delete[T](mat: sink Matrix[T]) =
|
||||
c_free(mat.buffer)
|
||||
|
||||
# And an auto converter for int32 -> float64 so we don't have to convert
|
||||
# all i, j indices manually
|
||||
|
||||
converter i32toF64(x: int32): float64 {.inline.} =
|
||||
float64(x)
|
||||
|
||||
# -------------------------------------------------------
|
||||
|
||||
template f(x, y: SomeFloat): SomeFloat =
|
||||
sin(x) * sin(y)
|
||||
|
||||
template randa[T: SomeFloat](x, t: T): T =
|
||||
T(0.0)
|
||||
|
||||
proc randb(x, t: SomeFloat): SomeFloat {.inline.} =
|
||||
# proc instead of template to avoid Nim constant folding bug:
|
||||
# https://github.com/nim-lang/Nim/issues/12783
|
||||
exp(-2 * t) * sin(x)
|
||||
|
||||
template randc[T: SomeFloat](y, t: T): T =
|
||||
T(0.0)
|
||||
|
||||
proc randd(y, t: SomeFloat): SomeFloat {.inline.} =
|
||||
# proc instead of template to avoid Nim constant folding bug:
|
||||
# https://github.com/nim-lang/Nim/issues/12783
|
||||
exp(-2 * t) * sin(y)
|
||||
|
||||
template solu(x, y, t: SomeFloat): SomeFloat =
|
||||
exp(-2 * t) * sin(x) * sin(y)
|
||||
|
||||
const n = 4096'i32
|
||||
|
||||
var
|
||||
nx, ny, nt: int32
|
||||
xu, xo, yu, yo, tu, to: float64
|
||||
|
||||
dx, dy, dt: float64
|
||||
dtdxsq, dtdysq: float64
|
||||
|
||||
odd: Matrix[float64]
|
||||
even: Matrix[float64]
|
||||
|
||||
proc heat(m: Matrix[float64], il, iu: int32): bool {.discardable.}=
|
||||
# TODO to allow awaiting `heat` we return a dummy bool
|
||||
# The parallel spawns are updating the same matrix cells otherwise
|
||||
if iu - il > 1:
|
||||
let im = (il + iu) div 2
|
||||
|
||||
let h = tp.spawn heat(m, il, im)
|
||||
heat(m, im, iu)
|
||||
discard sync(h)
|
||||
return true
|
||||
# ------------------------
|
||||
|
||||
let i = il
|
||||
let row = m.getRow(i)
|
||||
|
||||
if i == 0:
|
||||
for j in 0 ..< ny:
|
||||
row[j] = randc(yu + j*dy, 0)
|
||||
elif i == nx - 1:
|
||||
for j in 0 ..< ny:
|
||||
row[j] = randd(yu + j*dy, 0)
|
||||
else:
|
||||
row[0] = randa(xu + i*dx, 0)
|
||||
for j in 1 ..< ny - 1:
|
||||
row[j] = f(xu + i*dx, yu + j*dy)
|
||||
row[ny - 1] = randb(xu + i*dx, 0)
|
||||
|
||||
proc diffuse(output: Matrix[float64], input: Matrix[float64], il, iu: int32, t: float64): bool {.discardable.} =
|
||||
# TODO to allow awaiting `diffuse` we return a dummy bool
|
||||
# The parallel spawns are updating the same matrix cells otherwise
|
||||
if iu - il > 1:
|
||||
let im = (il + iu) div 2
|
||||
|
||||
let d = tp.spawn diffuse(output, input, il, im, t)
|
||||
diffuse(output, input, im, iu, t)
|
||||
discard sync(d)
|
||||
return true
|
||||
# ------------------------
|
||||
|
||||
let i = il
|
||||
let row = output.getRow(i)
|
||||
|
||||
if i == 0:
|
||||
for j in 0 ..< ny:
|
||||
row[j] = randc(yu + j*dy, t)
|
||||
elif i == nx - 1:
|
||||
for j in 0 ..< ny:
|
||||
row[j] = randd(yu + j*dy, t)
|
||||
else:
|
||||
row[0] = randa(xu + i*dx, t)
|
||||
for j in 1 ..< ny - 1:
|
||||
row[j] = input[i, j] + # The use of nested sequences here is a bad idea ...
|
||||
dtdysq * (input[i, j+1] - 2 * input[i, j] + input[i, j-1]) +
|
||||
dtdxsq * (input[i+1, j] - 2 * input[i, j] + input[i-1, j])
|
||||
row[ny - 1] = randb(xu + i*dx, t)
|
||||
|
||||
proc initTest() =
|
||||
nx = n
|
||||
ny = 1024
|
||||
nt = 100
|
||||
xu = 0.0
|
||||
xo = 1.570796326794896558
|
||||
yu = 0.0
|
||||
yo = 1.570796326794896558
|
||||
tu = 0.0
|
||||
to = 0.0000001
|
||||
|
||||
dx = (xo - xu) / float64(nx - 1)
|
||||
dy = (yo - yu) / float64(ny - 1)
|
||||
dt = (to - tu) / float64(nt)
|
||||
|
||||
dtdxsq = dt / (dx * dx)
|
||||
dtdysq = dt / (dy * dy)
|
||||
|
||||
even = newMatrix[float64](nx, ny)
|
||||
odd = newMatrix[float64](nx, ny)
|
||||
|
||||
proc prep() =
|
||||
heat(even, 0, nx)
|
||||
|
||||
proc test() =
|
||||
var t = tu
|
||||
|
||||
for _ in countup(1, nt.int, 2):
|
||||
# nt included
|
||||
t += dt
|
||||
diffuse(odd, even, 0, nx, t)
|
||||
t += dt
|
||||
diffuse(even, odd, 0, nx, t)
|
||||
|
||||
if nt mod 2 != 0:
|
||||
t += dt
|
||||
diffuse(odd, even, 0, nx, t)
|
||||
|
||||
proc verify() =
|
||||
var
|
||||
mat: Matrix[float64]
|
||||
mae: float64
|
||||
mre: float64
|
||||
me: float64
|
||||
|
||||
mat = if nt mod 2 != 0: odd else: even
|
||||
|
||||
for a in 0 ..< nx:
|
||||
for b in 0 ..< ny:
|
||||
var tmp = abs(mat[a, b] - solu(xu + a*dx, yu + b*dy, to))
|
||||
if tmp > 1e-3:
|
||||
echo "nx: ", nx, " - ny: ", ny
|
||||
echo "mat[", a, ", ", b, "] = ", mat[a, b], ", expected sol = ", solu(xu + a*dx, yu + b*dy, to)
|
||||
quit 1
|
||||
|
||||
me += tmp
|
||||
if tmp > mae: mae = tmp
|
||||
if mat[a, b] != 0.0: tmp /= mat[a, b]
|
||||
if tmp > mre: mre = tmp
|
||||
|
||||
me /= nx * ny
|
||||
|
||||
if mae > 1e-12:
|
||||
echo &"Local maximal absolute error {mae:1.3e}"
|
||||
quit 1
|
||||
if mre > 1e-12:
|
||||
echo &"Local maximal relative error {mre:1.3e}"
|
||||
quit 1
|
||||
if me > 1e-12:
|
||||
echo &"Global mean absolute error {me:1.3e}"
|
||||
quit 1
|
||||
|
||||
echo "Verification successful"
|
||||
|
||||
proc main() =
|
||||
var nthreads: int
|
||||
if existsEnv"TASKPOOL_NUM_THREADS":
|
||||
nthreads = getEnv"TASKPOOL_NUM_THREADS".parseInt()
|
||||
else:
|
||||
nthreads = countProcessors()
|
||||
|
||||
when not defined(windows):
|
||||
var ru: Rusage
|
||||
getrusage(RusageSelf, ru)
|
||||
var
|
||||
rss = ru.ru_maxrss
|
||||
flt = ru.ru_minflt
|
||||
|
||||
initTest()
|
||||
|
||||
# Fibril initializes before benching
|
||||
tp = Taskpool.new(numThreads = nthreads)
|
||||
|
||||
prep()
|
||||
when not defined(windows):
|
||||
let start = wtime_usec()
|
||||
test()
|
||||
when not defined(windows):
|
||||
let stop = wtime_usec()
|
||||
|
||||
getrusage(RusageSelf, ru)
|
||||
rss = ru.ru_maxrss - rss
|
||||
flt = ru.ru_minflt - flt
|
||||
|
||||
tp.shutdown()
|
||||
|
||||
verify()
|
||||
delete(even)
|
||||
delete(odd)
|
||||
|
||||
echo "Scheduler: Taskpools"
|
||||
echo "Benchmark: heat"
|
||||
echo "Threads: ", nthreads
|
||||
when not defined(windows):
|
||||
echo "Time(us) ", stop - start
|
||||
echo "Max RSS (KB): ", ru.ru_maxrss
|
||||
echo "Runtime RSS (KB): ", rss
|
||||
echo "# of page faults: ", flt
|
||||
|
||||
quit 0
|
||||
|
||||
main()
|
|
@ -0,0 +1,12 @@
|
|||
# Cache-Oblivious Matrix Multiplication
|
||||
|
||||
From Staccato and Cilk
|
||||
|
||||
https://bradley.csail.mit.edu/svn/repos/cilk/5.4.3/examples/matmul.cilk
|
||||
See the paper ``Cache-Oblivious Algorithms'', by
|
||||
Matteo Frigo, Charles E. Leiserson, Harald Prokop, and
|
||||
Sridhar Ramachandran, FOCS 1999, for an explanation of
|
||||
why this algorithm is good for caches.
|
||||
|
||||
Note that the benchmarks output incorrect matrix traces
|
||||
according to the check ...
|
|
@ -0,0 +1,213 @@
|
|||
# Weave
|
||||
# Copyright (c) 2019 Mamy André-Ratsimbazafy
|
||||
# Licensed and distributed under either of
|
||||
# * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
|
||||
# * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
|
||||
# at your option. This file may not be copied, modified, or distributed except according to those terms.
|
||||
|
||||
# Rectangular matrix multiplication.
|
||||
#
|
||||
# Adapted from Cilk 5.4.3 example
|
||||
#
|
||||
# https://bradley.csail.mit.edu/svn/repos/cilk/5.4.3/examples/matmul.cilk;
|
||||
# See the paper ``Cache-Oblivious Algorithms'', by
|
||||
# Matteo Frigo, Charles E. Leiserson, Harald Prokop, and
|
||||
# Sridhar Ramachandran, FOCS 1999, for an explanation of
|
||||
# why this algorithm is good for caches.
|
||||
|
||||
import
|
||||
# Stdlib
|
||||
strformat, os, strutils, math, system/ansi_c,
|
||||
cpuinfo,
|
||||
# Taskpool
|
||||
../../taskpools,
|
||||
# bench
|
||||
../wtime, ../resources
|
||||
|
||||
# Helpers
|
||||
# -------------------------------------------------------
|
||||
|
||||
# We need a thin wrapper around raw pointers for matrices,
|
||||
# we can't pass "var" to other threads
|
||||
type
|
||||
Matrix[T: SomeFloat] = object
|
||||
buffer: ptr UncheckedArray[T]
|
||||
ld: int
|
||||
|
||||
var tp: Taskpool
|
||||
|
||||
func newMatrixNxN[T](n: int): Matrix[T] {.inline.} =
|
||||
result.buffer = cast[ptr UncheckedArray[T]](c_malloc(csize_t n*n*sizeof(T)))
|
||||
result.ld = n
|
||||
|
||||
template `[]`[T](mat: Matrix[T], row, col: Natural): T =
|
||||
# row-major storage
|
||||
assert row < mat.ld, $i & " < " & $mat.ld
|
||||
assert col < mat.ld, $i & " < " & $mat.ld
|
||||
mat.buffer[row * mat.ld + col]
|
||||
|
||||
template `[]=`[T](mat: Matrix[T], row, col: Natural, value: T) =
|
||||
assert row < mat.ld, $i & " < " & $mat.ld
|
||||
assert col < mat.ld, $i & " < " & $mat.ld
|
||||
mat.buffer[row * mat.ld + col] = value
|
||||
|
||||
func stride*[T](mat: Matrix[T], row, col: Natural): Matrix[T]{.inline.}=
|
||||
## Returns a new view offset by the row and column stride
|
||||
result.buffer = cast[ptr UncheckedArray[T]](
|
||||
addr mat.buffer[row*mat.ld + col]
|
||||
)
|
||||
|
||||
func delete[T](mat: sink Matrix[T]) =
|
||||
c_free(mat.buffer)
|
||||
|
||||
# -------------------------------------------------------
|
||||
|
||||
proc xorshiftRand(): uint32 =
|
||||
var x {.global.} = uint32(2463534242)
|
||||
x = x xor (x shr 13)
|
||||
x = x xor (x shl 17)
|
||||
x = x xor (x shr 5)
|
||||
return x
|
||||
|
||||
func zero[T](A: Matrix[T]) =
|
||||
# zeroing is not timed
|
||||
zeroMem(A.buffer, A.ld * A.ld * sizeof(T))
|
||||
|
||||
proc fill[T](A: Matrix[T]) =
|
||||
for i in 0 ..< A.ld:
|
||||
for j in 0 ..< A.ld:
|
||||
A[i, j] = T(xorshiftRand() mod A.ld.uint32)
|
||||
|
||||
func maxError(A, B: Matrix): float64 =
|
||||
assert A.ld == B.ld
|
||||
for i in 0 ..< A.ld:
|
||||
for j in 0 ..< A.ld:
|
||||
var diff = (A[i, j] - B[i, j]) / A[i, j]
|
||||
if diff < 0:
|
||||
diff = -diff
|
||||
if diff > result:
|
||||
result = diff
|
||||
|
||||
func check[T](A, B, C: Matrix[T], n: int): bool =
|
||||
var
|
||||
tr_C = 0.T
|
||||
tr_AB = 0.T
|
||||
for i in 0 ..< n:
|
||||
for j in 0 ..< n:
|
||||
tr_AB += A[i, j] * B[j, i]
|
||||
tr_C += C[i, i]
|
||||
|
||||
# Note, all benchmarks return false ‾\_(ツ)_/‾
|
||||
return abs(tr_AB - tr_C) < 1e-3
|
||||
|
||||
proc matmul[T](A, B, C: Matrix[T], m, n, p: int, add: bool): bool =
|
||||
# The original bench passes around a ``ld`` parameter (leading dimension?),
|
||||
# we store it in the matrices
|
||||
# We return a dummy bool to allow waiting on the matmul
|
||||
|
||||
# Threshold
|
||||
if (m + n + p) <= 64:
|
||||
if add:
|
||||
for i in 0 ..< m:
|
||||
for k in 0 ..< p:
|
||||
var c = 0.T
|
||||
for j in 0 ..< n:
|
||||
c += A[i, j] * B[j, k]
|
||||
C[i, k] += c
|
||||
else:
|
||||
for i in 0 ..< m:
|
||||
for k in 0 ..< p:
|
||||
var c = 0.T
|
||||
for j in 0 ..< n:
|
||||
c += A[i, j] * B[j, k]
|
||||
C[i, k] = c
|
||||
|
||||
return
|
||||
|
||||
var h0, h1: FlowVar[bool]
|
||||
## Each half of the computation
|
||||
|
||||
# matrix is larger than threshold
|
||||
if m >= n and n >= p:
|
||||
let m1 = m shr 1 # divide by 2
|
||||
h0 = tp.spawn matmul(A, B, C, m1, n, p, add)
|
||||
h1 = tp.spawn matmul(A.stride(m1, 0), B, C.stride(m1, 0), m - m1, n, p, add)
|
||||
elif n >= m and n >= p:
|
||||
let n1 = n shr 1 # divide by 2
|
||||
h0 = tp.spawn matmul(A, B, C, m, n1, p, add)
|
||||
h1 = tp.spawn matmul(A.stride(0, n1), B.stride(n1, 0), C, m, n - n1, p, add = true)
|
||||
else:
|
||||
let p1 = p shr 1
|
||||
h0 = tp.spawn matmul(A, B, C, m, n, p1, add)
|
||||
h1 = tp.spawn matmul(A, B.stride(0, p1), C.stride(0, p1), m, n, p - p1, add)
|
||||
|
||||
discard sync(h0)
|
||||
discard sync(h1)
|
||||
|
||||
proc main() =
|
||||
echo "Warning the benchmark seems to not be correct."
|
||||
var
|
||||
n = 3000
|
||||
nthreads: int
|
||||
|
||||
if existsEnv"TASKPOOL_NUM_THREADS":
|
||||
nthreads = getEnv"TASKPOOL_NUM_THREADS".parseInt()
|
||||
else:
|
||||
nthreads = countProcessors()
|
||||
|
||||
if paramCount() == 0:
|
||||
let exeName = getAppFilename().extractFilename()
|
||||
echo &"Usage: {exeName} <n (matrix size):{n}>"
|
||||
echo &"Running with default config n = {n}"
|
||||
elif paramCount() == 1:
|
||||
n = paramStr(1).parseInt()
|
||||
else:
|
||||
let exeName = getAppFilename().extractFilename()
|
||||
echo &"Usage: {exeName} <n (matrix size):{n}>"
|
||||
echo &"Up to 1 parameter is valid. Received {paramCount()}"
|
||||
quit 1
|
||||
|
||||
var A = newMatrixNxN[float32](n)
|
||||
var B = newMatrixNxN[float32](n)
|
||||
var C = newMatrixNxN[float32](n)
|
||||
|
||||
fill(A)
|
||||
fill(B)
|
||||
zero(C)
|
||||
|
||||
var ru: Rusage
|
||||
getrusage(RusageSelf, ru)
|
||||
var
|
||||
rss = ru.ru_maxrss
|
||||
flt = ru.ru_minflt
|
||||
|
||||
# Staccato benches runtime init and exit as well
|
||||
let start = wtime_msec()
|
||||
|
||||
tp = Taskpool.new(numThreads = nthreads)
|
||||
discard sync tp.spawn matmul(A, B, C, n, n, n, add = false)
|
||||
tp.shutdown()
|
||||
|
||||
let stop = wtime_msec()
|
||||
|
||||
getrusage(RusageSelf, ru)
|
||||
rss = ru.ru_maxrss - rss
|
||||
flt = ru.ru_minflt - flt
|
||||
|
||||
echo "Scheduler: Taskpool"
|
||||
echo "Benchmark: Matrix Multiplication (cache oblivious)"
|
||||
echo "Threads: ", nthreads
|
||||
echo "Time(ms) ", stop - start
|
||||
echo "Max RSS (KB): ", ru.ru_maxrss
|
||||
echo "Runtime RSS (KB): ", rss
|
||||
echo "# of page faults: ", flt
|
||||
echo "Input: ", n
|
||||
echo "Error: ", check(A, B, C, n)
|
||||
|
||||
delete A
|
||||
delete B
|
||||
delete C
|
||||
|
||||
quit 0
|
||||
|
||||
main()
|
|
@ -0,0 +1,187 @@
|
|||
# Weave
|
||||
# Copyright (c) 2019 Mamy André-Ratsimbazafy
|
||||
# Licensed and distributed under either of
|
||||
# * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
|
||||
# * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
|
||||
# at your option. This file may not be copied, modified, or distributed except according to those terms.
|
||||
#
|
||||
# Original code licenses
|
||||
# ------------------------------------------------------------------------------------------------
|
||||
#
|
||||
# /**********************************************************************************************/
|
||||
# /* This program is part of the Barcelona OpenMP Tasks Suite */
|
||||
# /* Copyright (C) 2009 Barcelona Supercomputing Center - Centro Nacional de Supercomputacion */
|
||||
# /* Copyright (C) 2009 Universitat Politecnica de Catalunya */
|
||||
# /* */
|
||||
# /* This program is free software; you can redistribute it and/or modify */
|
||||
# /* it under the terms of the GNU General Public License as published by */
|
||||
# /* the Free Software Foundation; either version 2 of the License, or */
|
||||
# /* (at your option) any later version. */
|
||||
# /* */
|
||||
# /* This program is distributed in the hope that it will be useful, */
|
||||
# /* but WITHOUT ANY WARRANTY; without even the implied warranty of */
|
||||
# /* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the */
|
||||
# /* GNU General Public License for more details. */
|
||||
# /* */
|
||||
# /* You should have received a copy of the GNU General Public License */
|
||||
# /* along with this program; if not, write to the Free Software */
|
||||
# /* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */
|
||||
# /**********************************************************************************************/
|
||||
#
|
||||
# /*
|
||||
# * Original code from the Cilk project (by Keith Randall)
|
||||
# *
|
||||
# * Copyright (c) 2000 Massachusetts Institute of Technology
|
||||
# * Copyright (c) 2000 Matteo Frigo
|
||||
# */
|
||||
|
||||
import
|
||||
# Stdlib
|
||||
system/ansi_c, strformat, os, strutils,
|
||||
threadpool,
|
||||
# bench
|
||||
../wtime
|
||||
|
||||
# This deadlocks :/
|
||||
|
||||
# Nim helpers
|
||||
# -------------------------------------------------
|
||||
|
||||
when defined(windows):
|
||||
proc alloca(size: csize): pointer {.header: "<malloc.h>".}
|
||||
else:
|
||||
proc alloca(size: csize): pointer {.header: "<alloca.h>".}
|
||||
|
||||
template alloca*(T: typedesc): ptr T =
|
||||
cast[ptr T](alloca(sizeof(T)))
|
||||
|
||||
template alloca*(T: typedesc, len: Natural): ptr UncheckedArray[T] =
|
||||
cast[ptr UncheckedArray[T]](alloca(sizeof(T) * len))
|
||||
|
||||
proc wv_alloc*(T: typedesc, len: SomeInteger): ptr UncheckedArray[T] {.inline.} =
|
||||
when defined(WV_useNimAlloc):
|
||||
cast[type result](createSharedU(T, len))
|
||||
else:
|
||||
cast[type result](c_malloc(csize_t len*sizeof(T)))
|
||||
|
||||
proc wv_free*[T: ptr](p: T) {.inline.} =
|
||||
when defined(WV_useNimAlloc):
|
||||
freeShared(p)
|
||||
else:
|
||||
c_free(p)
|
||||
|
||||
# We assume that Nim zeroMem vs C memset
|
||||
# and Nim copyMem vs C memcpy have no difference
|
||||
# Nim does have extra checks to handle GC-ed types
|
||||
# but they should be eliminated by the Nim compiler.
|
||||
|
||||
# -------------------------------------------------
|
||||
|
||||
type CharArray = ptr UncheckedArray[char]
|
||||
|
||||
var example_solution: ptr UncheckedArray[char]
|
||||
|
||||
func isValid(n: int32, a: CharArray): bool =
|
||||
## `a` contains an array of `n` queen positions.
|
||||
## Returns true if none of the queens conflict and 0 otherwise.
|
||||
|
||||
for i in 0'i32 ..< n:
|
||||
let p = cast[int32](a[i])
|
||||
|
||||
for j in i+1 ..< n:
|
||||
let q = cast[int32](a[j])
|
||||
if q == p or q == p - (j-i) or q == p + (j-i):
|
||||
return false
|
||||
return true
|
||||
|
||||
proc nqueens_ser(n, j: int32, a: CharArray): int32 =
|
||||
# Serial nqueens
|
||||
if n == j:
|
||||
# Good solution count it
|
||||
if example_solution.isNil:
|
||||
example_solution = wv_alloc(char, n)
|
||||
copyMem(example_solution, a, n * sizeof(char))
|
||||
return 1
|
||||
|
||||
# Try each possible position for queen `j`
|
||||
for i in 0 ..< n:
|
||||
a[j] = cast[char](i)
|
||||
if isValid(j+1, a):
|
||||
result += nqueens_ser(n, j+1, a)
|
||||
|
||||
proc nqueens_par(n, j: int32, a: CharArray): int32 =
|
||||
|
||||
if n == j:
|
||||
# Good solution, count it
|
||||
return 1
|
||||
|
||||
var localCounts = alloca(Flowvar[int32], n)
|
||||
zeroMem(localCounts, n * sizeof(Flowvar[int32]))
|
||||
|
||||
# Try each position for queen `j`
|
||||
for i in 0 ..< n:
|
||||
var b = alloca(char, j+1)
|
||||
copyMem(b, a, j * sizeof(char))
|
||||
b[j] = cast[char](i)
|
||||
if isValid(j+1, b):
|
||||
localCounts[i] = spawn nqueens_par(n, j+1, b)
|
||||
|
||||
for i in 0 ..< n:
|
||||
if not localCounts[i].isNil():
|
||||
result += ^localCounts[i]
|
||||
|
||||
const solutions = [
|
||||
1,
|
||||
0,
|
||||
0,
|
||||
2,
|
||||
10, # 5x5
|
||||
4,
|
||||
10,
|
||||
92, # 8x8
|
||||
352,
|
||||
724, # 10x10
|
||||
2680,
|
||||
14200,
|
||||
73712,
|
||||
365596,
|
||||
2279184, # 15x15
|
||||
14772512
|
||||
]
|
||||
|
||||
proc verifyQueens(n, res: int32) =
|
||||
if n > solutions.len:
|
||||
echo &"Cannot verify result: {n} is out of range [1,{solutions.len}]"
|
||||
return
|
||||
|
||||
if res != solutions[n-1]:
|
||||
echo &"N-Queens failure: {res} is different from expected {solutions[n-1]}"
|
||||
|
||||
proc main() =
|
||||
if paramCount() != 1:
|
||||
let exeName = getAppFilename().extractFilename()
|
||||
echo &"Usage: {exeName} <n: number of queens on a nxn board>"
|
||||
quit 0
|
||||
|
||||
let n = paramStr(1).parseInt.int32
|
||||
|
||||
if n notin 1 .. solutions.len:
|
||||
echo &"The number of queens N (on a NxN board) must be in the range [1, {solutions.len}]"
|
||||
quit 1
|
||||
|
||||
|
||||
let start = wtime_msec()
|
||||
let count = nqueens_par(n, 0, alloca(char, n))
|
||||
let stop = wtime_msec()
|
||||
|
||||
verifyQueens(n, count)
|
||||
|
||||
if not example_solution.isNil:
|
||||
stdout.write("Example solution: ")
|
||||
for i in 0 ..< n:
|
||||
c_printf("%2d ", example_solution[i])
|
||||
stdout.write('\n')
|
||||
|
||||
echo &"Elapsed wall time: {stop-start:2.4f} ms"
|
||||
|
||||
main()
|
|
@ -0,0 +1,229 @@
|
|||
# Weave
|
||||
# Copyright (c) 2019 Mamy André-Ratsimbazafy
|
||||
# Licensed and distributed under either of
|
||||
# * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
|
||||
# * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
|
||||
# at your option. This file may not be copied, modified, or distributed except according to those terms.
|
||||
#
|
||||
# Original code licenses
|
||||
# ------------------------------------------------------------------------------------------------
|
||||
#
|
||||
# /**********************************************************************************************/
|
||||
# /* This program is part of the Barcelona OpenMP Tasks Suite */
|
||||
# /* Copyright (C) 2009 Barcelona Supercomputing Center - Centro Nacional de Supercomputacion */
|
||||
# /* Copyright (C) 2009 Universitat Politecnica de Catalunya */
|
||||
# /* */
|
||||
# /* This program is free software; you can redistribute it and/or modify */
|
||||
# /* it under the terms of the GNU General Public License as published by */
|
||||
# /* the Free Software Foundation; either version 2 of the License, or */
|
||||
# /* (at your option) any later version. */
|
||||
# /* */
|
||||
# /* This program is distributed in the hope that it will be useful, */
|
||||
# /* but WITHOUT ANY WARRANTY; without even the implied warranty of */
|
||||
# /* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the */
|
||||
# /* GNU General Public License for more details. */
|
||||
# /* */
|
||||
# /* You should have received a copy of the GNU General Public License */
|
||||
# /* along with this program; if not, write to the Free Software */
|
||||
# /* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */
|
||||
# /**********************************************************************************************/
|
||||
#
|
||||
# /*
|
||||
# * Original code from the Cilk project (by Keith Randall)
|
||||
# *
|
||||
# * Copyright (c) 2000 Massachusetts Institute of Technology
|
||||
# * Copyright (c) 2000 Matteo Frigo
|
||||
# */
|
||||
|
||||
import
|
||||
# Stdlib
|
||||
system/ansi_c, strformat, os, strutils, cpuinfo,
|
||||
# Taskpools
|
||||
../../taskpools
|
||||
|
||||
when not defined(windows):
|
||||
# bench
|
||||
import ../wtime, ../resources
|
||||
|
||||
# Nim helpers
|
||||
# -------------------------------------------------
|
||||
|
||||
when defined(windows):
|
||||
proc alloca(size: int): pointer {.header: "<malloc.h>".}
|
||||
else:
|
||||
proc alloca(size: int): pointer {.header: "<alloca.h>".}
|
||||
|
||||
template alloca*(T: typedesc): ptr T =
|
||||
cast[ptr T](alloca(sizeof(T)))
|
||||
|
||||
template alloca*(T: typedesc, len: Natural): ptr UncheckedArray[T] =
|
||||
cast[ptr UncheckedArray[T]](alloca(sizeof(T) * len))
|
||||
|
||||
proc tp_alloc*(T: typedesc, len: SomeInteger): ptr UncheckedArray[T] {.inline.} =
|
||||
when defined(TP_useNimAlloc):
|
||||
cast[type result](createSharedU(T, len))
|
||||
else:
|
||||
cast[type result](c_malloc(csize_t len*sizeof(T)))
|
||||
|
||||
proc tp_free*[T: ptr](p: T) {.inline.} =
|
||||
when defined(TP_useNimAlloc):
|
||||
freeShared(p)
|
||||
else:
|
||||
c_free(p)
|
||||
|
||||
# We assume that Nim zeroMem vs C memset
|
||||
# and Nim copyMem vs C memcpy have no difference
|
||||
# Nim does have extra checks to handle GC-ed types
|
||||
# but they should be eliminated by the Nim compiler.
|
||||
|
||||
# -------------------------------------------------
|
||||
|
||||
type CharArray = ptr UncheckedArray[char]
|
||||
|
||||
var tp: Taskpool
|
||||
var example_solution: ptr UncheckedArray[char]
|
||||
|
||||
func isValid(n: int32, a: CharArray): bool =
|
||||
## `a` contains an array of `n` queen positions.
|
||||
## Returns true if none of the queens conflict and 0 otherwise.
|
||||
|
||||
for i in 0'i32 ..< n:
|
||||
let p = cast[int32](a[i])
|
||||
|
||||
for j in i+1 ..< n:
|
||||
let q = cast[int32](a[j])
|
||||
if q == p or q == p - (j-i) or q == p + (j-i):
|
||||
return false
|
||||
return true
|
||||
|
||||
proc nqueens_ser(n, j: int32, a: CharArray): int32 =
|
||||
# Serial nqueens
|
||||
if n == j:
|
||||
# Good solution count it
|
||||
if example_solution.isNil:
|
||||
example_solution = tp_alloc(char, n)
|
||||
copyMem(example_solution, a, n * sizeof(char))
|
||||
return 1
|
||||
|
||||
# Try each possible position for queen `j`
|
||||
for i in 0 ..< n:
|
||||
a[j] = cast[char](i)
|
||||
if isValid(j+1, a):
|
||||
result += nqueens_ser(n, j+1, a)
|
||||
|
||||
proc nqueens_par(n, j: int32, a: CharArray): int32 =
|
||||
|
||||
if n == j:
|
||||
# Good solution, count it
|
||||
return 1
|
||||
|
||||
var localCounts = alloca(Flowvar[int32], n)
|
||||
zeroMem(localCounts, n * sizeof(Flowvar[int32]))
|
||||
|
||||
# Try each position for queen `j`
|
||||
for i in 0 ..< n:
|
||||
var b = alloca(char, j+1)
|
||||
copyMem(b, a, j * sizeof(char))
|
||||
b[j] = cast[char](i)
|
||||
if isValid(j+1, b):
|
||||
localCounts[i] = tp.spawn nqueens_par(n, j+1, b)
|
||||
|
||||
for i in 0 ..< n:
|
||||
if localCounts[i].isSpawned():
|
||||
result += sync(localCounts[i])
|
||||
|
||||
const solutions = [
|
||||
1,
|
||||
0,
|
||||
0,
|
||||
2,
|
||||
10, # 5x5
|
||||
4,
|
||||
10,
|
||||
92, # 8x8
|
||||
352,
|
||||
724, # 10x10
|
||||
2680,
|
||||
14200,
|
||||
73712,
|
||||
365596,
|
||||
2279184, # 15x15
|
||||
14772512
|
||||
]
|
||||
|
||||
proc verifyQueens(n, res: int32) =
|
||||
if n > solutions.len:
|
||||
echo &"Cannot verify result: {n} is out of range [1,{solutions.len}]"
|
||||
return
|
||||
|
||||
if res != solutions[n-1]:
|
||||
echo &"N-Queens failure: {res} is different from expected {solutions[n-1]}"
|
||||
|
||||
proc main() =
|
||||
var
|
||||
n = 11'i32
|
||||
nthreads: int
|
||||
|
||||
if existsEnv"TASKPOOL_NUM_THREADS":
|
||||
nthreads = getEnv"TASKPOOL_NUM_THREADS".parseInt()
|
||||
else:
|
||||
nthreads = countProcessors()
|
||||
|
||||
if paramCount() == 0:
|
||||
let exeName = getAppFilename().extractFilename()
|
||||
echo &"Usage: {exeName} <N:{n}>"
|
||||
echo &"Running with default config N = {n}\n"
|
||||
|
||||
if paramCount() >= 1:
|
||||
n = paramStr(1).parseInt.int32
|
||||
|
||||
if n notin 1 .. solutions.len:
|
||||
echo &"The number of queens N (on a NxN board) must be in the range [1, {solutions.len}]"
|
||||
quit 1
|
||||
|
||||
when not defined(windows):
|
||||
var ru: Rusage
|
||||
getrusage(RusageSelf, ru)
|
||||
var
|
||||
rss = ru.ru_maxrss
|
||||
flt = ru.ru_minflt
|
||||
|
||||
tp = Taskpool.new(numThreads = nthreads)
|
||||
|
||||
when not defined(windows):
|
||||
let start = wtime_msec()
|
||||
|
||||
let count = nqueens_par(n, 0, alloca(char, n))
|
||||
|
||||
when not defined(windows):
|
||||
let stop = wtime_msec()
|
||||
|
||||
when not defined(windows):
|
||||
getrusage(RusageSelf, ru)
|
||||
rss = ru.ru_maxrss - rss
|
||||
flt = ru.ru_minflt - flt
|
||||
|
||||
tp.shutdown()
|
||||
|
||||
verifyQueens(n, count)
|
||||
|
||||
if not example_solution.isNil:
|
||||
stdout.write("Example solution: ")
|
||||
for i in 0 ..< n:
|
||||
c_printf("%2d ", example_solution[i])
|
||||
stdout.write('\n')
|
||||
|
||||
echo "Scheduler: Taskpool"
|
||||
echo "Benchmark: N-queens"
|
||||
echo "Threads: ", nthreads
|
||||
when not defined(windows):
|
||||
echo "Time(us) ", stop - start
|
||||
echo "Max RSS (KB): ", ru.ru_maxrss
|
||||
echo "Runtime RSS (KB): ", rss
|
||||
echo "# of page faults: ", flt
|
||||
echo "Problem size: ", n,"x",n, " board with ",n, " queens"
|
||||
echo "Solutions found: ", count
|
||||
|
||||
quit 0
|
||||
|
||||
main()
|
|
@ -0,0 +1,24 @@
|
|||
type
|
||||
Timeval {.importc: "timeval", header:"<sys/time.h>", bycopy.} = object
|
||||
|
||||
Rusage* {.importc: "struct rusage", header:"<sys/resource.h>", bycopy.} = object
|
||||
ru_utime {.importc.}: Timeval
|
||||
ru_stime {.importc.}: Timeval
|
||||
ru_maxrss* {.importc.}: int32 # Maximum resident set size
|
||||
# ...
|
||||
ru_minflt* {.importc.}: int32 # page reclaims (soft page faults)
|
||||
|
||||
RusageWho* {.size: sizeof(cint).} = enum
|
||||
RusageChildren = -1
|
||||
RusageSelf = 0
|
||||
RusageThread = 1
|
||||
|
||||
when defined(debug):
|
||||
var H_RUSAGE_SELF{.importc, header:"<sys/resource.h".}: cint
|
||||
var H_RUSAGE_CHILDREN{.importc, header:"<sys/resource.h".}: cint
|
||||
var H_RUSAGE_THREAD{.importc, header:"<sys/resource.h".}: cint
|
||||
assert H_RUSAGE_SELF == ord(RusageSelf)
|
||||
assert H_RUSAGE_CHILDREN = ord(RusageChildren)
|
||||
assert H_RUSAGE_THREAD = ord(RusageThread)
|
||||
|
||||
proc getrusage*(who: RusageWho, usage: var Rusage) {.importc, header: "sys/resource.h".}
|
|
@ -0,0 +1,7 @@
|
|||
# Simple single-producer multiple consumers benchmarks
|
||||
|
||||
SPC A Simple Producer-Consumer benchmark.
|
||||
|
||||
A single worker produces n tasks,
|
||||
each running for t microseconds. This benchmark allows us to test how many
|
||||
concurrent consumers a single producer can sustain.
|
|
@ -0,0 +1,145 @@
|
|||
import
|
||||
# STD lib
|
||||
os, strutils, system/ansi_c, cpuinfo, strformat, math,
|
||||
# Library
|
||||
../../taskpools,
|
||||
# bench
|
||||
../wtime, ../resources
|
||||
|
||||
var NumTasksTotal: int32
|
||||
var TaskGranularity: int32 # microsecond
|
||||
var PollInterval: float64 # microsecond
|
||||
|
||||
var tp: Taskpool
|
||||
|
||||
var global_poll_elapsed {.threadvar.}: float64
|
||||
|
||||
template dummy_cpt(): untyped =
|
||||
# Dummy computation
|
||||
# Calculate fib(30) iteratively
|
||||
var
|
||||
fib = 0
|
||||
f2 = 0
|
||||
f1 = 1
|
||||
for i in 2 .. 30:
|
||||
fib = f1 + f2
|
||||
f2 = f1
|
||||
f1 = fib
|
||||
|
||||
proc spc_consume(usec: int32) =
|
||||
|
||||
var pollElapsed = 0'f64
|
||||
|
||||
let start = wtime_usec()
|
||||
let stop = usec.float64
|
||||
global_poll_elapsed = PollInterval
|
||||
|
||||
while true:
|
||||
var elapsed = wtime_usec() - start
|
||||
elapsed = elapsed - pollElapsed
|
||||
if elapsed >= stop:
|
||||
break
|
||||
|
||||
dummy_cpt()
|
||||
|
||||
# if elapsed >= global_poll_elapsed:
|
||||
# let pollStart = wtime_usec()
|
||||
# loadBalance(Weave)
|
||||
# pollElapsed += wtime_usec() - pollStart
|
||||
# global_poll_elapsed += PollInterval
|
||||
|
||||
# c_printf("Elapsed: %.2lfus\n", elapsed)
|
||||
|
||||
proc spc_consume_nopoll(usec: int32) =
|
||||
|
||||
let start = wtime_usec()
|
||||
let stop = usec.float64
|
||||
|
||||
while true:
|
||||
var elapsed = wtime_usec() - start
|
||||
if elapsed >= stop:
|
||||
break
|
||||
|
||||
dummy_cpt()
|
||||
|
||||
# c_printf("Elapsed: %.2lfus\n", elapsed)
|
||||
|
||||
proc spc_produce(n: int32) =
|
||||
for i in 0 ..< n:
|
||||
tp.spawn spc_consume(TaskGranularity)
|
||||
|
||||
proc spc_produce_seq(n: int32) =
|
||||
for i in 0 ..< n:
|
||||
spc_consume_no_poll(TaskGranularity)
|
||||
|
||||
proc main() =
|
||||
NumTasksTotal = 1000000
|
||||
TaskGranularity = 10
|
||||
PollInterval = 10
|
||||
|
||||
if paramCount() == 0:
|
||||
let exeName = getAppFilename().extractFilename()
|
||||
echo &"Usage: {exeName} <# of tasks:{NumTasksTotal}> " &
|
||||
&"<task granularity (us): {TaskGranularity}> " &
|
||||
&"[polling interval (us): task granularity]"
|
||||
echo &"Running with default config tasks = {NumTasksTotal}, granularity (us) = {TaskGranularity}, polling (us) = {PollInterval}"
|
||||
if paramCount() >= 1:
|
||||
NumTasksTotal = paramStr(1).parseInt.int32
|
||||
if paramCount() >= 2:
|
||||
TaskGranularity = paramStr(2). parseInt.int32
|
||||
if paramCount() == 3:
|
||||
PollInterval = paramStr(3).parseInt.float64
|
||||
else:
|
||||
PollInterval = TaskGranularity.float64
|
||||
if paramCount() > 3:
|
||||
let exeName = getAppFilename().extractFilename()
|
||||
echo &"Usage: {exeName} <# of tasks:{NumTasksTotal}> " &
|
||||
&"<task granularity (us): {TaskGranularity}> " &
|
||||
&"[polling interval (us): task granularity]"
|
||||
quit 1
|
||||
|
||||
var nthreads: int
|
||||
if existsEnv"WEAVE_NUM_THREADS":
|
||||
nthreads = getEnv"WEAVE_NUM_THREADS".parseInt()
|
||||
else:
|
||||
nthreads = countProcessors()
|
||||
|
||||
tp = Taskpool.new(numThreads = nthreads)
|
||||
|
||||
# measure overhead during tasking
|
||||
var ru: Rusage
|
||||
getrusage(RusageSelf, ru)
|
||||
var
|
||||
rss = ru.ru_maxrss
|
||||
flt = ru.ru_minflt
|
||||
|
||||
let start = wtime_msec()
|
||||
|
||||
# spc_produce_seq(NumTasksTotal)
|
||||
spc_produce(NumTasksTotal)
|
||||
tp.syncAll()
|
||||
|
||||
let stop = wtime_msec()
|
||||
|
||||
getrusage(RusageSelf, ru)
|
||||
rss = ru.ru_maxrss - rss
|
||||
flt = ru.ru_minflt - flt
|
||||
|
||||
tp.shutdown()
|
||||
|
||||
echo "--------------------------------------------------------------------------"
|
||||
echo "Scheduler: Taskpool"
|
||||
echo "Benchmark: SPC (Single task Producer - multi Consumer)"
|
||||
echo "Threads: ", nthreads
|
||||
echo "Time(ms) ", round(stop - start, 3)
|
||||
echo "Max RSS (KB): ", ru.ru_maxrss
|
||||
echo "Runtime RSS (KB): ", rss
|
||||
echo "# of page faults: ", flt
|
||||
echo "--------------------------------------------------------------------------"
|
||||
echo "# of tasks: ", NumTasksTotal
|
||||
echo "Task granularity (us): ", TaskGranularity
|
||||
echo "Polling / manual load balancing interval (us): ", PollInterval
|
||||
|
||||
quit 0
|
||||
|
||||
main()
|
|
@ -0,0 +1,53 @@
|
|||
#ifndef WTIME_H
|
||||
#define WTIME_H
|
||||
|
||||
#include <sys/time.h>
|
||||
#include <time.h>
|
||||
|
||||
// Number of seconds since the Epoch
|
||||
static inline double Wtime_sec(void)
|
||||
{
|
||||
struct timeval tv;
|
||||
gettimeofday(&tv, NULL);
|
||||
return tv.tv_sec + tv.tv_usec / 1e6;
|
||||
}
|
||||
|
||||
// Number of milliseconds since the Epoch
|
||||
static inline double Wtime_msec(void)
|
||||
{
|
||||
struct timeval tv;
|
||||
gettimeofday(&tv, NULL);
|
||||
return tv.tv_sec * 1e3 + tv.tv_usec / 1e3;
|
||||
}
|
||||
|
||||
// Number of microseconds since the Epoch
|
||||
static inline double Wtime_usec(void)
|
||||
{
|
||||
struct timeval tv;
|
||||
gettimeofday(&tv, NULL);
|
||||
return tv.tv_sec * 1e6 + tv.tv_usec;
|
||||
}
|
||||
|
||||
// Read time stamp counter on x86
|
||||
static inline unsigned long long readtsc(void)
|
||||
{
|
||||
unsigned int lo, hi;
|
||||
// RDTSC copies contents of 64-bit TSC into EDX:EAX
|
||||
asm volatile ("rdtsc" : "=a" (lo), "=d" (hi));
|
||||
return (unsigned long long)hi << 32 | lo;
|
||||
}
|
||||
|
||||
#define WTIME_unique_var_name_paste(id, n) id ## n
|
||||
#define WTIME_unique_var_name(id, n) WTIME_unique_var_name_paste(id, n)
|
||||
#define WTIME_unique_var(id) WTIME_unique_var_name(id, __LINE__)
|
||||
|
||||
// Convenience macro for time measurement
|
||||
#define WTIME(unit) \
|
||||
double WTIME_unique_var(_start_##unit##_) = Wtime_##unit##ec(); \
|
||||
int WTIME_unique_var(_i_) = 0; \
|
||||
for (; WTIME_unique_var(_i_) == 0 || \
|
||||
(printf("Elapsed wall time: %.2lf "#unit"\n", \
|
||||
Wtime_##unit##ec() - WTIME_unique_var(_start_##unit##_)), 0); \
|
||||
WTIME_unique_var(_i_)++)
|
||||
|
||||
#endif // WTIME_H
|
|
@ -0,0 +1,10 @@
|
|||
|
||||
import strutils, os
|
||||
|
||||
const cSourcesPath = currentSourcePath.rsplit(DirSep, 1)[0]
|
||||
const cHeader = csourcesPath / "wtime.h"
|
||||
|
||||
{.passC: "-I" & cSourcesPath .}
|
||||
|
||||
proc wtime_usec*: float64 {.importc: "Wtime_usec", header: cHeader.}
|
||||
proc wtime_msec*: float64 {.importc: "Wtime_msec", header: cHeader.}
|
|
@ -0,0 +1,17 @@
|
|||
# Taskpools architecture
|
||||
|
||||
Taskpools architecture is a simple threadpool with work-stealing to handle unbalanced workloads.
|
||||
|
||||
## Architecture
|
||||
|
||||
### Processing steps
|
||||
|
||||
1. On a `spawn` expression, thread i packages the function call in a task.
|
||||
2. It enqueues it in it's own dequeue.
|
||||
3. It notify_one a condition variable that holds all sleeping threads.
|
||||
4. The notified thread wakes up and
|
||||
5. The notified thread randomly tries to steal a task in a worker.
|
||||
6. If no tasks are found, it goes back to sleep.
|
||||
7. Otherwise it runs the task.
|
||||
8. On a `sync` statement, it runs task in its own task dequeue or steal a task from another worker.
|
||||
9. Once the `sync` task is ready, it can run the following statements (continuation).
|
|
@ -0,0 +1,43 @@
|
|||
import ../taskpools/taskpools
|
||||
import std/macros
|
||||
|
||||
block: # Async without result
|
||||
|
||||
proc display_int(x: int) =
|
||||
stdout.write(x)
|
||||
stdout.write(" - SUCCESS\n")
|
||||
|
||||
proc main() =
|
||||
echo "\nSanity check 1: Printing 123456 654321 in parallel"
|
||||
|
||||
var tp = Taskpool.new(numThreads = 4)
|
||||
tp.spawn display_int(123456)
|
||||
tp.spawn display_int(654321)
|
||||
tp.shutdown()
|
||||
|
||||
main()
|
||||
|
||||
block: # Async/Await
|
||||
|
||||
var tp: Taskpool
|
||||
|
||||
|
||||
proc async_fib(n: int): int =
|
||||
if n < 2:
|
||||
return n
|
||||
|
||||
let x = tp.spawn async_fib(n-1)
|
||||
let y = async_fib(n-2)
|
||||
|
||||
result = sync(x) + y
|
||||
|
||||
proc main2() =
|
||||
echo "\nSanity check 2: fib(20)"
|
||||
|
||||
tp = Taskpool.new()
|
||||
let f = async_fib(20)
|
||||
tp.shutdown()
|
||||
|
||||
doAssert f == 6765
|
||||
|
||||
main2()
|
Binary file not shown.
Binary file not shown.
|
@ -0,0 +1,9 @@
|
|||
# Nim-Taskpools
|
||||
# Copyright (c) 2021 Status Research & Development GmbH
|
||||
# Licensed and distributed under either of
|
||||
# * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
|
||||
# * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
|
||||
# at your option. This file may not be copied, modified, or distributed except according to those terms.
|
||||
|
||||
import taskpools/taskpools
|
||||
export taskpools
|
|
@ -0,0 +1,33 @@
|
|||
# Nim-Taskpools
|
||||
# Copyright (c) 2021 Status Research & Development GmbH
|
||||
# Licensed and distributed under either of
|
||||
# * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
|
||||
# * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
|
||||
# at your option. This file may not be copied, modified, or distributed except according to those terms.
|
||||
|
||||
import macros
|
||||
|
||||
template letsGoDeeper =
|
||||
var rTree = node.kind.newTree()
|
||||
for child in node:
|
||||
rTree.add inspect(child)
|
||||
return rTree
|
||||
|
||||
proc replaceSymsByIdents*(ast: NimNode): NimNode =
|
||||
proc inspect(node: NimNode): NimNode =
|
||||
case node.kind:
|
||||
of {nnkIdent, nnkSym}:
|
||||
return ident($node)
|
||||
of nnkEmpty:
|
||||
return node
|
||||
of nnkLiterals:
|
||||
return node
|
||||
of nnkHiddenStdConv:
|
||||
if node[1].kind == nnkIntLit:
|
||||
return node[1]
|
||||
else:
|
||||
expectKind(node[1], nnkSym)
|
||||
return ident($node[1])
|
||||
else:
|
||||
letsGoDeeper()
|
||||
result = inspect(ast)
|
|
@ -0,0 +1,178 @@
|
|||
# Weave
|
||||
# Copyright (c) 2019 Mamy André-Ratsimbazafy
|
||||
# Licensed and distributed under either of
|
||||
# * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
|
||||
# * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
|
||||
# at your option. This file may not be copied, modified, or distributed except according to those terms.
|
||||
|
||||
import
|
||||
std/atomics,
|
||||
./instrumentation/[contracts, loggers]
|
||||
|
||||
type
|
||||
ChannelSPSCSingle* = object
|
||||
## A type-erased SPSC channel.
|
||||
##
|
||||
## Wait-free bounded single-producer single-consumer channel
|
||||
## that can only buffer a single item
|
||||
## Properties:
|
||||
## - wait-free
|
||||
## - supports weak memory models
|
||||
## - buffers a single item
|
||||
## - Padded to avoid false sharing in collections
|
||||
## - No extra indirection to access the item, the buffer is inline the channel
|
||||
## - Linearizable
|
||||
## - Default usable size is 254 bytes (WV_MemBlockSize - 2).
|
||||
## If used in an intrusive manner, it's 126 bytes due to the default 128 bytes padding.
|
||||
##
|
||||
## The channel should be the last field of an object if used in an intrusive manner
|
||||
##
|
||||
## Motivations for type erasure
|
||||
## - when LazyFlowvar needs to be converted
|
||||
## from stack-allocated memory to heap to extended their lifetime
|
||||
## we have no type information at all as the whole runtime
|
||||
## and especially tasks does not retain it.
|
||||
##
|
||||
## - When a task depends on a future that was generated from lazy loop-splitting
|
||||
## we don't have type information either.
|
||||
##
|
||||
## - An extra benefit is probably easier embedding, or calling
|
||||
## from C or JIT code.
|
||||
full{.align: 64.}: Atomic[bool]
|
||||
itemSize*: uint8
|
||||
buffer*{.align: 8.}: UncheckedArray[byte]
|
||||
|
||||
proc `=`(
|
||||
dest: var ChannelSPSCSingle,
|
||||
source: ChannelSPSCSingle
|
||||
) {.error: "A channel cannot be copied".}
|
||||
|
||||
proc initialize*(chan: var ChannelSPSCSingle, itemsize: SomeInteger) {.inline.} =
|
||||
## If ChannelSPSCSingle is used intrusive another data structure
|
||||
## be aware that it should be the last part due to ending by UncheckedArray
|
||||
preCondition: itemsize.int in 0 .. int high(uint8)
|
||||
|
||||
chan.itemSize = uint8 itemsize
|
||||
chan.full.store(false, moRelaxed)
|
||||
|
||||
func isEmpty*(chan: var ChannelSPSCSingle): bool {.inline.} =
|
||||
not chan.full.load(moAcquire)
|
||||
|
||||
func tryRecv*[T](chan: var ChannelSPSCSingle, dst: var T): bool {.inline.} =
|
||||
## Try receiving the item buffered in the channel
|
||||
## Returns true if successful (channel was not empty)
|
||||
##
|
||||
## ⚠ Use only in the consumer thread that reads from the channel.
|
||||
preCondition: (sizeof(T) == chan.itemsize.int) or
|
||||
# Support dummy object
|
||||
(sizeof(T) == 0 and chan.itemsize == 1)
|
||||
|
||||
let full = chan.full.load(moAcquire)
|
||||
if not full:
|
||||
return false
|
||||
dst = cast[ptr T](chan.buffer.addr)[]
|
||||
chan.full.store(false, moRelease)
|
||||
return true
|
||||
|
||||
func trySend*[T](chan: var ChannelSPSCSingle, src: sink T): bool {.inline.} =
|
||||
## Try sending an item into the channel
|
||||
## Reurns true if successful (channel was empty)
|
||||
##
|
||||
## ⚠ Use only in the producer thread that writes from the channel.
|
||||
preCondition: (sizeof(T) == chan.itemsize.int) or
|
||||
# Support dummy object
|
||||
(sizeof(T) == 0 and chan.itemsize == 1)
|
||||
|
||||
let full = chan.full.load(moAcquire)
|
||||
if full:
|
||||
return false
|
||||
cast[ptr T](chan.buffer.addr)[] = src
|
||||
chan.full.store(true, moRelease)
|
||||
return true
|
||||
|
||||
# Sanity checks
|
||||
# ------------------------------------------------------------------------------
|
||||
when isMainModule:
|
||||
import ../memory/memory_pools
|
||||
|
||||
when not compileOption("threads"):
|
||||
{.error: "This requires --threads:on compilation flag".}
|
||||
|
||||
template sendLoop[T](chan: var ChannelSPSCSingle,
|
||||
data: sink T,
|
||||
body: untyped): untyped =
|
||||
while not chan.trySend(data):
|
||||
body
|
||||
|
||||
template recvLoop[T](chan: var ChannelSPSCSingle,
|
||||
data: var T,
|
||||
body: untyped): untyped =
|
||||
while not chan.tryRecv(data):
|
||||
body
|
||||
|
||||
type
|
||||
ThreadArgs = object
|
||||
ID: WorkerKind
|
||||
chan: ptr ChannelSPSCSingle
|
||||
|
||||
WorkerKind = enum
|
||||
Sender
|
||||
Receiver
|
||||
|
||||
template Worker(id: WorkerKind, body: untyped): untyped {.dirty.} =
|
||||
if args.ID == id:
|
||||
body
|
||||
|
||||
proc thread_func(args: ThreadArgs) =
|
||||
|
||||
# Worker RECEIVER:
|
||||
# ---------
|
||||
# <- chan
|
||||
# <- chan
|
||||
# <- chan
|
||||
#
|
||||
# Worker SENDER:
|
||||
# ---------
|
||||
# chan <- 42
|
||||
# chan <- 53
|
||||
# chan <- 64
|
||||
Worker(Receiver):
|
||||
var val: int
|
||||
for j in 0 ..< 10:
|
||||
args.chan[].recvLoop(val):
|
||||
# Busy loop, in prod we might want to yield the core/thread timeslice
|
||||
discard
|
||||
echo " Receiver got: ", val
|
||||
doAssert val == 42 + j*11
|
||||
|
||||
Worker(Sender):
|
||||
doAssert args.chan.full.load(moRelaxed) == false
|
||||
for j in 0 ..< 10:
|
||||
let val = 42 + j*11
|
||||
args.chan[].sendLoop(val):
|
||||
# Busy loop, in prod we might want to yield the core/thread timeslice
|
||||
discard
|
||||
echo "Sender sent: ", val
|
||||
|
||||
proc main() =
|
||||
echo "Testing if 2 threads can send data"
|
||||
echo "-----------------------------------"
|
||||
var threads: array[2, Thread[ThreadArgs]]
|
||||
var pool: TLPoolAllocator
|
||||
pool.initialize()
|
||||
|
||||
var chan = pool.borrow(ChannelSPSCSingle)
|
||||
chan[].initialize(itemSize = sizeof(int))
|
||||
|
||||
createThread(threads[0], thread_func, ThreadArgs(ID: Receiver, chan: chan))
|
||||
createThread(threads[1], thread_func, ThreadArgs(ID: Sender, chan: chan))
|
||||
|
||||
joinThread(threads[0])
|
||||
joinThread(threads[1])
|
||||
|
||||
recycle(chan)
|
||||
|
||||
echo "-----------------------------------"
|
||||
echo "Success"
|
||||
|
||||
main()
|
|
@ -0,0 +1,181 @@
|
|||
# Nim-Taskpools
|
||||
# Copyright (c) 2021 Status Research & Development GmbH
|
||||
# Licensed and distributed under either of
|
||||
# * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
|
||||
# * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
|
||||
# at your option. This file may not be copied, modified, or distributed except according to those terms.
|
||||
|
||||
# chase_lev_deques.nim
|
||||
# --------------------
|
||||
# This file implements a Chase-Lev deque
|
||||
# This is a single-consumer multi-consumer concurrent queue
|
||||
# for work-stealing schedulers.
|
||||
#
|
||||
# Papers:
|
||||
# - Dynamic Circular Work-Stealing Deque
|
||||
# David Chase, Yossi Lev, 1993
|
||||
# https://www.dre.vanderbilt.edu/~schmidt/PDF/work-stealing-dequeue.pdf
|
||||
#
|
||||
# - Correct and Efficient Work-Stealing for Weak Memory Models
|
||||
# Nhat Minh Lê, Antoniu Pop, Albert Cohen, Francesco Zappa Nardelli, 2013
|
||||
# https://fzn.fr/readings/ppopp13.pdf
|
||||
#
|
||||
# We straight translate the second paper which includes formal proofs of correctness,
|
||||
# and uses modern C++11 code.
|
||||
#
|
||||
# A Chase-lev dequeue implements the following push, pop, steal.
|
||||
#
|
||||
# top bottom
|
||||
# ---------------------------------
|
||||
# | | | | <- push()
|
||||
# steal() <- | Task 0 | Task 1 | Task 2 | -> pop()
|
||||
# any thread | | | | owner-only
|
||||
# ---------------------------------
|
||||
#
|
||||
# To reduce contention, stealing is done on the opposite end from push/pop
|
||||
# so that there is a race only for the very last task.
|
||||
|
||||
{.push raises: [].}
|
||||
|
||||
import
|
||||
system/ansi_c,
|
||||
std/[locks, typetraits, atomics],
|
||||
./instrumentation/[contracts, loggers]
|
||||
|
||||
type
|
||||
Buf[T] = object
|
||||
## Backend buffer of a ChaseLevDeque
|
||||
## `capacity` MUST be a power of 2
|
||||
capacity: int
|
||||
mask: int # == capacity-1 implies (i and mask) == (i mod capacity)
|
||||
rawBuffer: UncheckedArray[Atomic[T]]
|
||||
|
||||
ChaseLevDeque*[T] = object
|
||||
## This implements a lock-free, growable, work-stealing deque.
|
||||
## The owning thread enqueues and dequeues at the bottom
|
||||
## Foreign threads steal at the top.
|
||||
##
|
||||
## Default queue size is 8
|
||||
## Queue can grow to handle up to 34 359 738 368 tasks in flights
|
||||
## TODO:
|
||||
## with --gc:arc / --gc:orc, use a seq instead of a fixed max size.
|
||||
top {.align: 64.}: Atomic[int]
|
||||
bottom: Atomic[int]
|
||||
buf: Atomic[ptr Buf[T]]
|
||||
garbage: array[32, ptr Buf[T]] # up to 34 359 738 368 sized buffer
|
||||
garbageUsed: uint8
|
||||
|
||||
func isPowerOfTwo(n: int): bool {.inline.} =
|
||||
(n and (n - 1)) == 0 and (n != 0)
|
||||
|
||||
proc newBuf(T: typedesc, capacity: int): ptr Buf[T] =
|
||||
# Tasks have a destructor
|
||||
# static:
|
||||
# doAssert supportsCopyMem(T), $T & " must be a (POD) plain-old-data type: no seq, string, ref."
|
||||
|
||||
preCondition: capacity.isPowerOfTwo()
|
||||
|
||||
result = cast[ptr Buf[T]](
|
||||
c_malloc(csize_t 2*sizeof(int) + sizeof(T)*capacity)
|
||||
)
|
||||
|
||||
result.capacity = capacity
|
||||
result.mask = capacity - 1
|
||||
result.rawBuffer.addr.zeroMem(sizeof(T)*capacity)
|
||||
|
||||
proc `[]=`[T](buf: var Buf[T], index: int, item: T) {.inline.} =
|
||||
buf.rawBuffer[index and buf.mask].store(item, moRelaxed)
|
||||
|
||||
proc `[]`[T](buf: var Buf[T], index: int): T {.inline.} =
|
||||
result = buf.rawBuffer[index and buf.mask].load(moRelaxed)
|
||||
|
||||
proc grow[T](deque: var ChaseLevDeque[T], buf: var ptr Buf[T], top, bottom: int) {.inline.} =
|
||||
## Double the buffer size
|
||||
## bottom is the last item index
|
||||
##
|
||||
## To handle race-conditions the current "top", "bottom" and "buf"
|
||||
## have to be saved before calling this procedure.
|
||||
## It reads and writes the "deque.buf", "deque.garbage" and "deque.garbageUsed"
|
||||
|
||||
# Read -> Copy -> Update
|
||||
var tmp = newBuf(T, buf.capacity*2)
|
||||
for i in top ..< bottom:
|
||||
tmp[][i] = buf[][i]
|
||||
|
||||
# This requires 68+ billions tasks in flight (per-thread)
|
||||
ascertain: deque.garbageUsed.int < deque.garbage.len
|
||||
|
||||
deque.garbage[deque.garbageUsed] = buf
|
||||
swap(buf, tmp)
|
||||
deque.buf.store(buf, moRelaxed)
|
||||
|
||||
# Public API
|
||||
# ---------------------------------------------------
|
||||
|
||||
proc init*[T](deque: var ChaseLevDeque[T]) =
|
||||
## Initializes a new Chase-lev work-stealing deque.
|
||||
deque.reset()
|
||||
deque.buf.store(newBuf(T, 8), moRelaxed)
|
||||
|
||||
proc teardown*[T](deque: var ChaseLevDeque[T]) =
|
||||
## Teardown a Chase-lev work-stealing deque.
|
||||
for i in 0 ..< deque.garbageUsed.int:
|
||||
c_free(deque.garbage[i])
|
||||
c_free(deque.buf.load(moRelaxed))
|
||||
|
||||
proc push*[T](deque: var ChaseLevDeque[T], item: T) =
|
||||
## Enqueue an item at the bottom
|
||||
## The item should not be used afterwards.
|
||||
|
||||
let # Handle race conditions
|
||||
b = deque.bottom.load(moRelaxed)
|
||||
t = deque.top.load(moAcquire)
|
||||
var a = deque.buf.load(moRelaxed)
|
||||
|
||||
if b-t > a.capacity - 1:
|
||||
# Full queue
|
||||
deque.grow(a, t, b)
|
||||
|
||||
a[][b] = item
|
||||
fence(moRelease)
|
||||
deque.bottom.store(b+1, moRelaxed)
|
||||
|
||||
proc pop*[T](deque: var ChaseLevDeque[T]): T =
|
||||
## Deque an item at the bottom
|
||||
|
||||
let # Handle race conditions
|
||||
b = deque.bottom.load(moRelaxed) - 1
|
||||
a = deque.buf.load(moRelaxed)
|
||||
|
||||
deque.bottom.store(b, moRelaxed)
|
||||
fence(moSequentiallyConsistent)
|
||||
var t = deque.top.load(moRelaxed)
|
||||
|
||||
if t <= b:
|
||||
# Non-empty queue.
|
||||
result = a[][b]
|
||||
if t == b:
|
||||
# Single last element in queue.
|
||||
if not compare_exchange(deque.top, t, t+1, moSequentiallyConsistent, moRelaxed):
|
||||
# Failed race.
|
||||
result = default(T)
|
||||
deque.bottom.store(b+1, moRelaxed)
|
||||
else:
|
||||
# Empty queue.
|
||||
result = default(T)
|
||||
deque.bottom.store(b+1, moRelaxed)
|
||||
|
||||
proc steal*[T](deque: var ChaseLevDeque[T]): T =
|
||||
## Deque an item at the top
|
||||
var t = deque.top.load(moAcquire)
|
||||
fence(moSequentiallyConsistent)
|
||||
let b = deque.bottom.load(moAcquire)
|
||||
result = default(T)
|
||||
|
||||
if t < b:
|
||||
# Non-empty queue.
|
||||
let a = deque.buf.load(moConsume)
|
||||
result = a[][t]
|
||||
if not compare_exchange(deque.top, t, t+1, moSequentiallyConsistent, moRelaxed):
|
||||
# Failed race.
|
||||
return default(T)
|
|
@ -0,0 +1,82 @@
|
|||
# Nim-Taskpools
|
||||
# Copyright (c) 2021 Status Research & Development GmbH
|
||||
# Licensed and distributed under either of
|
||||
# * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
|
||||
# * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
|
||||
# at your option. This file may not be copied, modified, or distributed except according to those terms.
|
||||
|
||||
# event_notifier.nim
|
||||
# ------------------
|
||||
# This file implements an event notifier.
|
||||
# It allows putting idle threads to sleep or waking them up.
|
||||
|
||||
# Design
|
||||
# Currently it is a shared lock + condition variable (a.k.a. a semaphore)
|
||||
#
|
||||
# In the future an eventcount might be considered, an event count significantly
|
||||
# reduces scheduler overhead by removing lock acquisition from critical path.
|
||||
# See overview and implementations at
|
||||
# https://gist.github.com/mratsim/04a29bdd98d6295acda4d0677c4d0041
|
||||
#
|
||||
# Weave "one event-notifier per thread" further reduces overhead
|
||||
# but requires the threadpool to be message-passing based.
|
||||
# https://github.com/mratsim/weave/blob/a230cce98a8524b2680011e496ec17de3c1039f2/weave/cross_thread_com/event_notifiers.nim
|
||||
|
||||
import
|
||||
std/locks,
|
||||
./instrumentation/contracts
|
||||
|
||||
type
|
||||
EventNotifier* = object
|
||||
## This data structure allows threads to be parked when no events are pending
|
||||
## and woken up when a new event is.
|
||||
# Lock must be aligned to a cache-line to avoid false-sharing.
|
||||
lock{.align: 64.}: Lock
|
||||
cond: Cond
|
||||
parked: int
|
||||
signals: int
|
||||
|
||||
func initialize*(en: var EventNotifier) {.inline.} =
|
||||
## Initialize the event notifier
|
||||
en.lock.initLock()
|
||||
en.cond.initCond()
|
||||
en.parked = 0
|
||||
en.signals = 0
|
||||
|
||||
func `=destroy`*(en: var EventNotifier) {.inline.} =
|
||||
en.cond.deinitCond()
|
||||
en.lock.deinitLock()
|
||||
|
||||
func `=`*(dst: var EventNotifier, src: EventNotifier) {.error: "An event notifier cannot be copied".}
|
||||
func `=sink`*(dst: var EventNotifier, src: EventNotifier) {.error: "An event notifier cannot be moved".}
|
||||
|
||||
proc park*(en: var EventNotifier) {.inline.} =
|
||||
## Wait until we are signaled of an event
|
||||
## Thread is parked and does not consume CPU resources
|
||||
en.lock.acquire()
|
||||
preCondition: en.signals == 0
|
||||
|
||||
en.parked += 1
|
||||
while en.signals == 0: # handle spurious wakeups
|
||||
en.cond.wait(en.lock)
|
||||
en.parked -= 1
|
||||
en.signals -= 1
|
||||
|
||||
postCondition: en.signals >= 0
|
||||
en.lock.release()
|
||||
|
||||
proc notify*(en: var EventNotifier) {.inline.} =
|
||||
## Unpark a thread if any is available
|
||||
en.lock.acquire()
|
||||
|
||||
if en.parked > 0:
|
||||
en.signals += 1
|
||||
en.cond.signal()
|
||||
|
||||
en.lock.release()
|
||||
|
||||
proc getParked*(en: var EventNotifier): int {.inline.} =
|
||||
## Get the number of parked thread
|
||||
en.lock.acquire()
|
||||
result = en.parked
|
||||
en.lock.release()
|
|
@ -0,0 +1,71 @@
|
|||
# Weave
|
||||
# Copyright (c) 2019 Mamy André-Ratsimbazafy
|
||||
# Licensed and distributed under either of
|
||||
# * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
|
||||
# * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
|
||||
# at your option. This file may not be copied, modified, or distributed except according to those terms.
|
||||
|
||||
import
|
||||
./channels_spsc_single,
|
||||
system/ansi_c,
|
||||
./instrumentation/contracts,
|
||||
std/os
|
||||
|
||||
{.push gcsafe.}
|
||||
|
||||
type
|
||||
Flowvar*[T] = object
|
||||
## A Flowvar is a placeholder for a future result that may be computed in parallel
|
||||
# Flowvar are optimized when containing a ptr type.
|
||||
# They take less size in memory by testing isNil
|
||||
# instead of having an extra atomic bool
|
||||
# They also use type-erasure to avoid having duplicate code
|
||||
# due to generic monomorphization.
|
||||
chan: ptr ChannelSPSCSingle
|
||||
|
||||
# proc `=copy`*[T](dst: var Flowvar[T], src: Flowvar[T]) {.error: "Futures/Flowvars cannot be copied".}
|
||||
#
|
||||
# Unfortunately we cannot prevent this easily as internally
|
||||
# we need a copy:
|
||||
# - nim-taskpools level when doing toTask(fnCall(args, fut)) and then returning fut. (Can be worked around with copyMem)
|
||||
# - in std/tasks (need upstream workaround)
|
||||
|
||||
proc newFlowVar*(T: typedesc): Flowvar[T] {.inline.} =
|
||||
let size = 2 + sizeof(T) # full flag + item size + buffer
|
||||
result.chan = cast[ptr ChannelSPSCSingle](c_calloc(1, csize_t size))
|
||||
result.chan[].initialize(sizeof(T))
|
||||
|
||||
proc cleanup(fv: sink Flowvar) {.inline.} =
|
||||
if not fv.chan.isNil:
|
||||
c_free(fv.chan)
|
||||
|
||||
func isSpawned*(fv: Flowvar): bool {.inline.} =
|
||||
## Returns true if a flowvar is spawned
|
||||
## This may be useful for recursive algorithms that
|
||||
## may or may not spawn a flowvar depending on a condition.
|
||||
## This is similar to Option or Maybe types
|
||||
return not fv.chan.isNil
|
||||
|
||||
proc readyWith*[T](fv: Flowvar[T], childResult: T) {.inline.} =
|
||||
## Send the Flowvar result from the child thread processing the task
|
||||
## to its parent thread.
|
||||
let resultSent {.used.} = fv.chan[].trySend(childResult)
|
||||
postCondition: resultSent
|
||||
|
||||
template tryComplete*[T](fv: Flowvar, parentResult: var T): bool =
|
||||
fv.chan[].tryRecv(parentResult)
|
||||
|
||||
func isReady*[T](fv: Flowvar[T]): bool {.inline.} =
|
||||
## Returns true if the result of a Flowvar is ready.
|
||||
## In that case `sync` will not block.
|
||||
## Otherwise the current will block to help on all the pending tasks
|
||||
## until the Flowvar is ready.
|
||||
not fv.chan[].isEmpty()
|
||||
|
||||
proc sync*[T](fv: sink Flowvar[T]): T {.inline, gcsafe.} =
|
||||
## Blocks the current thread until the flowvar is available
|
||||
## and returned.
|
||||
## The thread is not idle and will complete pending tasks.
|
||||
mixin forceFuture
|
||||
forceFuture(fv, result)
|
||||
cleanup(fv)
|
|
@ -0,0 +1,113 @@
|
|||
# Weave
|
||||
# Copyright (c) 2019 Mamy André-Ratsimbazafy
|
||||
# Licensed and distributed under either of
|
||||
# * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
|
||||
# * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
|
||||
# at your option. This file may not be copied, modified, or distributed except according to those terms.
|
||||
|
||||
import macros, os, strutils
|
||||
|
||||
{.used.}
|
||||
|
||||
# A simple design-by-contract API
|
||||
# ----------------------------------------------------------------------------------
|
||||
|
||||
# Everything should be a template that doesn't produce any code
|
||||
# when WV_Asserts is not defined.
|
||||
# Those checks are controlled by a custom flag instead of
|
||||
# "--boundsChecks" or "--nilChecks" to decouple them from user code checks.
|
||||
# Furthermore, we want them to be very lightweight on performance
|
||||
|
||||
# TODO auto-add documentation
|
||||
|
||||
proc inspectInfix(node: NimNode): NimNode =
|
||||
## Inspect an expression,
|
||||
## Returns the AST as string with runtime values inlined
|
||||
## from infix operators inlined.
|
||||
# TODO: pointer and custom type need a default repr
|
||||
# otherwise we can only resulve simple expressions
|
||||
proc inspect(node: NimNode): NimNode =
|
||||
case node.kind:
|
||||
of nnkInfix:
|
||||
return newCall(
|
||||
bindSym"&",
|
||||
newCall(
|
||||
bindSym"&",
|
||||
newCall(ident"$", inspect(node[1])),
|
||||
newLit(" " & $node[0] & " ")
|
||||
),
|
||||
newCall(ident"$", inspect(node[2]))
|
||||
)
|
||||
of {nnkIdent, nnkSym}:
|
||||
return node
|
||||
of nnkDotExpr:
|
||||
return quote do:
|
||||
when `node` is pointer or
|
||||
`node` is ptr or
|
||||
`node` is (proc):
|
||||
toHex(cast[ByteAddress](`node`) and 0xffff_ffff)
|
||||
else:
|
||||
$(`node`)
|
||||
of nnkPar:
|
||||
result = nnkPar.newTree()
|
||||
for sub in node:
|
||||
result.add inspect(sub)
|
||||
else:
|
||||
return node.toStrLit()
|
||||
return inspect(node)
|
||||
|
||||
macro assertContract(
|
||||
checkName: static string,
|
||||
predicate: untyped) =
|
||||
let lineinfo = lineinfoObj(predicate)
|
||||
let file = extractFilename(lineinfo.filename)
|
||||
|
||||
var strippedPredicate: NimNode
|
||||
if predicate.kind == nnkStmtList:
|
||||
assert predicate.len == 1, "Only one-liner conditions are supported"
|
||||
strippedPredicate = predicate[0]
|
||||
else:
|
||||
strippedPredicate = predicate
|
||||
|
||||
let debug = "\n Contract violated for " & checkName & " at " & file & ":" & $lineinfo.line &
|
||||
"\n " & $strippedPredicate.toStrLit &
|
||||
"\n The following values are contrary to expectations:" &
|
||||
"\n "
|
||||
let values = inspectInfix(strippedPredicate)
|
||||
let myID = quote do:
|
||||
when declared(myID):
|
||||
$myID()
|
||||
else:
|
||||
"N/A"
|
||||
|
||||
result = quote do:
|
||||
{.noSideEffect.}:
|
||||
when compileOption("assertions"):
|
||||
assert(`predicate`, `debug` & $`values` & " [Worker " & `myID` & "]\n")
|
||||
elif defined(WV_Asserts):
|
||||
if unlikely(not(`predicate`)):
|
||||
raise newException(AssertionError, `debug` & $`values` & '\n')
|
||||
|
||||
# A way way to get the caller function would be nice.
|
||||
|
||||
template preCondition*(require: untyped) =
|
||||
## Optional runtime check before returning from a function
|
||||
assertContract("pre-condition", require)
|
||||
|
||||
template postCondition*(ensure: untyped) =
|
||||
## Optional runtime check at the start of a function
|
||||
assertContract("post-condition", ensure)
|
||||
|
||||
template ascertain*(check: untyped) =
|
||||
## Optional runtime check in the middle of processing
|
||||
assertContract("transient condition", check)
|
||||
|
||||
# Sanity checks
|
||||
# ----------------------------------------------------------------------------------
|
||||
|
||||
when isMainModule:
|
||||
proc assertGreater(x, y: int) =
|
||||
postcondition(x > y)
|
||||
|
||||
# We should get a nicely formatted exception
|
||||
assertGreater(10, 12)
|
|
@ -0,0 +1,22 @@
|
|||
# Weave
|
||||
# Copyright (c) 2019 Mamy André-Ratsimbazafy
|
||||
# Licensed and distributed under either of
|
||||
# * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
|
||||
# * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
|
||||
# at your option. This file may not be copied, modified, or distributed except according to those terms.
|
||||
|
||||
import system/ansi_c
|
||||
|
||||
{.used.}
|
||||
|
||||
template log*(args: varargs[untyped]): untyped =
|
||||
c_printf(args)
|
||||
flushFile(stdout)
|
||||
|
||||
template debugTermination*(body: untyped): untyped =
|
||||
when defined(TP_DebugTermination) or defined(TP_Debug):
|
||||
{.noSideEffect, gcsafe.}: body
|
||||
|
||||
template debug*(body: untyped): untyped =
|
||||
when defined(TP_Debug):
|
||||
{.noSideEffect, gcsafe.}: body
|
|
@ -0,0 +1,52 @@
|
|||
# Weave
|
||||
# Copyright (c) 2019 Mamy André-Ratsimbazafy
|
||||
# Licensed and distributed under either of
|
||||
# * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
|
||||
# * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
|
||||
# at your option. This file may not be copied, modified, or distributed except according to those terms.
|
||||
|
||||
# Thread primitives
|
||||
# ----------------------------------------------------------------------------------
|
||||
|
||||
type
|
||||
Pthread {.importc: "pthread_t", header: "<sys/types.h>".} = distinct culong
|
||||
CpuSet {.byref, importc: "cpu_set_t", header: "<sched.h>".} = object
|
||||
|
||||
proc pthread_self(): Pthread {.header: "<pthread.h>".}
|
||||
|
||||
proc pthread_setaffinity_np(
|
||||
thread: Pthread,
|
||||
cpuset_size: int,
|
||||
cpuset: CpuSet
|
||||
) {.header: "<pthread.h>".}
|
||||
## Limit specified `thread` to run only on the processors
|
||||
## represented in `cpuset`
|
||||
|
||||
# Note CpuSet is always passed by (hidden) pointer
|
||||
|
||||
proc cpu_zero(cpuset: var CpuSet) {.importc: "CPU_ZERO", header: "<sched.h>".}
|
||||
## Clears the set so that it contains no CPU
|
||||
proc cpu_set(cpu: cint, cpuset: var CpuSet) {.importc: "CPU_SET", header: "<sched.h>".}
|
||||
## Add CPU to set
|
||||
|
||||
# Affinity
|
||||
# ----------------------------------------------------------------------------------
|
||||
|
||||
# Nim doesn't allow the main thread to set its own affinity
|
||||
|
||||
proc set_thread_affinity(t: Pthread, cpu: int32) {.inline.}=
|
||||
when defined(osx) or defined(android):
|
||||
{.warning: "To improve performance we should pin threads to cores.\n" &
|
||||
"This is not possible with MacOS or Android.".}
|
||||
# Note: on Android it's even more complex due to the Big.Little architecture
|
||||
# with cores with different performance profiles to save on battery
|
||||
else:
|
||||
var cpuset {.noinit.}: CpuSet
|
||||
|
||||
cpu_zero(cpuset)
|
||||
cpu_set(cpu, cpuset)
|
||||
pthread_setaffinity_np(t, sizeof(CpuSet), cpuset)
|
||||
|
||||
proc pinToCpu*(cpu: int32) {.inline.} =
|
||||
## Set the affinity of the main thread (the calling thread)
|
||||
set_thread_affinity(pthread_self(), cpu)
|
|
@ -0,0 +1,18 @@
|
|||
# Weave
|
||||
# Copyright (c) 2019 Mamy André-Ratsimbazafy
|
||||
# Licensed and distributed under either of
|
||||
# * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
|
||||
# * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
|
||||
# at your option. This file may not be copied, modified, or distributed except according to those terms.
|
||||
|
||||
import winlean
|
||||
|
||||
when not compileOption("threads"):
|
||||
{.error: "This requires --threads:on compilation flag".}
|
||||
|
||||
proc setThreadAffinityMask(hThread: Handle, dwThreadAffinityMask: uint) {.
|
||||
importc: "SetThreadAffinityMask", stdcall, header: "<windows.h>".}
|
||||
|
||||
proc pinToCpu*(cpu: int32) {.inline.} =
|
||||
## Set the affinity of the main thread (the calling thread)
|
||||
setThreadAffinityMask(getThreadID(), uint(1 shl cpu))
|
|
@ -0,0 +1,53 @@
|
|||
# Synchronization Barriers
|
||||
|
||||
OSX does not implement pthread_barrier as its an optional part
|
||||
of the POSIX standard and they probably want to drive people to libdispatch/Grand Central Dispatch.
|
||||
|
||||
So we need to roll our own with a POSIX compatible API.
|
||||
|
||||
## Glibc barriers, design bug and implementation
|
||||
|
||||
> Note: due to GPL licensing, do not lift the code.
|
||||
> Not that we can as it is heavily dependent on futexes
|
||||
> which are not available on OSX
|
||||
|
||||
We need to make sure that we don't hit the same bug
|
||||
as glibc: https://sourceware.org/bugzilla/show_bug.cgi?id=13065
|
||||
which seems to be an issue in some of the barrier implementations
|
||||
in the wild.
|
||||
|
||||
The design of Glibc barriers is here:
|
||||
https://sourceware.org/git/?p=glibc.git;a=blob;f=nptl/DESIGN-barrier.txt;h=23463c6b7e77231697db3e13933b36ce295365b1;hb=HEAD
|
||||
|
||||
And implementation:
|
||||
- https://sourceware.org/git/?p=glibc.git;a=blob;f=nptl/pthread_barrier_destroy.c;h=76957adef3ee751e5b0cfa429fcf4dd3cfd80b2b;hb=HEAD
|
||||
- https://sourceware.org/git/?p=glibc.git;a=blob;f=nptl/pthread_barrier_init.c;h=c8ebab3a3cb5cbbe469c0d05fb8d9ca0c365b2bb;hb=HEAD`
|
||||
- https://sourceware.org/git/?p=glibc.git;a=blob;f=nptl/pthread_barrier_wait.c;h=49fcfd370c1c4929fdabdf420f2f19720362e4a0;hb=HEAD
|
||||
|
||||
## Synchronization barrier techniques
|
||||
|
||||
This article goes over the techniques of
|
||||
"pool barrier" and "ticket barrier"
|
||||
https://locklessinc.com/articles/barriers/
|
||||
to reach 2x to 20x the speed of pthreads barrier
|
||||
|
||||
This course https://cs.anu.edu.au/courses/comp8320/lectures/aux/comp422-Lecture21-Barriers.pdf
|
||||
goes over
|
||||
- centralized barrier with sense reversal
|
||||
- combining tree barrier
|
||||
- dissemination barrier
|
||||
- tournament barrier
|
||||
- scalable tree barrier
|
||||
More courses:
|
||||
- http://www.cs.rochester.edu/u/sandhya/csc458/seminars/jb_Barrier_Methods.pdf
|
||||
|
||||
It however requires lightweight mutexes like Linux futexes
|
||||
that OSX lacks.
|
||||
|
||||
This post goes over lightweight mutexes like Benaphores (from BeOS)
|
||||
https://preshing.com/20120226/roll-your-own-lightweight-mutex/
|
||||
|
||||
This gives a few barrier implementations
|
||||
http://gallium.inria.fr/~maranget/MPRI/02.pdf
|
||||
and refers to Cubible paper for formally verifying synchronization barriers
|
||||
http://cubicle.lri.fr/papers/jfla2014.pdf (in French)
|
|
@ -0,0 +1,69 @@
|
|||
# Weave
|
||||
# Copyright (c) 2019 Mamy André-Ratsimbazafy
|
||||
# Licensed and distributed under either of
|
||||
# * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
|
||||
# * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
|
||||
# at your option. This file may not be copied, modified, or distributed except according to those terms.
|
||||
|
||||
when defined(windows):
|
||||
import ./barriers_windows
|
||||
when compileOption("assertions"):
|
||||
import os
|
||||
|
||||
type SyncBarrier* = SynchronizationBarrier
|
||||
|
||||
proc init*(syncBarrier: var SyncBarrier, threadCount: range[0'i32..high(int32)]) {.inline.} =
|
||||
## Initialize a synchronization barrier that will block ``threadCount`` threads
|
||||
## before release.
|
||||
let err {.used.} = InitializeSynchronizationBarrier(syncBarrier, threadCount, -1)
|
||||
when compileOption("assertions"):
|
||||
if err != 1:
|
||||
assert err == 0
|
||||
raiseOSError(osLastError())
|
||||
|
||||
proc wait*(syncBarrier: var SyncBarrier): bool {.inline.} =
|
||||
## Blocks thread at a synchronization barrier.
|
||||
## Returns true for one of the threads (the last one on Windows, undefined on Posix)
|
||||
## and false for the others.
|
||||
result = bool EnterSynchronizationBarrier(syncBarrier, SYNCHRONIZATION_BARRIER_FLAGS_NO_DELETE)
|
||||
|
||||
proc delete*(syncBarrier: sink SyncBarrier) {.inline.} =
|
||||
## Deletes a synchronization barrier.
|
||||
## This assumes no race between waiting at a barrier and deleting it,
|
||||
## and reuse of the barrier requires initialization.
|
||||
DeleteSynchronizationBarrier(syncBarrier.addr)
|
||||
|
||||
else:
|
||||
import ./barriers_posix
|
||||
when compileOption("assertions"):
|
||||
import os
|
||||
|
||||
type SyncBarrier* = PthreadBarrier
|
||||
|
||||
proc init*(syncBarrier: var SyncBarrier, threadCount: range[0'i32..high(int32)]) {.inline.} =
|
||||
## Initialize a synchronization barrier that will block ``threadCount`` threads
|
||||
## before release.
|
||||
let err {.used.} = pthread_barrier_init(syncBarrier, nil, threadCount)
|
||||
when compileOption("assertions"):
|
||||
if err != 0:
|
||||
raiseOSError(OSErrorCode(err))
|
||||
|
||||
proc wait*(syncBarrier: var SyncBarrier): bool {.inline.} =
|
||||
## Blocks thread at a synchronization barrier.
|
||||
## Returns true for one of the threads (the last one on Windows, undefined on Posix)
|
||||
## and false for the others.
|
||||
let err {.used.} = pthread_barrier_wait(syncBarrier)
|
||||
when compileOption("assertions"):
|
||||
if err != PTHREAD_BARRIER_SERIAL_THREAD and err < 0:
|
||||
raiseOSError(OSErrorCode(err))
|
||||
result = if err == PTHREAD_BARRIER_SERIAL_THREAD: true
|
||||
else: false
|
||||
|
||||
proc delete*(syncBarrier: sink SyncBarrier) {.inline.} =
|
||||
## Deletes a synchronization barrier.
|
||||
## This assumes no race between waiting at a barrier and deleting it,
|
||||
## and reuse of the barrier requires initialization.
|
||||
let err {.used.} = pthread_barrier_destroy(syncBarrier)
|
||||
when compileOption("assertions"):
|
||||
if err < 0:
|
||||
raiseOSError(OSErrorCode(err))
|
|
@ -0,0 +1,88 @@
|
|||
# Weave
|
||||
# Copyright (c) 2019 Mamy André-Ratsimbazafy
|
||||
# Licensed and distributed under either of
|
||||
# * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
|
||||
# * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
|
||||
# at your option. This file may not be copied, modified, or distributed except according to those terms.
|
||||
|
||||
# OSX doesn't implement pthread_barrier_t
|
||||
# It's an optional part of the POSIX standard
|
||||
#
|
||||
# This is a manual implementation of a sense reversing barrier
|
||||
|
||||
import locks
|
||||
|
||||
type
|
||||
Natural32 = range[0'i32..high(int32)]
|
||||
|
||||
Errno* = cint
|
||||
|
||||
PthreadAttr* = object
|
||||
## Dummy
|
||||
PthreadBarrier* = object
|
||||
## Implementation of a sense reversing barrier
|
||||
## (The Art of Multiprocessor Programming by Maurice Herlihy & Nir Shavit)
|
||||
|
||||
lock: Lock # Alternatively spinlock on Atomic
|
||||
cond {.guard: lock.}: Cond
|
||||
sense {.guard: lock.}: bool # Choose int32 to avoid zero-expansion cost in registers?
|
||||
left {.guard: lock.}: Natural32 # Number of threads missing at the barrier before opening
|
||||
count: Natural32 # Total number of threads that need to arrive before opening the barrier
|
||||
|
||||
const
|
||||
PTHREAD_BARRIER_SERIAL_THREAD* = Errno(1)
|
||||
|
||||
proc pthread_cond_broadcast(cond: var Cond): Errno {.header:"<pthread.h>".}
|
||||
## Nim only signal one thread in locks
|
||||
## We need to unblock all
|
||||
|
||||
proc broadcast(cond: var Cond) {.inline.}=
|
||||
discard pthread_cond_broadcast(cond)
|
||||
|
||||
func pthread_barrier_init*(
|
||||
barrier: var PthreadBarrier,
|
||||
attr: ptr PthreadAttr,
|
||||
count: range[0'i32..high(int32)]
|
||||
): Errno =
|
||||
barrier.lock.initLock()
|
||||
{.locks: [barrier.lock].}:
|
||||
barrier.cond.initCond()
|
||||
barrier.left = count
|
||||
barrier.count = count
|
||||
# barrier.sense = false
|
||||
|
||||
proc pthread_barrier_wait*(barrier: var PthreadBarrier): Errno =
|
||||
## Wait on `barrier`
|
||||
## Returns PTHREAD_BARRIER_SERIAL_THREAD for a single arbitrary thread
|
||||
## Returns 0 for the other
|
||||
## Returns Errno if there is an error
|
||||
barrier.lock.acquire()
|
||||
{.locks: [barrier.lock].}:
|
||||
var local_sense = barrier.sense # Thread local sense
|
||||
dec barrier.left
|
||||
|
||||
if barrier.left == 0:
|
||||
# Last thread to arrive at the barrier
|
||||
# Reverse phase and release it
|
||||
barrier.left = barrier.count
|
||||
barrier.sense = not barrier.sense
|
||||
barrier.cond.broadcast()
|
||||
barrier.lock.release()
|
||||
return PTHREAD_BARRIER_SERIAL_THREAD
|
||||
|
||||
while barrier.sense == local_sense:
|
||||
# We are waiting for threads
|
||||
# Wait for the sense to reverse
|
||||
# while loop because we might have spurious wakeups
|
||||
barrier.cond.wait(barrier.lock)
|
||||
|
||||
# Reversed, we can leave the barrier
|
||||
barrier.lock.release()
|
||||
return Errno(0)
|
||||
|
||||
proc pthread_barrier_destroy*(barrier: var PthreadBarrier): Errno =
|
||||
{.locks: [barrier.lock].}:
|
||||
barrier.cond.deinitCond()
|
||||
barrier.lock.deinitLock()
|
||||
|
||||
# TODO: tests
|
|
@ -0,0 +1,51 @@
|
|||
# Weave
|
||||
# Copyright (c) 2019 Mamy André-Ratsimbazafy
|
||||
# Licensed and distributed under either of
|
||||
# * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
|
||||
# * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
|
||||
# at your option. This file may not be copied, modified, or distributed except according to those terms.
|
||||
|
||||
# Abstractions over POSIX barriers (non-)implementations
|
||||
|
||||
when not compileOption("threads"):
|
||||
{.error: "This requires --threads:on compilation flag".}
|
||||
|
||||
# Types
|
||||
# -------------------------------------------------------
|
||||
|
||||
when defined(osx):
|
||||
import ./barriers_macos
|
||||
export PthreadAttr, PthreadBarrier, Errno, PTHREAD_BARRIER_SERIAL_THREAD
|
||||
else:
|
||||
type
|
||||
PthreadAttr* {.byref, importc: "pthread_attr_t", header: "<sys/types.h>".} = object
|
||||
PthreadBarrier* {.byref, importc: "pthread_barrier_t", header: "<sys/types.h>".} = object
|
||||
|
||||
Errno* = cint
|
||||
|
||||
var PTHREAD_BARRIER_SERIAL_THREAD* {.importc, header:"<pthread.h>".}: Errno
|
||||
|
||||
# Pthread
|
||||
# -------------------------------------------------------
|
||||
when defined(osx):
|
||||
export pthread_barrier_init, pthread_barrier_wait, pthread_barrier_destroy
|
||||
else:
|
||||
proc pthread_barrier_init*(
|
||||
barrier: PthreadBarrier,
|
||||
attr: PthreadAttr or ptr PthreadAttr,
|
||||
count: range[0'i32..high(int32)]
|
||||
): Errno {.header: "<pthread.h>".}
|
||||
## Initialize `barrier` with the attributes `attr`.
|
||||
## The barrier is opened when `count` waiters arrived.
|
||||
|
||||
proc pthread_barrier_destroy*(
|
||||
barrier: sink PthreadBarrier): Errno {.header: "<pthread.h>".}
|
||||
## Destroy a previously dynamically initialized `barrier`.
|
||||
|
||||
proc pthread_barrier_wait*(
|
||||
barrier: var PthreadBarrier
|
||||
): Errno {.header: "<pthread.h>".}
|
||||
## Wait on `barrier`
|
||||
## Returns PTHREAD_BARRIER_SERIAL_THREAD for a single arbitrary thread
|
||||
## Returns 0 for the other
|
||||
## Returns Errno if there is an error
|
|
@ -0,0 +1,31 @@
|
|||
# Weave
|
||||
# Copyright (c) 2019 Mamy André-Ratsimbazafy
|
||||
# Licensed and distributed under either of
|
||||
# * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
|
||||
# * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
|
||||
# at your option. This file may not be copied, modified, or distributed except according to those terms.
|
||||
|
||||
import winlean
|
||||
|
||||
# Technically in <synchapi.h> but MSVC complains with
|
||||
# @m..@s..@sweave@sscheduler.nim.cpp
|
||||
# C:\Program Files (x86)\Windows Kits\10\include\10.0.17763.0\um\winnt.h(154): fatal error C1189: #error: "No Target Architecture
|
||||
|
||||
type
|
||||
SynchronizationBarrier*{.importc:"SYNCHRONIZATION_BARRIER", header:"<windows.h>".} = object
|
||||
|
||||
var SYNCHRONIZATION_BARRIER_FLAGS_NO_DELETE* {.importc, header: "<windows.h>".}: DWORD
|
||||
## Skip expensive checks on barrier enter if a barrier is never deleted.
|
||||
|
||||
proc EnterSynchronizationBarrier*(lpBarrier: var SynchronizationBarrier, dwFlags: DWORD): WINBOOL {.importc, stdcall, header: "<windows.h>".}
|
||||
proc DeleteSynchronizationBarrier*(lpBarrier: ptr SynchronizationBarrier) {.importc, stdcall, header: "<windows.h>".}
|
||||
proc InitializeSynchronizationBarrier*(lpBarrier: var SynchronizationBarrier, lTotalThreads: LONG, lSpinCount: LONG): WINBOOL {.importc, stdcall, header: "<windows.h>".}
|
||||
|
||||
when isMainModule:
|
||||
import os
|
||||
|
||||
var x{.noinit.}: SynchronizationBarrier
|
||||
let err = InitializeSynchronizationBarrier(x, 2, -1)
|
||||
if err != 1:
|
||||
assert err == 0
|
||||
raiseOSError(osLastError())
|
|
@ -0,0 +1,12 @@
|
|||
# Versions
|
||||
|
||||
## std/tasks
|
||||
- https://github.com/nim-lang/Nim/blob/3619a5a2aa1c7387ec7df01b195bc683943654ff/lib/std/tasks.nim
|
||||
|
||||
We don't support aborting if there is a closure as this requires [#17501](https://github.com/nim-lang/Nim/pull/17501/files)
|
||||
|
||||
## std/isolation
|
||||
- https://github.com/nim-lang/Nim/blob/603af22b7ca46ac566f8c7c15402028f3f976a4e/lib/std/isolation.nim
|
||||
|
||||
## std/effecttraits
|
||||
- https://github.com/nim-lang/Nim/blob/603af22b7ca46ac566f8c7c15402028f3f976a4e/lib/std/effecttraits.nim
|
|
@ -0,0 +1,54 @@
|
|||
#
|
||||
#
|
||||
# Nim's Runtime Library
|
||||
# (c) Copyright 2020 Nim contributors
|
||||
#
|
||||
# See the file "copying.txt", included in this
|
||||
# distribution, for details about the copyright.
|
||||
#
|
||||
|
||||
## This module provides access to the inferred .raises effects
|
||||
## for Nim's macro system.
|
||||
## **Since**: Version 1.4.
|
||||
##
|
||||
## One can test for the existance of this standard module
|
||||
## via `defined(nimHasEffectTraitsModule)`.
|
||||
|
||||
import macros
|
||||
|
||||
proc getRaisesListImpl(n: NimNode): NimNode = discard "see compiler/vmops.nim"
|
||||
proc getTagsListImpl(n: NimNode): NimNode = discard "see compiler/vmops.nim"
|
||||
proc isGcSafeImpl(n: NimNode): bool = discard "see compiler/vmops.nim"
|
||||
proc hasNoSideEffectsImpl(n: NimNode): bool = discard "see compiler/vmops.nim"
|
||||
|
||||
proc getRaisesList*(fn: NimNode): NimNode =
|
||||
## Extracts the `.raises` list of the func/proc/etc `fn`.
|
||||
## `fn` has to be a resolved symbol of kind `nnkSym`. This
|
||||
## implies that the macro that calls this proc should accept `typed`
|
||||
## arguments and not `untyped` arguments.
|
||||
expectKind fn, nnkSym
|
||||
result = getRaisesListImpl(fn)
|
||||
|
||||
proc getTagsList*(fn: NimNode): NimNode =
|
||||
## Extracts the `.tags` list of the func/proc/etc `fn`.
|
||||
## `fn` has to be a resolved symbol of kind `nnkSym`. This
|
||||
## implies that the macro that calls this proc should accept `typed`
|
||||
## arguments and not `untyped` arguments.
|
||||
expectKind fn, nnkSym
|
||||
result = getTagsListImpl(fn)
|
||||
|
||||
proc isGcSafe*(fn: NimNode): bool =
|
||||
## Return true if the func/proc/etc `fn` is `gcsafe`.
|
||||
## `fn` has to be a resolved symbol of kind `nnkSym`. This
|
||||
## implies that the macro that calls this proc should accept `typed`
|
||||
## arguments and not `untyped` arguments.
|
||||
expectKind fn, nnkSym
|
||||
result = isGcSafeImpl(fn)
|
||||
|
||||
proc hasNoSideEffects*(fn: NimNode): bool =
|
||||
## Return true if the func/proc/etc `fn` has `noSideEffect`.
|
||||
## `fn` has to be a resolved symbol of kind `nnkSym`. This
|
||||
## implies that the macro that calls this proc should accept `typed`
|
||||
## arguments and not `untyped` arguments.
|
||||
expectKind fn, nnkSym
|
||||
result = hasNoSideEffectsImpl(fn)
|
|
@ -0,0 +1,50 @@
|
|||
#
|
||||
#
|
||||
# Nim's Runtime Library
|
||||
# (c) Copyright 2020 Nim contributors
|
||||
#
|
||||
# See the file "copying.txt", included in this
|
||||
# distribution, for details about the copyright.
|
||||
#
|
||||
|
||||
## This module implements the `Isolated[T]` type for
|
||||
## safe construction of isolated subgraphs that can be
|
||||
## passed efficiently to different channels and threads.
|
||||
##
|
||||
## .. warning:: This module is experimental and its interface may change.
|
||||
##
|
||||
|
||||
type
|
||||
Isolated*[T] = object ## Isolated data can only be moved, not copied.
|
||||
value: T
|
||||
|
||||
proc `=copy`*[T](dest: var Isolated[T]; src: Isolated[T]) {.error.}
|
||||
|
||||
proc `=sink`*[T](dest: var Isolated[T]; src: Isolated[T]) {.inline.} =
|
||||
# delegate to value's sink operation
|
||||
`=sink`(dest.value, src.value)
|
||||
|
||||
proc `=destroy`*[T](dest: var Isolated[T]) {.inline.} =
|
||||
# delegate to value's destroy operation
|
||||
`=destroy`(dest.value)
|
||||
|
||||
# XXX: removed the {.magic: "Isolate".}
|
||||
func isolate*[T](value: sink T): Isolated[T] =
|
||||
## Creates an isolated subgraph from the expression `value`.
|
||||
## Isolation is checked at compile time.
|
||||
##
|
||||
## Please read https://github.com/nim-lang/RFCs/issues/244
|
||||
## for more details.
|
||||
Isolated[T](value: value)
|
||||
|
||||
func unsafeIsolate*[T](value: sink T): Isolated[T] =
|
||||
## Creates an isolated subgraph from the expression `value`.
|
||||
##
|
||||
## .. warning:: The proc doesn't check whether `value` is isolated.
|
||||
##
|
||||
Isolated[T](value: value)
|
||||
|
||||
func extract*[T](src: var Isolated[T]): T =
|
||||
## Returns the internal value of `src`.
|
||||
## The value is moved from `src`.
|
||||
result = move(src.value)
|
|
@ -0,0 +1,284 @@
|
|||
#
|
||||
#
|
||||
# Nim's Runtime Library
|
||||
# (c) Copyright 2021 Nim contributors
|
||||
#
|
||||
# See the file "copying.txt", included in this
|
||||
# distribution, for details about the copyright.
|
||||
#
|
||||
|
||||
## This module provides basic primitives for creating parallel programs.
|
||||
## A `Task` should be only owned by a single Thread, it cannot be shared by threads.
|
||||
|
||||
import std/[macros, typetraits]
|
||||
import system/ansi_c
|
||||
|
||||
import ./isolation
|
||||
export isolation
|
||||
|
||||
when compileOption("threads"):
|
||||
from ./effecttraits import isGcSafe
|
||||
|
||||
|
||||
#
|
||||
# proc hello(a: int, b: string) =
|
||||
# echo $a & b
|
||||
#
|
||||
# let literal = "Nim"
|
||||
# let t = toTask(hello(521, literal))
|
||||
#
|
||||
#
|
||||
# is roughly converted to
|
||||
#
|
||||
# type
|
||||
# ScratchObj_369098780 = object
|
||||
# a: int
|
||||
# b: string
|
||||
#
|
||||
# let scratch_369098762 = cast[ptr ScratchObj_369098780](c_calloc(csize_t 1,
|
||||
# csize_t sizeof(ScratchObj_369098780)))
|
||||
# if scratch_369098762.isNil:
|
||||
# raise newException(OutOfMemDefect, "Could not allocate memory")
|
||||
# block:
|
||||
# var isolate_369098776 = isolate(521)
|
||||
# scratch_369098762.a = extract(isolate_369098776)
|
||||
# var isolate_369098778 = isolate(literal)
|
||||
# scratch_369098762.b = extract(isolate_369098778)
|
||||
# proc hello_369098781(args`gensym3: pointer) {.nimcall.} =
|
||||
# let objTemp_369098775 = cast[ptr ScratchObj_369098780](args`gensym3)
|
||||
# let :tmp_369098777 = objTemp_369098775.a
|
||||
# let :tmp_369098779 = objTemp_369098775.b
|
||||
# hello(a = :tmp_369098777, b = :tmp_369098779)
|
||||
#
|
||||
# proc destroyScratch_369098782(args`gensym3: pointer) {.nimcall.} =
|
||||
# let obj_369098783 = cast[ptr ScratchObj_369098780](args`gensym3)
|
||||
# =destroy(obj_369098783[])
|
||||
# let t = Task(callback: hello_369098781, args: scratch_369098762, destroy: destroyScratch_369098782)
|
||||
#
|
||||
|
||||
|
||||
type
|
||||
Task* = object ## `Task` contains the callback and its arguments.
|
||||
callback: proc (args: pointer) {.nimcall, gcsafe.}
|
||||
args: pointer
|
||||
destroy: proc (args: pointer) {.nimcall.}
|
||||
|
||||
|
||||
proc `=copy`*(x: var Task, y: Task) {.error.}
|
||||
|
||||
proc `=destroy`*(t: var Task) {.inline.} =
|
||||
## Frees the resources allocated for a `Task`.
|
||||
if t.args != nil:
|
||||
if t.destroy != nil:
|
||||
t.destroy(t.args)
|
||||
c_free(t.args)
|
||||
|
||||
proc invoke*(task: Task) {.inline, gcsafe.} =
|
||||
## Invokes the `task`.
|
||||
assert task.callback != nil
|
||||
task.callback(task.args)
|
||||
|
||||
template checkIsolate(scratchAssignList: seq[NimNode], procParam, scratchDotExpr: NimNode) =
|
||||
# block:
|
||||
# var isoTempA = isolate(521)
|
||||
# scratch.a = extract(isolateA)
|
||||
# var isoTempB = isolate(literal)
|
||||
# scratch.b = extract(isolateB)
|
||||
let isolatedTemp = genSym(nskTemp, "isoTemp")
|
||||
|
||||
# XXX: Fix sym bindings
|
||||
# scratchAssignList.add newVarStmt(isolatedTemp, newCall(newidentNode("isolate"), procParam))
|
||||
# scratchAssignList.add newAssignment(scratchDotExpr,
|
||||
# newcall(newIdentNode("extract"), isolatedTemp))
|
||||
scratchAssignList.add newVarStmt(isolatedTemp, newCall(bindSym("isolate"), procParam))
|
||||
scratchAssignList.add newAssignment(scratchDotExpr,
|
||||
newcall(bindSym("extract"), isolatedTemp))
|
||||
|
||||
template addAllNode(assignParam: NimNode, procParam: NimNode) =
|
||||
let scratchDotExpr = newDotExpr(scratchIdent, formalParams[i][0])
|
||||
|
||||
checkIsolate(scratchAssignList, procParam, scratchDotExpr)
|
||||
|
||||
let tempNode = genSym(kind = nskTemp, ident = formalParams[i][0].strVal)
|
||||
callNode.add nnkExprEqExpr.newTree(formalParams[i][0], tempNode)
|
||||
tempAssignList.add newLetStmt(tempNode, newDotExpr(objTemp, formalParams[i][0]))
|
||||
scratchRecList.add newIdentDefs(newIdentNode(formalParams[i][0].strVal), assignParam)
|
||||
|
||||
macro toTask*(e: typed{nkCall | nkInfix | nkPrefix | nkPostfix | nkCommand | nkCallStrLit}): Task =
|
||||
## Converts the call and its arguments to `Task`.
|
||||
runnableExamples("--gc:orc"):
|
||||
proc hello(a: int) = echo a
|
||||
|
||||
let b = toTask hello(13)
|
||||
assert b is Task
|
||||
|
||||
doAssert getTypeInst(e).typeKind == ntyVoid
|
||||
|
||||
# requires 1.6
|
||||
# when compileOption("threads"):
|
||||
# if not isGcSafe(e[0]):
|
||||
# error("'toTask' takes a GC safe call expression")
|
||||
|
||||
# TODO
|
||||
# https://github.com/nim-lang/Nim/pull/17501/files
|
||||
#
|
||||
# if hasClosure(e[0]):
|
||||
# error("closure call is not allowed")
|
||||
|
||||
if e.len > 1:
|
||||
let scratchIdent = genSym(kind = nskTemp, ident = "scratch")
|
||||
let impl = e[0].getTypeInst
|
||||
|
||||
when defined(nimTasksDebug):
|
||||
echo impl.treeRepr
|
||||
echo e.treeRepr
|
||||
let formalParams = impl[0]
|
||||
|
||||
var
|
||||
scratchRecList = newNimNode(nnkRecList)
|
||||
scratchAssignList: seq[NimNode]
|
||||
tempAssignList: seq[NimNode]
|
||||
callNode: seq[NimNode]
|
||||
|
||||
let
|
||||
objTemp = genSym(nskTemp, ident = "objTemp")
|
||||
|
||||
for i in 1 ..< formalParams.len:
|
||||
var param = formalParams[i][1]
|
||||
|
||||
if param.kind == nnkBracketExpr and param[0].eqIdent("sink"):
|
||||
param = param[0]
|
||||
|
||||
if param.typeKind in {ntyExpr, ntyStmt}:
|
||||
error("'toTask'ed function cannot have a 'typed' or 'untyped' parameter")
|
||||
|
||||
case param.kind
|
||||
of nnkVarTy:
|
||||
error("'toTask'ed function cannot have a 'var' parameter")
|
||||
of nnkBracketExpr:
|
||||
if param[0].typeKind == ntyTypeDesc:
|
||||
callNode.add nnkExprEqExpr.newTree(formalParams[i][0], e[i])
|
||||
elif param[0].typeKind in {ntyVarargs, ntyOpenArray}:
|
||||
if param[1].typeKind in {ntyExpr, ntyStmt}:
|
||||
error("'toTask'ed function cannot have a 'typed' or 'untyped' parameter")
|
||||
let
|
||||
seqType = nnkBracketExpr.newTree(newIdentNode("seq"), param[1])
|
||||
seqCallNode = newcall("@", e[i])
|
||||
addAllNode(seqType, seqCallNode)
|
||||
else:
|
||||
addAllNode(param, e[i])
|
||||
of nnkBracket, nnkObjConstr:
|
||||
# passing by static parameters
|
||||
# so we pass them directly instead of passing by scratchObj
|
||||
callNode.add nnkExprEqExpr.newTree(formalParams[i][0], e[i])
|
||||
of nnkSym, nnkPtrTy:
|
||||
addAllNode(param, e[i])
|
||||
of nnkCharLit..nnkNilLit:
|
||||
callNode.add nnkExprEqExpr.newTree(formalParams[i][0], e[i])
|
||||
else:
|
||||
error("not supported type kinds")
|
||||
|
||||
let scratchObjType = genSym(kind = nskType, ident = "ScratchObj")
|
||||
let scratchObj = nnkTypeSection.newTree(
|
||||
nnkTypeDef.newTree(
|
||||
scratchObjType,
|
||||
newEmptyNode(),
|
||||
nnkObjectTy.newTree(
|
||||
newEmptyNode(),
|
||||
newEmptyNode(),
|
||||
scratchRecList
|
||||
)
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
let scratchObjPtrType = quote do:
|
||||
cast[ptr `scratchObjType`](c_calloc(csize_t 1, csize_t sizeof(`scratchObjType`)))
|
||||
|
||||
let scratchLetSection = newLetStmt(
|
||||
scratchIdent,
|
||||
scratchObjPtrType
|
||||
)
|
||||
|
||||
let scratchCheck = quote do:
|
||||
if `scratchIdent`.isNil:
|
||||
# Renamed in 1.4
|
||||
# raise newException(OutOfMemDefect, "Could not allocate memory")
|
||||
raise newException(OutOfMemError, "Could not allocate memory")
|
||||
|
||||
var stmtList = newStmtList()
|
||||
stmtList.add(scratchObj)
|
||||
stmtList.add(scratchLetSection)
|
||||
stmtList.add(scratchCheck)
|
||||
stmtList.add(nnkBlockStmt.newTree(newEmptyNode(), newStmtList(scratchAssignList)))
|
||||
|
||||
var functionStmtList = newStmtList()
|
||||
let funcCall = newCall(e[0], callNode)
|
||||
functionStmtList.add tempAssignList
|
||||
functionStmtList.add funcCall
|
||||
|
||||
let funcName = genSym(nskProc, e[0].strVal)
|
||||
let destroyName = genSym(nskProc, "destroyScratch")
|
||||
let objTemp2 = genSym(ident = "obj")
|
||||
let tempNode = quote("@") do:
|
||||
`=destroy`(@objTemp2[])
|
||||
|
||||
result = quote do:
|
||||
`stmtList`
|
||||
|
||||
proc `funcName`(args: pointer) {.gcsafe, nimcall.} =
|
||||
let `objTemp` = cast[ptr `scratchObjType`](args)
|
||||
`functionStmtList`
|
||||
|
||||
proc `destroyName`(args: pointer) {.nimcall.} =
|
||||
let `objTemp2` = cast[ptr `scratchObjType`](args)
|
||||
`tempNode`
|
||||
|
||||
Task(callback: `funcName`, args: `scratchIdent`, destroy: `destroyName`)
|
||||
else:
|
||||
let funcCall = newCall(e[0])
|
||||
let funcName = genSym(nskProc, e[0].strVal)
|
||||
|
||||
result = quote do:
|
||||
proc `funcName`(args: pointer) {.gcsafe, nimcall.} =
|
||||
`funcCall`
|
||||
|
||||
Task(callback: `funcName`, args: nil)
|
||||
|
||||
when defined(nimTasksDebug):
|
||||
echo result.repr
|
||||
|
||||
runnableExamples("--gc:orc"):
|
||||
block:
|
||||
var num = 0
|
||||
proc hello(a: int) = inc num, a
|
||||
|
||||
let b = toTask hello(13)
|
||||
b.invoke()
|
||||
assert num == 13
|
||||
# A task can be invoked multiple times
|
||||
b.invoke()
|
||||
assert num == 26
|
||||
|
||||
block:
|
||||
type
|
||||
Runnable = ref object
|
||||
data: int
|
||||
|
||||
var data: int
|
||||
proc hello(a: Runnable) {.nimcall.} =
|
||||
a.data += 2
|
||||
data = a.data
|
||||
|
||||
|
||||
when false:
|
||||
# the parameters of call must be isolated.
|
||||
let x = Runnable(data: 12)
|
||||
let b = toTask hello(x) # error ----> expression cannot be isolated: x
|
||||
b.invoke()
|
||||
|
||||
let b = toTask(hello(Runnable(data: 12)))
|
||||
b.invoke()
|
||||
assert data == 14
|
||||
b.invoke()
|
||||
assert data == 16
|
|
@ -0,0 +1,151 @@
|
|||
# Weave
|
||||
# Copyright (c) 2019 Mamy André-Ratsimbazafy
|
||||
# Licensed and distributed under either of
|
||||
# * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
|
||||
# * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
|
||||
# at your option. This file may not be copied, modified, or distributed except according to those terms.
|
||||
|
||||
import
|
||||
std/random,
|
||||
system/ansi_c,
|
||||
./instrumentation/contracts
|
||||
|
||||
const TP_MaxWorkers = 255
|
||||
type Setuint = uint8 # We support at most 255 threads (0xFF is kept as special value to signify absence in the set)
|
||||
|
||||
const Empty = high(Setuint)
|
||||
|
||||
type
|
||||
SparseSet* = object
|
||||
## Stores efficiently a set of integers in the range [0 .. Capacity)
|
||||
## Supports:
|
||||
## - O(1) inclusion, exclusion and contains
|
||||
## - O(1) random pick
|
||||
## - O(1) length
|
||||
## - O(length) iteration
|
||||
##
|
||||
## Space: Capacity * sizeof(words)
|
||||
##
|
||||
## This is contrary to bitsets which requires:
|
||||
## - random picking: multiple random "contains" + a fallback to uncompressing the set
|
||||
## - O(Capacity/sizeof(words)) length (via popcounts)
|
||||
## - O(capacity) iteration
|
||||
indices: ptr UncheckedArray[Setuint]
|
||||
values: ptr UncheckedArray[Setuint]
|
||||
rawBuffer: ptr UncheckedArray[Setuint]
|
||||
len*: Setuint
|
||||
capacity*: Setuint
|
||||
|
||||
func allocate*(s: var SparseSet, capacity: SomeInteger) {.inline.} =
|
||||
preCondition: capacity <= TP_MaxWorkers
|
||||
|
||||
s.capacity = Setuint capacity
|
||||
s.rawBuffer = cast[ptr UncheckedArray[Setuint]](c_calloc(csize_t 2*capacity, csize_t sizeof(Setuint)))
|
||||
s.indices = s.rawBuffer
|
||||
s.values = cast[ptr UncheckedArray[Setuint]](s.rawBuffer[capacity].addr)
|
||||
|
||||
func delete*(s: var SparseSet) {.inline.} =
|
||||
s.indices = nil
|
||||
s.values = nil
|
||||
c_free(s.rawBuffer)
|
||||
|
||||
func refill*(s: var SparseSet) {.inline.} =
|
||||
## Reset the sparseset by including all integers
|
||||
## in the range [0 .. Capacity)
|
||||
preCondition: not s.indices.isNil
|
||||
preCondition: not s.values.isNil
|
||||
preCondition: not s.rawBuffer.isNil
|
||||
preCondition: s.capacity != 0
|
||||
|
||||
s.len = s.capacity
|
||||
|
||||
for i in Setuint(0) ..< s.len:
|
||||
s.indices[i] = i
|
||||
s.values[i] = i
|
||||
|
||||
func isEmpty*(s: SparseSet): bool {.inline.} =
|
||||
s.len == 0
|
||||
|
||||
func contains*(s: SparseSet, n: SomeInteger): bool {.inline.} =
|
||||
assert n.int != Empty.int
|
||||
s.indices[n] != Empty
|
||||
|
||||
func incl*(s: var SparseSet, n: SomeInteger) {.inline.} =
|
||||
preCondition: n < Empty
|
||||
|
||||
if n in s: return
|
||||
|
||||
preCondition: s.len < s.capacity
|
||||
|
||||
s.indices[n] = s.len
|
||||
s.values[s.len] = n
|
||||
s.len += 1
|
||||
|
||||
func peek*(s: SparseSet): int32 {.inline.} =
|
||||
## Returns the last point in the set
|
||||
## Note: if an item is deleted this is not the last inserted point
|
||||
preCondition: s.len.int > 0
|
||||
int32 s.values[s.len - 1]
|
||||
|
||||
func excl*(s: var SparseSet, n: SomeInteger) {.inline.} =
|
||||
if n notin s: return
|
||||
|
||||
# We do constant time deletion by replacing the deleted
|
||||
# integer by the last value in the array of values
|
||||
|
||||
let delIdx = s.indices[n]
|
||||
|
||||
s.len -= 1
|
||||
let lastVal = s.values[s.len]
|
||||
|
||||
s.indices[lastVal] = del_idx # Last value now points to deleted index
|
||||
s.values[delIdx] = s.values[lastVal] # Deleted item is now last value
|
||||
|
||||
# Erase the item
|
||||
s.indices[n] = Empty
|
||||
|
||||
func randomPick*(s: SparseSet, rng: var Rand): int {.inline.} =
|
||||
## Randomly pick from the set.
|
||||
# The value is NOT removed from it.
|
||||
let pickIdx = rng.rand(s.len-1)
|
||||
result = s.values[pickIdx].int
|
||||
|
||||
func `$`*(s: SparseSet): string =
|
||||
$toOpenArray(s.values, 0, s.len.int - 1)
|
||||
|
||||
# Sanity checks
|
||||
# ------------------------------------------------------------------------------
|
||||
|
||||
when isMainModule:
|
||||
|
||||
const Size = 10
|
||||
const Picked = 5
|
||||
|
||||
var S: SparseSet
|
||||
S.allocate(Size)
|
||||
S.refill()
|
||||
echo S
|
||||
|
||||
var rngState = initRand(123)
|
||||
var picked: seq[int]
|
||||
|
||||
for _ in 0 ..< Picked:
|
||||
let p = S.randomPick(rngState)
|
||||
picked.add p
|
||||
S.excl p
|
||||
echo "---"
|
||||
echo "picked: ", p
|
||||
echo "S indices: ", toOpenArray(S.indices, 0, S.capacity.int - 1)
|
||||
|
||||
echo "---"
|
||||
echo "picked: ", picked
|
||||
echo "S: ", S
|
||||
echo "S indices: ", toOpenArray(S.indices, 0, S.capacity.int - 1)
|
||||
|
||||
for x in 0 ..< Size:
|
||||
if x notin picked:
|
||||
echo x, " notin picked -> in S"
|
||||
doAssert x in S
|
||||
else:
|
||||
echo x, " in picked -> notin S"
|
||||
doAssert x notin S
|
|
@ -0,0 +1,530 @@
|
|||
# Nim-Taskpools
|
||||
# Copyright (c) 2021 Status Research & Development GmbH
|
||||
# Licensed and distributed under either of
|
||||
# * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
|
||||
# * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
|
||||
# at your option. This file may not be copied, modified, or distributed except according to those terms.
|
||||
|
||||
# Taskpools
|
||||
#
|
||||
# This file implements a taskpool
|
||||
#
|
||||
# Implementation:
|
||||
#
|
||||
# It is a simple shared memory based work-stealing threadpool.
|
||||
# The primary focus is:
|
||||
# - Delegate compute intensive tasks to the threadpool.
|
||||
# - Simple to audit by staying close to foundational papers
|
||||
# and using simple datastructures otherwise.
|
||||
# - Low energy consumption:
|
||||
# threads should be put to sleep ASAP
|
||||
# instead of polling/spinning (energy vs latency tradeoff)
|
||||
# - Decent performance:
|
||||
# Work-stealing has optimal asymptotic parallel speedup.
|
||||
# Work-stealing has significantly reduced contention
|
||||
# when many tasks are created,
|
||||
# for example by divide-and-conquer algorithms, compared to a global task queue
|
||||
#
|
||||
# Not a priority:
|
||||
# - Handling trillions of very short tasks (less than 100µs).
|
||||
# - Advanced task dependencies or events API.
|
||||
# - Unbalanced parallel-for loops.
|
||||
# - Handling services that should run for the lifetime of the program.
|
||||
#
|
||||
# Doing IO on a compute threadpool should be avoided
|
||||
# In case a thread is blocked for IO, other threads can steal pending tasks in that thread.
|
||||
# If all threads are pending for IO, the threadpool will not make any progress and be soft-locked.
|
||||
|
||||
{.push raises: [].}
|
||||
|
||||
import
|
||||
system/ansi_c,
|
||||
std/[random, cpuinfo, atomics, macros],
|
||||
./channels_spsc_single,
|
||||
./chase_lev_deques,
|
||||
./event_notifiers,
|
||||
./primitives/barriers,
|
||||
./instrumentation/[contracts, loggers],
|
||||
./sparsesets,
|
||||
./flowvars,
|
||||
./ast_utils
|
||||
|
||||
export
|
||||
# flowvars
|
||||
Flowvar, isSpawned, isReady, sync
|
||||
|
||||
when defined(windows):
|
||||
import ./primitives/affinity_windows
|
||||
else:
|
||||
import ./primitives/affinity_posix
|
||||
|
||||
when (NimMajor,NimMinor,NimPatch) >= (1,6,0):
|
||||
import std/tasks
|
||||
else:
|
||||
import ./shims_pre_1_6/tasks
|
||||
|
||||
type
|
||||
WorkerID = int32
|
||||
|
||||
TaskNode = ptr object
|
||||
# Linked list of tasks
|
||||
parent: TaskNode
|
||||
task: Task
|
||||
|
||||
Signal = object
|
||||
terminate {.align: 64.}: Atomic[bool]
|
||||
|
||||
WorkerContext = object
|
||||
## Thread-local worker context
|
||||
|
||||
# Params
|
||||
id: WorkerID
|
||||
taskpool: Taskpool
|
||||
|
||||
# Tasks
|
||||
taskDeque: ptr ChaseLevDeque[TaskNode] # owned task deque
|
||||
currentTask: TaskNode
|
||||
|
||||
# Synchronization
|
||||
eventNotifier: ptr EventNotifier # shared event notifier
|
||||
signal: ptr Signal # owned signal
|
||||
|
||||
# Thefts
|
||||
rng: Rand # RNG state to select victims
|
||||
numThreads: int
|
||||
otherDeques: ptr UncheckedArray[ChaseLevDeque[TaskNode]]
|
||||
victims: SparseSet
|
||||
|
||||
Taskpool* = ptr object
|
||||
barrier: SyncBarrier
|
||||
## Barrier for initialization and teardown
|
||||
eventNotifier: EventNotifier
|
||||
## Puts thread to sleep
|
||||
|
||||
numThreads{.align: 64.}: int
|
||||
workerDeques: ptr UncheckedArray[ChaseLevDeque[TaskNode]]
|
||||
## Direct access for task stealing
|
||||
workers: ptr UncheckedArray[Thread[(Taskpool, WorkerID)]]
|
||||
workerSignals: ptr UncheckedArray[Signal]
|
||||
## Access signaledTerminate
|
||||
|
||||
# Thread-local config
|
||||
# ---------------------------------------------
|
||||
|
||||
var workerContext {.threadvar.}: WorkerContext
|
||||
## Thread-local Worker context
|
||||
|
||||
proc setupWorker() =
|
||||
## Initialize the thread-local context of a worker
|
||||
## Requires the ID and taskpool fields to be initialized
|
||||
template ctx: untyped = workerContext
|
||||
|
||||
preCondition: not ctx.taskpool.isNil()
|
||||
preCondition: 0 <= ctx.id and ctx.id < ctx.taskpool.numThreads
|
||||
preCondition: not ctx.taskpool.workerDeques.isNil()
|
||||
preCondition: not ctx.taskpool.workerSignals.isNil()
|
||||
|
||||
# Thefts
|
||||
ctx.rng = initRand(0xEFFACED + ctx.id)
|
||||
ctx.numThreads = ctx.taskpool.numThreads
|
||||
ctx.otherDeques = ctx.taskpool.workerDeques
|
||||
ctx.victims.allocate(ctx.taskpool.numThreads)
|
||||
|
||||
# Synchronization
|
||||
ctx.eventNotifier = addr ctx.taskpool.eventNotifier
|
||||
ctx.signal = addr ctx.taskpool.workerSignals[ctx.id]
|
||||
ctx.signal.terminate.store(false, moRelaxed)
|
||||
|
||||
# Tasks
|
||||
ctx.taskDeque = addr ctx.taskpool.workerDeques[ctx.id]
|
||||
ctx.currentTask = nil
|
||||
|
||||
# Init
|
||||
ctx.taskDeque[].init()
|
||||
|
||||
proc teardownWorker() =
|
||||
## Cleanup the thread-local context of a worker
|
||||
template ctx: untyped = workerContext
|
||||
ctx.taskDeque[].teardown()
|
||||
ctx.victims.delete()
|
||||
|
||||
proc eventLoop(ctx: var WorkerContext) {.raises:[Exception].}
|
||||
|
||||
proc workerEntryFn(params: tuple[taskpool: Taskpool, id: WorkerID])
|
||||
{.raises: [Exception].} =
|
||||
## On the start of the threadpool workers will execute this
|
||||
## until they receive a termination signal
|
||||
# We assume that thread_local variables start all at their binary zero value
|
||||
preCondition: workerContext == default(WorkerContext)
|
||||
|
||||
template ctx: untyped = workerContext
|
||||
|
||||
# If the following crashes, you need --tlsEmulation:off
|
||||
ctx.id = params.id
|
||||
ctx.taskpool = params.taskpool
|
||||
|
||||
setupWorker()
|
||||
|
||||
# 1 matching barrier in Taskpool.new() for root thread
|
||||
discard params.taskpool.barrier.wait()
|
||||
|
||||
{.gcsafe.}: # Not GC-safe when multi-threaded due to thread-local variables
|
||||
ctx.eventLoop()
|
||||
|
||||
debugTermination:
|
||||
log(">>> Worker %2d shutting down <<<\n", ctx.id)
|
||||
|
||||
# 1 matching barrier in taskpool.shutdown() for root thread
|
||||
discard params.taskpool.barrier.wait()
|
||||
|
||||
teardownWorker()
|
||||
|
||||
# Tasks
|
||||
# ---------------------------------------------
|
||||
|
||||
proc new(T: type TaskNode, parent: TaskNode, task: sink Task): T =
|
||||
type TaskNodeObj = typeof(default(T)[])
|
||||
var tn = cast[TaskNode](c_calloc(1, csize_t sizeof(TaskNodeObj)))
|
||||
tn.parent = parent
|
||||
tn.task = task
|
||||
return tn
|
||||
|
||||
proc runTask(tn: var TaskNode) {.raises:[Exception], inline.} =
|
||||
## Run a task and consumes the taskNode
|
||||
tn.task.invoke()
|
||||
tn.c_free()
|
||||
|
||||
proc schedule(ctx: WorkerContext, tn: sink TaskNode) {.inline.} =
|
||||
## Schedule a task in the taskpool
|
||||
debug: log("Worker %2d: schedule task 0x%.08x (parent 0x%.08x, current 0x%.08x)\n", ctx.id, tn, tn.parent, ctx.currentTask)
|
||||
ctx.taskDeque[].push(tn)
|
||||
ctx.taskpool.eventNotifier.notify()
|
||||
|
||||
# Scheduler
|
||||
# ---------------------------------------------
|
||||
|
||||
proc trySteal(ctx: var WorkerContext): TaskNode =
|
||||
## Try to steal a task.
|
||||
|
||||
ctx.victims.refill()
|
||||
ctx.victims.excl(ctx.id)
|
||||
|
||||
while not ctx.victims.isEmpty():
|
||||
let target = ctx.victims.randomPick(ctx.rng)
|
||||
|
||||
let stolenTask = ctx.otherDeques[target].steal()
|
||||
if not stolenTask.isNil:
|
||||
return stolenTask
|
||||
|
||||
ctx.victims.excl(target)
|
||||
|
||||
return nil
|
||||
|
||||
proc eventLoop(ctx: var WorkerContext) {.raises:[Exception].} =
|
||||
## Each worker thread executes this loop over and over.
|
||||
while not ctx.signal.terminate.load(moRelaxed):
|
||||
# 1. Pick from local deque
|
||||
debug: log("Worker %2d: eventLoop 1 - searching task from local deque\n", ctx.id)
|
||||
while (var taskNode = ctx.taskDeque[].pop(); not taskNode.isNil):
|
||||
debug: log("Worker %2d: eventLoop 1 - running task 0x%.08x (parent 0x%.08x, current 0x%.08x)\n", ctx.id, taskNode, taskNode.parent, ctx.currentTask)
|
||||
taskNode.runTask()
|
||||
|
||||
# 2. Run out of tasks, become a thief
|
||||
debug: log("Worker %2d: eventLoop 2 - becoming a thief\n", ctx.id)
|
||||
var stolenTask = ctx.trySteal()
|
||||
if not stolenTask.isNil:
|
||||
# 2.a Run task
|
||||
debug: log("Worker %2d: eventLoop 2.a - stole task 0x%.08x (parent 0x%.08x, current 0x%.08x)\n", ctx.id, stolenTask, stolenTask.parent, ctx.currentTask)
|
||||
stolenTask.runTask()
|
||||
else:
|
||||
# 2.b Park the thread until a new task enters the taskpool
|
||||
debug: log("Worker %2d: eventLoop 2.b - sleeping\n", ctx.id)
|
||||
ctx.eventNotifier[].park()
|
||||
debug: log("Worker %2d: eventLoop 2.b - waking\n", ctx.id)
|
||||
|
||||
# Tasking
|
||||
# ---------------------------------------------
|
||||
|
||||
const RootTask = default(Task) # TODO: sentinel value different from null task
|
||||
|
||||
template isRootTask(task: Task): bool =
|
||||
task == RootTask
|
||||
|
||||
proc forceFuture*[T](fv: Flowvar[T], parentResult: var T) {.raises:[Exception].} =
|
||||
## Eagerly complete an awaited FlowVar
|
||||
|
||||
template ctx: untyped = workerContext
|
||||
|
||||
template isFutReady(): untyped =
|
||||
fv.chan[].tryRecv(parentResult)
|
||||
|
||||
if isFutReady():
|
||||
return
|
||||
|
||||
## 1. Process all the children of the current tasks.
|
||||
## This ensures that we can give control back ASAP.
|
||||
debug: log("Worker %2d: sync 1 - searching task from local deque\n", ctx.id)
|
||||
while (var taskNode = ctx.taskDeque[].pop(); not taskNode.isNil):
|
||||
if taskNode.parent != ctx.currentTask:
|
||||
debug: log("Worker %2d: sync 1 - skipping non-direct descendant task 0x%.08x (parent 0x%.08x, current 0x%.08x)\n", ctx.id, taskNode, taskNode.parent, ctx.currentTask)
|
||||
ctx.schedule(taskNode)
|
||||
break
|
||||
debug: log("Worker %2d: sync 1 - running task 0x%.08x (parent 0x%.08x, current 0x%.08x)\n", ctx.id, taskNode, taskNode.parent, ctx.currentTask)
|
||||
taskNode.runTask()
|
||||
if isFutReady():
|
||||
debug: log("Worker %2d: sync 1 - future ready, exiting\n", ctx.id)
|
||||
return
|
||||
|
||||
## 2. We run out-of-tasks or out-of-direct-child of our current awaited task
|
||||
## So the task is bottlenecked by dependencies in other threads,
|
||||
## hence we abandon our enqueued work and steal in the others' queues
|
||||
## in hope it advances our awaited task. This prioritizes latency over throughput.
|
||||
debug: log("Worker %2d: sync 2 - future not ready, becoming a thief (currentTask 0x%.08x)\n", ctx.id, ctx.currentTask)
|
||||
while not isFutReady():
|
||||
var taskNode = ctx.trySteal()
|
||||
|
||||
if not taskNode.isNil:
|
||||
# We stole some task, we hope we advance our awaited task
|
||||
debug: log("Worker %2d: sync 2.1 - stole task 0x%.08x (parent 0x%.08x, current 0x%.08x)\n", ctx.id, taskNode, taskNode.parent, ctx.currentTask)
|
||||
taskNode.runTask()
|
||||
# elif (taskNode = ctx.taskDeque[].pop(); not taskNode.isNil):
|
||||
# # We advance our own queue, this increases throughput but may impact latency on the awaited task
|
||||
# debug: log("Worker %2d: sync 2.2 - couldn't steal, running own task\n", ctx.id)
|
||||
# taskNode.runTask()
|
||||
else:
|
||||
# We don't park as there is no notif for task completion
|
||||
cpuRelax()
|
||||
|
||||
proc syncAll*(pool: Taskpool) {.raises: [Exception].} =
|
||||
## Blocks until all pending tasks are completed
|
||||
## This MUST only be called from
|
||||
## the root scope that created the taskpool
|
||||
template ctx: untyped = workerContext
|
||||
|
||||
debugTermination:
|
||||
log(">>> Worker %2d enters barrier <<<\n", ctx.id)
|
||||
|
||||
preCondition: ctx.id == 0
|
||||
preCondition: ctx.currentTask.task.isRootTask()
|
||||
|
||||
# Empty all tasks
|
||||
var foreignThreadsParked = false
|
||||
while not foreignThreadsParked:
|
||||
# 1. Empty local tasks
|
||||
debug: log("Worker %2d: syncAll 1 - searching task from local deque\n", ctx.id)
|
||||
while (var taskNode = ctx.taskDeque[].pop(); not taskNode.isNil):
|
||||
debug: log("Worker %2d: syncAll 1 - running task 0x%.08x (parent 0x%.08x, current 0x%.08x)\n", ctx.id, taskNode, taskNode.parent, ctx.currentTask)
|
||||
taskNode.runTask()
|
||||
|
||||
if ctx.numThreads == 1 or foreignThreadsParked:
|
||||
break
|
||||
|
||||
# 2. Help other threads
|
||||
debug: log("Worker %2d: syncAll 2 - becoming a thief\n", ctx.id)
|
||||
var taskNode = ctx.trySteal()
|
||||
|
||||
if not taskNode.isNil:
|
||||
# 2.1 We stole some task
|
||||
debug: log("Worker %2d: syncAll 2.1 - stole task 0x%.08x (parent 0x%.08x, current 0x%.08x)\n", ctx.id, taskNode, taskNode.parent, ctx.currentTask)
|
||||
taskNode.runTask()
|
||||
else:
|
||||
# 2.2 No task to steal
|
||||
if pool.eventNotifier.getParked() == pool.numThreads - 1:
|
||||
# 2.2.1 all threads besides the current are parked
|
||||
debugTermination:
|
||||
log("Worker %2d: syncAll 2.2.1 - termination, all other threads sleeping\n", ctx.id)
|
||||
foreignThreadsParked = true
|
||||
else:
|
||||
# 2.2.2 We don't park as there is no notif for task completion
|
||||
cpuRelax()
|
||||
|
||||
debugTermination:
|
||||
log(">>> Worker %2d leaves barrier <<<\n", ctx.id)
|
||||
|
||||
# Runtime
|
||||
# ---------------------------------------------
|
||||
|
||||
proc new*(T: type Taskpool, numThreads = countProcessors()): T {.raises: [Exception].} =
|
||||
## Initialize a threadpool that manages `numThreads` threads.
|
||||
## Default to the number of logical processors available.
|
||||
|
||||
var tp = cast[T](c_calloc(1, csize_t sizeof(default(Taskpool)[])))
|
||||
|
||||
tp.barrier.init(numThreads.int32)
|
||||
tp.eventNotifier.initialize()
|
||||
tp.numThreads = numThreads
|
||||
tp.workerDeques = cast[ptr UncheckedArray[ChaseLevDeque[TaskNode]]](c_calloc(csize_t numThreads, csize_t sizeof ChaseLevDeque[TaskNode]))
|
||||
tp.workers = cast[ptr UncheckedArray[Thread[(Taskpool, WorkerID)]]](c_calloc(csize_t numThreads, csize_t sizeof Thread[(Taskpool, WorkerID)]))
|
||||
tp.workerSignals = cast[ptr UncheckedArray[Signal]](c_calloc(csize_t numThreads, csize_t sizeof Signal))
|
||||
|
||||
# Setup master thread
|
||||
workerContext.id = 0
|
||||
workerContext.taskpool = tp
|
||||
when not(defined(cpp) and defined(vcc)):
|
||||
# TODO: Nim casts between Windows Handles but that requires reinterpret cast for C++
|
||||
pinToCpu(0)
|
||||
|
||||
# Start worker threads
|
||||
for i in 1 ..< numThreads:
|
||||
createThread(tp.workers[i], worker_entry_fn, (tp, WorkerID(i)))
|
||||
# TODO: we might want to take into account Hyper-Threading (HT)
|
||||
# and allow spawning tasks and pinning to cores that are not HT-siblings.
|
||||
# This is important for memory-bound workloads (like copy, addition, ...)
|
||||
# where both sibling cores will compete for L1 and L2 cache, effectively
|
||||
# halving the memory bandwidth or worse, flushing what the other put in cache.
|
||||
# Note that while 2x siblings is common, Xeon Phi has 4x Hyper-Threading.
|
||||
when not(defined(cpp) and defined(vcc)):
|
||||
# TODO: Nim casts between Windows Handles but that requires reinterpret cast for C++
|
||||
pinToCpu(tp.workers[i], i)
|
||||
|
||||
# Root worker
|
||||
setupWorker()
|
||||
|
||||
# Root task, this is a sentinel task that is never called.
|
||||
workerContext.currentTask = TaskNode.new(
|
||||
parent = nil,
|
||||
task = default(Task) # TODO RootTask, somehow this uses `=copy`
|
||||
)
|
||||
|
||||
# Wait for the child threads
|
||||
discard tp.barrier.wait()
|
||||
return tp
|
||||
|
||||
proc cleanup(tp: var TaskPool) {.raises: [OSError].} =
|
||||
## Cleanup all resources allocated by the taskpool
|
||||
preCondition: workerContext.currentTask.task.isRootTask()
|
||||
|
||||
for i in 1 ..< tp.numThreads:
|
||||
joinThread(tp.workers[i])
|
||||
|
||||
tp.workerSignals.c_free()
|
||||
tp.workers.c_free()
|
||||
tp.workerDeques.c_free()
|
||||
`=destroy`(tp.eventNotifier)
|
||||
tp.barrier.delete()
|
||||
|
||||
tp.c_free()
|
||||
|
||||
proc shutdown*(tp: var TaskPool) {.raises:[Exception].} =
|
||||
## Wait until all tasks are processed and then shutdown the taskpool
|
||||
preCondition: workerContext.currentTask.task.isRootTask()
|
||||
tp.syncAll()
|
||||
|
||||
# Signal termination to all threads
|
||||
for i in 0 ..< tp.numThreads:
|
||||
tp.workerSignals[i].terminate.store(true, moRelaxed)
|
||||
|
||||
let parked = tp.eventNotifier.getParked()
|
||||
for i in 0 ..< parked:
|
||||
tp.eventNotifier.notify()
|
||||
|
||||
# 1 matching barrier in worker_entry_fn
|
||||
discard tp.barrier.wait()
|
||||
|
||||
teardownWorker()
|
||||
tp.cleanup()
|
||||
|
||||
# Dealloc dummy task
|
||||
workerContext.currentTask.c_free()
|
||||
|
||||
# Task parallelism
|
||||
# ---------------------------------------------
|
||||
{.pop.} # raises:[]
|
||||
|
||||
macro spawn*(tp: TaskPool, fnCall: typed): untyped =
|
||||
## Spawns the input function call asynchronously, potentially on another thread of execution.
|
||||
##
|
||||
## If the function calls returns a result, spawn will wrap it in a Flowvar.
|
||||
## You can use `sync` to block the current thread and extract the asynchronous result from the flowvar.
|
||||
## You can use `isReady` to check if result is available and if subsequent
|
||||
## `spawn` returns immediately.
|
||||
##
|
||||
## Tasks are processed approximately in Last-In-First-Out (LIFO) order
|
||||
result = newStmtList()
|
||||
|
||||
let fn = fnCall[0]
|
||||
let fnName = $fn
|
||||
|
||||
# Get the return type if any
|
||||
let retType = fnCall[0].getImpl[3][0]
|
||||
let needFuture = retType.kind != nnkEmpty
|
||||
|
||||
# Package in a task
|
||||
let taskNode = ident("taskNode")
|
||||
let task = ident("task")
|
||||
if not needFuture:
|
||||
result.add quote do:
|
||||
let `task` = toTask(`fnCall`)
|
||||
let `taskNode` = TaskNode.new(workerContext.currentTask, `task`)
|
||||
schedule(workerContext, `taskNode`)
|
||||
|
||||
else:
|
||||
# tasks have no return value.
|
||||
# 1. We create a channel/flowvar to transmit the return value to awaiter/sync
|
||||
# 2. We create a wrapper async_fn without return value that send the return value in the channel
|
||||
# 3. We package that wrapper function in a task
|
||||
|
||||
# 1. Create the channel
|
||||
let fut = ident("fut")
|
||||
let futTy = nnkBracketExpr.newTree(
|
||||
bindSym"FlowVar",
|
||||
retType
|
||||
)
|
||||
result.add quote do:
|
||||
let `fut` = newFlowVar(type `retType`)
|
||||
|
||||
# 2. Create a wrapper function that sends result to the channel
|
||||
# TODO, upstream "getImpl" doesn't return the generic params
|
||||
let genericParams = fn.getImpl()[2].replaceSymsByIdents()
|
||||
let formalParams = fn.getImpl()[3].replaceSymsByIdents()
|
||||
|
||||
var asyncParams = nnkFormalParams.newTree(
|
||||
newEmptyNode()
|
||||
)
|
||||
var fnCallIdents = nnkCall.newTree(
|
||||
fnCall[0]
|
||||
)
|
||||
for i in 1 ..< formalParams.len:
|
||||
let ident = formalParams[i].replaceSymsByIdents()
|
||||
asyncParams.add ident
|
||||
for j in 0 ..< ident.len - 2:
|
||||
# Handle "a, b: int"
|
||||
fnCallIdents.add ident[j]
|
||||
|
||||
let futFnParam = ident("fut")
|
||||
asyncParams.add newIdentDefs(futFnParam, futTy)
|
||||
|
||||
let asyncBody = quote do:
|
||||
# XXX: can't test that when the RootTask is default(Task) instead of a sentinel value
|
||||
# preCondition: not isRootTask(workerContext.currentTask.task)
|
||||
|
||||
let res = `fnCallIdents`
|
||||
readyWith(`futFnParam`, res)
|
||||
|
||||
let asyncFn = ident("taskpool_" & fnName)
|
||||
result.add nnkProcDef.newTree(
|
||||
asyncFn,
|
||||
newEmptyNode(),
|
||||
genericParams,
|
||||
asyncParams,
|
||||
nnkPragma.newTree(ident("nimcall")),
|
||||
newEmptyNode(),
|
||||
asyncBody
|
||||
)
|
||||
|
||||
var asyncCall = newCall(asyncFn)
|
||||
for i in 1 ..< fnCall.len:
|
||||
asyncCall.add fnCall[i].replaceSymsByIdents()
|
||||
asyncCall.add fut
|
||||
|
||||
result.add quote do:
|
||||
let `task` = toTask(`asyncCall`)
|
||||
let `taskNode` = TaskNode.new(workerContext.currentTask, `task`)
|
||||
schedule(workerContext, `taskNode`)
|
||||
|
||||
# Return the future / flowvar
|
||||
`fut`
|
||||
|
||||
# Wrap in a block for namespacing
|
||||
result = nnkBlockStmt.newTree(newEmptyNode(), result)
|
||||
echo result.toStrLit()
|
Loading…
Reference in New Issue