Add a threadpool (#213)

* Implement a threadpool

* int and SomeUnsignedInt ...

* Type conversion for windows SynchronizationBarrier

* Use the latest MacOS 11, Big Sur API (jan 2021) for MacOS futexes, Github action offers MacOS 12 and can test them

* bench need posix timer not available on windows and darwin futex

* Windows: nimble exec empty line is an error, Mac: use defined(osx) instead of defined(macos)

* file rename

* okay, that's the last one hopefully

* deactivate stealHalf for now
This commit is contained in:
Mamy Ratsimbazafy 2023-01-24 02:32:28 +01:00 committed by GitHub
parent 188f3e710c
commit 2931913b67
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
48 changed files with 4647 additions and 106 deletions

View File

@ -43,7 +43,7 @@ const testDesc: seq[tuple[path: string, useGMP: bool]] = @[
# ----------------------------------------------------------
("tests/math/t_primitives.nim", false),
("tests/math/t_primitives_extended_precision.nim", false),
# Big ints
# ----------------------------------------------------------
("tests/math/t_io_bigints.nim", false),
@ -53,7 +53,7 @@ const testDesc: seq[tuple[path: string, useGMP: bool]] = @[
("tests/math/t_bigints_mod_vs_gmp.nim", true),
("tests/math/t_bigints_mul_vs_gmp.nim", true),
("tests/math/t_bigints_mul_high_words_vs_gmp.nim", true),
# Field
# ----------------------------------------------------------
("tests/math/t_io_fields", false),
@ -64,11 +64,11 @@ const testDesc: seq[tuple[path: string, useGMP: bool]] = @[
("tests/math/t_finite_fields_powinv.nim", false),
("tests/math/t_finite_fields_vs_gmp.nim", true),
# ("tests/math/t_fp_cubic_root.nim", false),
# Double-precision finite fields
# ----------------------------------------------------------
("tests/math/t_finite_fields_double_precision.nim", false),
# Towers of extension fields
# ----------------------------------------------------------
# ("tests/math/t_fp2.nim", false),
@ -90,7 +90,7 @@ const testDesc: seq[tuple[path: string, useGMP: bool]] = @[
("tests/math/t_fp6_frobenius.nim", false),
("tests/math/t_fp12_frobenius.nim", false),
# Elliptic curve arithmetic
# Elliptic curve arithmetic
# ----------------------------------------------------------
("tests/math/t_ec_conversion.nim", false),
@ -111,7 +111,7 @@ const testDesc: seq[tuple[path: string, useGMP: bool]] = @[
("tests/math/t_ec_twedwards_prj_add_double", false),
("tests/math/t_ec_twedwards_prj_mul_sanity", false),
("tests/math/t_ec_twedwards_prj_mul_distri", false),
# Elliptic curve arithmetic G2
# ----------------------------------------------------------
@ -172,7 +172,7 @@ const testDesc: seq[tuple[path: string, useGMP: bool]] = @[
("tests/math/t_ec_sage_bls12_381.nim", false),
("tests/math/t_ec_sage_pallas.nim", false),
("tests/math/t_ec_sage_vesta.nim", false),
# Edge cases highlighted by past bugs
# ----------------------------------------------------------
("tests/math/t_ec_shortw_prj_edge_cases.nim", false),
@ -233,6 +233,18 @@ const testDescNvidia: seq[string] = @[
"tests/gpu/t_nvidia_fp.nim",
]
const testDescThreadpool: seq[string] = @[
"constantine/platforms/threadpool/examples/e01_simple_tasks.nim",
"constantine/platforms/threadpool/examples/e02_parallel_pi.nim",
# "constantine/platforms/threadpool/benchmarks/bouncing_producer_consumer/threadpool_bpc.nim", # Need timing not implemented on Windows
"constantine/platforms/threadpool/benchmarks/dfs/threadpool_dfs.nim",
"constantine/platforms/threadpool/benchmarks/fibonacci/threadpool_fib.nim",
"constantine/platforms/threadpool/benchmarks/heat/threadpool_heat.nim",
# "constantine/platforms/threadpool/benchmarks/matmul_cache_oblivious/threadpool_matmul_co.nim",
"constantine/platforms/threadpool/benchmarks/nqueens/threadpool_nqueens.nim",
# "constantine/platforms/threadpool/benchmarks/single_task_producer/threadpool_spc.nim", # Need timing not implemented on Windows
]
const benchDesc = [
"bench_fp",
"bench_fp_double_precision",
@ -301,7 +313,7 @@ proc clearParallelBuild() =
if fileExists(buildParallel):
rmFile(buildParallel)
template setupCommand(): untyped {.dirty.} =
template setupCommand(): untyped {.dirty.} =
var lang = "c"
if existsEnv"TEST_LANG":
lang = getEnv"TEST_LANG"
@ -367,7 +379,7 @@ proc addTestSet(cmdFile: var string, requireGMP: bool, test32bit = false, testAS
if not dirExists "build":
mkDir "build"
echo "Found " & $testDesc.len & " tests to run."
for td in testDesc:
if not(td.useGMP and not requireGMP):
var flags = ""
@ -379,17 +391,25 @@ proc addTestSet(cmdFile: var string, requireGMP: bool, test32bit = false, testAS
flags &= " -d:debugConstantine"
if td.path notin skipSanitizers:
flags &= sanitizers
cmdFile.testBatch(flags, td.path)
proc addTestSetNvidia(cmdFile: var string) =
if not dirExists "build":
mkDir "build"
echo "Found " & $testDescNvidia.len & " tests to run."
for path in testDescNvidia:
cmdFile.testBatch(flags = "", path)
proc addTestSetThreadpool(cmdFile: var string) =
if not dirExists "build":
mkDir "build"
echo "Found " & $testDescThreadpool.len & " tests to run."
for path in testDescThreadpool:
cmdFile.testBatch(flags = "--threads:on --linetrace:on", path)
proc addBenchSet(cmdFile: var string, useAsm = true) =
if not dirExists "build":
mkDir "build"
@ -491,13 +511,13 @@ task test_bindings, "Test C bindings":
# Put DLL near the exe as LD_LIBRARY_PATH doesn't work even in an POSIX compatible shell
exec "gcc -Ibindings/generated -Lbindings/generated -o build/testsuite/t_libctt_bls12_381_dl.exe tests/bindings/t_libctt_bls12_381.c -lgmp -lconstantine_bls12_381"
exec "./build/testsuite/t_libctt_bls12_381_dl.exe"
echo "--> Testing statically linked library"
when not defined(windows):
# Beware MacOS annoying linker with regards to static libraries
# The following standard way cannot be used on MacOS
# exec "gcc -Ibindings/generated -Lbindings/generated -o build/t_libctt_bls12_381_sl.exe tests/bindings/t_libctt_bls12_381.c -lgmp -Wl,-Bstatic -lconstantine_bls12_381 -Wl,-Bdynamic"
exec "gcc -Ibindings/generated -o build/testsuite/t_libctt_bls12_381_sl tests/bindings/t_libctt_bls12_381.c bindings/generated/libconstantine_bls12_381.a -lgmp"
exec "./build/testsuite/t_libctt_bls12_381_sl"
else:
@ -509,32 +529,40 @@ task test, "Run all tests":
var cmdFile: string
cmdFile.addTestSet(requireGMP = true, testASM = true)
cmdFile.addBenchSet(useASM = true) # Build (but don't run) benches to ensure they stay relevant
cmdFile.addTestSetThreadpool()
for cmd in cmdFile.splitLines():
exec cmd
if cmd != "": # Windows doesn't like empty commands
exec cmd
task test_no_asm, "Run all tests (no assembly)":
# -d:testingCurves is configured in a *.nim.cfg for convenience
var cmdFile: string
cmdFile.addTestSet(requireGMP = true, testASM = false)
cmdFile.addBenchSet(useASM = false) # Build (but don't run) benches to ensure they stay relevant
cmdFile.addTestSetThreadpool()
for cmd in cmdFile.splitLines():
exec cmd
if cmd != "": # Windows doesn't like empty commands
exec cmd
task test_no_gmp, "Run tests that don't require GMP":
# -d:testingCurves is configured in a *.nim.cfg for convenience
var cmdFile: string
cmdFile.addTestSet(requireGMP = false, testASM = true)
cmdFile.addBenchSet(useASM = true) # Build (but don't run) benches to ensure they stay relevant
cmdFile.addTestSetThreadpool()
for cmd in cmdFile.splitLines():
exec cmd
if cmd != "": # Windows doesn't like empty commands
exec cmd
task test_no_gmp_no_asm, "Run tests that don't require GMP using a pure Nim backend":
# -d:testingCurves is configured in a *.nim.cfg for convenience
var cmdFile: string
cmdFile.addTestSet(requireGMP = false, testASM = false)
cmdFile.addBenchSet(useASM = false) # Build (but don't run) benches to ensure they stay relevant
cmdFile.addTestSetThreadpool()
for cmd in cmdFile.splitLines():
exec cmd
if cmd != "": # Windows doesn't like empty commands
exec cmd
task test_parallel, "Run all tests in parallel (via GNU parallel)":
# -d:testingCurves is configured in a *.nim.cfg for convenience
@ -547,6 +575,13 @@ task test_parallel, "Run all tests in parallel (via GNU parallel)":
writeFile(buildParallel, cmdFile)
exec "build/pararun " & buildParallel
# Threadpool tests done serially
cmdFile = ""
cmdFile.addTestSetThreadpool()
for cmd in cmdFile.splitLines():
if cmd != "": # Windows doesn't like empty commands
exec cmd
task test_parallel_no_asm, "Run all tests (without macro assembler) in parallel (via GNU parallel)":
# -d:testingCurves is configured in a *.nim.cfg for convenience
clearParallelBuild()
@ -558,6 +593,13 @@ task test_parallel_no_asm, "Run all tests (without macro assembler) in parallel
writeFile(buildParallel, cmdFile)
exec "build/pararun " & buildParallel
# Threadpool tests done serially
cmdFile = ""
cmdFile.addTestSetThreadpool()
for cmd in cmdFile.splitLines():
if cmd != "": # Windows doesn't like empty commands
exec cmd
task test_parallel_no_gmp, "Run all tests in parallel (via GNU parallel)":
# -d:testingCurves is configured in a *.nim.cfg for convenience
clearParallelBuild()
@ -569,6 +611,13 @@ task test_parallel_no_gmp, "Run all tests in parallel (via GNU parallel)":
writeFile(buildParallel, cmdFile)
exec "build/pararun " & buildParallel
# Threadpool tests done serially
cmdFile = ""
cmdFile.addTestSetThreadpool()
for cmd in cmdFile.splitLines():
if cmd != "": # Windows doesn't like empty commands
exec cmd
task test_parallel_no_gmp_no_asm, "Run all tests in parallel (via GNU parallel)":
# -d:testingCurves is configured in a *.nim.cfg for convenience
clearParallelBuild()
@ -580,11 +629,26 @@ task test_parallel_no_gmp_no_asm, "Run all tests in parallel (via GNU parallel)"
writeFile(buildParallel, cmdFile)
exec "build/pararun " & buildParallel
# Threadpool tests done serially
cmdFile = ""
cmdFile.addTestSetThreadpool()
for cmd in cmdFile.splitLines():
if cmd != "": # Windows doesn't like empty commands
exec cmd
task test_threadpool, "Run all tests for the builtin threadpool":
var cmdFile: string
cmdFile.addTestSetThreadpool()
for cmd in cmdFile.splitLines():
if cmd != "": # Windows doesn't like empty commands
exec cmd
task test_nvidia, "Run all tests for Nvidia GPUs":
var cmdFile: string
cmdFile.addTestSetNvidia()
for cmd in cmdFile.splitLines():
exec cmd
if cmd != "": # Windows doesn't like empty commands
exec cmd
# Finite field 𝔽p
# ------------------------------------------

View File

@ -356,7 +356,7 @@ func div10*(a: var Limbs): SecretWord =
## Divide `a` by 10 in-place and return the remainder
result = Zero
const clz = WordBitWidth - 1 - log2_vartime(10)
const clz = WordBitWidth - 1 - log2_vartime(10'u32)
const norm10 = SecretWord(10) shl clz
for i in countdown(a.len-1, 0):

View File

@ -61,7 +61,7 @@ func invModBitwidth(a: BaseType): BaseType =
# which grows in O(log(log(a)))
debug: doAssert (a and 1) == 1, "a must be odd"
const k = log2_vartime(a.sizeof() * 8)
const k = log2_vartime(a.sizeof().uint32 * 8)
result = a # Start from an inverse of M0 modulo 2, M0 is odd and it's own inverse
for _ in 0 ..< k: # at each iteration we get the inverse mod(2^2k)
result *= 2 - a * result # x' = x(2 - ax)

View File

@ -282,7 +282,7 @@ func invModBitwidth*[T: SomeUnsignedInt](a: T): T =
# which grows in O(log(log(a)))
checkOdd(a)
let k = log2_vartime(T.sizeof() * 8)
let k = log2_vartime(T.sizeof().uint32 * 8)
result = a # Start from an inverse of M0 modulo 2, M0 is odd and it's own inverse
for _ in 0 ..< k: # at each iteration we get the inverse mod(2^2k)
result *= 2 - a * result # x' = x(2 - ax)

View File

@ -36,7 +36,7 @@ debug:
result[0] = '0'
result[1] = 'x'
var a = a
for j in countdown(result.len-1, 2):
for j in countdown(result.len-1, 0):
result[j] = hexChars.secretLookup(a and SecretWord 0xF)
a = a shr 4

View File

@ -291,8 +291,8 @@ func sum_batch_vartime*[F; G: static Subgroup](
const maxStride = maxChunkSize div sizeof(ECP_ShortW_Aff[F, G])
let n = min(maxStride, points.len)
let accumulators = alloca(ECP_ShortW_Aff[F, G], n)
let lambdas = alloca(tuple[num, den: F], n)
let accumulators = allocStackArray(ECP_ShortW_Aff[F, G], n)
let lambdas = allocStackArray(tuple[num, den: F], n)
for i in countup(0, points.len-1, maxStride):
let n = min(maxStride, points.len - i)

View File

@ -6,9 +6,23 @@ This folder holds:
to have the compiler enforce proper usage
- extended precision multiplication and division primitives
- assembly or builtin int128 primitives
- intrinsics
- an assembler
- Runtime CPU features detection
- SIMD intrinsics
- assemblers for x86 and LLVM IR
- a code generator for Nvidia GPU from LLVM IR
- runtime CPU features detection
- a threadpool
## Runtimes
Constantine strongly avoid any runtime so that it can be used even where garbage collection, dynamic memory allocation
are not allowed. That also avoids secrets remaining in heap memory.
At runtime, Constantine may:
- detect the CPU features at the start of the application (in Nim) or after calling `ctt_myprotocol_init_NimMain()` for the C (or any other language) bindings.
And offers the following opt-in features with use dynamic allocation:
- a threadpool, only for explicitly tagged parallel primitives.
- use LLVM and Cuda, and configure code to run computation on GPUs.
## Security

View File

@ -6,6 +6,9 @@
# * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
# at your option. This file may not be copied, modified, or distributed except according to those terms.
{.push raises:[].} # No exceptions for crypto
{.push checks:off.} # No int->size_t exceptions
# ############################################################
#
# Allocators
@ -20,13 +23,107 @@
#
# stack allocation is strongly preferred where necessary.
# Bindings
# ----------------------------------------------------------------------------------
# We wrap them with int instead of size_t / csize_t
when defined(windows):
proc alloca(size: int): pointer {.header: "<malloc.h>".}
else:
proc alloca(size: int): pointer {.header: "<alloca.h>".}
template alloca*(T: typedesc): ptr T =
proc malloc(size: int): pointer {.sideeffect, header: "<stdlib.h>".}
proc free(p: pointer) {.sideeffect, header: "<stdlib.h>".}
when defined(windows):
proc aligned_alloc_windows(size, alignment: int): pointer {.sideeffect,importc:"_aligned_malloc", header:"<malloc.h>".}
# Beware of the arg order!
proc aligned_alloc(alignment, size: int): pointer {.inline.} =
aligned_alloc_windows(size, alignment)
proc aligned_free(p: pointer){.sideeffect,importc:"_aligned_free", header:"<malloc.h>".}
elif defined(osx):
proc posix_memalign(mem: var pointer, alignment, size: int){.sideeffect,importc, header:"<stdlib.h>".}
proc aligned_alloc(alignment, size: int): pointer {.inline.} =
posix_memalign(result, alignment, size)
proc aligned_free(p: pointer) {.sideeffect, importc: "free", header: "<stdlib.h>".}
else:
proc aligned_alloc(alignment, size: int): pointer {.sideeffect,importc, header:"<stdlib.h>".}
proc aligned_free(p: pointer) {.sideeffect, importc: "free", header: "<stdlib.h>".}
# Helpers
# ----------------------------------------------------------------------------------
proc isPowerOfTwo(n: int): bool {.inline.} =
(n and (n - 1)) == 0 and (n != 0)
func roundNextMultipleOf(x: int, n: static int): int {.inline.} =
## Round the input to the next multiple of "n"
when n.isPowerOfTwo():
# n is a power of 2. (If compiler cannot prove that x>0 it does not make the optim)
result = (x + n - 1) and not(n - 1)
else:
result = ((x + n - 1) div n) * n
# Stack allocation
# ----------------------------------------------------------------------------------
template allocStack*(T: typedesc): ptr T =
cast[ptr T](alloca(sizeof(T)))
template alloca*(T: typedesc, len: Natural): ptr UncheckedArray[T] =
cast[ptr UncheckedArray[T]](alloca(sizeof(T) * len))
template allocStackUnchecked*(T: typedesc, size: int): ptr T =
## Stack allocation for types containing a variable-sized UncheckedArray field
cast[ptr T](alloca(size))
template allocStackArray*(T: typedesc, len: Natural): ptr UncheckedArray[T] =
cast[ptr UncheckedArray[T]](alloca(sizeof(T) * len))
# Heap allocation
# ----------------------------------------------------------------------------------
proc allocHeap*(T: typedesc): ptr T {.inline.} =
cast[type result](malloc(sizeof(T)))
proc allocHeapUnchecked*(T: typedesc, size: int): ptr T {.inline.} =
## Heap allocation for types containing a variable-sized UncheckedArray field
cast[type result](malloc(size))
proc allocHeapArray*(T: typedesc, len: SomeInteger): ptr UncheckedArray[T] {.inline.} =
cast[type result](malloc(len*sizeof(T)))
proc freeHeap*(p: pointer) {.inline.} =
free(p)
proc allocHeapAligned*(T: typedesc, alignment: static Natural): ptr T {.inline.} =
# aligned_alloc requires allocating in multiple of the alignment.
const
size = sizeof(T)
requiredMem = size.roundNextMultipleOf(alignment)
cast[ptr T](aligned_alloc(alignment, requiredMem))
proc allocHeapUncheckedAligned*(T: typedesc, size: int, alignment: static Natural): ptr T {.inline.} =
## Aligned heap allocation for types containing a variable-sized UncheckedArray field
## or an importc type with missing size information
# aligned_alloc requires allocating in multiple of the alignment.
let requiredMem = size.roundNextMultipleOf(alignment)
cast[ptr T](aligned_alloc(alignment, requiredMem))
proc allocHeapArrayAligned*(T: typedesc, len: int, alignment: static Natural): ptr UncheckedArray[T] {.inline.} =
# aligned_alloc requires allocating in multiple of the alignment.
let
size = sizeof(T) * len
requiredMem = size.roundNextMultipleOf(alignment)
cast[ptr UncheckedArray[T]](aligned_alloc(alignment, requiredMem))
proc allocHeapAlignedPtr*(T: typedesc[ptr], alignment: static Natural): T {.inline.} =
allocHeapAligned(typeof(default(T)[]), alignment)
proc allocHeapUncheckedAlignedPtr*(T: typedesc[ptr], size: int, alignment: static Natural): T {.inline.} =
## Aligned heap allocation for types containing a variable-sized UncheckedArray field
## or an importc type with missing size information
allocHeapUncheckedAligned(typeof(default(T)[]), size, alignment)
proc freeHeapAligned*(p: pointer) {.inline.} =
aligned_free(p)

View File

@ -32,7 +32,10 @@ import ./compilers/bitops
#
# See: https://www.chessprogramming.org/BitScan
# https://www.chessprogramming.org/General_Setwise_Operations
# https://www.chessprogramming.org/De_Bruijn_Sequence_Generator
# and https://graphics.stanford.edu/%7Eseander/bithacks.html
# and Hacker's Delight 2nd Edition, Henry S Warren, Jr.
# and https://sites.google.com/site/sydfhd/articles-tutorials/de-bruijn-sequence-generator
# for compendiums of bit manipulation
func clearMask[T: SomeInteger](v: T, mask: T): T {.inline.} =
@ -43,77 +46,106 @@ func clearBit*[T: SomeInteger](v: T, bit: T): T {.inline.} =
## Returns ``v``, with the bit at position ``bit`` set to 0
v.clearMask(1.T shl bit)
func log2impl_vartime(x: uint32): uint32 =
func log2_impl_vartime(n: uint32): uint32 =
## Find the log base 2 of a 32-bit or less integer.
## using De Bruijn multiplication
## Works at compile-time.
## ⚠️ not constant-time, table accesses are not uniform.
# https://graphics.stanford.edu/%7Eseander/bithacks.html#IntegerLogDeBruijn
const lookup: array[32, uint8] = [0'u8, 9, 1, 10, 13, 21, 2, 29, 11, 14, 16, 18,
22, 25, 3, 30, 8, 12, 20, 28, 15, 17, 24, 7, 19, 27, 23, 6, 26, 5, 4, 31]
var v = x
v = v or v shr 1 # first round down to one less than a power of 2
v = v or v shr 2
v = v or v shr 4
v = v or v shr 8
v = v or v shr 16
lookup[(v * 0x07C4ACDD'u32) shr 27]
const lookup: array[32, uint8] = [
uint8 0, 9, 1, 10, 13, 21, 2, 29,
11, 14, 16, 18, 22, 25, 3, 30,
8, 12, 20, 28, 15, 17, 24, 7,
19, 27, 23, 6, 26, 5, 4, 31]
# Isolate MSB
var n = n
n = n or n shr 1 # first round down to one less than a power of 2
n = n or n shr 2
n = n or n shr 4
n = n or n shr 8
n = n or n shr 16
uint32 lookup[(n * 0x07C4ACDD'u32) shr 27]
func log2impl_vartime(x: uint64): uint64 {.inline.} =
func log2_impl_vartime(n: uint64): uint64 {.inline.} =
## Find the log base 2 of a 32-bit or less integer.
## using De Bruijn multiplication
## Works at compile-time.
## ⚠️ not constant-time, table accesses are not uniform.
# https://graphics.stanford.edu/%7Eseander/bithacks.html#IntegerLogDeBruijn
const lookup: array[64, uint8] = [0'u8, 58, 1, 59, 47, 53, 2, 60, 39, 48, 27, 54,
33, 42, 3, 61, 51, 37, 40, 49, 18, 28, 20, 55, 30, 34, 11, 43, 14, 22, 4, 62,
57, 46, 52, 38, 26, 32, 41, 50, 36, 17, 19, 29, 10, 13, 21, 56, 45, 25, 31,
35, 16, 9, 12, 44, 24, 15, 8, 23, 7, 6, 5, 63]
var v = x
v = v or v shr 1 # first round down to one less than a power of 2
v = v or v shr 2
v = v or v shr 4
v = v or v shr 8
v = v or v shr 16
v = v or v shr 32
lookup[(v * 0x03F6EAF2CD271461'u64) shr 58]
# https://stackoverflow.com/questions/11376288/fast-computing-of-log2-for-64-bit-integers
const lookup: array[64, uint8] = [
uint8 0, 58, 1, 59, 47, 53, 2, 60,
39, 48, 27, 54, 33, 42, 3, 61,
51, 37, 40, 49, 18, 28, 20, 55,
30, 34, 11, 43, 14, 22, 4, 62,
57, 46, 52, 38, 26, 32, 41, 50,
36, 17, 19, 29, 10, 13, 21, 56,
45, 25, 31, 35, 16, 9, 12, 44,
24, 15, 8, 23, 7, 6, 5, 63]
# Isolate MSB
var n = n
n = n or n shr 1 # first round down to one less than a power of 2
n = n or n shr 2
n = n or n shr 4
n = n or n shr 8
n = n or n shr 16
n = n or n shr 32
uint64 lookup[(n * 0x03F6EAF2CD271461'u64) shr 58]
func log2_vartime*[T: SomeUnsignedInt](n: T): T {.inline.} =
## Find the log base 2 of an integer
##
## ⚠ With GCC and Clang compilers on x86, if n is zero, result is undefined.
when nimvm:
when sizeof(T) == sizeof(uint64):
T(log2impl_vartime(uint64(n)))
when sizeof(T) == 8:
T(log2_impl_vartime(uint64(n)))
else:
static: doAssert sizeof(T) <= sizeof(uint32)
T(log2impl_vartime(uint32(n)))
T(log2_impl_vartime(uint32(n)))
else:
when sizeof(T) == sizeof(uint64):
T(log2_c_compiler_vartime(uint64(n)))
log2_c_compiler_vartime(n)
func ctz_impl_vartime(n: uint32): uint32 =
## Find the number of trailing zero bits
## Requires n != 0
# https://sites.google.com/site/sydfhd/articles-tutorials/de-bruijn-sequence-generator
const lookup: array[32, uint8] = [
uint8 0, 1, 16, 2, 29, 17, 3, 22,
30, 20, 18, 11, 13, 4, 7, 23,
31, 15, 28, 21, 19, 10, 12, 6,
14, 27, 9, 5, 26, 8, 25, 24]
let isolateLSB = n xor (n-1)
uint32 lookup[(isolateLSB * 0x6EB14F9'u32) shr 27]
func ctz_impl_vartime(n: uint64): uint64 =
## Find the number of trailing zero bits
## Requires n != 0
# https://www.chessprogramming.org/BitScan#Bitscan_forward
const lookup: array[64, uint8] = [
uint8 0, 47, 1, 56, 48, 27, 2, 60,
57, 49, 41, 37, 28, 16, 3, 61,
54, 58, 35, 52, 50, 42, 21, 44,
38, 32, 29, 23, 17, 11, 4, 62,
46, 55, 26, 59, 40, 36, 15, 53,
34, 51, 20, 43, 31, 22, 10, 45,
25, 39, 14, 33, 19, 30, 9, 24,
13, 18, 8, 12, 7, 6, 5, 63]
let isolateLSB = n xor (n-1)
uint64 lookup[(isolateLSB * 0x03f79d71b4cb0a89'u64) shr 58]
func countTrailingZeroBits*[T: SomeUnsignedInt](n: T): T {.inline.} =
## Count the number of trailing zero bits of an integer
when nimvm:
if n == 0:
T(sizeof(n) * 8)
else:
static: doAssert sizeof(T) <= sizeof(uint32)
T(log2_c_compiler_vartime(uint32(n)))
func hammingWeight*(x: uint32): uint {.inline.} =
## Counts the set bits in integer.
# https://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel
var v = x
v = v - ((v shr 1) and 0x55555555)
v = (v and 0x33333333) + ((v shr 2) and 0x33333333)
uint(((v + (v shr 4) and 0xF0F0F0F) * 0x1010101) shr 24)
func hammingWeight*(x: uint64): uint {.inline.} =
## Counts the set bits in integer.
# https://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel
var v = x
v = v - ((v shr 1'u64) and 0x5555555555555555'u64)
v = (v and 0x3333333333333333'u64) + ((v shr 2'u64) and 0x3333333333333333'u64)
v = (v + (v shr 4'u64) and 0x0F0F0F0F0F0F0F0F'u64)
uint((v * 0x0101010101010101'u64) shr 56'u64)
func countLeadingZeros_vartime*[T: SomeUnsignedInt](x: T): T {.inline.} =
(8*sizeof(T)) - 1 - log2_vartime(x)
when sizeof(T) == 8:
T(ctz_impl_vartime(uint64(n)))
else:
T(ctz_impl_vartime(uint32(n)))
else:
ctz_c_compiler_vartime(n)
func isPowerOf2_vartime*(n: SomeUnsignedInt): bool {.inline.} =
## Returns true if n is a power of 2
@ -121,7 +153,7 @@ func isPowerOf2_vartime*(n: SomeUnsignedInt): bool {.inline.} =
## for compile-time or explicit vartime proc only.
(n and (n - 1)) == 0
func nextPowerOf2_vartime*(n: uint64): uint64 {.inline.} =
func nextPowerOfTwo_vartime*(n: uint32): uint32 {.inline.} =
## Returns x if x is a power of 2
## or the next biggest power of 2
1'u64 shl (log2_vartime(n-1) + 1)
1'u32 shl (log2_vartime(n-1) + 1)

View File

@ -15,54 +15,101 @@ when GCC_Compatible:
func builtin_clzll(n: uint64): cint {.importc: "__builtin_clzll", nodecl.}
## Count the number of leading zeros
## undefined if n is zero
func builtin_ctz(n: uint32): cint {.importc: "__builtin_ctz", nodecl.}
## Count the number of trailing zeros
## undefined if n is zero
func builtin_ctzll(n: uint64): cint {.importc: "__builtin_ctzll", nodecl.}
## Count the number of trailing zeros
## undefined if n is zero
func log2_c_compiler_vartime*(n: uint8|uint16|uint32): int {.inline.} =
func log2_c_compiler_vartime*(n: SomeUnsignedInt): cint {.inline.} =
## Compute the log2 of n using compiler builtin
## ⚠ Depending on the compiler:
## - It is undefined if n == 0
## - It is not constant-time as a zero input is checked
cast[int](31 - cast[cuint](builtin_clz(n.uint32)))
func log2_c_compiler_vartime*(n: uint64): int {.inline.} =
## Compute the log2 of n using compiler builtin
if n == 0:
0
else:
when sizeof(n) == 8:
cint(64) - builtin_clzll(n)
else:
cint(31) - builtin_clz(n.uint32)
func ctz_c_compiler_vartime*(n: SomeUnsignedInt): cint {.inline.} =
## Compute the number of trailing zeros
## in the bit representation of n using compiler builtin
## ⚠ Depending on the compiler:
## - It is undefined if n == 0
## - It is not constant-time as a zero input is checked
cast[int](63 - cast[cuint](builtin_clzll(n)))
if n == 0:
sizeof(n) * 8
else:
when sizeof(n) == 8:
builtin_ctzll(n)
else:
builtin_ctz(n.uint32)
elif defined(icc):
func bitScanReverse(r: var uint32, n: uint32): uint8 {.importc: "_BitScanReverse", header: "<immintrin.h>".}
## Returns 0 if n is zero and non-zero otherwise
## Returns the position of the first set bit in `r`
## from MSB to LSB
func bitScanReverse64(r: var uint32, n: uint64): uint8 {.importc: "_BitScanReverse64", header: "<immintrin.h>".}
## Returns 0 if n is zero and non-zero otherwise
## Returns the position of the first set bit in `r`
## from MSB to LSB
func bitScanForward(r: var uint32, n: uint32): uint8 {.importc: "_BitScanForward", header: "<immintrin.h>".}
## Returns 0 if n is zero and non-zero otherwise
## Returns the position of the first set bit in `r`
## from LSB to MSB
func bitScanForward64(r: var uint32, n: uint64): uint8 {.importc: "_BitScanForward64", header: "<immintrin.h>".}
## Returns 0 if n is zero and non-zero otherwise
## Returns the position of the first set bit in `r`
## from LSB to MSB
template bitscan(fnc: untyped; v: untyped): int {.inline.} =
template bitscan(fnc: untyped; v: untyped, default: static int): int {.inline.} =
var index: uint32
if fnc(index.addr, v) == 0:
return 0
return default
return index.int
func log2_c_compiler_vartime*(n: uint8|uint16|uint32): int {.inline.} =
func log2_c_compiler_vartime*(n: SomeUnsignedInt): cint {.inline.} =
## Compute the log2 of n using compiler builtin
## ⚠ Depending on the compiler:
## - It is undefined if n == 0
## - It is not constant-time as a zero input is checked
bitscan(bitScanReverse, c.uint32)
func log2_c_compiler_vartime*(n: uint64): int {.inline.} =
## Compute the log2 of n using compiler builtin
## ⚠ Depending on the compiler:
## - It is undefined if n == 0
## - It is not constant-time as a zero input is checked
bitscan(bitScanReverse64, n)
when sizeof(n) == 8:
bitscan(bitScanReverse64, n, default = 0)
else:
bitscan(bitScanReverse, c.uint32, default = 0)
func ctz_c_compiler_vartime*(n: SomeUnsignedInt): cint {.inline.} =
## Compute the number of trailing zero bits of n using compiler builtin
## ⚠ Depending on the compiler:
## - It is undefined if n == 0
## - It is not constant-time as a zero input is checked
when sizeof(n) == 8:
bitscan(bitScanForward64, n, default = 0)
else:
bitscan(bitScanForward, c.uint32, default = 0)
elif defined(vcc):
func bitScanReverse(p: ptr uint32, b: uint32): uint8 {.importc: "_BitScanReverse", header: "<intrin.h>".}
## Returns 0 if n s no set bit and non-zero otherwise
## Returns the position of the first set bit in `r`
## from MSB to LSB
func bitScanReverse64(p: ptr uint32, b: uint64): uint8 {.importc: "_BitScanReverse64", header: "<intrin.h>".}
## Returns 0 if n s no set bit and non-zero otherwise
## Returns the position of the first set bit in `r`
## from MSB to LSB
func bitScanForward(r: var uint32, n: uint32): uint8 {.importc: "_BitScanForward", header: "<intrin.h>".}
## Returns 0 if n is zero and non-zero otherwise
## Returns the position of the first set bit in `r`
## from LSB to MSB
func bitScanForward64(r: var uint32, n: uint64): uint8 {.importc: "_BitScanForward64", header: "<intrin.h>".}
## Returns 0 if n is zero and non-zero otherwise
## Returns the position of the first set bit in `r`
## from LSB to MSB
template bitscan(fnc: untyped; v: untyped): int =
var index: uint32
@ -70,18 +117,25 @@ elif defined(vcc):
return 0
return index.int
func log2_c_compiler_vartime*(n: uint8|uint16|uint32): int {.inline.} =
func log2_c_compiler_vartime*(n: SomeUnsignedInt): cint {.inline.} =
## Compute the log2 of n using compiler builtin
## ⚠ Depending on the compiler:
## - It is undefined if n == 0
## - It is not constant-time as a zero input is checked
bitscan(bitScanReverse, c.uint32)
func log2_c_compiler_vartime*(n: uint64): int {.inline.} =
## Compute the log2 of n using compiler builtin
when sizeof(n) == 8:
bitscan(bitScanReverse64, n, default = 0)
else:
bitscan(bitScanReverse, c.uint32, default = 0)
func ctz_c_compiler_vartime*(n: SomeUnsignedInt): cint {.inline.} =
## Compute the number of trailing zero bits of n using compiler builtin
## ⚠ Depending on the compiler:
## - It is undefined if n == 0
## - It is not constant-time as a zero input is checked
bitscan(bitScanReverse64, n)
when sizeof(n) == 8:
bitscan(bitScanForward64, n, default = sizeof(n) * 8)
else:
bitscan(bitScanForward, c.uint32, default = sizeof(n) * 8)
else:
{. error: "Unsupported compiler".}

View File

@ -0,0 +1,38 @@
# Constantine Threadpool
## API
The API spec follows https://github.com/nim-lang/RFCs/issues/347#task-parallelism-api
## Overview
This implements a lightweight, energy-efficient, easily auditable multithreaded threadpool.
This threadpool will desirable properties are:
- Ease of auditing and maintenance.
- Resource-efficient. Threads spindown to save power, low memory use.
- Decent performance and scalability. The CPU should spent its time processing user workloads
and not dealing with threadpool contention, latencies and overheads.
Compared to [Weave](https://github.com/mratsim/weave), here are the tradeoffs:
- Constantine's threadpool only provide spawn/sync (task parallelism).\
There is no (extremely) optimized parallel for (data parallelism)\
or precise in/out dependencies (events / dataflow parallelism).
- Constantine's threadpool has been significantly optimized to provide
overhead lower than Weave's default (and as low as Weave "lazy" + "alloca" allocation scheme)
- Constantine's threadpool provides the same adaptative scheduling strategy as Weave
with additional enhancement (leapfrogging)
Compared to [nim-taskpools](https://github.com/status-im), here are the tradeoffs:
- Constantine does not use std/tasks:
- No external dependencies at runtime (apart from compilers, OS and drivers)
- We can replace Task with an intrusive linked list
- Furthermore we can embed tasks in their future
- Hence allocation/scheduler overhead is 3x less than nim-taskpools as we fuse the following allocations:
- Task
- The linked list of tasks
- The future (Flowvar) result channel
- Contention improvement, Constantine is entirely lock-free while Nim-taskpools need a lock+condition variable for putting threads to sleep
- Powersaving improvement, threads sleep when awaiting for a task and there is no work available.
- Scheduling improvement, Constantine's threadpool incorporate Weave's adaptative scheduling policy with additional enhancement (leapfrogging)

View File

@ -0,0 +1,11 @@
# BPC (Bouncing Producer-Consumer)
From [tasking-2.0](https://github.com/aprell/tasking-2.0) description
> **BPC**, short for **B**ouncing **P**roducer-**C**onsumer benchmark, as far
> as I know, first described by [Dinan et al][1]. There are two types of
> tasks, producer and consumer tasks. Each producer task creates another
> producer task followed by *n* consumer tasks, until a certain depth *d* is
> reached. Consumer tasks run for *t* microseconds. The smaller the values of
> *n* and *t*, the harder it becomes to exploit the available parallelism. A
> solid contender for the most antagonistic microbenchmark.

View File

@ -0,0 +1,157 @@
import
# STD lib
system/ansi_c, std/[os, strutils, cpuinfo, strformat, math],
# Library
../../threadpool,
# bench
../wtime, ../resources
var
Depth: int32 # For example 10000
NumTasksPerDepth: int32 # For example 9
# The total number of tasks in the BPC benchmark is
# (NumTasksPerDepth + 1) * Depth
NumTasksTotal: int32
TaskGranularity: int32 # in microseconds
PollInterval: float64 # in microseconds
tp: Threadpool
var global_poll_elapsed {.threadvar.}: float64
template dummy_cpt(): untyped =
# Dummy computation
# Calculate fib(30) iteratively
var
fib = 0
f2 = 0
f1 = 1
for i in 2 .. 30:
fib = f1 + f2
f2 = f1
f1 = fib
proc bpc_consume(usec: int32) =
var pollElapsed = 0'f64
let start = wtime_usec()
let stop = usec.float64
global_poll_elapsed = PollInterval
while true:
var elapsed = wtime_usec() - start
elapsed -= pollElapsed
if elapsed >= stop:
break
dummy_cpt()
# if elapsed >= global_poll_elapsed:
# let pollStart = wtime_usec()
# loadBalance(Weave)
# pollElapsed += wtime_usec() - pollStart
# global_poll_elapsed += PollInterval
proc bpc_consume_nopoll(usec: int32) =
let start = wtime_usec()
let stop = usec.float64
while true:
var elapsed = wtime_usec() - start
if elapsed >= stop:
break
dummy_cpt()
proc bpc_produce(n, d: int32) {.gcsafe.} =
if d > 0:
# Create producer task
tp.spawn bpc_produce(n, d-1)
else:
return
# Followed by n consumer tasks
for i in 0 ..< n:
tp.spawn bpc_consume(TaskGranularity)
proc main() =
Depth = 10000
NumTasksPerDepth = 999
TaskGranularity = 1
if paramCount() == 0:
let exeName = getAppFilename().extractFilename()
echo &"Usage: {exeName} <depth: {Depth}> " &
&"<# of tasks per depth: {NumTasksPerDepth}> " &
&"[task granularity (us): {TaskGranularity}] " &
&"[polling interval (us): task granularity]"
echo &"Running with default config Depth = {Depth}, NumTasksPerDepth = {NumTasksPerDepth}, granularity (us) = {TaskGranularity}, polling (us) = {PollInterval}"
if paramCount() >= 1:
Depth = paramStr(1).parseInt.int32
if paramCount() >= 2:
NumTasksPerDepth = paramStr(2). parseInt.int32
if paramCount() >= 3:
TaskGranularity = paramStr(3). parseInt.int32
if paramCount() == 4:
PollInterval = paramStr(4).parseInt.float64
else:
PollInterval = TaskGranularity.float64
if paramCount() > 4:
let exeName = getAppFilename().extractFilename()
echo &"Usage: {exeName} <depth: {Depth}> " &
&"<# of tasks per depth: {NumTasksPerDepth}> " &
&"[task granularity (us): {TaskGranularity}] " &
&"[polling interval (us): task granularity]"
quit 1
NumTasksTotal = (NumTasksPerDepth + 1) * Depth
var nthreads: int
if existsEnv"CTT_NUM_THREADS":
nthreads = getEnv"CTT_NUM_THREADS".parseInt()
else:
nthreads = countProcessors()
tp = Threadpool.new(numThreads = nthreads)
# measure overhead during tasking
var ru: Rusage
getrusage(RusageSelf, ru)
var
rss = ru.ru_maxrss
flt = ru.ru_minflt
let start = wtime_msec()
bpc_produce(NumTasksPerDepth, Depth)
tp.syncAll()
let stop = wtime_msec()
getrusage(RusageSelf, ru)
rss = ru.ru_maxrss - rss
flt = ru.ru_minflt - flt
tp.shutdown()
echo "--------------------------------------------------------------------------"
echo "Scheduler: Constantine's Threadpool"
echo "Benchmark: BPC (Bouncing Producer-Consumer)"
echo "Threads: ", nthreads
echo "Time(ms) ", round(stop - start, 3)
echo "Max RSS (KB): ", ru.ru_maxrss
echo "Runtime RSS (KB): ", rss
echo "# of page faults: ", flt
echo "--------------------------------------------------------------------------"
echo "# of tasks: ", NumTasksTotal
echo "# of tasks/depth: ", NumTasksPerDepth
echo "Depth: ", Depth
echo "Task granularity (us): ", TaskGranularity
echo "Polling / manual load balancing interval (us): ", PollInterval
echo "--------------------------------------------------------------------------"
quit 0
main()

View File

@ -0,0 +1,86 @@
# Weave
# Copyright (c) 2019 Mamy André-Ratsimbazafy
# Licensed and distributed under either of
# * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
# * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
# at your option. This file may not be copied, modified, or distributed except according to those terms.
import
# Stdlib
system/ansi_c, std/[strformat, os, strutils, cpuinfo],
# Library
../../threadpool
when not defined(windows):
# bench
import ../wtime
var tp: Threadpool
proc dfs(depth, breadth: int): uint32 {.gcsafe.} =
if depth == 0:
return 1
# We could use alloca to avoid heap allocation here
var sums = newSeq[Flowvar[uint32]](breadth)
for i in 0 ..< breadth:
sums[i] = tp.spawn dfs(depth - 1, breadth)
for i in 0 ..< breadth:
result += sync(sums[i])
proc test(depth, breadth: int): uint32 =
result = sync tp.spawn dfs(depth, breadth)
proc main() =
var
depth = 8
breadth = 8
answer: uint32
nthreads: int
if existsEnv"CTT_NUM_THREADS":
nthreads = getEnv"CTT_NUM_THREADS".parseInt()
else:
nthreads = countProcessors()
if paramCount() == 0:
let exeName = getAppFilename().extractFilename()
echo &"Usage: {exeName} <depth:{depth}> <breadth:{breadth}>"
echo &"Running with default config depth = {depth} and breadth = {breadth}"
if paramCount() >= 1:
depth = paramStr(1).parseInt()
if paramCount() == 2:
breadth = paramStr(2).parseInt()
if paramCount() > 2:
let exeName = getAppFilename().extractFilename()
echo &"Usage: {exeName} <depth:{depth}> <breadth:{breadth}>"
echo &"Up to 2 parameters are valid. Received {paramCount()}"
quit 1
# Staccato benches runtime init and exit as well
when not defined(windows):
let start = wtime_usec()
tp = Threadpool.new()
answer = test(depth, breadth)
tp.shutdown()
when not defined(windows):
let stop = wtime_usec()
echo "--------------------------------------------------------------------------"
echo "Scheduler: Constantine's Threadpool"
echo "Benchmark: dfs"
echo "Threads: ", nthreads
when not defined(windows):
echo "Time(us) ", stop - start
echo "Output: ", answer
echo "--------------------------------------------------------------------------"
quit 0
main()

View File

@ -0,0 +1,77 @@
# Fibonacci benchmarks
⚠️ Disclaimer:
Please don't use parallel fibonacci in production!
Use the fast doubling method with memoization instead.
Fibonacci benchmark has 3 draws:
1. It's very simple to implement
2. It's unbalanced and efficiency requires distributions to avoid idle cores.
3. It's a very effective scheduler overhead benchmark, because the basic task is very trivial and the task spawning grows at 2^n scale.
Want to know the difference between low and high overhead?
Run the following C code (taken from [Oracle OpenMP example](https://docs.oracle.com/cd/E19205-01/820-7883/girtd/index.html))
```C
#include <stdio.h>
#include <omp.h>
int fib(int n)
{
int i, j;
if (n<2)
return n;
else
{
#pragma omp task shared(i) firstprivate(n)
{
i=fib(n-1);
}
j=fib(n-2);
#pragma omp taskwait
return i+j;
}
}
int main()
{
int n = 40;
#pragma omp parallel shared(n)
{
#pragma omp single
printf ("fib(%d) = %d\n", n, fib(n));
}
}
```
First compile with Clang and run it
```
clang -O3 -fopenmp benchmarks/fibonacci/omp_fib.c
time a.out
```
It should be fairly quick
Then compile with GCC and run it
```
gcc -O3 -fopenmp benchmarks/fibonacci/omp_fib.c
time a.out
```
Notice how some cores get idle as time goes on?
Don't forget to kill the benchmark, you'll be there all day.
What's happening?
GCC's OpenMP implementation uses a single queue for all tasks.
That queue gets constantly hammered by all threads and becomes a contention point.
Furthermore, it seems like there is no load balancing or that due to the contention/lock
threads are descheduled.
However Clang implementation uses a work-stealing scheduler with one queue per thread.
The only contention happens when a thread run out of work and has to look for more work,
in the queue of other threads. And which thread to check is chosen at random so
the potential contention is distributed among all threads instead of a single structure.

View File

@ -0,0 +1,35 @@
import
# STD lib
std/[os, strutils, threadpool, strformat],
# bench
../wtime
# Using Nim's standard threadpool
# Compile with "nim c --threads:on -d:release -d:danger --outdir:build benchmarks/fibonacci/stdnim_fib.nim"
#
# Note: it breaks at fib 16.
proc parfib(n: uint64): uint64 =
if n < 2: # Note: be sure to compare n<2 -> return n
return n # instead of n<=2 -> return 1
let x = spawn parfib(n-1)
let y = parfib(n-2)
return ^x + y
proc main() =
if paramCount() != 1:
echo "Usage: fib <n-th fibonacci number requested>"
quit 0
let n = paramStr(1).parseUInt.uint64
let start = wtime_msec()
let f = parfib(n)
let stop = wtime_msec()
echo "Result: ", f
echo &"Elapsed wall time: {stop-start:.2} ms"
main()

View File

@ -0,0 +1,79 @@
import
# STD lib
std/[os, strutils, cpuinfo, strformat, math],
# Library
../../threadpool
when not defined(windows):
# bench
import ../wtime, ../resources
var tp: Threadpool
proc fib(n: int): int =
# int64 on x86-64
if n < 2:
return n
let x = tp.spawn fib(n-1)
let y = fib(n-2)
result = sync(x) + y
proc main() =
var n = 40
var nthreads: int
if paramCount() == 0:
let exeName = getAppFilename().extractFilename()
echo &"Usage: {exeName} <n-th fibonacci number requested:{n}> "
echo &"Running with default n = {n}"
elif paramCount() == 1:
n = paramStr(1).parseInt
else:
let exeName = getAppFilename().extractFilename()
echo &"Usage: {exeName} <n-th fibonacci number requested:{n}>"
quit 1
if existsEnv"CTT_NUM_THREADS":
nthreads = getEnv"CTT_NUM_THREADS".parseInt()
else:
nthreads = countProcessors()
tp = Threadpool.new()
# measure overhead during tasking
when not defined(windows):
var ru: Rusage
getrusage(RusageSelf, ru)
var
rss = ru.ru_maxrss
flt = ru.ru_minflt
let start = wtime_msec()
let f = fib(n)
when not defined(windows):
let stop = wtime_msec()
tp.shutdown()
when not defined(windows):
getrusage(RusageSelf, ru)
rss = ru.ru_maxrss - rss
flt = ru.ru_minflt - flt
echo "--------------------------------------------------------------------------"
echo "Scheduler: Constantine's Threadpool"
echo "Benchmark: Fibonacci"
echo "Threads: ", nthreads
when not defined(windows):
echo "Time(ms) ", round(stop - start, 3)
echo "Max RSS (KB): ", ru.ru_maxrss
echo "Runtime RSS (KB): ", rss
echo "# of page faults: ", flt
echo "--------------------------------------------------------------------------"
echo "n requested: ", n
echo "result: ", f
main()

View File

@ -0,0 +1,300 @@
# Weave
# Copyright (c) 2019 Mamy André-Ratsimbazafy
# Licensed and distributed under either of
# * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
# * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
# at your option. This file may not be copied, modified, or distributed except according to those terms.
# From fibril
#
# Original license
#
# /*
# * Heat diffusion (Jacobi-type iteration)
# *
# * Volker Strumpen, Boston August 1996
# *
# * Copyright (c) 1996 Massachusetts Institute of Technology
# *
# * This program is free software; you can redistribute it and/or modify
# * it under the terms of the GNU General Public License as published by
# * the Free Software Foundation; either version 2 of the License, or
# * (at your option) any later version.
# *
# * This program is distributed in the hope that it will be useful,
# * but WITHOUT ANY WARRANTY; without even the implied warranty of
# * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# * GNU General Public License for more details.
# *
# * You should have received a copy of the GNU General Public License
# * along with this program; if not, write to the Free Software
# * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
# */
import
# Stdlib
system/ansi_c, std/[strformat, os, strutils, math, cpuinfo],
std/threadpool,
# bench
../wtime, ../resources
# This deadlocks :/
# Helpers
# -------------------------------------------------------
# We need a thin wrapper around raw pointers for matrices,
# we can't pass "var seq[seq[float64]]" to other threads
# nor "var" for that matter
type
Matrix[T] = object
buffer: ptr UncheckedArray[T]
m, n: int
Row[T] = object
buffer: ptr UncheckedArray[T]
len: int
func newMatrix[T](m, n: int): Matrix[T] {.inline.} =
result.buffer = cast[ptr UncheckedArray[T]](c_malloc(csize_t m*n*sizeof(T)))
result.m = m
result.n = n
template `[]`[T](mat: Matrix[T], row, col: Natural): T =
# row-major storage
assert row < mat.m
assert col < mat.n
mat.buffer[row * mat.n + col]
template `[]=`[T](mat: Matrix[T], row, col: Natural, value: T) =
assert row < mat.m
assert col < mat.n
mat.buffer[row * mat.n + col] = value
func getRow[T](mat: Matrix[T], rowIdx: Natural): Row[T] {.inline.} =
# row-major storage, there are n columns in between each rows
assert rowIdx < mat.m
result.buffer = cast[ptr UncheckedArray[T]](mat.buffer[rowIdx * mat.n].addr)
result.len = mat.m
template `[]`[T](row: Row[T], idx: Natural): T =
assert idx < row.len
row.buffer[idx]
template `[]=`[T](row: Row[T], idx: Natural, value: T) =
assert idx < row.len
row.buffer[idx] = value
func delete[T](mat: sink Matrix[T]) =
c_free(mat.buffer)
# And an auto converter for int32 -> float64 so we don't have to convert
# all i, j indices manually
converter i32toF64(x: int32): float64 {.inline.} =
float64(x)
# -------------------------------------------------------
template f(x, y: SomeFloat): SomeFloat =
sin(x) * sin(y)
template randa[T: SomeFloat](x, t: T): T =
T(0.0)
proc randb(x, t: SomeFloat): SomeFloat {.inline.} =
# proc instead of template to avoid Nim constant folding bug:
# https://github.com/nim-lang/Nim/issues/12783
exp(-2 * t) * sin(x)
template randc[T: SomeFloat](y, t: T): T =
T(0.0)
proc randd(y, t: SomeFloat): SomeFloat {.inline.} =
# proc instead of template to avoid Nim constant folding bug:
# https://github.com/nim-lang/Nim/issues/12783
exp(-2 * t) * sin(y)
template solu(x, y, t: SomeFloat): SomeFloat =
exp(-2 * t) * sin(x) * sin(y)
const n = 4096'i32
var
nx, ny, nt: int32
xu, xo, yu, yo, tu, to: float64
dx, dy, dt: float64
dtdxsq, dtdysq: float64
odd: Matrix[float64]
even: Matrix[float64]
proc heat(m: Matrix[float64], il, iu: int32): bool {.discardable.}=
# TODO to allow awaiting `heat` we return a dummy bool
# The parallel spawns are updating the same matrix cells otherwise
if iu - il > 1:
let im = (il + iu) div 2
let h = spawn heat(m, il, im)
heat(m, im, iu)
discard ^h
return true
# ------------------------
let i = il
let row = m.getRow(i)
if i == 0:
for j in 0 ..< ny:
row[j] = randc(yu + j*dy, 0)
elif i == nx - 1:
for j in 0 ..< ny:
row[j] = randd(yu + j*dy, 0)
else:
row[0] = randa(xu + i*dx, 0)
for j in 1 ..< ny - 1:
row[j] = f(xu + i*dx, yu + j*dy)
row[ny - 1] = randb(xu + i*dx, 0)
proc diffuse(output: Matrix[float64], input: Matrix[float64], il, iu: int32, t: float64): bool {.discardable.} =
# TODO to allow awaiting `diffuse` we return a dummy bool
# The parallel spawns are updating the same matrix cells otherwise
if iu - il > 1:
let im = (il + iu) div 2
let d = spawn diffuse(output, input, il, im, t)
diffuse(output, input, im, iu, t)
discard ^d
return true
# ------------------------
let i = il
let row = output.getRow(i)
if i == 0:
for j in 0 ..< ny:
row[j] = randc(yu + j*dy, t)
elif i == nx - 1:
for j in 0 ..< ny:
row[j] = randd(yu + j*dy, t)
else:
row[0] = randa(xu + i*dx, t)
for j in 1 ..< ny - 1:
row[j] = input[i, j] + # The use of nested sequences here is a bad idea ...
dtdysq * (input[i, j+1] - 2 * input[i, j] + input[i, j-1]) +
dtdxsq * (input[i+1, j] - 2 * input[i, j] + input[i-1, j])
row[ny - 1] = randb(xu + i*dx, t)
proc initTest() =
nx = n
ny = 1024
nt = 100
xu = 0.0
xo = 1.570796326794896558
yu = 0.0
yo = 1.570796326794896558
tu = 0.0
to = 0.0000001
dx = (xo - xu) / float64(nx - 1)
dy = (yo - yu) / float64(ny - 1)
dt = (to - tu) / float64(nt)
dtdxsq = dt / (dx * dx)
dtdysq = dt / (dy * dy)
even = newMatrix[float64](nx, ny)
odd = newMatrix[float64](nx, ny)
proc prep() =
heat(even, 0, nx)
proc test() =
var t = tu
for _ in countup(1, nt.int, 2):
# nt included
t += dt
diffuse(odd, even, 0, nx, t)
t += dt
diffuse(even, odd, 0, nx, t)
if nt mod 2 != 0:
t += dt
diffuse(odd, even, 0, nx, t)
proc verify() =
var
mat: Matrix[float64]
mae: float64
mre: float64
me: float64
mat = if nt mod 2 != 0: odd else: even
for a in 0 ..< nx:
for b in 0 ..< ny:
var tmp = abs(mat[a, b] - solu(xu + a*dx, yu + b*dy, to))
if tmp > 1e-3:
echo "nx: ", nx, " - ny: ", ny
echo "mat[", a, ", ", b, "] = ", mat[a, b], ", expected sol = ", solu(xu + a*dx, yu + b*dy, to)
quit 1
me += tmp
if tmp > mae: mae = tmp
if mat[a, b] != 0.0: tmp /= mat[a, b]
if tmp > mre: mre = tmp
me /= nx * ny
if mae > 1e-12:
echo &"Local maximal absolute error {mae:1.3e}"
quit 1
if mre > 1e-12:
echo &"Local maximal relative error {mre:1.3e}"
quit 1
if me > 1e-12:
echo &"Global mean absolute error {me:1.3e}"
quit 1
echo "Verification successful"
proc main() =
var nthreads: int
nthreads = countProcessors()
var ru: Rusage
getrusage(RusageSelf, ru)
var
rss = ru.ru_maxrss
flt = ru.ru_minflt
initTest()
prep()
let start = wtime_usec()
test()
let stop = wtime_usec()
getrusage(RusageSelf, ru)
rss = ru.ru_maxrss - rss
flt = ru.ru_minflt - flt
sync()
verify()
delete(even)
delete(odd)
echo "Scheduler: Nim threadpool (standard lib)"
echo "Benchmark: heat"
echo "Threads: ", nthreads
echo "Time(us) ", stop - start
echo "Max RSS (KB): ", ru.ru_maxrss
echo "Runtime RSS (KB): ", rss
echo "# of page faults: ", flt
quit 0
main()

View File

@ -0,0 +1,314 @@
# Weave
# Copyright (c) 2019 Mamy André-Ratsimbazafy
# Licensed and distributed under either of
# * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
# * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
# at your option. This file may not be copied, modified, or distributed except according to those terms.
# From fibril
#
# Original license
#
# /*
# * Heat diffusion (Jacobi-type iteration)
# *
# * Volker Strumpen, Boston August 1996
# *
# * Copyright (c) 1996 Massachusetts Institute of Technology
# *
# * This program is free software; you can redistribute it and/or modify
# * it under the terms of the GNU General Public License as published by
# * the Free Software Foundation; either version 2 of the License, or
# * (at your option) any later version.
# *
# * This program is distributed in the hope that it will be useful,
# * but WITHOUT ANY WARRANTY; without even the implied warranty of
# * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# * GNU General Public License for more details.
# *
# * You should have received a copy of the GNU General Public License
# * along with this program; if not, write to the Free Software
# * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
# */
import
# Stdlib
system/ansi_c, std/[strformat, os, strutils, math, cpuinfo],
# Library
../../threadpool
when not defined(windows):
# bench
import ../wtime, ../resources
# Helpers
# -------------------------------------------------------
# We need a thin wrapper around raw pointers for matrices,
# we can't pass "var seq[seq[float64]]" to other threads
# nor "var" for that matter
type
Matrix[T] = object
buffer: ptr UncheckedArray[T]
m, n: int
Row[T] = object
buffer: ptr UncheckedArray[T]
len: int
var tp: Threadpool
func newMatrix[T](m, n: int): Matrix[T] {.inline.} =
result.buffer = cast[ptr UncheckedArray[T]](c_malloc(csize_t m*n*sizeof(T)))
result.m = m
result.n = n
template `[]`[T](mat: Matrix[T], row, col: Natural): T =
# row-major storage
assert row < mat.m
assert col < mat.n
mat.buffer[row * mat.n + col]
template `[]=`[T](mat: Matrix[T], row, col: Natural, value: T) =
assert row < mat.m
assert col < mat.n
mat.buffer[row * mat.n + col] = value
func getRow[T](mat: Matrix[T], rowIdx: Natural): Row[T] {.inline.} =
# row-major storage, there are n columns in between each rows
assert rowIdx < mat.m
result.buffer = cast[ptr UncheckedArray[T]](mat.buffer[rowIdx * mat.n].addr)
result.len = mat.m
template `[]`[T](row: Row[T], idx: Natural): T =
assert idx < row.len
row.buffer[idx]
template `[]=`[T](row: Row[T], idx: Natural, value: T) =
assert idx < row.len
row.buffer[idx] = value
func delete[T](mat: sink Matrix[T]) =
c_free(mat.buffer)
# And an auto converter for int32 -> float64 so we don't have to convert
# all i, j indices manually
converter i32toF64(x: int32): float64 {.inline.} =
float64(x)
# -------------------------------------------------------
template f(x, y: SomeFloat): SomeFloat =
sin(x) * sin(y)
template randa[T: SomeFloat](x, t: T): T =
T(0.0)
proc randb(x, t: SomeFloat): SomeFloat {.inline.} =
# proc instead of template to avoid Nim constant folding bug:
# https://github.com/nim-lang/Nim/issues/12783
exp(-2 * t) * sin(x)
template randc[T: SomeFloat](y, t: T): T =
T(0.0)
proc randd(y, t: SomeFloat): SomeFloat {.inline.} =
# proc instead of template to avoid Nim constant folding bug:
# https://github.com/nim-lang/Nim/issues/12783
exp(-2 * t) * sin(y)
template solu(x, y, t: SomeFloat): SomeFloat =
exp(-2 * t) * sin(x) * sin(y)
const n = 4096'i32
var
nx, ny, nt: int32
xu, xo, yu, yo, tu, to: float64
dx, dy, dt: float64
dtdxsq, dtdysq: float64
odd: Matrix[float64]
even: Matrix[float64]
proc heat(m: Matrix[float64], il, iu: int32): bool {.discardable, gcsafe.}=
# TODO to allow awaiting `heat` we return a dummy bool
# The parallel spawns are updating the same matrix cells otherwise
if iu - il > 1:
let im = (il + iu) div 2
let h = tp.spawn heat(m, il, im)
heat(m, im, iu)
discard sync(h)
return true
# ------------------------
let i = il
let row = m.getRow(i)
if i == 0:
for j in 0 ..< ny:
row[j] = randc(yu + j*dy, 0)
elif i == nx - 1:
for j in 0 ..< ny:
row[j] = randd(yu + j*dy, 0)
else:
row[0] = randa(xu + i*dx, 0)
for j in 1 ..< ny - 1:
row[j] = f(xu + i*dx, yu + j*dy)
row[ny - 1] = randb(xu + i*dx, 0)
proc diffuse(output: Matrix[float64], input: Matrix[float64], il, iu: int32, t: float64): bool {.discardable, gcsafe.} =
# TODO to allow awaiting `diffuse` we return a dummy bool
# The parallel spawns are updating the same matrix cells otherwise
if iu - il > 1:
let im = (il + iu) div 2
let d = tp.spawn diffuse(output, input, il, im, t)
diffuse(output, input, im, iu, t)
discard sync(d)
return true
# ------------------------
let i = il
let row = output.getRow(i)
if i == 0:
for j in 0 ..< ny:
row[j] = randc(yu + j*dy, t)
elif i == nx - 1:
for j in 0 ..< ny:
row[j] = randd(yu + j*dy, t)
else:
row[0] = randa(xu + i*dx, t)
for j in 1 ..< ny - 1:
row[j] = input[i, j] + # The use of nested sequences here is a bad idea ...
dtdysq * (input[i, j+1] - 2 * input[i, j] + input[i, j-1]) +
dtdxsq * (input[i+1, j] - 2 * input[i, j] + input[i-1, j])
row[ny - 1] = randb(xu + i*dx, t)
proc initTest() =
nx = n
ny = 1024
nt = 100
xu = 0.0
xo = 1.570796326794896558
yu = 0.0
yo = 1.570796326794896558
tu = 0.0
to = 0.0000001
dx = (xo - xu) / float64(nx - 1)
dy = (yo - yu) / float64(ny - 1)
dt = (to - tu) / float64(nt)
dtdxsq = dt / (dx * dx)
dtdysq = dt / (dy * dy)
even = newMatrix[float64](nx, ny)
odd = newMatrix[float64](nx, ny)
proc prep() =
heat(even, 0, nx)
proc test() =
var t = tu
for _ in countup(1, nt.int, 2):
# nt included
t += dt
diffuse(odd, even, 0, nx, t)
t += dt
diffuse(even, odd, 0, nx, t)
if nt mod 2 != 0:
t += dt
diffuse(odd, even, 0, nx, t)
proc verify() =
var
mat: Matrix[float64]
mae: float64
mre: float64
me: float64
mat = if nt mod 2 != 0: odd else: even
for a in 0 ..< nx:
for b in 0 ..< ny:
var tmp = abs(mat[a, b] - solu(xu + a*dx, yu + b*dy, to))
if tmp > 1e-3:
echo "nx: ", nx, " - ny: ", ny
echo "mat[", a, ", ", b, "] = ", mat[a, b], ", expected sol = ", solu(xu + a*dx, yu + b*dy, to)
quit 1
me += tmp
if tmp > mae: mae = tmp
if mat[a, b] != 0.0: tmp /= mat[a, b]
if tmp > mre: mre = tmp
me /= nx * ny
if mae > 1e-12:
echo &"Local maximal absolute error {mae:1.3e}"
quit 1
if mre > 1e-12:
echo &"Local maximal relative error {mre:1.3e}"
quit 1
if me > 1e-12:
echo &"Global mean absolute error {me:1.3e}"
quit 1
echo "Heat: Verification successful"
proc main() =
var nthreads: int
if existsEnv"CTT_NUM_THREADS":
nthreads = getEnv"CTT_NUM_THREADS".parseInt()
else:
nthreads = countProcessors()
when not defined(windows):
var ru: Rusage
getrusage(RusageSelf, ru)
var
rss = ru.ru_maxrss
flt = ru.ru_minflt
initTest()
# Fibril initializes before benching
tp = Threadpool.new(numThreads = nthreads)
prep()
when not defined(windows):
let start = wtime_usec()
test()
when not defined(windows):
let stop = wtime_usec()
getrusage(RusageSelf, ru)
rss = ru.ru_maxrss - rss
flt = ru.ru_minflt - flt
tp.shutdown()
verify()
delete(even)
delete(odd)
echo "--------------------------------------------------------------------------"
echo "Scheduler: Constantine's Threadpool"
echo "Benchmark: heat"
echo "Threads: ", nthreads
when not defined(windows):
echo "Time(us) ", stop - start
echo "Max RSS (KB): ", ru.ru_maxrss
echo "Runtime RSS (KB): ", rss
echo "# of page faults: ", flt
echo "--------------------------------------------------------------------------"
quit 0
main()

View File

@ -0,0 +1,12 @@
# Cache-Oblivious Matrix Multiplication
From Staccato and Cilk
https://bradley.csail.mit.edu/svn/repos/cilk/5.4.3/examples/matmul.cilk
See the paper ``Cache-Oblivious Algorithms'', by
Matteo Frigo, Charles E. Leiserson, Harald Prokop, and
Sridhar Ramachandran, FOCS 1999, for an explanation of
why this algorithm is good for caches.
Note that the benchmarks output incorrect matrix traces
according to the check ...

View File

@ -0,0 +1,214 @@
# Weave
# Copyright (c) 2019 Mamy André-Ratsimbazafy
# Licensed and distributed under either of
# * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
# * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
# at your option. This file may not be copied, modified, or distributed except according to those terms.
# Rectangular matrix multiplication.
#
# Adapted from Cilk 5.4.3 example
#
# https://bradley.csail.mit.edu/svn/repos/cilk/5.4.3/examples/matmul.cilk;
# See the paper ``Cache-Oblivious Algorithms'', by
# Matteo Frigo, Charles E. Leiserson, Harald Prokop, and
# Sridhar Ramachandran, FOCS 1999, for an explanation of
# why this algorithm is good for caches.
import
# Stdlib
system/ansi_c, std/[strformat, os, strutils, math, cpuinfo],
# Library
../../threadpool,
# bench
../wtime, ../resources
# Helpers
# -------------------------------------------------------
# We need a thin wrapper around raw pointers for matrices,
# we can't pass "var" to other threads
type
Matrix[T: SomeFloat] = object
buffer: ptr UncheckedArray[T]
ld: int
var tp: Threadpool
func newMatrixNxN[T](n: int): Matrix[T] {.inline.} =
result.buffer = cast[ptr UncheckedArray[T]](c_malloc(csize_t n*n*sizeof(T)))
result.ld = n
template `[]`[T](mat: Matrix[T], row, col: Natural): T =
# row-major storage
assert row < mat.ld, $i & " < " & $mat.ld
assert col < mat.ld, $i & " < " & $mat.ld
mat.buffer[row * mat.ld + col]
template `[]=`[T](mat: Matrix[T], row, col: Natural, value: T) =
assert row < mat.ld, $i & " < " & $mat.ld
assert col < mat.ld, $i & " < " & $mat.ld
mat.buffer[row * mat.ld + col] = value
func stride*[T](mat: Matrix[T], row, col: Natural): Matrix[T]{.inline.}=
## Returns a new view offset by the row and column stride
result.buffer = cast[ptr UncheckedArray[T]](
addr mat.buffer[row*mat.ld + col]
)
func delete[T](mat: sink Matrix[T]) =
c_free(mat.buffer)
# -------------------------------------------------------
proc xorshiftRand(): uint32 =
var x {.global.} = uint32(2463534242)
x = x xor (x shr 13)
x = x xor (x shl 17)
x = x xor (x shr 5)
return x
func zero[T](A: Matrix[T]) =
# zeroing is not timed
zeroMem(A.buffer, A.ld * A.ld * sizeof(T))
proc fill[T](A: Matrix[T]) =
for i in 0 ..< A.ld:
for j in 0 ..< A.ld:
A[i, j] = T(xorshiftRand() mod A.ld.uint32)
func maxError(A, B: Matrix): float64 =
assert A.ld == B.ld
for i in 0 ..< A.ld:
for j in 0 ..< A.ld:
var diff = (A[i, j] - B[i, j]) / A[i, j]
if diff < 0:
diff = -diff
if diff > result:
result = diff
func check[T](A, B, C: Matrix[T], n: int): bool =
var
tr_C = 0.T
tr_AB = 0.T
for i in 0 ..< n:
for j in 0 ..< n:
tr_AB += A[i, j] * B[j, i]
tr_C += C[i, i]
# Note, all benchmarks return false ‾\_(ツ)_/‾
return abs(tr_AB - tr_C) < 1e-3
proc matmul[T](A, B, C: Matrix[T], m, n, p: int, add: bool): bool =
# The original bench passes around a ``ld`` parameter (leading dimension?),
# we store it in the matrices
# We return a dummy bool to allow waiting on the matmul
# Threshold
if (m + n + p) <= 64:
if add:
for i in 0 ..< m:
for k in 0 ..< p:
var c = 0.T
for j in 0 ..< n:
c += A[i, j] * B[j, k]
C[i, k] += c
else:
for i in 0 ..< m:
for k in 0 ..< p:
var c = 0.T
for j in 0 ..< n:
c += A[i, j] * B[j, k]
C[i, k] = c
return
var h0, h1: FlowVar[bool]
## Each half of the computation
# matrix is larger than threshold
if m >= n and n >= p:
let m1 = m shr 1 # divide by 2
h0 = tp.spawn matmul(A, B, C, m1, n, p, add)
h1 = tp.spawn matmul(A.stride(m1, 0), B, C.stride(m1, 0), m - m1, n, p, add)
elif n >= m and n >= p:
let n1 = n shr 1 # divide by 2
h0 = tp.spawn matmul(A, B, C, m, n1, p, add)
h1 = tp.spawn matmul(A.stride(0, n1), B.stride(n1, 0), C, m, n - n1, p, add = true)
else:
let p1 = p shr 1
h0 = tp.spawn matmul(A, B, C, m, n, p1, add)
h1 = tp.spawn matmul(A, B.stride(0, p1), C.stride(0, p1), m, n, p - p1, add)
discard sync(h0)
discard sync(h1)
proc main() =
echo "Warning the benchmark seems to not be correct."
var
n = 3000
nthreads: int
if existsEnv"CTT_NUM_THREADS":
nthreads = getEnv"CTT_NUM_THREADS".parseInt()
else:
nthreads = countProcessors()
if paramCount() == 0:
let exeName = getAppFilename().extractFilename()
echo &"Usage: {exeName} <n (matrix size):{n}>"
echo &"Running with default config n = {n}"
elif paramCount() == 1:
n = paramStr(1).parseInt()
else:
let exeName = getAppFilename().extractFilename()
echo &"Usage: {exeName} <n (matrix size):{n}>"
echo &"Up to 1 parameter is valid. Received {paramCount()}"
quit 1
var A = newMatrixNxN[float32](n)
var B = newMatrixNxN[float32](n)
var C = newMatrixNxN[float32](n)
fill(A)
fill(B)
zero(C)
var ru: Rusage
getrusage(RusageSelf, ru)
var
rss = ru.ru_maxrss
flt = ru.ru_minflt
# Staccato benches runtime init and exit as well
let start = wtime_msec()
tp = Threadpool.new(numThreads = nthreads)
discard sync tp.spawn matmul(A, B, C, n, n, n, add = false)
tp.shutdown()
let stop = wtime_msec()
getrusage(RusageSelf, ru)
rss = ru.ru_maxrss - rss
flt = ru.ru_minflt - flt
echo "--------------------------------------------------------------------------"
echo "Scheduler: Constantine's Threadpool"
echo "Benchmark: Matrix Multiplication (cache oblivious)"
echo "Threads: ", nthreads
echo "Time(ms) ", stop - start
echo "Max RSS (KB): ", ru.ru_maxrss
echo "Runtime RSS (KB): ", rss
echo "# of page faults: ", flt
echo "Input: ", n
echo "Error: ", check(A, B, C, n)
echo "--------------------------------------------------------------------------"
delete A
delete B
delete C
quit 0
main()

View File

@ -0,0 +1,181 @@
# Weave
# Copyright (c) 2019 Mamy André-Ratsimbazafy
# Licensed and distributed under either of
# * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
# * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
# at your option. This file may not be copied, modified, or distributed except according to those terms.
#
# Original code licenses
# ------------------------------------------------------------------------------------------------
#
# /**********************************************************************************************/
# /* This program is part of the Barcelona OpenMP Tasks Suite */
# /* Copyright (C) 2009 Barcelona Supercomputing Center - Centro Nacional de Supercomputacion */
# /* Copyright (C) 2009 Universitat Politecnica de Catalunya */
# /* */
# /* This program is free software; you can redistribute it and/or modify */
# /* it under the terms of the GNU General Public License as published by */
# /* the Free Software Foundation; either version 2 of the License, or */
# /* (at your option) any later version. */
# /* */
# /* This program is distributed in the hope that it will be useful, */
# /* but WITHOUT ANY WARRANTY; without even the implied warranty of */
# /* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the */
# /* GNU General Public License for more details. */
# /* */
# /* You should have received a copy of the GNU General Public License */
# /* along with this program; if not, write to the Free Software */
# /* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */
# /**********************************************************************************************/
#
# /*
# * Original code from the Cilk project (by Keith Randall)
# *
# * Copyright (c) 2000 Massachusetts Institute of Technology
# * Copyright (c) 2000 Matteo Frigo
# */
import
# Stdlib
system/ansi_c, std/[strformat, os, strutils],
std/threadpool,
# bench
../wtime
# This deadlocks :/
# Nim helpers
# -------------------------------------------------
when defined(windows):
proc alloca(size: csize): pointer {.header: "<malloc.h>".}
else:
proc alloca(size: csize): pointer {.header: "<alloca.h>".}
template alloca*(T: typedesc): ptr T =
cast[ptr T](alloca(sizeof(T)))
template alloca*(T: typedesc, len: Natural): ptr UncheckedArray[T] =
cast[ptr UncheckedArray[T]](alloca(sizeof(T) * len))
proc tp_alloc*(T: typedesc, len: SomeInteger): ptr UncheckedArray[T] {.inline.} =
cast[type result](c_malloc(csize_t len*sizeof(T)))
proc tp_free*[T: ptr](p: T) {.inline.} =
c_free(p)
# We assume that Nim zeroMem vs C memset
# and Nim copyMem vs C memcpy have no difference
# Nim does have extra checks to handle GC-ed types
# but they should be eliminated by the Nim compiler.
# -------------------------------------------------
type CharArray = ptr UncheckedArray[char]
var example_solution: ptr UncheckedArray[char]
func isValid(n: int32, a: CharArray): bool =
## `a` contains an array of `n` queen positions.
## Returns true if none of the queens conflict and 0 otherwise.
for i in 0'i32 ..< n:
let p = cast[int32](a[i])
for j in i+1 ..< n:
let q = cast[int32](a[j])
if q == p or q == p - (j-i) or q == p + (j-i):
return false
return true
proc nqueens_ser(n, j: int32, a: CharArray): int32 =
# Serial nqueens
if n == j:
# Good solution count it
if example_solution.isNil:
example_solution = tp_alloc(char, n)
copyMem(example_solution, a, n * sizeof(char))
return 1
# Try each possible position for queen `j`
for i in 0 ..< n:
a[j] = cast[char](i)
if isValid(j+1, a):
result += nqueens_ser(n, j+1, a)
proc nqueens_par(n, j: int32, a: CharArray): int32 =
if n == j:
# Good solution, count it
return 1
var localCounts = alloca(Flowvar[int32], n)
zeroMem(localCounts, n * sizeof(Flowvar[int32]))
# Try each position for queen `j`
for i in 0 ..< n:
var b = alloca(char, j+1)
copyMem(b, a, j * sizeof(char))
b[j] = cast[char](i)
if isValid(j+1, b):
localCounts[i] = spawn nqueens_par(n, j+1, b)
for i in 0 ..< n:
if not localCounts[i].isNil():
result += ^localCounts[i]
const solutions = [
1,
0,
0,
2,
10, # 5x5
4,
10,
92, # 8x8
352,
724, # 10x10
2680,
14200,
73712,
365596,
2279184, # 15x15
14772512
]
proc verifyQueens(n, res: int32) =
if n > solutions.len:
echo &"Cannot verify result: {n} is out of range [1,{solutions.len}]"
return
if res != solutions[n-1]:
echo &"N-Queens failure: {res} is different from expected {solutions[n-1]}"
proc main() =
if paramCount() != 1:
let exeName = getAppFilename().extractFilename()
echo &"Usage: {exeName} <n: number of queens on a nxn board>"
quit 0
let n = paramStr(1).parseInt.int32
if n notin 1 .. solutions.len:
echo &"The number of queens N (on a NxN board) must be in the range [1, {solutions.len}]"
quit 1
let start = wtime_msec()
let count = nqueens_par(n, 0, alloca(char, n))
let stop = wtime_msec()
verifyQueens(n, count)
if not example_solution.isNil:
stdout.write("Example solution: ")
for i in 0 ..< n:
c_printf("%2d ", example_solution[i])
stdout.write('\n')
echo &"Elapsed wall time: {stop-start:2.4f} ms"
main()

View File

@ -0,0 +1,231 @@
# Weave
# Copyright (c) 2019 Mamy André-Ratsimbazafy
# Licensed and distributed under either of
# * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
# * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
# at your option. This file may not be copied, modified, or distributed except according to those terms.
#
# Original code licenses
# ------------------------------------------------------------------------------------------------
#
# /**********************************************************************************************/
# /* This program is part of the Barcelona OpenMP Tasks Suite */
# /* Copyright (C) 2009 Barcelona Supercomputing Center - Centro Nacional de Supercomputacion */
# /* Copyright (C) 2009 Universitat Politecnica de Catalunya */
# /* */
# /* This program is free software; you can redistribute it and/or modify */
# /* it under the terms of the GNU General Public License as published by */
# /* the Free Software Foundation; either version 2 of the License, or */
# /* (at your option) any later version. */
# /* */
# /* This program is distributed in the hope that it will be useful, */
# /* but WITHOUT ANY WARRANTY; without even the implied warranty of */
# /* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the */
# /* GNU General Public License for more details. */
# /* */
# /* You should have received a copy of the GNU General Public License */
# /* along with this program; if not, write to the Free Software */
# /* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */
# /**********************************************************************************************/
#
# /*
# * Original code from the Cilk project (by Keith Randall)
# *
# * Copyright (c) 2000 Massachusetts Institute of Technology
# * Copyright (c) 2000 Matteo Frigo
# */
import
# Stdlib
system/ansi_c, std/[strformat, os, strutils, cpuinfo],
# library
../../threadpool
when not defined(windows):
# bench
import ../wtime, ../resources
# Nim helpers
# -------------------------------------------------
when defined(windows):
proc alloca(size: int): pointer {.header: "<malloc.h>".}
else:
proc alloca(size: int): pointer {.header: "<alloca.h>".}
template alloca*(T: typedesc): ptr T =
cast[ptr T](alloca(sizeof(T)))
template alloca*(T: typedesc, len: Natural): ptr UncheckedArray[T] =
cast[ptr UncheckedArray[T]](alloca(sizeof(T) * len))
proc tp_alloc*(T: typedesc, len: SomeInteger): ptr UncheckedArray[T] {.inline.} =
when defined(TP_useNimAlloc):
cast[type result](createSharedU(T, len))
else:
cast[type result](c_malloc(csize_t len*sizeof(T)))
proc tp_free*[T: ptr](p: T) {.inline.} =
when defined(TP_useNimAlloc):
freeShared(p)
else:
c_free(p)
# We assume that Nim zeroMem vs C memset
# and Nim copyMem vs C memcpy have no difference
# Nim does have extra checks to handle GC-ed types
# but they should be eliminated by the Nim compiler.
# -------------------------------------------------
type CharArray = ptr UncheckedArray[char]
var tp: Threadpool
var example_solution: ptr UncheckedArray[char]
func isValid(n: int32, a: CharArray): bool =
## `a` contains an array of `n` queen positions.
## Returns true if none of the queens conflict and 0 otherwise.
for i in 0'i32 ..< n:
let p = cast[int32](a[i])
for j in i+1 ..< n:
let q = cast[int32](a[j])
if q == p or q == p - (j-i) or q == p + (j-i):
return false
return true
proc nqueens_ser(n, j: int32, a: CharArray): int32 =
# Serial nqueens
if n == j:
# Good solution count it
if example_solution.isNil:
example_solution = tp_alloc(char, n)
copyMem(example_solution, a, n * sizeof(char))
return 1
# Try each possible position for queen `j`
for i in 0 ..< n:
a[j] = cast[char](i)
if isValid(j+1, a):
result += nqueens_ser(n, j+1, a)
proc nqueens_par(n, j: int32, a: CharArray): int32 {.gcsafe.} =
if n == j:
# Good solution, count it
return 1
var localCounts = alloca(Flowvar[int32], n)
zeroMem(localCounts, n * sizeof(Flowvar[int32]))
# Try each position for queen `j`
for i in 0 ..< n:
var b = alloca(char, j+1)
copyMem(b, a, j * sizeof(char))
b[j] = cast[char](i)
if isValid(j+1, b):
localCounts[i] = tp.spawn nqueens_par(n, j+1, b)
for i in 0 ..< n:
if localCounts[i].isSpawned():
result += sync(localCounts[i])
const solutions = [
1,
0,
0,
2,
10, # 5x5
4,
10,
92, # 8x8
352,
724, # 10x10
2680,
14200,
73712,
365596,
2279184, # 15x15
14772512
]
proc verifyQueens(n, res: int32) =
if n > solutions.len:
echo &"Cannot verify result: {n} is out of range [1,{solutions.len}]"
return
if res != solutions[n-1]:
echo &"N-Queens failure: {res} is different from expected {solutions[n-1]}"
proc main() =
var
n = 11'i32
nthreads: int
if existsEnv"CTT_NUM_THREADS":
nthreads = getEnv"CTT_NUM_THREADS".parseInt()
else:
nthreads = countProcessors()
if paramCount() == 0:
let exeName = getAppFilename().extractFilename()
echo &"Usage: {exeName} <N:{n}>"
echo &"Running with default config N = {n}\n"
if paramCount() >= 1:
n = paramStr(1).parseInt.int32
if n notin 1 .. solutions.len:
echo &"The number of queens N (on a NxN board) must be in the range [1, {solutions.len}]"
quit 1
when not defined(windows):
var ru: Rusage
getrusage(RusageSelf, ru)
var
rss = ru.ru_maxrss
flt = ru.ru_minflt
tp = Threadpool.new(numThreads = nthreads)
when not defined(windows):
let start = wtime_msec()
let count = nqueens_par(n, 0, alloca(char, n))
when not defined(windows):
let stop = wtime_msec()
when not defined(windows):
getrusage(RusageSelf, ru)
rss = ru.ru_maxrss - rss
flt = ru.ru_minflt - flt
tp.shutdown()
verifyQueens(n, count)
if not example_solution.isNil:
stdout.write("Example solution: ")
for i in 0 ..< n:
c_printf("%2d ", example_solution[i])
stdout.write('\n')
echo "--------------------------------------------------------------------------"
echo "Scheduler: Constantine's Threadpool"
echo "Benchmark: N-queens"
echo "Threads: ", nthreads
when not defined(windows):
echo "Time(us) ", stop - start
echo "Max RSS (KB): ", ru.ru_maxrss
echo "Runtime RSS (KB): ", rss
echo "# of page faults: ", flt
echo "Problem size: ", n,"x",n, " board with ",n, " queens"
echo "Solutions found: ", count
echo "--------------------------------------------------------------------------"
quit 0
main()

View File

@ -0,0 +1,24 @@
type
Timeval {.importc: "timeval", header:"<sys/time.h>", bycopy.} = object
Rusage* {.importc: "struct rusage", header:"<sys/resource.h>", bycopy.} = object
ru_utime {.importc.}: Timeval
ru_stime {.importc.}: Timeval
ru_maxrss* {.importc.}: int32 # Maximum resident set size
# ...
ru_minflt* {.importc.}: int32 # page reclaims (soft page faults)
RusageWho* {.size: sizeof(cint).} = enum
RusageChildren = -1
RusageSelf = 0
RusageThread = 1
when defined(debug):
var H_RUSAGE_SELF{.importc, header:"<sys/resource.h".}: cint
var H_RUSAGE_CHILDREN{.importc, header:"<sys/resource.h".}: cint
var H_RUSAGE_THREAD{.importc, header:"<sys/resource.h".}: cint
assert H_RUSAGE_SELF == ord(RusageSelf)
assert H_RUSAGE_CHILDREN = ord(RusageChildren)
assert H_RUSAGE_THREAD = ord(RusageThread)
proc getrusage*(who: RusageWho, usage: var Rusage) {.importc, header: "sys/resource.h".}

View File

@ -0,0 +1,7 @@
# Simple single-producer multiple consumers benchmarks
SPC A Simple Producer-Consumer benchmark.
A single worker produces n tasks,
each running for t microseconds. This benchmark allows us to test how many
concurrent consumers a single producer can sustain.

View File

@ -0,0 +1,146 @@
import
# STD lib
system/ansi_c, std/[os, strutils, cpuinfo, strformat, math],
# Library
../../threadpool,
# bench
../wtime, ../resources
var NumTasksTotal: int32
var TaskGranularity: int32 # microsecond
var PollInterval: float64 # microsecond
var tp: Threadpool
var global_poll_elapsed {.threadvar.}: float64
template dummy_cpt(): untyped =
# Dummy computation
# Calculate fib(30) iteratively
var
fib = 0
f2 = 0
f1 = 1
for i in 2 .. 30:
fib = f1 + f2
f2 = f1
f1 = fib
proc spc_consume(usec: int32) =
var pollElapsed = 0'f64
let start = wtime_usec()
let stop = usec.float64
global_poll_elapsed = PollInterval
while true:
var elapsed = wtime_usec() - start
elapsed = elapsed - pollElapsed
if elapsed >= stop:
break
dummy_cpt()
# if elapsed >= global_poll_elapsed:
# let pollStart = wtime_usec()
# loadBalance(Weave)
# pollElapsed += wtime_usec() - pollStart
# global_poll_elapsed += PollInterval
# c_printf("Elapsed: %.2lfus\n", elapsed)
proc spc_consume_nopoll(usec: int32) =
let start = wtime_usec()
let stop = usec.float64
while true:
var elapsed = wtime_usec() - start
if elapsed >= stop:
break
dummy_cpt()
# c_printf("Elapsed: %.2lfus\n", elapsed)
proc spc_produce(n: int32) =
for i in 0 ..< n:
tp.spawn spc_consume(TaskGranularity)
proc spc_produce_seq(n: int32) =
for i in 0 ..< n:
spc_consume_nopoll(TaskGranularity)
proc main() =
NumTasksTotal = 1000000
TaskGranularity = 10
PollInterval = 10
if paramCount() == 0:
let exeName = getAppFilename().extractFilename()
echo &"Usage: {exeName} <# of tasks:{NumTasksTotal}> " &
&"<task granularity (us): {TaskGranularity}> " &
&"[polling interval (us): task granularity]"
echo &"Running with default config tasks = {NumTasksTotal}, granularity (us) = {TaskGranularity}, polling (us) = {PollInterval}"
if paramCount() >= 1:
NumTasksTotal = paramStr(1).parseInt.int32
if paramCount() >= 2:
TaskGranularity = paramStr(2). parseInt.int32
if paramCount() == 3:
PollInterval = paramStr(3).parseInt.float64
else:
PollInterval = TaskGranularity.float64
if paramCount() > 3:
let exeName = getAppFilename().extractFilename()
echo &"Usage: {exeName} <# of tasks:{NumTasksTotal}> " &
&"<task granularity (us): {TaskGranularity}> " &
&"[polling interval (us): task granularity]"
quit 1
var nthreads: int
if existsEnv"CTT_NUM_THREADS":
nthreads = getEnv"CTT_NUM_THREADS".parseInt()
else:
nthreads = countProcessors()
tp = Threadpool.new(numThreads = nthreads)
# measure overhead during tasking
var ru: Rusage
getrusage(RusageSelf, ru)
var
rss = ru.ru_maxrss
flt = ru.ru_minflt
let start = wtime_msec()
# spc_produce_seq(NumTasksTotal)
spc_produce(NumTasksTotal)
tp.syncAll()
let stop = wtime_msec()
getrusage(RusageSelf, ru)
rss = ru.ru_maxrss - rss
flt = ru.ru_minflt - flt
tp.shutdown()
echo "--------------------------------------------------------------------------"
echo "Scheduler: Constantine's Threadpool"
echo "Benchmark: SPC (Single task Producer - multi Consumer)"
echo "Threads: ", nthreads
echo "Time(ms) ", round(stop - start, 3)
echo "Max RSS (KB): ", ru.ru_maxrss
echo "Runtime RSS (KB): ", rss
echo "# of page faults: ", flt
echo "--------------------------------------------------------------------------"
echo "# of tasks: ", NumTasksTotal
echo "Task granularity (us): ", TaskGranularity
echo "Polling / manual load balancing interval (us): ", PollInterval
echo "--------------------------------------------------------------------------"
quit 0
main()

View File

@ -0,0 +1,53 @@
#ifndef WTIME_H
#define WTIME_H
#include <sys/time.h>
#include <time.h>
// Number of seconds since the Epoch
static inline double Wtime_sec(void)
{
struct timeval tv;
gettimeofday(&tv, NULL);
return tv.tv_sec + tv.tv_usec / 1e6;
}
// Number of milliseconds since the Epoch
static inline double Wtime_msec(void)
{
struct timeval tv;
gettimeofday(&tv, NULL);
return tv.tv_sec * 1e3 + tv.tv_usec / 1e3;
}
// Number of microseconds since the Epoch
static inline double Wtime_usec(void)
{
struct timeval tv;
gettimeofday(&tv, NULL);
return tv.tv_sec * 1e6 + tv.tv_usec;
}
// Read time stamp counter on x86
static inline unsigned long long readtsc(void)
{
unsigned int lo, hi;
// RDTSC copies contents of 64-bit TSC into EDX:EAX
asm volatile ("rdtsc" : "=a" (lo), "=d" (hi));
return (unsigned long long)hi << 32 | lo;
}
#define WTIME_unique_var_name_paste(id, n) id ## n
#define WTIME_unique_var_name(id, n) WTIME_unique_var_name_paste(id, n)
#define WTIME_unique_var(id) WTIME_unique_var_name(id, __LINE__)
// Convenience macro for time measurement
#define WTIME(unit) \
double WTIME_unique_var(_start_##unit##_) = Wtime_##unit##ec(); \
int WTIME_unique_var(_i_) = 0; \
for (; WTIME_unique_var(_i_) == 0 || \
(printf("Elapsed wall time: %.2lf "#unit"\n", \
Wtime_##unit##ec() - WTIME_unique_var(_start_##unit##_)), 0); \
WTIME_unique_var(_i_)++)
#endif // WTIME_H

View File

@ -0,0 +1,10 @@
import strutils, os
const cSourcesPath = currentSourcePath.rsplit(DirSep, 1)[0]
const cHeader = csourcesPath / "wtime.h"
{.passC: "-I" & cSourcesPath .}
proc wtime_usec*: float64 {.importc: "Wtime_usec", header: cHeader.}
proc wtime_msec*: float64 {.importc: "Wtime_msec", header: cHeader.}

View File

@ -0,0 +1,186 @@
# Constantine
# Copyright (c) 2018-2019 Status Research & Development GmbH
# Copyright (c) 2020-Present Mamy André-Ratsimbazafy
# Licensed and distributed under either of
# * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
# * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
# at your option. This file may not be copied, modified, or distributed except according to those terms.
import
std/atomics,
../primitives/futexes
# We implement 2 datastructures to put threads to sleep:
# 1. An event notifier to put an awaiting thread to sleep when the task they require is worked on by another thread
# 2. An eventcount to put an idle thread to sleep
{.push raises:[], checks:off.}
# ############################################################
#
# Event Notifier
#
# ############################################################
# Formal verification at: https://github.com/mratsim/weave/blob/7682784/formal_verification/event_notifiers.tla#L76-L109
type
EventNotifier* = object
## Multi Producers, Single Consumer event notification
## This is can be seen as a wait-free condition variable for producers
## that avoids them spending time in expensive kernel land due to mutexes.
# ---- Consumer specific ----
ticket{.align: 64.}: uint8 # A ticket for the consumer to sleep in a phase
# ---- Contention ---- no real need for padding as cache line should be reloaded in case of contention anyway
futex: Futex # A Futex (atomic int32 that can put thread to sleep)
phase: Atomic[uint8] # A binary timestamp, toggles between 0 and 1 (but there is no atomic "not")
signaled: Atomic[bool] # Signaling condition
func initialize*(en: var EventNotifier) {.inline.} =
en.futex.initialize()
en.ticket = 0
en.phase.store(0, moRelaxed)
en.signaled.store(false, moRelaxed)
func `=destroy`*(en: var EventNotifier) {.inline.} =
en.futex.teardown()
func `=`*(dst: var EventNotifier, src: EventNotifier) {.error: "An event notifier cannot be copied".}
func `=sink`*(dst: var EventNotifier, src: EventNotifier) {.error: "An event notifier cannot be moved".}
func prepareToPark*(en: var EventNotifier) {.inline.} =
## The consumer intends to sleep soon.
## This must be called before the formal notification
## via a channel.
if not en.signaled.load(moRelaxed):
en.ticket = en.phase.load(moRelaxed)
proc park*(en: var EventNotifier) {.inline.} =
## Wait until we are signaled of an event
## Thread is parked and does not consume CPU resources
## This may wakeup spuriously.
if not en.signaled.load(moRelaxed):
if en.ticket == en.phase.load(moRelaxed):
en.futex.wait(0)
en.signaled.store(false, moRelaxed)
en.futex.initialize()
proc notify*(en: var EventNotifier) {.inline.} =
## Signal a thread that it can be unparked
if en.signaled.load(moRelaxed):
# Another producer is signaling
return
en.signaled.store(true, moRelease)
discard en.phase.fetchXor(1, moRelaxed)
en.futex.store(1, moRelease)
en.futex.wake()
# ############################################################
#
# Eventcount
#
# ############################################################
type
Eventcount* = object
## The lock-free equivalent of a condition variable.
##
## Usage, if a thread needs to be parked until a condition is true
## and signaled by another thread:
## ```Nim
## if condition:
## return
##
## while true:
## ticket = ec.sleepy()
## if condition:
## ec.cancelSleep()
## break
## else:
## ec.sleep()
## ```
state: Atomic[uint32]
# State is actually the equivalent of a bitfield
# type State = object
# waiters {.bitsize: 16.}: uint16
# when sizeof(pointer) == 4:
# epoch {.bitsize: 16.}: uint16
# else:
# epoch {.bitsize: 48.}: uint48
#
# of size, the native integer size
# and so can be used for atomic operations on 32-bit or 64-bit platforms.
# but there is no native fetchAdd for bitfield
futex: Futex
# Technically we could use the futex as the state.
# When you wait on a Futex, it waits only if the value of the futex
# matches with a reference value.
# But our reference value will be the epoch of notifications
# and it is non-trivial to zero-out the waiters bits.
# - One way could be to split a 64-bit number in 2
# and cast the epoch part to Futex but that would only work on 64-bit CPU.
# - Another more hacky way would be to pad with a zero-out uint16 before and after the Futex
# and depending on big or little endian provide a shifted address as Futex.
ParkingTicket* = object
epoch: uint32
const # bitfield
# Low 16 bits are waiters, up to 2¹⁶ = 65536 threads are supported
# High 16 or 48 bits are epochs.
# We can deal with the ABA problem o:
# - up to 65536 wake requests on 32-bit
# - up to 281 474 976 710 656 wake requests on 64-bit
# Epoch rolling over to 0 are not a problem, they won't change the low 16 bits
kEpochShift = 16
kAddEpoch = 1 shl kEpochShift
kWaiterMask = kAddEpoch - 1
kEpochMask {.used.} = not kWaiterMask
kAddWaiter = 1
kSubWaiter = 1
func initialize*(ec: var EventCount) {.inline.} =
ec.state.store(0, moRelaxed)
ec.futex.initialize()
func `=destroy`*(ec: var EventCount) {.inline.} =
ec.futex.teardown()
proc sleepy*(ec: var Eventcount): ParkingTicket {.inline.} =
## To be called before checking if the condition to not sleep is met.
## Returns a ticket to be used when committing to sleep
let prevState = ec.state.fetchAdd(kAddWaiter, moAcquireRelease)
result.epoch = prevState shr kEpochShift
proc sleep*(ec: var Eventcount, ticket: ParkingTicket) {.inline.} =
## Put a thread to sleep until notified.
## If the ticket becomes invalid (a notfication has been received)
## by the time sleep is called, the thread won't enter sleep
while ec.state.load(moAcquire) shr kEpochShift == ticket.epoch:
ec.futex.wait(ticket.epoch) # We don't use the futex internal value
let prev {.used.} = ec.state.fetchSub(kSubWaiter, moRelaxed)
proc cancelSleep*(ec: var Eventcount) {.inline.} =
## Cancel a sleep that was scheduled.
let prev {.used.} = ec.state.fetchSub(kSubWaiter, moRelaxed)
proc wake*(ec: var EventCount) {.inline.} =
## Wake a thread if at least 1 is parked
let prev = ec.state.fetchAdd(kAddEpoch, moAcquireRelease)
if (prev and kWaiterMask) != 0:
ec.futex.wake()
proc wakeAll*(ec: var EventCount) {.inline.} =
## Wake all threads if at least 1 is parked
let prev = ec.state.fetchAdd(kAddEpoch, moAcquireRelease)
if (prev and kWaiterMask) != 0:
ec.futex.wakeAll()
proc getNumWaiters*(ec: var EventCount): uint32 {.inline.} =
## Get the number of parked threads
ec.state.load(moRelaxed) and kWaiterMask
{.pop.} # {.push raises:[], checks:off.}

View File

@ -0,0 +1,286 @@
# Constantine
# Copyright (c) 2018-2019 Status Research & Development GmbH
# Copyright (c) 2020-Present Mamy André-Ratsimbazafy
# Licensed and distributed under either of
# * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
# * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
# at your option. This file may not be copied, modified, or distributed except according to those terms.
# This file implements a single-producer multi-consumer
# task queue for work-stealing schedulers.
#
# Papers:
# - Dynamic Circular Work-Stealing Deque
# David Chase, Yossi Lev, 1993
# https://www.dre.vanderbilt.edu/~schmidt/PDF/work-stealing-dequeue.pdf
#
# - Non-Blocking Steal-Half Work Queues
# Danny Hendler, Nir Shavit, 2002
# https://www.cs.bgu.ac.il/~hendlerd/papers/p280-hendler.pdf
#
# - Correct and Efficient Work-Stealing for Weak Memory Models
# Nhat Minh Lê, Antoniu Pop, Albert Cohen, Francesco Zappa Nardelli, 2013
# https://fzn.fr/readings/ppopp13.pdf
#
# The task queue implements the following push, pop, steal, stealHalf
#
# front back
# ---------------------------------
# steal() <- | | | | <- push()
# stealHalf() <- | Task 0 | Task 1 | Task 2 | -> pop()
# any thread | | | | owner-only
# ---------------------------------
#
# To reduce contention, stealing is done on the opposite end from push/pop
# so that there is a race only for the very last task(s).
{.push raises: [], checks: off.} # No exceptions in a multithreading datastructure
import
std/atomics,
../instrumentation,
../../allocs,
./tasks_flowvars
type
Buf = object
## Backend buffer of a Taskqueue
## `capacity` MUST be a power of 2
prevRetired: ptr Buf # intrusive linked list. Used for garbage collection
capacity: int
rawBuffer: UncheckedArray[Atomic[ptr Task]]
Taskqueue* = object
## This implements a lock-free, growable, work-stealing task queue.
## The owning thread enqueues and dequeues at the back
## Foreign threads steal at the front.
##
## There is no memory reclamation scheme for simplicity
front {.align: 64.}: Atomic[int] # Consumers - steal/stealHalf
back: Atomic[int] # Producer - push/pop
buf: Atomic[ptr Buf]
garbage: ptr Buf
proc peek*(tq: var Taskqueue): int =
## Estimates the number of items pending in the channel
## In a SPMC setting
## - If called by the producer the true number might be less
## due to consumers removing items concurrently.
## - If called by a consumer the true number is undefined
## as other consumers also remove items concurrently and
## the producer removes them concurrently.
##
## If the producer peeks and the Chase-Lev Deque returns 0,
## the queue is empty.
##
## This is a non-locking operation.
let # Handle race conditions
b = tq.back.load(moRelaxed)
f = tq.front.load(moRelaxed)
if b >= f:
return b-f
else:
return 0
func isPowerOfTwo(n: int): bool {.used, inline.} =
(n and (n - 1)) == 0 and (n != 0)
proc newBuf(capacity: int): ptr Buf =
# Tasks have a destructor
# static:
# doAssert supportsCopyMem(T), $T & " must be a (POD) plain-old-data type: no seq, string, ref."
preCondition: capacity.isPowerOfTwo()
result = allocHeapUnchecked(Buf, 1*sizeof(pointer) + 2*sizeof(int) + sizeof(pointer)*capacity)
result.prevRetired = nil
result.capacity = capacity
result.rawBuffer.addr.zeroMem(sizeof(pointer)*capacity)
proc `[]=`(buf: var Buf, index: int, item: ptr Task) {.inline.} =
buf.rawBuffer[index and (buf.capacity-1)].store(item, moRelaxed)
proc `[]`(buf: var Buf, index: int): ptr Task {.inline.} =
result = buf.rawBuffer[index and (buf.capacity-1)].load(moRelaxed)
proc grow(tq: var Taskqueue, buf: var ptr Buf, newCapacity, front, back: int) {.inline.} =
## Double the buffer size
## back is the last item index
##
## To handle race-conditions the current "front", "back" and "buf"
## have to be saved before calling this procedure.
## It reads and writes the "tq.buf" and "tq.garbage"
# Read -> Copy -> Update
var tmp = newBuf(newCapacity)
for i in front ..< back:
tmp[][i] = buf[][i]
buf.prevRetired = tq.garbage
tq.garbage = buf
# publish globally
tq.buf.store(tmp, moRelaxed)
# publish locally
swap(buf, tmp)
proc garbageCollect(tq: var Taskqueue) {.inline.} =
var node = tq.garbage
while node != nil:
let tmp = node.prevRetired
freeHeap(node)
node = tmp
tq.garbage = nil
# Public API
# ---------------------------------------------------
proc init*(tq: var Taskqueue, initialCapacity: int) =
zeroMem(tq.addr, tq.sizeof())
tq.buf.store(newBuf(initialCapacity), moRelaxed)
proc teardown*(tq: var Taskqueue) =
tq.garbageCollect()
freeHeap(tq.buf.load(moRelaxed))
proc push*(tq: var Taskqueue, item: ptr Task) =
## Enqueue an item at the back
## As the task queue takes ownership of it. The item must not be used afterwards.
## This is intended for the producer only.
let # Handle race conditions
b = tq.back.load(moRelaxed)
f = tq.front.load(moAcquire)
var buf = tq.buf.load(moRelaxed)
if b-f > buf.capacity - 1:
# Full queue
tq.grow(buf, buf.capacity*2, f, b)
if not tq.garbage.isNil and f == b:
# Empty queue, no thieves can have a pointer to an old retired buffer
tq.garbageCollect()
buf[][b] = item
fence(moRelease)
tq.back.store(b+1, moRelaxed)
proc pop*(tq: var Taskqueue): ptr Task =
## Dequeue an item at the back. Takes ownership of the item
## This is intended for the producer only.
let # Handle race conditions
b = tq.back.load(moRelaxed) - 1
buf = tq.buf.load(moRelaxed)
tq.back.store(b, moRelaxed)
fence(moSequentiallyConsistent)
var f = tq.front.load(moRelaxed)
if f <= b:
# Non-empty queue.
result = buf[][b]
if f == b:
# Single last element in queue.
if not compareExchange(tq.front, f, f+1, moSequentiallyConsistent, moRelaxed):
# Failed race.
result = nil
tq.back.store(b+1, moRelaxed)
if not tq.garbage.isNil:
# Empty queue, no thieves can have a pointer to an old retired buffer
tq.garbageCollect()
else:
# Empty queue.
result = nil
tq.back.store(b+1, moRelaxed)
if not tq.garbage.isNil:
# Empty queue, no thieves can have a pointer to an old retired buffer
tq.garbageCollect()
proc steal*(thiefID: uint32, tq: var Taskqueue): ptr Task =
## Dequeue an item at the front. Takes ownership of the item
## This is intended for consumers.
var f = tq.front.load(moAcquire)
fence(moSequentiallyConsistent)
let b = tq.back.load(moAcquire)
result = nil
if f < b:
# Non-empty queue.
let a = tq.buf.load(moConsume)
result = a[][f]
if not compareExchange(tq.front, f, f+1, moSequentiallyConsistent, moRelaxed):
# Failed race.
return nil
result.thiefID.store(thiefID, moRelease)
proc stealHalfImpl(dst: var Buf, dstBack: int, src: var Taskqueue): int =
## Theft part of stealHalf:
## - updates the victim metadata if successful
## - uncommitted updates to the thief tq whether successful or not
## Returns -1 if dst buffer is too small
## Assumes `dst` buffer is empty (i.e. not ahead-of-time thefts)
while true:
# Try as long as there are something to steal, we are idling anyway.
var f = src.front.load(moAcquire)
fence(moSequentiallyConsistent)
let b = src.back.load(moAcquire)
var n = b-f
n = n - (n shr 1) # Division by 2 rounded up, so if only one task is left, we still get it.
if n <= 0:
return 0
if n > dst.capacity:
return -1
# Non-empty queue.
let sBuf = src.buf.load(moConsume)
for i in 0 ..< n: # Copy LIFO or FIFO?
dst[dstBack+i] = sBuf[][f+i]
if compareExchange(src.front, f, f+n, moSequentiallyConsistent, moRelaxed):
return n
proc stealHalf*(thiefID: uint32, dst: var Taskqueue, src: var Taskqueue): ptr Task =
## Dequeue up to half of the items in the `src` tq, fom the front.
## Return the last of those, or nil if none found
while true:
# Prepare for batch steal
let
bDst = dst.back.load(moRelaxed)
fDst = dst.front.load(moAcquire)
var dBuf = dst.buf.load(moAcquire)
let sBuf = src.buf.load(moAcquire)
if dBuf.capacity < sBuf.capacity:
# We could grow to sBuf/2 since we steal half, but we want to minimize
# churn if we are actually in the process of stealing and the buffers grows.
dst.grow(dBuf, sBuf.capacity, fDst, bDst)
# Steal
let n = dBuf[].stealHalfImpl(bDst, src)
if n == 0:
return nil
if n == -1:
# Oops, victim buffer grew bigger than ours, restart the whole process
continue
# Update metadata
for i in 0 ..< n:
dBuf[][bDst+i].thiefID.store(thiefID, moRelease)
# Commit/publish theft, return the first item for processing
let last = dBuf[][bDst+n-1]
fence(moSequentiallyConsistent)
if n == 1:
return last
# We have more than one item, so some must go in our queue
# they are already here but inaccessible for other thieves
dst.back.store(bDst+n-1, moRelease) # We assume that queue was empty and so dst.front didn't change
return last

View File

@ -0,0 +1,127 @@
# Constantine
# Copyright (c) 2018-2019 Status Research & Development GmbH
# Copyright (c) 2020-Present Mamy André-Ratsimbazafy
# Licensed and distributed under either of
# * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
# * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
# at your option. This file may not be copied, modified, or distributed except according to those terms.
import
std/atomics,
../instrumentation,
../../allocs,
./backoff
# Tasks have an efficient design so that a single heap allocation
# is required per `spawn`.
# This greatly reduce overhead and potential memory fragmentation for long-running applications.
#
# This is done by tasks:
# - being an intrusive linked lists
# - integrating the channel to send results
#
# Flowvar is the public type created when spawning a task.
# and can be synced to receive the task result.
# Flowvars are also called future interchangeably.
# (The name future is already used for IO scheduling)
type
Task* = object
# Intrusive metadata
# ------------------
parent*: ptr Task # When a task is awaiting, a thread can quickly prioritize the direct child of a task
thiefID*: Atomic[uint32] # ID of the worker that stole and run the task. For leapfrogging.
# Result sync
# ------------------
hasFuture*: bool # Ownership: if a task has a future, the future deallocates it. Otherwise the worker thread does.
completed*: Atomic[bool]
waiter*: Atomic[ptr EventNotifier]
# Execution
# ------------------
fn*: proc (param: pointer) {.nimcall, gcsafe.}
# destroy*: proc (param: pointer) {.nimcall, gcsafe.} # Constantine only deals with plain old data
data*{.align:sizeof(int).}: UncheckedArray[byte]
Flowvar*[T] = object
task: ptr Task
const SentinelThief* = 0xFACADE'u32
proc new*(
T: typedesc[Task],
parent: ptr Task,
fn: proc (param: pointer) {.nimcall, gcsafe.}): ptr Task {.inline.} =
const size = sizeof(T)
result = allocHeapUnchecked(T, size)
result.parent = parent
result.thiefID.store(SentinelThief, moRelaxed)
result.completed.store(false, moRelaxed)
result.waiter.store(nil, moRelaxed)
result.fn = fn
proc new*(
T: typedesc[Task],
parent: ptr Task,
fn: proc (param: pointer) {.nimcall, gcsafe.},
params: auto): ptr Task {.inline.} =
const size = sizeof(T) + # size without Unchecked
sizeof(params)
result = allocHeapUnchecked(T, size)
result.parent = parent
result.thiefID.store(SentinelThief, moRelaxed)
result.completed.store(false, moRelaxed)
result.waiter.store(nil, moRelaxed)
result.fn = fn
cast[ptr[type params]](result.data)[] = params
# proc `=copy`*[T](dst: var Flowvar[T], src: Flowvar[T]) {.error: "Futures/Flowvars cannot be copied".}
proc newFlowVar*(T: typedesc, task: ptr Task): Flowvar[T] {.inline.} =
result.task = task
result.task.hasFuture = true
# Task with future references themselves so that readyWith can be called
# within the constructed
# proc async_fn(param: pointer) {.nimcall.}
# that can only access data
cast[ptr ptr Task](task.data.addr)[] = task
proc cleanup*(fv: var Flowvar) {.inline.} =
fv.task.freeHeap()
fv.task = nil
func isSpawned*(fv: Flowvar): bool {.inline.} =
## Returns true if a flowvar is spawned
## This may be useful for recursive algorithms that
## may or may not spawn a flowvar depending on a condition.
## This is similar to Option or Maybe types
return not fv.task.isNil
func isReady*[T](fv: Flowvar[T]): bool {.inline.} =
## Returns true if the result of a Flowvar is ready.
## In that case `sync` will not block.
## Otherwise the current will block to help on all the pending tasks
## until the Flowvar is ready.
fv.task.completed.load(moAcquire)
func readyWith*[T](task: ptr Task, childResult: T) {.inline.} =
## Send the Flowvar result from the child thread processing the task
## to its parent thread.
precondition: not task.completed.load(moAcquire)
cast[ptr (ptr Task, T)](task.data.addr)[1] = childResult
task.completed.store(true, moRelease)
proc sync*[T](fv: sink Flowvar[T]): T {.inline, gcsafe.} =
## Blocks the current thread until the flowvar is available
## and returned.
## The thread is not idle and will complete pending tasks.
mixin completeFuture
completeFuture(fv, result)
cleanup(fv)

View File

@ -0,0 +1,166 @@
# Partitioners
For data parallelism (parallel for loops) there are 2 main scheduling strategies:
- static scheduling, when work is regular (for example adding 2 matrices).
In that case, splitting the loop
into same-sized chunk provides perfect speedup, with no synchronization overhead.
(Assuming threads have the same performance and no parasite load)
- dynamic scheduling, when work is irregular (for example zero-ing buffers of different length).
Partitioners help implementing static scheduling.
Static scheduling
=================
Usually static scheduling is problematic because the threshold below which running in parallel
is both hardware dependent, data layout and function dependent, see https://github.com/zy97140/omp-benchmark-for-pytorch
| CPU Model | Sockets | Cores/Socket | Frequency |
|------------------------------------|---------|--------------|-----------|
| Intel(R) Xeon(R) CPU E5-2699 v4 | 2 | 22 | 2.20GHz |
| Intel(R) Xeon(R) Platinum 8180 CPU | 2 | 28 | 2.50GHz |
| Intel(R) Core(TM) i7-5960X CPU | 1 | 8 | 3.00GHz |
| contiguous op | Xeon(R) Platinum 8180 CPU | Xeon(R) CPU E5-2699 v4 | i7-5960X CPU |
|---------------|---------------------------|------------------------|--------------|
| copy | 80k | 20k | 8k |
| add | 80k | 20k | 8k |
| div | 50k | 10k | 2k |
| exp | 1k | 1k | 1k |
| sin | 1k | 1k | 1k |
| sum | 1k | 1k | 1k |
| prod | 1k | 1k | 1k |
| non-contiguous op | Xeon(R) Platinum 8180 CPU | Xeon(R) CPU E5-2699 v4 | i7-5960X CPU |
|-------------------|---------------------------|------------------------|--------------|
| copy | 20k | 8k | 2k |
| add | 20k | 8k | 2k |
| div | 10k | 8k | 1k |
| exp | 1k | 1k | 1k |
| sin | 2k | 2k | 1k |
| sum | 1k | 1k | 1k |
| prod | 1k | 1k | 1k |
# Static partitioner
====================
```Nim
iterator balancedChunks*(start, stopEx, numChunks: int): tuple[chunkID, start, stopEx: int] =
## Balanced chunking algorithm for a range [start, stopEx)
## This splits a range into min(stopEx-start, numChunks) balanced regions
## and returns a tuple (chunkID, offset, length)
# Rationale
# The following simple chunking scheme can lead to severe load imbalance
#
# let chunk_offset = chunk_size * thread_id
# let chunk_size = if thread_id < nb_chunks - 1: chunk_size
# else: omp_size - chunk_offset
#
# For example dividing 40 items on 12 threads will lead to
# a base_chunk_size of 40/12 = 3 so work on the first 11 threads
# will be 3 * 11 = 33, and the remainder 7 on the last thread.
#
# Instead of dividing 40 work items on 12 cores into:
# 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 7 = 3*11 + 7 = 40
# the following scheme will divide into
# 4, 4, 4, 4, 3, 3, 3, 3, 3, 3, 3, 3 = 4*4 + 3*8 = 40
#
# This is compliant with OpenMP spec (page 60)
# http://www.openmp.org/mp-documents/openmp-4.5.pdf
# "When no chunk_size is specified, the iteration space is divided into chunks
# that are approximately equal in size, and at most one chunk is distributed to
# each thread. The size of the chunks is unspecified in this case."
# ---> chunks are the same ±1
let
totalIters = stopEx - start
baseChunkSize = totalIters div numChunks
cutoff = totalIters mod numChunks
for chunkID in 0 ..< min(numChunks, totalIters):
if chunkID < cutoff:
let offset = start + ((baseChunkSize + 1) * chunkID)
let chunkSize = baseChunkSize + 1
yield (chunkID, offset, offset+chunkSize)
else:
let offset = start + (baseChunkSize * chunkID) + cutoff
let chunkSize = baseChunkSize
yield (chunkID, offset, chunkSize)
when isMainModule:
for chunk in balancedChunks(start = 0, stopEx = 40, numChunks = 12):
echo chunk
```
Dynamic scheduling
==================
Dynamic schedulers decide at runtime whether to split a range for processing by multiple threads at runtime.
Unfortunately most (all?) of those in production do not or cannot take into account
the actual functions being called within a `parallel_for` section,
and might split into too fine-grained chunks or into too coarse-grained chunks.
Alternatively they might ask the programmer for a threshold below which not to split.
As the programmer has no way to know if their code will run on a Raspberry Pi or a powerful workstation, that choice cannot be optimal.
Recent advances in research like "Lazy Binary Splitting" and "Lazy Tree Splitting"
allows dynamic scheduling to fully adapt to the system current load and
the parallel section computational needs without programmer input (grain size or minimal split threshold)
by using backpressure.
Why do we need partitioners for cryptographic workload then?
============================================================
Here are 3 basic cryptographic primitives that are non-trivial to parallelize:
1. batch elliptic-curve addition
2. multi-scalar multiplication (MSM)
3. batch signature verification via multi-pairing
On the other hand, merkle tree hashing for example is a primitive where both static or dynamic scheduling should give perfect speedup.
Let's take the example of batch EC addition.
--------------------------------------------
There is a naive way, via doing a parallel EC sum reduction,
for example for 1M points,
using projective coordinates, each sum costs 12M (field multiplication)
At first level we have 500k sums, then 250k sums, then 125k, ...
The number of sums is ~1M, for a total cost of 12e⁶
```
0 1 2 3 4 5 6 7
+ + + +
+ +
+
```
The fast way uses affine sum for an asymptotic cost of 6M, so 2x faster.
but this requires an inversion, a fixed cost of ~131M (256-bit) to ~96M (384-bit)
whatever number of additions we accumulate
That inversion needs to be amortized into at least 20-25 additions.
Let's take a look at multi-pairing
----------------------------------
Multi-pairing is split into 2 phases:
- 1. the Miller-Loop which is embarassing parallel, each thread can compute it on their own.
- 2. reducing the n Miller-Loops into a single 𝔽pₖ point using parallel 𝔽pₖ sum reduction.
- 3. Computing the final exponentiation, a fixed cost whatever the number of Miller Loops we did.
3alt. Alternatively, computing n final exponentiations, and merging them with a 𝔽pₖ product reduction
in that case, step 2 is not needed.
Conclusion
----------
Dynamic scheduling for reduction with variable + fixed costs (independent of chunk size) is tricky.
Furthermore the computations are regular, same workload per range and static scheduling seems like a great fit.
The downside is if a core has a parasite workload or on architecture like ARM big.Little or Alder Lake
with performance and power-saving cores.
Alternatively, we can add dynamic scheduling hints about min and max chunk size so that
chunk size is kept within the optimal range whatever the number of idle threads.

View File

@ -0,0 +1,42 @@
# Random permutations
Work-stealing is more efficient when the thread we steal from is randomized.
If all threads steal in the same order, we increase contention
on the start victims task queues.
The randomness quality is not important besides distributing potential contention,
i.e. randomly trying thread i, then i+1, then i+n-1 (mod n) is good enough.
Hence for efficiency, so that a thread can go to sleep faster, we want to
reduce calls to to the RNG as:
- Getting a random value itself can be expensive, especially if we use a CSPRNG (not a requirement)
- a CSPRNG can be starved of entropy as with small tasks, threads might make millions of calls.
- If we want unbiaised thread ID generation in a range, rejection sampling is costly (not a requirement).
Instead of using Fisher-Yates
- generates the victim set eagerly, inefficient if the first steal attempts are successful
- needs a RNG call when sampling a victim
- memory usage: numThreads per thread so numthreads² uint8 (255 threads max) or uint32
or a sparseset
- 1 RNG call when sampling a victim
- memory usage: 2\*numThreads per thread so 2\*numthreads² uint8 (255 threads max) or uint32
we can use Linear Congruential Generators, a recurrence relation of the form Xₙ₊₁ = aXₙ+c (mod m)
If we respect the Hull-Dobell theorem requirements, we can generate pseudo-random permutations in [0, m)
with fixed memory usage whatever the number of potential victims: just 4 registers for a, x, c, m
References:
- Fisher-Yates: https://en.wikipedia.org/wiki/Fisher%E2%80%93Yates_shuffle
- Sparse sets: https://dl.acm.org/doi/pdf/10.1145/176454.176484
https://github.com/mratsim/weave/blob/7682784/weave/datatypes/sparsesets.nim
https://github.com/status-im/nim-taskpools/blob/4bc0b59/taskpools/sparsesets.nim
- Linear Congruential Generator: https://en.wikipedia.org/wiki/Linear_congruential_generator
And if we want cryptographic strength:
- Ciphers with Arbitrary Finite Domains
John Black and Phillip Rogaway, 2001
https://eprint.iacr.org/2001/012
- An Enciphering Scheme Based on a Card Shuffle
Viet Tung Hoang, Ben Morris, Phillip Rogaway
https://www.iacr.org/archive/crypto2012/74170001/74170001.pdf

View File

@ -0,0 +1,48 @@
import ../threadpool
block: # Async without result
proc displayInt(x: int) =
stdout.write(x)
stdout.write(" - SUCCESS\n")
proc main() =
echo "\n=============================================================================================="
echo "Running 'threadpool/examples/e01_simple_tasks.nim'"
echo "=============================================================================================="
echo "\nSanity check 1: Printing 123456 654321 in parallel"
var tp = Threadpool.new(numThreads = 4)
tp.spawn displayInt(123456)
tp.spawn displayInt(654321)
tp.shutdown()
main()
block: # Async/Await
var tp: Threadpool
proc asyncFib(n: int): int =
if n < 2:
return n
let x = tp.spawn asyncFib(n-1)
let y = asyncFib(n-2)
result = sync(x) + y
proc main2() =
echo "\n=============================================================================================="
echo "Running 'threadpool/examples/e01_simple_tasks.nim'"
echo "=============================================================================================="
echo "\nSanity check 2: fib(20)"
tp = Threadpool.new()
let f = asyncFib(20)
tp.shutdown()
doAssert f == 6765, "f was " & $f
main2()

View File

@ -0,0 +1,38 @@
# Demo of API using a very inefficient π approcimation algorithm.
import
std/[strutils, math, cpuinfo],
../threadpool
# From https://github.com/nim-lang/Nim/blob/v1.6.2/tests/parallel/tpi.nim
# Leibniz Formula https://en.wikipedia.org/wiki/Leibniz_formula_for_%CF%80
proc term(k: int): float =
if k mod 2 == 1:
-4'f / float(2*k + 1)
else:
4'f / float(2*k + 1)
proc piApprox(tp: Threadpool, n: int): float =
var pendingFuts = newSeq[Flowvar[float]](n)
for k in 0 ..< pendingFuts.len:
pendingFuts[k] = tp.spawn term(k) # Schedule a task on the threadpool a return a handle to retrieve the result.
for k in 0 ..< pendingFuts.len:
result += sync pendingFuts[k] # Block until the result is available.
proc main() =
echo "\n=============================================================================================="
echo "Running 'threadpool/examples/e02_parallel_pi.nim'"
echo "=============================================================================================="
var n = 1_000_000
var nthreads = countProcessors()
var tp = Threadpool.new(num_threads = nthreads) # Default to the number of hardware threads.
echo formatFloat(tp.piApprox(n))
tp.shutdown()
# Compile with nim c -r -d:release --threads:on --outdir:build example.nim
main()

View File

@ -0,0 +1,142 @@
# Weave
# Copyright (c) 2019 Mamy André-Ratsimbazafy
# Licensed and distributed under either of
# * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
# * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
# at your option. This file may not be copied, modified, or distributed except according to those terms.
import system/ansi_c
# Loggers
# --------------------------------------------------------
template log*(args: varargs[untyped]): untyped =
c_printf(args)
flushFile(stdout)
template debugTermination*(body: untyped): untyped =
when defined(TP_DebugTermination) or defined(TP_Debug):
{.noSideEffect, gcsafe.}: body
template debug*(body: untyped): untyped =
when defined(TP_Debug):
{.noSideEffect, gcsafe.}: body
# --------------------------------------------------------
import std/macros
# A simple design-by-contract API
# --------------------------------------------------------
# Everything should be a template that doesn't produce any code
# when debugConstantine is not defined.
# Those checks are controlled by a custom flag instead of
# "--boundsChecks" or "--nilChecks" to decouple them from user code checks.
# Furthermore, we want them to be very lightweight on performance
func toHex*(a: SomeInteger): string =
const hexChars = "0123456789abcdef"
const L = 2*sizeof(a)
result = newString(2 + L)
result[0] = '0'
result[1] = 'x'
var a = a
for j in countdown(result.len-1, 0):
result[j] = hexChars[a and 0xF]
a = a shr 4
proc inspectInfix(node: NimNode): NimNode =
## Inspect an expression,
## Returns the AST as string with runtime values inlined
## from infix operators inlined.
# TODO: pointer and custom type need a default repr
# otherwise we can only resulve simple expressions
proc inspect(node: NimNode): NimNode =
case node.kind:
of nnkInfix:
return newCall(
bindSym"&",
newCall(
bindSym"&",
newCall(ident"$", inspect(node[1])),
newLit(" " & $node[0] & " ")
),
newCall(ident"$", inspect(node[2]))
)
of {nnkIdent, nnkSym}:
return node
of nnkDotExpr:
return quote do:
when `node` is pointer or
`node` is ptr or
`node` is (proc):
toHex(cast[ByteAddress](`node`) and 0xffff_ffff)
else:
$(`node`)
of nnkPar:
result = nnkPar.newTree()
for sub in node:
result.add inspect(sub)
else:
return node.toStrLit()
return inspect(node)
macro assertContract(
checkName: static string,
predicate: untyped) =
let lineinfo = lineInfoObj(predicate)
var strippedPredicate: NimNode
if predicate.kind == nnkStmtList:
assert predicate.len == 1, "Only one-liner conditions are supported"
strippedPredicate = predicate[0]
else:
strippedPredicate = predicate
let debug = "\n Contract violated for " & checkName & " at " & $lineinfo &
"\n " & $strippedPredicate.toStrLit &
"\n The following values are contrary to expectations:" &
"\n "
let values = inspectInfix(strippedPredicate)
let workerID = quote do:
when declared(workerContext):
$workerContext.id
else:
"N/A"
let threadpoolID = quote do:
when declared(workerContext):
cast[ByteAddress](workerContext.threadpool).toHex()
else:
"N/A"
result = quote do:
{.noSideEffect.}:
when compileOption("assertions"):
assert(`predicate`, `debug` & $`values` & " [Worker " & `workerID` & " on threadpool " & `threadpoolID` & "]\n")
elif defined(TP_Asserts):
if unlikely(not(`predicate`)):
raise newException(AssertionError, `debug` & $`values` & " [Worker " & `workerID` & " on threadpool " & `threadpoolID` & "]\n")
template preCondition*(require: untyped) =
## Optional runtime check before returning from a function
assertContract("pre-condition", require)
template postCondition*(ensure: untyped) =
## Optional runtime check at the start of a function
assertContract("post-condition", ensure)
template ascertain*(check: untyped) =
## Optional runtime check in the middle of processing
assertContract("transient condition", check)
# Sanity checks
# ----------------------------------------------------------------------------------
when isMainModule:
proc assertGreater(x, y: int) =
postcondition(x > y)
# We should get a nicely formatted exception
assertGreater(10, 12)

View File

@ -0,0 +1,160 @@
# Constantine
# Copyright (c) 2018-2019 Status Research & Development GmbH
# Copyright (c) 2020-Present Mamy André-Ratsimbazafy
# Licensed and distributed under either of
# * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
# * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
# at your option. This file may not be copied, modified, or distributed except according to those terms.
import
std/macros,
./instrumentation,
./crossthread/tasks_flowvars
# Task parallelism - spawn
# ---------------------------------------------
proc spawnVoid(funcCall: NimNode, args, argsTy: NimNode, workerContext, schedule: NimNode): NimNode =
# Create the async function
let fn = funcCall[0]
let fnName = $fn
let withArgs = args.len > 0
let async_fn = ident("async_" & fnName)
var fnCall = newCall(fn)
let data = ident("data") # typed pointer to data
# Schedule
let task = ident"task"
let scheduleBlock = newCall(schedule, workerContext, task)
result = newStmtList()
if funcCall.len == 2:
# With only 1 arg, the tuple syntax doesn't construct a tuple
# let data = (123) # is an int
fnCall.add nnkDerefExpr.newTree(data)
else: # This handles the 0 arg case as well
for i in 1 ..< funcCall.len:
fnCall.add nnkBracketExpr.newTree(
data,
newLit i-1
)
# Create the async call
result.add quote do:
proc `async_fn`(param: pointer) {.nimcall.} =
# preCondition: not isRootTask(`workerContext`.currentTask)
when bool(`withArgs`):
let `data` = cast[ptr `argsTy`](param)
`fnCall`
# Create the task
result.add quote do:
block enq_deq_task:
when bool(`withArgs`):
let `task` = Task.new(
parent = `workerContext`.currentTask,
fn = `async_fn`,
params = `args`)
else:
let `task` = Task.new(
parent = `workerContext`.currentTask,
fn = `async_fn`)
`scheduleBlock`
proc spawnRet(funcCall: NimNode, retTy, args, argsTy: NimNode, workerContext, schedule: NimNode): NimNode =
# Create the async function
let fn = funcCall[0]
let fnName = $fn
let async_fn = ident("async_" & fnName)
var fnCall = newCall(fn)
let data = ident("data") # typed pointer to data
# Schedule
let task = ident"task"
let scheduleBlock = newCall(schedule, workerContext, task)
result = newStmtList()
# tasks have no return value.
# 1. The start of the task `data` buffer will store the return value for the flowvar and awaiter/sync
# 2. We create a wrapper async_fn without return value that send the return value in the channel
# 3. We package that wrapper function in a task
# We store the following in task.data:
#
# | ptr Task | result | arg₀ | arg₁ | ... | argₙ
let fut = ident"fut"
let taskSelfReference = ident"taskSelfReference"
let retVal = ident"retVal"
var futArgs = nnkPar.newTree
var futArgsTy = nnkPar.newTree
futArgs.add taskSelfReference
futArgsTy.add nnkPtrTy.newTree(bindSym"Task")
futArgs.add retVal
futArgsTy.add retTy
for i in 1 ..< funcCall.len:
futArgsTy.add getTypeInst(funcCall[i])
futArgs.add funcCall[i]
# data stores | ptr Task | result | arg₀ | arg₁ | ... | argₙ
# so arguments starts at data[2] in the wrapping funcCall functions
for i in 1 ..< funcCall.len:
fnCall.add nnkBracketExpr.newTree(
data,
newLit i+1
)
result.add quote do:
proc `async_fn`(param: pointer) {.nimcall.} =
# preCondition: not isRootTask(`workerContext`.currentTask)
let `data` = cast[ptr `futArgsTy`](param)
let res = `fnCall`
readyWith(`data`[0], res)
# Regenerate fresh ident, retTy has been tagged as a function call param
let retTy = ident($retTy)
# Create the task
result.add quote do:
block enq_deq_task:
let `taskSelfReference` = cast[ptr Task](0xDEADBEEF)
let `retVal` = default(`retTy`)
let `task` = Task.new(
parent = `workerContext`.currentTask,
fn = `async_fn`,
params = `futArgs`)
let `fut` = newFlowvar(`retTy`, `task`)
`scheduleBlock`
# Return the future
`fut`
proc spawnImpl*(tp: NimNode{nkSym}, funcCall: NimNode, workerContext, schedule: NimNode): NimNode =
funcCall.expectKind(nnkCall)
# Get the return type if any
let retType = funcCall[0].getImpl[3][0]
let needFuture = retType.kind != nnkEmpty
# Get a serialized type and data for all function arguments
# We use adhoc tuple
var argsTy = nnkPar.newTree()
var args = nnkPar.newTree()
for i in 1 ..< funcCall.len:
argsTy.add getTypeInst(funcCall[i])
args.add funcCall[i]
# Package in a task
if not needFuture:
result = spawnVoid(funcCall, args, argsTy, workerContext, schedule)
else:
result = spawnRet(funcCall, retType, args, argsTy, workerContext, schedule)
# Wrap in a block for namespacing
result = nnkBlockStmt.newTree(newEmptyNode(), result)
# echo result.toStrLit

View File

@ -0,0 +1,53 @@
# Synchronization Barriers
OSX does not implement pthread_barrier as its an optional part
of the POSIX standard and they probably want to drive people to libdispatch/Grand Central Dispatch.
So we need to roll our own with a POSIX compatible API.
## Glibc barriers, design bug and implementation
> Note: due to GPL licensing, do not lift the code.
> Not that we can as it is heavily dependent on futexes
> which are not available on OSX
We need to make sure that we don't hit the same bug
as glibc: https://sourceware.org/bugzilla/show_bug.cgi?id=13065
which seems to be an issue in some of the barrier implementations
in the wild.
The design of Glibc barriers is here:
https://sourceware.org/git/?p=glibc.git;a=blob;f=nptl/DESIGN-barrier.txt;h=23463c6b7e77231697db3e13933b36ce295365b1;hb=HEAD
And implementation:
- https://sourceware.org/git/?p=glibc.git;a=blob;f=nptl/pthread_barrier_destroy.c;h=76957adef3ee751e5b0cfa429fcf4dd3cfd80b2b;hb=HEAD
- https://sourceware.org/git/?p=glibc.git;a=blob;f=nptl/pthread_barrier_init.c;h=c8ebab3a3cb5cbbe469c0d05fb8d9ca0c365b2bb;hb=HEAD`
- https://sourceware.org/git/?p=glibc.git;a=blob;f=nptl/pthread_barrier_wait.c;h=49fcfd370c1c4929fdabdf420f2f19720362e4a0;hb=HEAD
## Synchronization barrier techniques
This article goes over the techniques of
"pool barrier" and "ticket barrier"
https://locklessinc.com/articles/barriers/
to reach 2x to 20x the speed of pthreads barrier
This course https://cs.anu.edu.au/courses/comp8320/lectures/aux/comp422-Lecture21-Barriers.pdf
goes over
- centralized barrier with sense reversal
- combining tree barrier
- dissemination barrier
- tournament barrier
- scalable tree barrier
More courses:
- http://www.cs.rochester.edu/u/sandhya/csc458/seminars/jb_Barrier_Methods.pdf
It however requires lightweight mutexes like Linux futexes
that OSX lacks.
This post goes over lightweight mutexes like Benaphores (from BeOS)
https://preshing.com/20120226/roll-your-own-lightweight-mutex/
This gives a few barrier implementations
http://gallium.inria.fr/~maranget/MPRI/02.pdf
and refers to Cubible paper for formally verifying synchronization barriers
http://cubicle.lri.fr/papers/jfla2014.pdf (in French)

View File

@ -0,0 +1,71 @@
# Weave
# Copyright (c) 2019 Mamy André-Ratsimbazafy
# Licensed and distributed under either of
# * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
# * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
# at your option. This file may not be copied, modified, or distributed except according to those terms.
{.push raises: [], checks: off.} # No exceptions or overflow/conversion checks
when defined(windows):
import ./barriers_windows
when compileOption("assertions"):
import os
type SyncBarrier* = SynchronizationBarrier
proc init*(syncBarrier: var SyncBarrier, threadCount: uint32) {.inline.} =
## Initialize a synchronization barrier that will block ``threadCount`` threads
## before release.
let err {.used.} = InitializeSynchronizationBarrier(syncBarrier, cast[int32](threadCount), -1)
when compileOption("assertions"):
if err != 1:
assert err == 0
raiseOSError(osLastError())
proc wait*(syncBarrier: var SyncBarrier): bool {.inline.} =
## Blocks thread at a synchronization barrier.
## Returns true for one of the threads (the last one on Windows, undefined on Posix)
## and false for the others.
result = bool EnterSynchronizationBarrier(syncBarrier, SYNCHRONIZATION_BARRIER_FLAGS_NO_DELETE)
proc delete*(syncBarrier: sink SyncBarrier) {.inline.} =
## Deletes a synchronization barrier.
## This assumes no race between waiting at a barrier and deleting it,
## and reuse of the barrier requires initialization.
DeleteSynchronizationBarrier(syncBarrier.addr)
else:
import ./barriers_posix
when compileOption("assertions"):
import os
type SyncBarrier* = PthreadBarrier
proc init*(syncBarrier: var SyncBarrier, threadCount: uint32) {.inline.} =
## Initialize a synchronization barrier that will block ``threadCount`` threads
## before release.
let err {.used.} = pthread_barrier_init(syncBarrier, nil, threadCount)
when compileOption("assertions"):
if err != 0:
raiseOSError(OSErrorCode(err))
proc wait*(syncBarrier: var SyncBarrier): bool {.inline.} =
## Blocks thread at a synchronization barrier.
## Returns true for one of the threads (the last one on Windows, undefined on Posix)
## and false for the others.
let err {.used.} = pthread_barrier_wait(syncBarrier)
when compileOption("assertions"):
if err != PTHREAD_BARRIER_SERIAL_THREAD and err < 0:
raiseOSError(OSErrorCode(err))
result = if err == PTHREAD_BARRIER_SERIAL_THREAD: true
else: false
proc delete*(syncBarrier: sink SyncBarrier) {.inline.} =
## Deletes a synchronization barrier.
## This assumes no race between waiting at a barrier and deleting it,
## and reuse of the barrier requires initialization.
let err {.used.} = pthread_barrier_destroy(syncBarrier)
when compileOption("assertions"):
if err < 0:
raiseOSError(OSErrorCode(err))

View File

@ -0,0 +1,84 @@
# Weave
# Copyright (c) 2019 Mamy André-Ratsimbazafy
# Licensed and distributed under either of
# * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
# * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
# at your option. This file may not be copied, modified, or distributed except according to those terms.
# OSX doesn't implement pthread_barrier_t
# It's an optional part of the POSIX standard
#
# This is a manual implementation of a sense reversing barrier
import std/locks
type
Errno* = cint
PthreadBarrierAttr* = object
## Dummy
PthreadBarrier* = object
## Implementation of a sense reversing barrier
## (The Art of Multiprocessor Programming by Maurice Herlihy & Nir Shavit)
lock: Lock # Alternatively spinlock on Atomic
cond {.guard: lock.}: Cond
sense {.guard: lock.}: bool # Choose int32 to avoid zero-expansion cost in registers?
left {.guard: lock.}: cuint # Number of threads missing at the barrier before opening
count: cuint # Total number of threads that need to arrive before opening the barrier
const
PTHREAD_BARRIER_SERIAL_THREAD* = Errno(1)
proc pthread_cond_broadcast(cond: var Cond): Errno {.header:"<pthread.h>".}
## Nim only signal one thread in locks
## We need to unblock all
proc broadcast(cond: var Cond) {.inline.}=
discard pthread_cond_broadcast(cond)
func pthread_barrier_init*(
barrier: var PthreadBarrier,
attr: ptr PthreadBarrierAttr,
count: cuint
): Errno =
barrier.lock.initLock()
{.locks: [barrier.lock].}:
barrier.cond.initCond()
barrier.left = count
barrier.count = count
# barrier.sense = false
proc pthread_barrier_wait*(barrier: var PthreadBarrier): Errno =
## Wait on `barrier`
## Returns PTHREAD_BARRIER_SERIAL_THREAD for a single arbitrary thread
## Returns 0 for the other
## Returns Errno if there is an error
barrier.lock.acquire()
{.locks: [barrier.lock].}:
var local_sense = barrier.sense # Thread local sense
dec barrier.left
if barrier.left == 0:
# Last thread to arrive at the barrier
# Reverse phase and release it
barrier.left = barrier.count
barrier.sense = not barrier.sense
barrier.cond.broadcast()
barrier.lock.release()
return PTHREAD_BARRIER_SERIAL_THREAD
while barrier.sense == local_sense:
# We are waiting for threads
# Wait for the sense to reverse
# while loop because we might have spurious wakeups
barrier.cond.wait(barrier.lock)
# Reversed, we can leave the barrier
barrier.lock.release()
return Errno(0)
proc pthread_barrier_destroy*(barrier: var PthreadBarrier): Errno =
{.locks: [barrier.lock].}:
barrier.cond.deinitCond()
barrier.lock.deinitLock()

View File

@ -0,0 +1,56 @@
# Weave
# Copyright (c) 2019 Mamy André-Ratsimbazafy
# Licensed and distributed under either of
# * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
# * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
# at your option. This file may not be copied, modified, or distributed except according to those terms.
# Abstractions over POSIX barriers (non-)implementations
when not compileOption("threads"):
{.error: "This requires --threads:on compilation flag".}
# Types
# -------------------------------------------------------
when defined(osx):
import ./barriers_macos
export PthreadBarrierAttr, PthreadBarrier, Errno, PTHREAD_BARRIER_SERIAL_THREAD
else:
type
PthreadBarrierAttr* {.importc: "pthread_barrierattr_t", header: "<sys/types.h>", byref.} = object
when (defined(linux) and not defined(android)) and defined(amd64):
abi: array[4 div sizeof(cint), cint] # https://sourceware.org/git/?p=glibc.git;a=blob;f=sysdeps/x86/nptl/bits/pthreadtypes-arch.h;h=dd06d6753ebc80d94ede6c3c18227a3ad3104570;hb=HEAD#l45
PthreadBarrier* {.importc: "pthread_barrier_t", header: "<sys/types.h>", byref.} = object
when (defined(linux) and not defined(android)) and defined(amd64):
abi: array[32 div sizeof(clong), clong] # https://sourceware.org/git/?p=glibc.git;a=blob;f=sysdeps/x86/nptl/bits/pthreadtypes-arch.h;h=dd06d6753ebc80d94ede6c3c18227a3ad3104570;hb=HEAD#l28
Errno* = cint
var PTHREAD_BARRIER_SERIAL_THREAD* {.importc, header:"<pthread.h>".}: Errno
# Pthread
# -------------------------------------------------------
when defined(osx):
export pthread_barrier_init, pthread_barrier_wait, pthread_barrier_destroy
else:
proc pthread_barrier_init*(
barrier: PthreadBarrier,
attr: ptr PthreadBarrierAttr,
count: cuint
): Errno {.header: "<pthread.h>".}
## Initialize `barrier` with the attributes `attr`.
## The barrier is opened when `count` waiters arrived.
proc pthread_barrier_destroy*(
barrier: sink PthreadBarrier): Errno {.header: "<pthread.h>".}
## Destroy a previously dynamically initialized `barrier`.
proc pthread_barrier_wait*(
barrier: var PthreadBarrier
): Errno {.header: "<pthread.h>".}
## Wait on `barrier`
## Returns PTHREAD_BARRIER_SERIAL_THREAD for a single arbitrary thread
## Returns 0 for the other
## Returns Errno if there is an error

View File

@ -0,0 +1,31 @@
# Weave
# Copyright (c) 2019 Mamy André-Ratsimbazafy
# Licensed and distributed under either of
# * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
# * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
# at your option. This file may not be copied, modified, or distributed except according to those terms.
import winlean
# Technically in <synchapi.h> but MSVC complains with
# @m..@s..@sweave@sscheduler.nim.cpp
# C:\Program Files (x86)\Windows Kits\10\include\10.0.17763.0\um\winnt.h(154): fatal error C1189: #error: "No Target Architecture"
type
SynchronizationBarrier*{.importc:"SYNCHRONIZATION_BARRIER", header:"<windows.h>".} = object
var SYNCHRONIZATION_BARRIER_FLAGS_NO_DELETE* {.importc, header: "<windows.h>".}: DWORD
## Skip expensive checks on barrier enter if a barrier is never deleted.
proc EnterSynchronizationBarrier*(lpBarrier: var SynchronizationBarrier, dwFlags: DWORD): WINBOOL {.importc, stdcall, header: "<windows.h>".}
proc DeleteSynchronizationBarrier*(lpBarrier: ptr SynchronizationBarrier) {.importc, stdcall, header: "<windows.h>".}
proc InitializeSynchronizationBarrier*(lpBarrier: var SynchronizationBarrier, lTotalThreads: LONG, lSpinCount: LONG): WINBOOL {.importc, stdcall, header: "<windows.h>".}
when isMainModule:
import os
var x{.noinit.}: SynchronizationBarrier
let err = InitializeSynchronizationBarrier(x, 2, -1)
if err != 1:
assert err == 0
raiseOSError(osLastError())

View File

@ -0,0 +1,18 @@
# Weave
# Copyright (c) 2019 Mamy André-Ratsimbazafy
# Licensed and distributed under either of
# * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
# * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
# at your option. This file may not be copied, modified, or distributed except according to those terms.
when defined(linux):
import ./futexes_linux
export futexes_linux
elif defined(windows):
import ./futexes_windows
export futexes_windows
elif defined(osx):
import ./futexes_macos
export futexes_macos
else:
{.error: "Futexes are not implemented for your OS".}

View File

@ -0,0 +1,68 @@
# Weave
# Copyright (c) 2019 Mamy André-Ratsimbazafy
# Licensed and distributed under either of
# * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
# * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
# at your option. This file may not be copied, modified, or distributed except according to those terms.
# A wrapper for linux futex.
# Condition variables do not always wake on signal which can deadlock the runtime
# so we need to roll up our sleeves and use the low-level futex API.
import std/atomics
export MemoryOrder
type
Futex* = object
value: Atomic[uint32]
FutexOp = distinct cint
var NR_Futex {.importc: "__NR_futex", header: "<sys/syscall.h>".}: cint
var FutexWaitPrivate {.importc:"FUTEX_WAIT_PRIVATE", header: "<linux/futex.h>".}: FutexOp
var FutexWakePrivate {.importc:"FUTEX_WAKE_PRIVATE", header: "<linux/futex.h>".}: FutexOp
proc syscall(sysno: clong): cint {.header:"<unistd.h>", varargs.}
proc sysFutex(
futex: var Futex, op: FutexOp, val1: cuint or cint,
timeout: pointer = nil, val2: pointer = nil, val3: cint = 0): cint {.inline.} =
syscall(NR_Futex, futex.value.addr, op, val1, timeout, val2, val3)
proc initialize*(futex: var Futex) {.inline.} =
futex.value.store(0, moRelaxed)
proc teardown*(futex: var Futex) {.inline.} =
futex.value.store(0, moRelaxed)
proc load*(futex: var Futex, order: MemoryOrder): uint32 {.inline.} =
futex.value.load(order)
proc loadMut*(futex: var Futex): var Atomic[uint32] {.inline.} =
futex.value
proc store*(futex: var Futex, value: uint32, order: MemoryOrder) {.inline.} =
futex.value.store(value, order)
proc wait*(futex: var Futex, refVal: uint32) {.inline.} =
## Suspend a thread if the value of the futex is the same as refVal.
# Returns 0 in case of a successful suspend
# If value are different, it returns EWOULDBLOCK
# We discard as this is not needed and simplifies compat with Windows futex
discard sysFutex(futex, FutexWaitPrivate, refVal)
proc wake*(futex: var Futex) {.inline.} =
## Wake one thread (from the same process)
# Returns the number of actually woken threads
# or a Posix error code (if negative)
# We discard as this is not needed and simplifies compat with Windows futex
discard sysFutex(futex, FutexWakePrivate, 1)
proc wakeAll*(futex: var Futex) {.inline.} =
## Wake all threads (from the same process)
# Returns the number of actually woken threads
# or a Posix error code (if negative)
# We discard as this is not needed and simplifies compat with Windows futex
discard sysFutex(futex, FutexWakePrivate, high(int32))

View File

@ -0,0 +1,109 @@
# Weave
# Copyright (c) 2019 Mamy André-Ratsimbazafy
# Licensed and distributed under either of
# * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
# * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
# at your option. This file may not be copied, modified, or distributed except according to those terms.
import std/atomics
# A wrapper for Darwin futex.
# They are used in libc++ so likely to be very stable.
# A new API appeared in OSX Big Sur (Jan 2021) ulock_wait2 and macOS pthread_cond_t has been migrated to it
# - https://github.com/apple/darwin-xnu/commit/d4061fb0260b3ed486147341b72468f836ed6c8f#diff-08f993cc40af475663274687b7c326cc6c3031e0db3ac8de7b24624610616be6
#
# The old API is ulock_wait
# - https://opensource.apple.com/source/xnu/xnu-7195.81.3/bsd/kern/sys_ulock.c.auto.html
# - https://opensource.apple.com/source/xnu/xnu-7195.81.3/bsd/sys/ulock.h.auto.html
{.push hint[XDeclaredButNotUsed]: off.}
const UL_COMPARE_AND_WAIT = 1
const UL_UNFAIR_LOCK = 2
const UL_COMPARE_AND_WAIT_SHARED = 3
const UL_UNFAIR_LOCK64_SHARED = 4
const UL_COMPARE_AND_WAIT64 = 5
const UL_COMPARE_AND_WAIT64_SHARED = 6
# obsolete names
const UL_OSSPINLOCK = UL_COMPARE_AND_WAIT
const UL_HANDOFFLOCK = UL_UNFAIR_LOCK
# These operation code are only implemented in (DEVELOPMENT || DEBUG) kernels
const UL_DEBUG_SIMULATE_COPYIN_FAULT = 253
const UL_DEBUG_HASH_DUMP_ALL = 254
const UL_DEBUG_HASH_DUMP_PID = 255
# operation bits [15, 8] contain the flags for __ulock_wake
#
const ULF_WAKE_ALL = 0x00000100
const ULF_WAKE_THREAD = 0x00000200
const ULF_WAKE_ALLOW_NON_OWNER = 0x00000400
# operation bits [23, 16] contain the flags for __ulock_wait
#
# @const ULF_WAIT_WORKQ_DATA_CONTENTION
# The waiter is contending on this lock for synchronization around global data.
# This causes the workqueue subsystem to not create new threads to offset for
# waiters on this lock.
#
# @const ULF_WAIT_CANCEL_POINT
# This wait is a cancelation point
#
# @const ULF_WAIT_ADAPTIVE_SPIN
# Use adaptive spinning when the thread that currently holds the unfair lock
# is on core.
const ULF_WAIT_WORKQ_DATA_CONTENTION = 0x00010000
const ULF_WAIT_CANCEL_POINT = 0x00020000
const ULF_WAIT_ADAPTIVE_SPIN = 0x00040000
# operation bits [31, 24] contain the generic flags
const ULF_NO_ERRNO = 0x01000000
# masks
const UL_OPCODE_MASK = 0x000000FF
const UL_FLAGS_MASK = 0xFFFFFF00
const ULF_GENERIC_MASK = 0xFFFF0000
const ULF_WAIT_MASK = ULF_NO_ERRNO or
ULF_WAIT_WORKQ_DATA_CONTENTION or
ULF_WAIT_CANCEL_POINT or
ULF_WAIT_ADAPTIVE_SPIN
const ULF_WAKE_MASK = ULF_NO_ERRNO or
ULF_WAKE_ALL or
ULF_WAKE_THREAD or
ULF_WAKE_ALLOW_NON_OWNER
proc ulock_wait(operation: uint32, address: pointer, value: uint64, timeout: uint32): cint {.importc:"__ulock_wait", cdecl.}
proc ulock_wait2(operation: uint32, address: pointer, value: uint64, timeout, value2: uint64): cint {.importc:"__ulock_wait2", cdecl.}
proc ulock_wake(operation: uint32, address: pointer, wake_value: uint64): cint {.importc:"__ulock_wake", cdecl.}
type
Futex* = object
value: Atomic[uint32]
proc initialize*(futex: var Futex) {.inline.} =
futex.value.store(0, moRelaxed)
proc teardown*(futex: var Futex) {.inline.} =
futex.value.store(0, moRelaxed)
proc load*(futex: var Futex, order: MemoryOrder): uint32 {.inline.} =
futex.value.load(order)
proc loadMut*(futex: var Futex): var Atomic[uint32] {.inline.} =
futex.value
proc store*(futex: var Futex, value: uint32, order: MemoryOrder) {.inline.} =
futex.value.store(value, order)
proc wait*(futex: var Futex, refVal: uint32) {.inline.} =
## Suspend a thread if the value of the futex is the same as refVal.
discard ulock_wait(UL_UNFAIR_LOCK64_SHARED or ULF_NO_ERRNO, futex.value.addr, uint64 refVal, 0)
proc wake*(futex: var Futex) {.inline.} =
## Wake one thread (from the same process)
discard ulock_wake(ULF_WAKE_THREAD or ULF_NO_ERRNO, futex.value.addr, 0)
proc wakeAll*(futex: var Futex) {.inline.} =
## Wake all threads (from the same process)
discard ulock_wake(ULF_WAKE_ALL or ULF_NO_ERRNO, futex.value.addr, 0)

View File

@ -0,0 +1,59 @@
# Weave
# Copyright (c) 2019 Mamy André-Ratsimbazafy
# Licensed and distributed under either of
# * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
# * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
# at your option. This file may not be copied, modified, or distributed except according to those terms.
# An implementation of futex using Windows primitives
import std/atomics, winlean
export MemoryOrder
type
Futex* = object
value: Atomic[uint32]
# Contrary to the documentation, the futex related primitives are NOT in kernel32.dll
# but in API-MS-Win-Core-Synch-l1-2-0.dll ¯\_(ツ)_/¯
proc initialize*(futex: var Futex) {.inline.} =
futex.value.store(0, moRelaxed)
proc teardown*(futex: var Futex) {.inline.} =
futex.value.store(0, moRelaxed)
proc WaitOnAddress(
Address: pointer, CompareAddress: pointer,
AddressSize: csize_t, dwMilliseconds: DWORD
): WINBOOL {.importc, stdcall, dynlib: "API-MS-Win-Core-Synch-l1-2-0.dll".}
# The Address should be volatile
proc WakeByAddressSingle(Address: pointer) {.importc, stdcall, dynlib: "API-MS-Win-Core-Synch-l1-2-0.dll".}
proc WakeByAddressAll(Address: pointer) {.importc, stdcall, dynlib: "API-MS-Win-Core-Synch-l1-2-0.dll".}
proc load*(futex: var Futex, order: MemoryOrder): uint32 {.inline.} =
futex.value.load(order)
proc loadMut*(futex: var Futex): var Atomic[uint32] {.inline.} =
futex.value
proc store*(futex: var Futex, value: uint32, order: MemoryOrder) {.inline.} =
futex.value.store(value, order)
proc wait*(futex: var Futex, refVal: uint32) {.inline.} =
## Suspend a thread if the value of the futex is the same as refVal.
# Returns TRUE if the wait succeeds or FALSE if not.
# getLastError() will contain the error information, for example
# if it failed due to a timeout.
# We discard as this is not needed and simplifies compat with Linux futex
discard WaitOnAddress(futex.value.addr, refVal.unsafeAddr, csize_t sizeof(refVal), INFINITE)
proc wake*(futex: var Futex) {.inline.} =
## Wake one thread (from the same process)
WakeByAddressSingle(futex.value.addr)
proc wakeAll*(futex: var Futex) {.inline.} =
## Wake all threads (from the same process)
WakeByAddressAll(futex.value.addr)

View File

@ -0,0 +1,555 @@
# Constantine
# Copyright (c) 2018-2019 Status Research & Development GmbH
# Copyright (c) 2020-Present Mamy André-Ratsimbazafy
# Licensed and distributed under either of
# * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
# * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
# at your option. This file may not be copied, modified, or distributed except according to those terms.
when not compileOption("threads"):
{.error: "This requires --threads:on compilation flag".}
{.push raises: [].}
import
std/[cpuinfo, atomics, macros],
./crossthread/[
taskqueues,
backoff,
tasks_flowvars],
./instrumentation,
./primitives/barriers,
./parallel_offloading,
../allocs, ../bithacks,
../../../helpers/prng_unsafe
export
# flowvars
Flowvar, isSpawned, isReady, sync
type
WorkerID = uint32
Signal = object
terminate {.align: 64.}: Atomic[bool]
WorkerContext = object
## Thread-local worker context
# Params
id: WorkerID
threadpool: Threadpool
# Tasks
taskqueue: ptr Taskqueue # owned task queue
currentTask: ptr Task
# Synchronization
localBackoff: EventNotifier # Multi-Producer Single-Consumer backoff
signal: ptr Signal # owned signal
# Thefts
rng: RngState # RNG state to select victims
# Adaptative theft policy
stealHalf: bool
recentTasks: uint32
recentThefts: uint32
recentTheftsAdaptative: uint32
recentLeaps: uint32
Threadpool* = ptr object
barrier: SyncBarrier # Barrier for initialization and teardown
# -- align: 64
globalBackoff: EventCount # Multi-Producer Multi-Consumer backoff
# -- align: 64
numThreads*{.align: 64.}: uint32
workerQueues: ptr UncheckedArray[Taskqueue]
workers: ptr UncheckedArray[Thread[(Threadpool, WorkerID)]]
workerSignals: ptr UncheckedArray[Signal]
# Thread-local config
# ---------------------------------------------
var workerContext {.threadvar.}: WorkerContext
## Thread-local Worker context
proc setupWorker() =
## Initialize the thread-local context of a worker
## Requires the ID and threadpool fields to be initialized
template ctx: untyped = workerContext
preCondition: not ctx.threadpool.isNil()
preCondition: 0 <= ctx.id and ctx.id < ctx.threadpool.numThreads.uint32
preCondition: not ctx.threadpool.workerQueues.isNil()
preCondition: not ctx.threadpool.workerSignals.isNil()
# Thefts
ctx.rng.seed(0xEFFACED + ctx.id)
# Synchronization
ctx.localBackoff.initialize()
ctx.signal = addr ctx.threadpool.workerSignals[ctx.id]
ctx.signal.terminate.store(false, moRelaxed)
# Tasks
ctx.taskqueue = addr ctx.threadpool.workerQueues[ctx.id]
ctx.currentTask = nil
# Init
ctx.taskqueue[].init(initialCapacity = 32)
# Adaptative theft policy
ctx.recentTasks = 0
ctx.recentThefts = 0
ctx.recentTheftsAdaptative = 0
ctx.recentLeaps = 0
proc teardownWorker() =
## Cleanup the thread-local context of a worker
workerContext.localBackoff.`=destroy`()
workerContext.taskqueue[].teardown()
proc eventLoop(ctx: var WorkerContext) {.raises:[Exception].}
proc workerEntryFn(params: tuple[threadpool: Threadpool, id: WorkerID]) {.raises: [Exception].} =
## On the start of the threadpool workers will execute this
## until they receive a termination signal
# We assume that thread_local variables start all at their binary zero value
preCondition: workerContext == default(WorkerContext)
template ctx: untyped = workerContext
# If the following crashes, you need --tlsEmulation:off
ctx.id = params.id
ctx.threadpool = params.threadpool
setupWorker()
# 1 matching barrier in Threadpool.new() for root thread
discard params.threadpool.barrier.wait()
{.cast(gcsafe).}: # Compiler does not consider that GC-safe by default when multi-threaded due to thread-local variables
ctx.eventLoop()
debugTermination:
log(">>> Worker %2d shutting down <<<\n", ctx.id)
# 1 matching barrier in threadpool.shutdown() for root thread
discard params.threadpool.barrier.wait()
teardownWorker()
# Tasks
# ---------------------------------------------
# Sentinel values
const ReadyFuture = cast[ptr EventNotifier](0xCA11AB1E)
const RootTask = cast[ptr Task](0xEFFACED0)
proc run*(ctx: var WorkerContext, task: ptr Task) {.raises:[Exception].} =
## Run a task, frees it if it is not owned by a Flowvar
let suspendedTask = workerContext.currentTask
ctx.currentTask = task
debug: log("Worker %2d: running task.fn 0x%.08x (%d pending)\n", ctx.id, task.fn, ctx.taskqueue[].peek())
task.fn(task.data.addr)
debug: log("Worker %2d: completed task.fn 0x%.08x (%d pending)\n", ctx.id, task.fn, ctx.taskqueue[].peek())
ctx.recentTasks += 1
ctx.currentTask = suspendedTask
if not task.hasFuture:
freeHeap(task)
return
# Sync with an awaiting thread without work in completeFuture
var expected = (ptr EventNotifier)(nil)
if not compareExchange(task.waiter, expected, desired = ReadyFuture, moAcquireRelease):
debug: log("Worker %2d: completed task 0x%.08x, notifying waiter 0x%.08x\n", ctx.id, task, expected)
expected[].notify()
proc schedule(ctx: var WorkerContext, tn: ptr Task, forceWake = false) {.inline.} =
## Schedule a task in the threadpool
## This wakes a sibling thread if our local queue is empty
## or forceWake is true.
debug: log("Worker %2d: schedule task 0x%.08x (parent 0x%.08x, current 0x%.08x)\n", ctx.id, tn, tn.parent, ctx.currentTask)
# Instead of notifying every time a task is scheduled, we notify
# only when the worker queue is empty. This is a good approximation
# of starvation in work-stealing.
# - Tzannes, A., G. C. Caragea, R. Barua, and U. Vishkin.
# Lazy binary-splitting: a run-time adaptive work-stealing scheduler.
# In PPoPP 10, Bangalore, India, January 2010. ACM, pp. 179190.
# https://user.eng.umd.edu/~barua/ppopp164.pdf
let wasEmpty = ctx.taskqueue[].peek() == 0
ctx.taskqueue[].push(tn)
if forceWake or wasEmpty:
ctx.threadpool.globalBackoff.wake()
# Scheduler
# ---------------------------------------------
iterator pseudoRandomPermutation(randomSeed, maxExclusive: uint32): uint32 =
## Create a (low-quality) pseudo-random permutation from [0, max)
# Design considerations and randomness constraint for work-stealing, see docs/random_permutations.md
#
# Linear Congruential Generator: https://en.wikipedia.org/wiki/Linear_congruential_generator
#
# Xₙ₊₁ = aXₙ+c (mod m) generates all random number mod m without repetition
# if and only if (Hull-Dobell theorem):
# 1. c and m are coprime
# 2. a-1 is divisible by all prime factors of m
# 3. a-1 is divisible by 4 if m is divisible by 4
#
# Alternative 1. By choosing a=1, all conditions are easy to reach.
#
# The randomness quality is not important besides distributing potential contention,
# i.e. randomly trying thread i, then i+1, then i+n-1 (mod n) is good enough.
#
# Assuming 6 threads, co-primes are [1, 5], which means the following permutations
# assuming we start with victim 0:
# - [0, 1, 2, 3, 4, 5]
# - [0, 5, 4, 3, 2, 1]
# While we don't care much about randoness quality, it's a bit disappointing.
#
# Alternative 2. We can choose m to be the next power of 2, meaning all odd integers are co-primes,
# consequently:
# - we don't need a GCD to find the coprimes
# - we don't need to cache coprimes, removing a cache-miss potential
# - a != 1, so we now have a multiplicative factor, which makes output more "random looking".
# n and (m-1) <=> n mod m, if m is a power of 2
let M = maxExclusive.nextPowerOfTwo_vartime()
let c = (randomSeed and ((M shr 1) - 1)) * 2 + 1 # c odd and c ∈ [0, M)
let a = (randomSeed and ((M shr 2) - 1)) * 4 + 1 # a-1 divisible by 2 (all prime factors of m) and by 4 if m divisible by 4
let mask = M-1 # for mod M
let start = randomSeed and mask
var x = start
while true:
if x < maxExclusive:
yield x
x = (a*x + c) and mask # ax + c (mod M), with M power of 2
if x == start:
break
proc tryStealOne(ctx: var WorkerContext): ptr Task =
## Try to steal a task.
let seed = ctx.rng.next().uint32
for targetId in seed.pseudoRandomPermutation(ctx.threadpool.numThreads):
if targetId == ctx.id:
continue
let stolenTask = ctx.id.steal(ctx.threadpool.workerQueues[targetId])
if not stolenTask.isNil():
ctx.recentThefts += 1
# Theft successful, there might be more work for idle threads, wake one
ctx.threadpool.globalBackoff.wake()
return stolenTask
return nil
proc updateStealStrategy(ctx: var WorkerContext) =
## Estimate work-stealing efficiency during the last interval
## If the value is below a threshold, switch strategies
const StealAdaptativeInterval = 25
if ctx.recentTheftsAdaptative == StealAdaptativeInterval:
let recentTheftsNonAdaptative = ctx.recentThefts - ctx.recentTheftsAdaptative
let adaptativeTasks = ctx.recentTasks - ctx.recentLeaps - recentTheftsNonAdaptative
let ratio = adaptativeTasks.float32 / StealAdaptativeInterval.float32
if ctx.stealHalf and ratio < 2.0f:
# Tasks stolen are coarse-grained, steal only one to reduce re-steal
ctx.stealHalf = false
elif not ctx.stealHalf and ratio == 1.0f:
# All tasks processed were stolen tasks, we need to steal many at a time
ctx.stealHalf = true
# Reset interval
ctx.recentTasks = 0
ctx.recentThefts = 0
ctx.recentTheftsAdaptative = 0
ctx.recentLeaps = 0
proc tryStealAdaptative(ctx: var WorkerContext): ptr Task =
## Try to steal one or many tasks, depending on load
# TODO: while running 'threadpool/examples/e02_parallel_pi.nim'
# stealHalf can error out in tasks_flowvars.nim with:
# "precondition not task.completed.load(moAcquire)"
ctx.stealHalf = false
# ctx.updateStealStrategy()
let seed = ctx.rng.next().uint32
for targetId in seed.pseudoRandomPermutation(ctx.threadpool.numThreads):
if targetId == ctx.id:
continue
let stolenTask =
if ctx.stealHalf: ctx.id.stealHalf(ctx.taskqueue[], ctx.threadpool.workerQueues[targetId])
else: ctx.id.steal(ctx.threadpool.workerQueues[targetId])
if not stolenTask.isNil():
ctx.recentThefts += 1
ctx.recentTheftsAdaptative += 1
# Theft successful, there might be more work for idle threads, wake one
ctx.threadpool.globalBackoff.wake()
return stolenTask
return nil
proc tryLeapfrog(ctx: var WorkerContext, awaitedTask: ptr Task): ptr Task =
## Leapfrogging:
##
## - Leapfrogging: a portable technique for implementing efficient futures,
## David B. Wagner, Bradley G. Calder, 1993
## https://dl.acm.org/doi/10.1145/173284.155354
##
## When awaiting a future, we can look in the thief queue first. They steal when they run out of tasks.
## If they have tasks in their queue, it's the task we are awaiting that created them and it will likely be stuck
## on those tasks as well, so we need to help them help us.
var thiefID = SentinelThief
while true:
debug: log("Worker %2d: waiting for thief to publish their ID\n", ctx.id)
thiefID = awaitedTask.thiefID.load(moAcquire)
if thiefID != SentinelThief:
break
cpuRelax()
ascertain: 0 <= thiefID and thiefID < ctx.threadpool.numThreads
# Leapfrogging is used when completing a future, steal only one
# and don't leave tasks stranded in our queue.
let leapTask = ctx.id.steal(ctx.threadpool.workerQueues[thiefID])
if not leapTask.isNil():
ctx.recentLeaps += 1
# Theft successful, there might be more work for idle threads, wake one
ctx.threadpool.globalBackoff.wake()
return leapTask
return nil
proc eventLoop(ctx: var WorkerContext) {.raises:[Exception].} =
## Each worker thread executes this loop over and over.
while not ctx.signal.terminate.load(moRelaxed):
# 1. Pick from local queue
debug: log("Worker %2d: eventLoop 1 - searching task from local queue\n", ctx.id)
while (var task = ctx.taskqueue[].pop(); not task.isNil):
debug: log("Worker %2d: eventLoop 1 - running task 0x%.08x (parent 0x%.08x, current 0x%.08x)\n", ctx.id, task, task.parent, ctx.currentTask)
ctx.run(task)
# 2. Run out of tasks, become a thief
debug: log("Worker %2d: eventLoop 2 - becoming a thief\n", ctx.id)
let ticket = ctx.threadpool.globalBackoff.sleepy()
var stolenTask = ctx.tryStealAdaptative()
if not stolenTask.isNil:
# We manage to steal a task, cancel sleep
ctx.threadpool.globalBackoff.cancelSleep()
# 2.a Run task
debug: log("Worker %2d: eventLoop 2.a - stole task 0x%.08x (parent 0x%.08x, current 0x%.08x)\n", ctx.id, stolenTask, stolenTask.parent, ctx.currentTask)
ctx.run(stolenTask)
else:
# 2.b Park the thread until a new task enters the threadpool
debug: log("Worker %2d: eventLoop 2.b - sleeping\n", ctx.id)
ctx.threadpool.globalBackoff.sleep(ticket)
debug: log("Worker %2d: eventLoop 2.b - waking\n", ctx.id)
# Sync
# ---------------------------------------------
template isRootTask(task: ptr Task): bool =
task == RootTask
proc completeFuture*[T](fv: Flowvar[T], parentResult: var T) {.raises:[Exception].} =
## Eagerly complete an awaited FlowVar
template ctx: untyped = workerContext
template isFutReady(): untyped =
let isReady = fv.task.completed.load(moAcquire)
if isReady:
parentResult = cast[ptr (ptr Task, T)](fv.task.data.addr)[1]
isReady
if isFutReady():
return
## 1. Process all the children of the current tasks.
## This ensures that we can give control back ASAP.
debug: log("Worker %2d: sync 1 - searching task from local queue\n", ctx.id)
while (let task = ctx.taskqueue[].pop(); not task.isNil):
if task.parent != ctx.currentTask:
debug: log("Worker %2d: sync 1 - skipping non-direct descendant task 0x%.08x (parent 0x%.08x, current 0x%.08x)\n", ctx.id, task, task.parent, ctx.currentTask)
ctx.schedule(task, forceWake = true) # reschedule task and wake a sibling to take it over.
break
debug: log("Worker %2d: sync 1 - running task 0x%.08x (parent 0x%.08x, current 0x%.08x)\n", ctx.id, task, task.parent, ctx.currentTask)
ctx.run(task)
if isFutReady():
debug: log("Worker %2d: sync 1 - future ready, exiting\n", ctx.id)
return
## 2. We run out-of-tasks or out-of-direct-child of our current awaited task
## So the task is bottlenecked by dependencies in other threads,
## hence we abandon our enqueued work and steal in the others' queues
## in hope it advances our awaited task. This prioritizes latency over throughput.
##
## See also
## - Proactive work-stealing for futures
## Kyle Singer, Yifan Xu, I-Ting Angelina Lee, 2019
## https://dl.acm.org/doi/10.1145/3293883.3295735
debug: log("Worker %2d: sync 2 - future not ready, becoming a thief (currentTask 0x%.08x)\n", ctx.id, ctx.currentTask)
while not isFutReady():
if (let leapTask = ctx.tryLeapfrog(fv.task); not leapTask.isNil):
# We stole a task generated by the task we are awaiting.
debug: log("Worker %2d: sync 2.1 - leapfrog task 0x%.08x (parent 0x%.08x, current 0x%.08x)\n", ctx.id, leapTask, leapTask.parent, ctx.currentTask)
ctx.run(leapTask)
elif (let stolenTask = ctx.tryStealOne(); not stolenTask.isNil):
# We stole a task, we hope we advance our awaited task.
debug: log("Worker %2d: sync 2.2 - stole task 0x%.08x (parent 0x%.08x, current 0x%.08x)\n", ctx.id, stolenTask, stolenTask.parent, ctx.currentTask)
ctx.run(stolenTask)
elif (let ownTask = ctx.taskqueue[].pop(); not ownTask.isNil):
# We advance our own queue, this increases global throughput but may impact latency on the awaited task.
#
# Note: for a scheduler to be optimal (i.e. within 2x than ideal) it should be greedy
# so all workers should be working. This is a difficult tradeoff.
debug: log("Worker %2d: sync 2.3 - couldn't steal, running own task\n", ctx.id)
ctx.run(ownTask)
else:
# Nothing to do, we park.
# Note: On today's hyperthreaded systems, it might be more efficient to always park
# instead of working on unrelated tasks in our task queue, despite making the scheduler non-greedy.
# The actual hardware resources are 2x less than the actual number of cores
ctx.localBackoff.prepareToPark()
var expected = (ptr EventNotifier)(nil)
if compareExchange(fv.task.waiter, expected, desired = ctx.localBackoff.addr, moAcquireRelease):
ctx.localBackoff.park()
proc syncAll*(tp: Threadpool) {.raises: [Exception].} =
## Blocks until all pending tasks are completed
## This MUST only be called from
## the root scope that created the threadpool
template ctx: untyped = workerContext
debugTermination:
log(">>> Worker %2d enters barrier <<<\n", ctx.id)
preCondition: ctx.id == 0
preCondition: ctx.currentTask.isRootTask()
# Empty all tasks
var foreignThreadsParked = false
while not foreignThreadsParked:
# 1. Empty local tasks
debug: log("Worker %2d: syncAll 1 - searching task from local queue\n", ctx.id)
while (let task = ctx.taskqueue[].pop(); not task.isNil):
debug: log("Worker %2d: syncAll 1 - running task 0x%.08x (parent 0x%.08x, current 0x%.08x)\n", ctx.id, task, task.parent, ctx.currentTask)
ctx.run(task)
if tp.numThreads == 1 or foreignThreadsParked:
break
# 2. Help other threads
debug: log("Worker %2d: syncAll 2 - becoming a thief\n", ctx.id)
let stolenTask = ctx.tryStealAdaptative()
if not stolenTask.isNil:
# 2.1 We stole some task
debug: log("Worker %2d: syncAll 2.1 - stole task 0x%.08x (parent 0x%.08x, current 0x%.08x)\n", ctx.id, stolenTask, stolenTask.parent, ctx.currentTask)
ctx.run(stolenTask)
else:
# 2.2 No task to steal
if tp.globalBackoff.getNumWaiters() == tp.numThreads - 1:
# 2.2.1 all threads besides the current are parked
debugTermination:
log("Worker %2d: syncAll 2.2.1 - termination, all other threads sleeping\n", ctx.id)
foreignThreadsParked = true
else:
# 2.2.2 We don't park as there is no notif for task completion
cpuRelax()
debugTermination:
log(">>> Worker %2d leaves barrier <<<\n", ctx.id)
# Runtime
# ---------------------------------------------
proc new*(T: type Threadpool, numThreads = countProcessors()): T {.raises: [Exception].} =
## Initialize a threadpool that manages `numThreads` threads.
## Default to the number of logical processors available.
type TpObj = typeof(default(Threadpool)[]) # due to C import, we need a dynamic sizeof
var tp = allocHeapUncheckedAlignedPtr(Threadpool, sizeof(TpObj), alignment = 64)
tp.barrier.init(numThreads.uint32)
tp.globalBackoff.initialize()
tp.numThreads = numThreads.uint32
tp.workerQueues = allocHeapArrayAligned(Taskqueue, numThreads, alignment = 64)
tp.workers = allocHeapArrayAligned(Thread[(Threadpool, WorkerID)], numThreads, alignment = 64)
tp.workerSignals = allocHeapArrayAligned(Signal, numThreads, alignment = 64)
# Setup master thread
workerContext.id = 0
workerContext.threadpool = tp
# Start worker threads
for i in 1 ..< numThreads:
createThread(tp.workers[i], workerEntryFn, (tp, WorkerID(i)))
# Root worker
setupWorker()
# Root task, this is a sentinel task that is never called.
workerContext.currentTask = RootTask
# Wait for the child threads
discard tp.barrier.wait()
return tp
proc cleanup(tp: var Threadpool) {.raises: [OSError].} =
## Cleanup all resources allocated by the threadpool
preCondition: workerContext.currentTask.isRootTask()
for i in 1 ..< tp.numThreads:
joinThread(tp.workers[i])
tp.workerSignals.freeHeapAligned()
tp.workers.freeHeapAligned()
tp.workerQueues.freeHeapAligned()
tp.globalBackoff.`=destroy`()
tp.barrier.delete()
tp.freeHeapAligned()
proc shutdown*(tp: var Threadpool) {.raises:[Exception].} =
## Wait until all tasks are processed and then shutdown the threadpool
preCondition: workerContext.currentTask.isRootTask()
tp.syncAll()
# Signal termination to all threads
for i in 0 ..< tp.numThreads:
tp.workerSignals[i].terminate.store(true, moRelaxed)
tp.globalBackoff.wakeAll()
# 1 matching barrier in workerEntryFn
discard tp.barrier.wait()
teardownWorker()
tp.cleanup()
# Delete dummy task
workerContext.currentTask = nil
{.pop.} # raises:[]
# Task parallel API
# ---------------------------------------------
macro spawn*(tp: Threadpool, fnCall: typed): untyped =
## Spawns the input function call asynchronously, potentially on another thread of execution.
##
## If the function calls returns a result, spawn will wrap it in a Flowvar.
## You can use `sync` to block the current thread and extract the asynchronous result from the flowvar.
## You can use `isReady` to check if result is available and if subsequent
## `spawn` returns immediately.
##
## Tasks are processed approximately in Last-In-First-Out (LIFO) order
result = spawnImpl(tp, fnCall, bindSym"workerContext", bindSym"schedule")

View File

@ -68,7 +68,7 @@ func rotl(x: uint64, k: static int): uint64 {.inline.} =
template `^=`(x: var uint64, y: uint64) =
x = x xor y
func next(rng: var RngState): uint64 =
func next*(rng: var RngState): uint64 =
## Compute a random uint64 from the input state
## using xoshiro512** algorithm by Vigna et al
## State is updated.
@ -96,6 +96,12 @@ func random_unsafe*(rng: var RngState, maxExclusive: uint32): uint32 =
## Uses an unbiaised generation method
## See Lemire's algorithm modified by Melissa O'Neill
## https://www.pcg-random.org/posts/bounded-rands.html
## Original:
## biaised: https://lemire.me/blog/2016/06/27/a-fast-alternative-to-the-modulo-reduction/
## unbiaised: https://arxiv.org/pdf/1805.10941.pdf
## Also:
## Barrett Reduction: https://en.wikipedia.org/wiki/Barrett_reduction
## http://www.acsel-lab.com/arithmetic/arith18/papers/ARITH18_Hasenplaugh.pdf
let max = maxExclusive
var x = uint32 rng.next()
var m = x.uint64 * max.uint64