mirror of
https://github.com/logos-storage/constantine.git
synced 2026-01-02 13:13:07 +00:00
Add a threadpool (#213)
* Implement a threadpool * int and SomeUnsignedInt ... * Type conversion for windows SynchronizationBarrier * Use the latest MacOS 11, Big Sur API (jan 2021) for MacOS futexes, Github action offers MacOS 12 and can test them * bench need posix timer not available on windows and darwin futex * Windows: nimble exec empty line is an error, Mac: use defined(osx) instead of defined(macos) * file rename * okay, that's the last one hopefully * deactivate stealHalf for now
This commit is contained in:
parent
188f3e710c
commit
2931913b67
@ -43,7 +43,7 @@ const testDesc: seq[tuple[path: string, useGMP: bool]] = @[
|
||||
# ----------------------------------------------------------
|
||||
("tests/math/t_primitives.nim", false),
|
||||
("tests/math/t_primitives_extended_precision.nim", false),
|
||||
|
||||
|
||||
# Big ints
|
||||
# ----------------------------------------------------------
|
||||
("tests/math/t_io_bigints.nim", false),
|
||||
@ -53,7 +53,7 @@ const testDesc: seq[tuple[path: string, useGMP: bool]] = @[
|
||||
("tests/math/t_bigints_mod_vs_gmp.nim", true),
|
||||
("tests/math/t_bigints_mul_vs_gmp.nim", true),
|
||||
("tests/math/t_bigints_mul_high_words_vs_gmp.nim", true),
|
||||
|
||||
|
||||
# Field
|
||||
# ----------------------------------------------------------
|
||||
("tests/math/t_io_fields", false),
|
||||
@ -64,11 +64,11 @@ const testDesc: seq[tuple[path: string, useGMP: bool]] = @[
|
||||
("tests/math/t_finite_fields_powinv.nim", false),
|
||||
("tests/math/t_finite_fields_vs_gmp.nim", true),
|
||||
# ("tests/math/t_fp_cubic_root.nim", false),
|
||||
|
||||
|
||||
# Double-precision finite fields
|
||||
# ----------------------------------------------------------
|
||||
("tests/math/t_finite_fields_double_precision.nim", false),
|
||||
|
||||
|
||||
# Towers of extension fields
|
||||
# ----------------------------------------------------------
|
||||
# ("tests/math/t_fp2.nim", false),
|
||||
@ -90,7 +90,7 @@ const testDesc: seq[tuple[path: string, useGMP: bool]] = @[
|
||||
("tests/math/t_fp6_frobenius.nim", false),
|
||||
("tests/math/t_fp12_frobenius.nim", false),
|
||||
|
||||
# Elliptic curve arithmetic
|
||||
# Elliptic curve arithmetic
|
||||
# ----------------------------------------------------------
|
||||
("tests/math/t_ec_conversion.nim", false),
|
||||
|
||||
@ -111,7 +111,7 @@ const testDesc: seq[tuple[path: string, useGMP: bool]] = @[
|
||||
("tests/math/t_ec_twedwards_prj_add_double", false),
|
||||
("tests/math/t_ec_twedwards_prj_mul_sanity", false),
|
||||
("tests/math/t_ec_twedwards_prj_mul_distri", false),
|
||||
|
||||
|
||||
|
||||
# Elliptic curve arithmetic G2
|
||||
# ----------------------------------------------------------
|
||||
@ -172,7 +172,7 @@ const testDesc: seq[tuple[path: string, useGMP: bool]] = @[
|
||||
("tests/math/t_ec_sage_bls12_381.nim", false),
|
||||
("tests/math/t_ec_sage_pallas.nim", false),
|
||||
("tests/math/t_ec_sage_vesta.nim", false),
|
||||
|
||||
|
||||
# Edge cases highlighted by past bugs
|
||||
# ----------------------------------------------------------
|
||||
("tests/math/t_ec_shortw_prj_edge_cases.nim", false),
|
||||
@ -233,6 +233,18 @@ const testDescNvidia: seq[string] = @[
|
||||
"tests/gpu/t_nvidia_fp.nim",
|
||||
]
|
||||
|
||||
const testDescThreadpool: seq[string] = @[
|
||||
"constantine/platforms/threadpool/examples/e01_simple_tasks.nim",
|
||||
"constantine/platforms/threadpool/examples/e02_parallel_pi.nim",
|
||||
# "constantine/platforms/threadpool/benchmarks/bouncing_producer_consumer/threadpool_bpc.nim", # Need timing not implemented on Windows
|
||||
"constantine/platforms/threadpool/benchmarks/dfs/threadpool_dfs.nim",
|
||||
"constantine/platforms/threadpool/benchmarks/fibonacci/threadpool_fib.nim",
|
||||
"constantine/platforms/threadpool/benchmarks/heat/threadpool_heat.nim",
|
||||
# "constantine/platforms/threadpool/benchmarks/matmul_cache_oblivious/threadpool_matmul_co.nim",
|
||||
"constantine/platforms/threadpool/benchmarks/nqueens/threadpool_nqueens.nim",
|
||||
# "constantine/platforms/threadpool/benchmarks/single_task_producer/threadpool_spc.nim", # Need timing not implemented on Windows
|
||||
]
|
||||
|
||||
const benchDesc = [
|
||||
"bench_fp",
|
||||
"bench_fp_double_precision",
|
||||
@ -301,7 +313,7 @@ proc clearParallelBuild() =
|
||||
if fileExists(buildParallel):
|
||||
rmFile(buildParallel)
|
||||
|
||||
template setupCommand(): untyped {.dirty.} =
|
||||
template setupCommand(): untyped {.dirty.} =
|
||||
var lang = "c"
|
||||
if existsEnv"TEST_LANG":
|
||||
lang = getEnv"TEST_LANG"
|
||||
@ -367,7 +379,7 @@ proc addTestSet(cmdFile: var string, requireGMP: bool, test32bit = false, testAS
|
||||
if not dirExists "build":
|
||||
mkDir "build"
|
||||
echo "Found " & $testDesc.len & " tests to run."
|
||||
|
||||
|
||||
for td in testDesc:
|
||||
if not(td.useGMP and not requireGMP):
|
||||
var flags = ""
|
||||
@ -379,17 +391,25 @@ proc addTestSet(cmdFile: var string, requireGMP: bool, test32bit = false, testAS
|
||||
flags &= " -d:debugConstantine"
|
||||
if td.path notin skipSanitizers:
|
||||
flags &= sanitizers
|
||||
|
||||
|
||||
cmdFile.testBatch(flags, td.path)
|
||||
|
||||
proc addTestSetNvidia(cmdFile: var string) =
|
||||
if not dirExists "build":
|
||||
mkDir "build"
|
||||
echo "Found " & $testDescNvidia.len & " tests to run."
|
||||
|
||||
|
||||
for path in testDescNvidia:
|
||||
cmdFile.testBatch(flags = "", path)
|
||||
|
||||
proc addTestSetThreadpool(cmdFile: var string) =
|
||||
if not dirExists "build":
|
||||
mkDir "build"
|
||||
echo "Found " & $testDescThreadpool.len & " tests to run."
|
||||
|
||||
for path in testDescThreadpool:
|
||||
cmdFile.testBatch(flags = "--threads:on --linetrace:on", path)
|
||||
|
||||
proc addBenchSet(cmdFile: var string, useAsm = true) =
|
||||
if not dirExists "build":
|
||||
mkDir "build"
|
||||
@ -491,13 +511,13 @@ task test_bindings, "Test C bindings":
|
||||
# Put DLL near the exe as LD_LIBRARY_PATH doesn't work even in an POSIX compatible shell
|
||||
exec "gcc -Ibindings/generated -Lbindings/generated -o build/testsuite/t_libctt_bls12_381_dl.exe tests/bindings/t_libctt_bls12_381.c -lgmp -lconstantine_bls12_381"
|
||||
exec "./build/testsuite/t_libctt_bls12_381_dl.exe"
|
||||
|
||||
|
||||
echo "--> Testing statically linked library"
|
||||
when not defined(windows):
|
||||
# Beware MacOS annoying linker with regards to static libraries
|
||||
# The following standard way cannot be used on MacOS
|
||||
# exec "gcc -Ibindings/generated -Lbindings/generated -o build/t_libctt_bls12_381_sl.exe tests/bindings/t_libctt_bls12_381.c -lgmp -Wl,-Bstatic -lconstantine_bls12_381 -Wl,-Bdynamic"
|
||||
|
||||
|
||||
exec "gcc -Ibindings/generated -o build/testsuite/t_libctt_bls12_381_sl tests/bindings/t_libctt_bls12_381.c bindings/generated/libconstantine_bls12_381.a -lgmp"
|
||||
exec "./build/testsuite/t_libctt_bls12_381_sl"
|
||||
else:
|
||||
@ -509,32 +529,40 @@ task test, "Run all tests":
|
||||
var cmdFile: string
|
||||
cmdFile.addTestSet(requireGMP = true, testASM = true)
|
||||
cmdFile.addBenchSet(useASM = true) # Build (but don't run) benches to ensure they stay relevant
|
||||
cmdFile.addTestSetThreadpool()
|
||||
for cmd in cmdFile.splitLines():
|
||||
exec cmd
|
||||
if cmd != "": # Windows doesn't like empty commands
|
||||
exec cmd
|
||||
|
||||
task test_no_asm, "Run all tests (no assembly)":
|
||||
# -d:testingCurves is configured in a *.nim.cfg for convenience
|
||||
var cmdFile: string
|
||||
cmdFile.addTestSet(requireGMP = true, testASM = false)
|
||||
cmdFile.addBenchSet(useASM = false) # Build (but don't run) benches to ensure they stay relevant
|
||||
cmdFile.addTestSetThreadpool()
|
||||
for cmd in cmdFile.splitLines():
|
||||
exec cmd
|
||||
if cmd != "": # Windows doesn't like empty commands
|
||||
exec cmd
|
||||
|
||||
task test_no_gmp, "Run tests that don't require GMP":
|
||||
# -d:testingCurves is configured in a *.nim.cfg for convenience
|
||||
var cmdFile: string
|
||||
cmdFile.addTestSet(requireGMP = false, testASM = true)
|
||||
cmdFile.addBenchSet(useASM = true) # Build (but don't run) benches to ensure they stay relevant
|
||||
cmdFile.addTestSetThreadpool()
|
||||
for cmd in cmdFile.splitLines():
|
||||
exec cmd
|
||||
if cmd != "": # Windows doesn't like empty commands
|
||||
exec cmd
|
||||
|
||||
task test_no_gmp_no_asm, "Run tests that don't require GMP using a pure Nim backend":
|
||||
# -d:testingCurves is configured in a *.nim.cfg for convenience
|
||||
var cmdFile: string
|
||||
cmdFile.addTestSet(requireGMP = false, testASM = false)
|
||||
cmdFile.addBenchSet(useASM = false) # Build (but don't run) benches to ensure they stay relevant
|
||||
cmdFile.addTestSetThreadpool()
|
||||
for cmd in cmdFile.splitLines():
|
||||
exec cmd
|
||||
if cmd != "": # Windows doesn't like empty commands
|
||||
exec cmd
|
||||
|
||||
task test_parallel, "Run all tests in parallel (via GNU parallel)":
|
||||
# -d:testingCurves is configured in a *.nim.cfg for convenience
|
||||
@ -547,6 +575,13 @@ task test_parallel, "Run all tests in parallel (via GNU parallel)":
|
||||
writeFile(buildParallel, cmdFile)
|
||||
exec "build/pararun " & buildParallel
|
||||
|
||||
# Threadpool tests done serially
|
||||
cmdFile = ""
|
||||
cmdFile.addTestSetThreadpool()
|
||||
for cmd in cmdFile.splitLines():
|
||||
if cmd != "": # Windows doesn't like empty commands
|
||||
exec cmd
|
||||
|
||||
task test_parallel_no_asm, "Run all tests (without macro assembler) in parallel (via GNU parallel)":
|
||||
# -d:testingCurves is configured in a *.nim.cfg for convenience
|
||||
clearParallelBuild()
|
||||
@ -558,6 +593,13 @@ task test_parallel_no_asm, "Run all tests (without macro assembler) in parallel
|
||||
writeFile(buildParallel, cmdFile)
|
||||
exec "build/pararun " & buildParallel
|
||||
|
||||
# Threadpool tests done serially
|
||||
cmdFile = ""
|
||||
cmdFile.addTestSetThreadpool()
|
||||
for cmd in cmdFile.splitLines():
|
||||
if cmd != "": # Windows doesn't like empty commands
|
||||
exec cmd
|
||||
|
||||
task test_parallel_no_gmp, "Run all tests in parallel (via GNU parallel)":
|
||||
# -d:testingCurves is configured in a *.nim.cfg for convenience
|
||||
clearParallelBuild()
|
||||
@ -569,6 +611,13 @@ task test_parallel_no_gmp, "Run all tests in parallel (via GNU parallel)":
|
||||
writeFile(buildParallel, cmdFile)
|
||||
exec "build/pararun " & buildParallel
|
||||
|
||||
# Threadpool tests done serially
|
||||
cmdFile = ""
|
||||
cmdFile.addTestSetThreadpool()
|
||||
for cmd in cmdFile.splitLines():
|
||||
if cmd != "": # Windows doesn't like empty commands
|
||||
exec cmd
|
||||
|
||||
task test_parallel_no_gmp_no_asm, "Run all tests in parallel (via GNU parallel)":
|
||||
# -d:testingCurves is configured in a *.nim.cfg for convenience
|
||||
clearParallelBuild()
|
||||
@ -580,11 +629,26 @@ task test_parallel_no_gmp_no_asm, "Run all tests in parallel (via GNU parallel)"
|
||||
writeFile(buildParallel, cmdFile)
|
||||
exec "build/pararun " & buildParallel
|
||||
|
||||
# Threadpool tests done serially
|
||||
cmdFile = ""
|
||||
cmdFile.addTestSetThreadpool()
|
||||
for cmd in cmdFile.splitLines():
|
||||
if cmd != "": # Windows doesn't like empty commands
|
||||
exec cmd
|
||||
|
||||
task test_threadpool, "Run all tests for the builtin threadpool":
|
||||
var cmdFile: string
|
||||
cmdFile.addTestSetThreadpool()
|
||||
for cmd in cmdFile.splitLines():
|
||||
if cmd != "": # Windows doesn't like empty commands
|
||||
exec cmd
|
||||
|
||||
task test_nvidia, "Run all tests for Nvidia GPUs":
|
||||
var cmdFile: string
|
||||
cmdFile.addTestSetNvidia()
|
||||
for cmd in cmdFile.splitLines():
|
||||
exec cmd
|
||||
if cmd != "": # Windows doesn't like empty commands
|
||||
exec cmd
|
||||
|
||||
# Finite field 𝔽p
|
||||
# ------------------------------------------
|
||||
|
||||
@ -356,7 +356,7 @@ func div10*(a: var Limbs): SecretWord =
|
||||
## Divide `a` by 10 in-place and return the remainder
|
||||
result = Zero
|
||||
|
||||
const clz = WordBitWidth - 1 - log2_vartime(10)
|
||||
const clz = WordBitWidth - 1 - log2_vartime(10'u32)
|
||||
const norm10 = SecretWord(10) shl clz
|
||||
|
||||
for i in countdown(a.len-1, 0):
|
||||
|
||||
@ -61,7 +61,7 @@ func invModBitwidth(a: BaseType): BaseType =
|
||||
# which grows in O(log(log(a)))
|
||||
debug: doAssert (a and 1) == 1, "a must be odd"
|
||||
|
||||
const k = log2_vartime(a.sizeof() * 8)
|
||||
const k = log2_vartime(a.sizeof().uint32 * 8)
|
||||
result = a # Start from an inverse of M0 modulo 2, M0 is odd and it's own inverse
|
||||
for _ in 0 ..< k: # at each iteration we get the inverse mod(2^2k)
|
||||
result *= 2 - a * result # x' = x(2 - ax)
|
||||
|
||||
@ -282,7 +282,7 @@ func invModBitwidth*[T: SomeUnsignedInt](a: T): T =
|
||||
# which grows in O(log(log(a)))
|
||||
checkOdd(a)
|
||||
|
||||
let k = log2_vartime(T.sizeof() * 8)
|
||||
let k = log2_vartime(T.sizeof().uint32 * 8)
|
||||
result = a # Start from an inverse of M0 modulo 2, M0 is odd and it's own inverse
|
||||
for _ in 0 ..< k: # at each iteration we get the inverse mod(2^2k)
|
||||
result *= 2 - a * result # x' = x(2 - ax)
|
||||
|
||||
@ -36,7 +36,7 @@ debug:
|
||||
result[0] = '0'
|
||||
result[1] = 'x'
|
||||
var a = a
|
||||
for j in countdown(result.len-1, 2):
|
||||
for j in countdown(result.len-1, 0):
|
||||
result[j] = hexChars.secretLookup(a and SecretWord 0xF)
|
||||
a = a shr 4
|
||||
|
||||
|
||||
@ -291,8 +291,8 @@ func sum_batch_vartime*[F; G: static Subgroup](
|
||||
const maxStride = maxChunkSize div sizeof(ECP_ShortW_Aff[F, G])
|
||||
|
||||
let n = min(maxStride, points.len)
|
||||
let accumulators = alloca(ECP_ShortW_Aff[F, G], n)
|
||||
let lambdas = alloca(tuple[num, den: F], n)
|
||||
let accumulators = allocStackArray(ECP_ShortW_Aff[F, G], n)
|
||||
let lambdas = allocStackArray(tuple[num, den: F], n)
|
||||
|
||||
for i in countup(0, points.len-1, maxStride):
|
||||
let n = min(maxStride, points.len - i)
|
||||
|
||||
@ -6,9 +6,23 @@ This folder holds:
|
||||
to have the compiler enforce proper usage
|
||||
- extended precision multiplication and division primitives
|
||||
- assembly or builtin int128 primitives
|
||||
- intrinsics
|
||||
- an assembler
|
||||
- Runtime CPU features detection
|
||||
- SIMD intrinsics
|
||||
- assemblers for x86 and LLVM IR
|
||||
- a code generator for Nvidia GPU from LLVM IR
|
||||
- runtime CPU features detection
|
||||
- a threadpool
|
||||
|
||||
## Runtimes
|
||||
|
||||
Constantine strongly avoid any runtime so that it can be used even where garbage collection, dynamic memory allocation
|
||||
are not allowed. That also avoids secrets remaining in heap memory.
|
||||
|
||||
At runtime, Constantine may:
|
||||
- detect the CPU features at the start of the application (in Nim) or after calling `ctt_myprotocol_init_NimMain()` for the C (or any other language) bindings.
|
||||
|
||||
And offers the following opt-in features with use dynamic allocation:
|
||||
- a threadpool, only for explicitly tagged parallel primitives.
|
||||
- use LLVM and Cuda, and configure code to run computation on GPUs.
|
||||
|
||||
## Security
|
||||
|
||||
|
||||
@ -6,6 +6,9 @@
|
||||
# * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
|
||||
# at your option. This file may not be copied, modified, or distributed except according to those terms.
|
||||
|
||||
{.push raises:[].} # No exceptions for crypto
|
||||
{.push checks:off.} # No int->size_t exceptions
|
||||
|
||||
# ############################################################
|
||||
#
|
||||
# Allocators
|
||||
@ -20,13 +23,107 @@
|
||||
#
|
||||
# stack allocation is strongly preferred where necessary.
|
||||
|
||||
# Bindings
|
||||
# ----------------------------------------------------------------------------------
|
||||
# We wrap them with int instead of size_t / csize_t
|
||||
|
||||
when defined(windows):
|
||||
proc alloca(size: int): pointer {.header: "<malloc.h>".}
|
||||
else:
|
||||
proc alloca(size: int): pointer {.header: "<alloca.h>".}
|
||||
|
||||
template alloca*(T: typedesc): ptr T =
|
||||
proc malloc(size: int): pointer {.sideeffect, header: "<stdlib.h>".}
|
||||
proc free(p: pointer) {.sideeffect, header: "<stdlib.h>".}
|
||||
|
||||
when defined(windows):
|
||||
proc aligned_alloc_windows(size, alignment: int): pointer {.sideeffect,importc:"_aligned_malloc", header:"<malloc.h>".}
|
||||
# Beware of the arg order!
|
||||
proc aligned_alloc(alignment, size: int): pointer {.inline.} =
|
||||
aligned_alloc_windows(size, alignment)
|
||||
proc aligned_free(p: pointer){.sideeffect,importc:"_aligned_free", header:"<malloc.h>".}
|
||||
elif defined(osx):
|
||||
proc posix_memalign(mem: var pointer, alignment, size: int){.sideeffect,importc, header:"<stdlib.h>".}
|
||||
proc aligned_alloc(alignment, size: int): pointer {.inline.} =
|
||||
posix_memalign(result, alignment, size)
|
||||
proc aligned_free(p: pointer) {.sideeffect, importc: "free", header: "<stdlib.h>".}
|
||||
else:
|
||||
proc aligned_alloc(alignment, size: int): pointer {.sideeffect,importc, header:"<stdlib.h>".}
|
||||
proc aligned_free(p: pointer) {.sideeffect, importc: "free", header: "<stdlib.h>".}
|
||||
|
||||
# Helpers
|
||||
# ----------------------------------------------------------------------------------
|
||||
|
||||
proc isPowerOfTwo(n: int): bool {.inline.} =
|
||||
(n and (n - 1)) == 0 and (n != 0)
|
||||
|
||||
func roundNextMultipleOf(x: int, n: static int): int {.inline.} =
|
||||
## Round the input to the next multiple of "n"
|
||||
when n.isPowerOfTwo():
|
||||
# n is a power of 2. (If compiler cannot prove that x>0 it does not make the optim)
|
||||
result = (x + n - 1) and not(n - 1)
|
||||
else:
|
||||
result = ((x + n - 1) div n) * n
|
||||
|
||||
# Stack allocation
|
||||
# ----------------------------------------------------------------------------------
|
||||
|
||||
template allocStack*(T: typedesc): ptr T =
|
||||
cast[ptr T](alloca(sizeof(T)))
|
||||
|
||||
template alloca*(T: typedesc, len: Natural): ptr UncheckedArray[T] =
|
||||
cast[ptr UncheckedArray[T]](alloca(sizeof(T) * len))
|
||||
template allocStackUnchecked*(T: typedesc, size: int): ptr T =
|
||||
## Stack allocation for types containing a variable-sized UncheckedArray field
|
||||
cast[ptr T](alloca(size))
|
||||
|
||||
template allocStackArray*(T: typedesc, len: Natural): ptr UncheckedArray[T] =
|
||||
cast[ptr UncheckedArray[T]](alloca(sizeof(T) * len))
|
||||
|
||||
# Heap allocation
|
||||
# ----------------------------------------------------------------------------------
|
||||
|
||||
proc allocHeap*(T: typedesc): ptr T {.inline.} =
|
||||
cast[type result](malloc(sizeof(T)))
|
||||
|
||||
proc allocHeapUnchecked*(T: typedesc, size: int): ptr T {.inline.} =
|
||||
## Heap allocation for types containing a variable-sized UncheckedArray field
|
||||
cast[type result](malloc(size))
|
||||
|
||||
proc allocHeapArray*(T: typedesc, len: SomeInteger): ptr UncheckedArray[T] {.inline.} =
|
||||
cast[type result](malloc(len*sizeof(T)))
|
||||
|
||||
proc freeHeap*(p: pointer) {.inline.} =
|
||||
free(p)
|
||||
|
||||
proc allocHeapAligned*(T: typedesc, alignment: static Natural): ptr T {.inline.} =
|
||||
# aligned_alloc requires allocating in multiple of the alignment.
|
||||
const
|
||||
size = sizeof(T)
|
||||
requiredMem = size.roundNextMultipleOf(alignment)
|
||||
|
||||
cast[ptr T](aligned_alloc(alignment, requiredMem))
|
||||
|
||||
proc allocHeapUncheckedAligned*(T: typedesc, size: int, alignment: static Natural): ptr T {.inline.} =
|
||||
## Aligned heap allocation for types containing a variable-sized UncheckedArray field
|
||||
## or an importc type with missing size information
|
||||
# aligned_alloc requires allocating in multiple of the alignment.
|
||||
let requiredMem = size.roundNextMultipleOf(alignment)
|
||||
|
||||
cast[ptr T](aligned_alloc(alignment, requiredMem))
|
||||
|
||||
proc allocHeapArrayAligned*(T: typedesc, len: int, alignment: static Natural): ptr UncheckedArray[T] {.inline.} =
|
||||
# aligned_alloc requires allocating in multiple of the alignment.
|
||||
let
|
||||
size = sizeof(T) * len
|
||||
requiredMem = size.roundNextMultipleOf(alignment)
|
||||
|
||||
cast[ptr UncheckedArray[T]](aligned_alloc(alignment, requiredMem))
|
||||
|
||||
proc allocHeapAlignedPtr*(T: typedesc[ptr], alignment: static Natural): T {.inline.} =
|
||||
allocHeapAligned(typeof(default(T)[]), alignment)
|
||||
|
||||
proc allocHeapUncheckedAlignedPtr*(T: typedesc[ptr], size: int, alignment: static Natural): T {.inline.} =
|
||||
## Aligned heap allocation for types containing a variable-sized UncheckedArray field
|
||||
## or an importc type with missing size information
|
||||
allocHeapUncheckedAligned(typeof(default(T)[]), size, alignment)
|
||||
|
||||
proc freeHeapAligned*(p: pointer) {.inline.} =
|
||||
aligned_free(p)
|
||||
@ -32,7 +32,10 @@ import ./compilers/bitops
|
||||
#
|
||||
# See: https://www.chessprogramming.org/BitScan
|
||||
# https://www.chessprogramming.org/General_Setwise_Operations
|
||||
# https://www.chessprogramming.org/De_Bruijn_Sequence_Generator
|
||||
# and https://graphics.stanford.edu/%7Eseander/bithacks.html
|
||||
# and Hacker's Delight 2nd Edition, Henry S Warren, Jr.
|
||||
# and https://sites.google.com/site/sydfhd/articles-tutorials/de-bruijn-sequence-generator
|
||||
# for compendiums of bit manipulation
|
||||
|
||||
func clearMask[T: SomeInteger](v: T, mask: T): T {.inline.} =
|
||||
@ -43,77 +46,106 @@ func clearBit*[T: SomeInteger](v: T, bit: T): T {.inline.} =
|
||||
## Returns ``v``, with the bit at position ``bit`` set to 0
|
||||
v.clearMask(1.T shl bit)
|
||||
|
||||
func log2impl_vartime(x: uint32): uint32 =
|
||||
func log2_impl_vartime(n: uint32): uint32 =
|
||||
## Find the log base 2 of a 32-bit or less integer.
|
||||
## using De Bruijn multiplication
|
||||
## Works at compile-time.
|
||||
## ⚠️ not constant-time, table accesses are not uniform.
|
||||
# https://graphics.stanford.edu/%7Eseander/bithacks.html#IntegerLogDeBruijn
|
||||
const lookup: array[32, uint8] = [0'u8, 9, 1, 10, 13, 21, 2, 29, 11, 14, 16, 18,
|
||||
22, 25, 3, 30, 8, 12, 20, 28, 15, 17, 24, 7, 19, 27, 23, 6, 26, 5, 4, 31]
|
||||
var v = x
|
||||
v = v or v shr 1 # first round down to one less than a power of 2
|
||||
v = v or v shr 2
|
||||
v = v or v shr 4
|
||||
v = v or v shr 8
|
||||
v = v or v shr 16
|
||||
lookup[(v * 0x07C4ACDD'u32) shr 27]
|
||||
const lookup: array[32, uint8] = [
|
||||
uint8 0, 9, 1, 10, 13, 21, 2, 29,
|
||||
11, 14, 16, 18, 22, 25, 3, 30,
|
||||
8, 12, 20, 28, 15, 17, 24, 7,
|
||||
19, 27, 23, 6, 26, 5, 4, 31]
|
||||
|
||||
# Isolate MSB
|
||||
var n = n
|
||||
n = n or n shr 1 # first round down to one less than a power of 2
|
||||
n = n or n shr 2
|
||||
n = n or n shr 4
|
||||
n = n or n shr 8
|
||||
n = n or n shr 16
|
||||
uint32 lookup[(n * 0x07C4ACDD'u32) shr 27]
|
||||
|
||||
func log2impl_vartime(x: uint64): uint64 {.inline.} =
|
||||
func log2_impl_vartime(n: uint64): uint64 {.inline.} =
|
||||
## Find the log base 2 of a 32-bit or less integer.
|
||||
## using De Bruijn multiplication
|
||||
## Works at compile-time.
|
||||
## ⚠️ not constant-time, table accesses are not uniform.
|
||||
# https://graphics.stanford.edu/%7Eseander/bithacks.html#IntegerLogDeBruijn
|
||||
const lookup: array[64, uint8] = [0'u8, 58, 1, 59, 47, 53, 2, 60, 39, 48, 27, 54,
|
||||
33, 42, 3, 61, 51, 37, 40, 49, 18, 28, 20, 55, 30, 34, 11, 43, 14, 22, 4, 62,
|
||||
57, 46, 52, 38, 26, 32, 41, 50, 36, 17, 19, 29, 10, 13, 21, 56, 45, 25, 31,
|
||||
35, 16, 9, 12, 44, 24, 15, 8, 23, 7, 6, 5, 63]
|
||||
var v = x
|
||||
v = v or v shr 1 # first round down to one less than a power of 2
|
||||
v = v or v shr 2
|
||||
v = v or v shr 4
|
||||
v = v or v shr 8
|
||||
v = v or v shr 16
|
||||
v = v or v shr 32
|
||||
lookup[(v * 0x03F6EAF2CD271461'u64) shr 58]
|
||||
# https://stackoverflow.com/questions/11376288/fast-computing-of-log2-for-64-bit-integers
|
||||
const lookup: array[64, uint8] = [
|
||||
uint8 0, 58, 1, 59, 47, 53, 2, 60,
|
||||
39, 48, 27, 54, 33, 42, 3, 61,
|
||||
51, 37, 40, 49, 18, 28, 20, 55,
|
||||
30, 34, 11, 43, 14, 22, 4, 62,
|
||||
57, 46, 52, 38, 26, 32, 41, 50,
|
||||
36, 17, 19, 29, 10, 13, 21, 56,
|
||||
45, 25, 31, 35, 16, 9, 12, 44,
|
||||
24, 15, 8, 23, 7, 6, 5, 63]
|
||||
|
||||
# Isolate MSB
|
||||
var n = n
|
||||
n = n or n shr 1 # first round down to one less than a power of 2
|
||||
n = n or n shr 2
|
||||
n = n or n shr 4
|
||||
n = n or n shr 8
|
||||
n = n or n shr 16
|
||||
n = n or n shr 32
|
||||
uint64 lookup[(n * 0x03F6EAF2CD271461'u64) shr 58]
|
||||
|
||||
func log2_vartime*[T: SomeUnsignedInt](n: T): T {.inline.} =
|
||||
## Find the log base 2 of an integer
|
||||
##
|
||||
## ⚠ With GCC and Clang compilers on x86, if n is zero, result is undefined.
|
||||
when nimvm:
|
||||
when sizeof(T) == sizeof(uint64):
|
||||
T(log2impl_vartime(uint64(n)))
|
||||
when sizeof(T) == 8:
|
||||
T(log2_impl_vartime(uint64(n)))
|
||||
else:
|
||||
static: doAssert sizeof(T) <= sizeof(uint32)
|
||||
T(log2impl_vartime(uint32(n)))
|
||||
T(log2_impl_vartime(uint32(n)))
|
||||
else:
|
||||
when sizeof(T) == sizeof(uint64):
|
||||
T(log2_c_compiler_vartime(uint64(n)))
|
||||
log2_c_compiler_vartime(n)
|
||||
|
||||
func ctz_impl_vartime(n: uint32): uint32 =
|
||||
## Find the number of trailing zero bits
|
||||
## Requires n != 0
|
||||
# https://sites.google.com/site/sydfhd/articles-tutorials/de-bruijn-sequence-generator
|
||||
const lookup: array[32, uint8] = [
|
||||
uint8 0, 1, 16, 2, 29, 17, 3, 22,
|
||||
30, 20, 18, 11, 13, 4, 7, 23,
|
||||
31, 15, 28, 21, 19, 10, 12, 6,
|
||||
14, 27, 9, 5, 26, 8, 25, 24]
|
||||
|
||||
let isolateLSB = n xor (n-1)
|
||||
uint32 lookup[(isolateLSB * 0x6EB14F9'u32) shr 27]
|
||||
|
||||
func ctz_impl_vartime(n: uint64): uint64 =
|
||||
## Find the number of trailing zero bits
|
||||
## Requires n != 0
|
||||
# https://www.chessprogramming.org/BitScan#Bitscan_forward
|
||||
const lookup: array[64, uint8] = [
|
||||
uint8 0, 47, 1, 56, 48, 27, 2, 60,
|
||||
57, 49, 41, 37, 28, 16, 3, 61,
|
||||
54, 58, 35, 52, 50, 42, 21, 44,
|
||||
38, 32, 29, 23, 17, 11, 4, 62,
|
||||
46, 55, 26, 59, 40, 36, 15, 53,
|
||||
34, 51, 20, 43, 31, 22, 10, 45,
|
||||
25, 39, 14, 33, 19, 30, 9, 24,
|
||||
13, 18, 8, 12, 7, 6, 5, 63]
|
||||
|
||||
let isolateLSB = n xor (n-1)
|
||||
uint64 lookup[(isolateLSB * 0x03f79d71b4cb0a89'u64) shr 58]
|
||||
|
||||
func countTrailingZeroBits*[T: SomeUnsignedInt](n: T): T {.inline.} =
|
||||
## Count the number of trailing zero bits of an integer
|
||||
when nimvm:
|
||||
if n == 0:
|
||||
T(sizeof(n) * 8)
|
||||
else:
|
||||
static: doAssert sizeof(T) <= sizeof(uint32)
|
||||
T(log2_c_compiler_vartime(uint32(n)))
|
||||
|
||||
func hammingWeight*(x: uint32): uint {.inline.} =
|
||||
## Counts the set bits in integer.
|
||||
# https://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel
|
||||
var v = x
|
||||
v = v - ((v shr 1) and 0x55555555)
|
||||
v = (v and 0x33333333) + ((v shr 2) and 0x33333333)
|
||||
uint(((v + (v shr 4) and 0xF0F0F0F) * 0x1010101) shr 24)
|
||||
|
||||
func hammingWeight*(x: uint64): uint {.inline.} =
|
||||
## Counts the set bits in integer.
|
||||
# https://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel
|
||||
var v = x
|
||||
v = v - ((v shr 1'u64) and 0x5555555555555555'u64)
|
||||
v = (v and 0x3333333333333333'u64) + ((v shr 2'u64) and 0x3333333333333333'u64)
|
||||
v = (v + (v shr 4'u64) and 0x0F0F0F0F0F0F0F0F'u64)
|
||||
uint((v * 0x0101010101010101'u64) shr 56'u64)
|
||||
|
||||
func countLeadingZeros_vartime*[T: SomeUnsignedInt](x: T): T {.inline.} =
|
||||
(8*sizeof(T)) - 1 - log2_vartime(x)
|
||||
when sizeof(T) == 8:
|
||||
T(ctz_impl_vartime(uint64(n)))
|
||||
else:
|
||||
T(ctz_impl_vartime(uint32(n)))
|
||||
else:
|
||||
ctz_c_compiler_vartime(n)
|
||||
|
||||
func isPowerOf2_vartime*(n: SomeUnsignedInt): bool {.inline.} =
|
||||
## Returns true if n is a power of 2
|
||||
@ -121,7 +153,7 @@ func isPowerOf2_vartime*(n: SomeUnsignedInt): bool {.inline.} =
|
||||
## for compile-time or explicit vartime proc only.
|
||||
(n and (n - 1)) == 0
|
||||
|
||||
func nextPowerOf2_vartime*(n: uint64): uint64 {.inline.} =
|
||||
func nextPowerOfTwo_vartime*(n: uint32): uint32 {.inline.} =
|
||||
## Returns x if x is a power of 2
|
||||
## or the next biggest power of 2
|
||||
1'u64 shl (log2_vartime(n-1) + 1)
|
||||
1'u32 shl (log2_vartime(n-1) + 1)
|
||||
@ -15,54 +15,101 @@ when GCC_Compatible:
|
||||
func builtin_clzll(n: uint64): cint {.importc: "__builtin_clzll", nodecl.}
|
||||
## Count the number of leading zeros
|
||||
## undefined if n is zero
|
||||
func builtin_ctz(n: uint32): cint {.importc: "__builtin_ctz", nodecl.}
|
||||
## Count the number of trailing zeros
|
||||
## undefined if n is zero
|
||||
func builtin_ctzll(n: uint64): cint {.importc: "__builtin_ctzll", nodecl.}
|
||||
## Count the number of trailing zeros
|
||||
## undefined if n is zero
|
||||
|
||||
func log2_c_compiler_vartime*(n: uint8|uint16|uint32): int {.inline.} =
|
||||
func log2_c_compiler_vartime*(n: SomeUnsignedInt): cint {.inline.} =
|
||||
## Compute the log2 of n using compiler builtin
|
||||
## ⚠ Depending on the compiler:
|
||||
## - It is undefined if n == 0
|
||||
## - It is not constant-time as a zero input is checked
|
||||
cast[int](31 - cast[cuint](builtin_clz(n.uint32)))
|
||||
func log2_c_compiler_vartime*(n: uint64): int {.inline.} =
|
||||
## Compute the log2 of n using compiler builtin
|
||||
if n == 0:
|
||||
0
|
||||
else:
|
||||
when sizeof(n) == 8:
|
||||
cint(64) - builtin_clzll(n)
|
||||
else:
|
||||
cint(31) - builtin_clz(n.uint32)
|
||||
|
||||
func ctz_c_compiler_vartime*(n: SomeUnsignedInt): cint {.inline.} =
|
||||
## Compute the number of trailing zeros
|
||||
## in the bit representation of n using compiler builtin
|
||||
## ⚠ Depending on the compiler:
|
||||
## - It is undefined if n == 0
|
||||
## - It is not constant-time as a zero input is checked
|
||||
cast[int](63 - cast[cuint](builtin_clzll(n)))
|
||||
if n == 0:
|
||||
sizeof(n) * 8
|
||||
else:
|
||||
when sizeof(n) == 8:
|
||||
builtin_ctzll(n)
|
||||
else:
|
||||
builtin_ctz(n.uint32)
|
||||
|
||||
elif defined(icc):
|
||||
func bitScanReverse(r: var uint32, n: uint32): uint8 {.importc: "_BitScanReverse", header: "<immintrin.h>".}
|
||||
## Returns 0 if n is zero and non-zero otherwise
|
||||
## Returns the position of the first set bit in `r`
|
||||
## from MSB to LSB
|
||||
func bitScanReverse64(r: var uint32, n: uint64): uint8 {.importc: "_BitScanReverse64", header: "<immintrin.h>".}
|
||||
## Returns 0 if n is zero and non-zero otherwise
|
||||
## Returns the position of the first set bit in `r`
|
||||
## from MSB to LSB
|
||||
func bitScanForward(r: var uint32, n: uint32): uint8 {.importc: "_BitScanForward", header: "<immintrin.h>".}
|
||||
## Returns 0 if n is zero and non-zero otherwise
|
||||
## Returns the position of the first set bit in `r`
|
||||
## from LSB to MSB
|
||||
func bitScanForward64(r: var uint32, n: uint64): uint8 {.importc: "_BitScanForward64", header: "<immintrin.h>".}
|
||||
## Returns 0 if n is zero and non-zero otherwise
|
||||
## Returns the position of the first set bit in `r`
|
||||
## from LSB to MSB
|
||||
|
||||
template bitscan(fnc: untyped; v: untyped): int {.inline.} =
|
||||
template bitscan(fnc: untyped; v: untyped, default: static int): int {.inline.} =
|
||||
var index: uint32
|
||||
if fnc(index.addr, v) == 0:
|
||||
return 0
|
||||
return default
|
||||
return index.int
|
||||
|
||||
func log2_c_compiler_vartime*(n: uint8|uint16|uint32): int {.inline.} =
|
||||
func log2_c_compiler_vartime*(n: SomeUnsignedInt): cint {.inline.} =
|
||||
## Compute the log2 of n using compiler builtin
|
||||
## ⚠ Depending on the compiler:
|
||||
## - It is undefined if n == 0
|
||||
## - It is not constant-time as a zero input is checked
|
||||
bitscan(bitScanReverse, c.uint32)
|
||||
func log2_c_compiler_vartime*(n: uint64): int {.inline.} =
|
||||
## Compute the log2 of n using compiler builtin
|
||||
## ⚠ Depending on the compiler:
|
||||
## - It is undefined if n == 0
|
||||
## - It is not constant-time as a zero input is checked
|
||||
bitscan(bitScanReverse64, n)
|
||||
when sizeof(n) == 8:
|
||||
bitscan(bitScanReverse64, n, default = 0)
|
||||
else:
|
||||
bitscan(bitScanReverse, c.uint32, default = 0)
|
||||
|
||||
func ctz_c_compiler_vartime*(n: SomeUnsignedInt): cint {.inline.} =
|
||||
## Compute the number of trailing zero bits of n using compiler builtin
|
||||
## ⚠ Depending on the compiler:
|
||||
## - It is undefined if n == 0
|
||||
## - It is not constant-time as a zero input is checked
|
||||
when sizeof(n) == 8:
|
||||
bitscan(bitScanForward64, n, default = 0)
|
||||
else:
|
||||
bitscan(bitScanForward, c.uint32, default = 0)
|
||||
|
||||
elif defined(vcc):
|
||||
func bitScanReverse(p: ptr uint32, b: uint32): uint8 {.importc: "_BitScanReverse", header: "<intrin.h>".}
|
||||
## Returns 0 if n s no set bit and non-zero otherwise
|
||||
## Returns the position of the first set bit in `r`
|
||||
## from MSB to LSB
|
||||
func bitScanReverse64(p: ptr uint32, b: uint64): uint8 {.importc: "_BitScanReverse64", header: "<intrin.h>".}
|
||||
## Returns 0 if n s no set bit and non-zero otherwise
|
||||
## Returns the position of the first set bit in `r`
|
||||
## from MSB to LSB
|
||||
func bitScanForward(r: var uint32, n: uint32): uint8 {.importc: "_BitScanForward", header: "<intrin.h>".}
|
||||
## Returns 0 if n is zero and non-zero otherwise
|
||||
## Returns the position of the first set bit in `r`
|
||||
## from LSB to MSB
|
||||
func bitScanForward64(r: var uint32, n: uint64): uint8 {.importc: "_BitScanForward64", header: "<intrin.h>".}
|
||||
## Returns 0 if n is zero and non-zero otherwise
|
||||
## Returns the position of the first set bit in `r`
|
||||
## from LSB to MSB
|
||||
|
||||
template bitscan(fnc: untyped; v: untyped): int =
|
||||
var index: uint32
|
||||
@ -70,18 +117,25 @@ elif defined(vcc):
|
||||
return 0
|
||||
return index.int
|
||||
|
||||
func log2_c_compiler_vartime*(n: uint8|uint16|uint32): int {.inline.} =
|
||||
func log2_c_compiler_vartime*(n: SomeUnsignedInt): cint {.inline.} =
|
||||
## Compute the log2 of n using compiler builtin
|
||||
## ⚠ Depending on the compiler:
|
||||
## - It is undefined if n == 0
|
||||
## - It is not constant-time as a zero input is checked
|
||||
bitscan(bitScanReverse, c.uint32)
|
||||
func log2_c_compiler_vartime*(n: uint64): int {.inline.} =
|
||||
## Compute the log2 of n using compiler builtin
|
||||
when sizeof(n) == 8:
|
||||
bitscan(bitScanReverse64, n, default = 0)
|
||||
else:
|
||||
bitscan(bitScanReverse, c.uint32, default = 0)
|
||||
|
||||
func ctz_c_compiler_vartime*(n: SomeUnsignedInt): cint {.inline.} =
|
||||
## Compute the number of trailing zero bits of n using compiler builtin
|
||||
## ⚠ Depending on the compiler:
|
||||
## - It is undefined if n == 0
|
||||
## - It is not constant-time as a zero input is checked
|
||||
bitscan(bitScanReverse64, n)
|
||||
when sizeof(n) == 8:
|
||||
bitscan(bitScanForward64, n, default = sizeof(n) * 8)
|
||||
else:
|
||||
bitscan(bitScanForward, c.uint32, default = sizeof(n) * 8)
|
||||
|
||||
else:
|
||||
{. error: "Unsupported compiler".}
|
||||
38
constantine/platforms/threadpool/README.md
Normal file
38
constantine/platforms/threadpool/README.md
Normal file
@ -0,0 +1,38 @@
|
||||
# Constantine Threadpool
|
||||
|
||||
## API
|
||||
|
||||
The API spec follows https://github.com/nim-lang/RFCs/issues/347#task-parallelism-api
|
||||
|
||||
## Overview
|
||||
|
||||
This implements a lightweight, energy-efficient, easily auditable multithreaded threadpool.
|
||||
|
||||
This threadpool will desirable properties are:
|
||||
|
||||
- Ease of auditing and maintenance.
|
||||
- Resource-efficient. Threads spindown to save power, low memory use.
|
||||
- Decent performance and scalability. The CPU should spent its time processing user workloads
|
||||
and not dealing with threadpool contention, latencies and overheads.
|
||||
|
||||
Compared to [Weave](https://github.com/mratsim/weave), here are the tradeoffs:
|
||||
- Constantine's threadpool only provide spawn/sync (task parallelism).\
|
||||
There is no (extremely) optimized parallel for (data parallelism)\
|
||||
or precise in/out dependencies (events / dataflow parallelism).
|
||||
- Constantine's threadpool has been significantly optimized to provide
|
||||
overhead lower than Weave's default (and as low as Weave "lazy" + "alloca" allocation scheme)
|
||||
- Constantine's threadpool provides the same adaptative scheduling strategy as Weave
|
||||
with additional enhancement (leapfrogging)
|
||||
|
||||
Compared to [nim-taskpools](https://github.com/status-im), here are the tradeoffs:
|
||||
- Constantine does not use std/tasks:
|
||||
- No external dependencies at runtime (apart from compilers, OS and drivers)
|
||||
- We can replace Task with an intrusive linked list
|
||||
- Furthermore we can embed tasks in their future
|
||||
- Hence allocation/scheduler overhead is 3x less than nim-taskpools as we fuse the following allocations:
|
||||
- Task
|
||||
- The linked list of tasks
|
||||
- The future (Flowvar) result channel
|
||||
- Contention improvement, Constantine is entirely lock-free while Nim-taskpools need a lock+condition variable for putting threads to sleep
|
||||
- Powersaving improvement, threads sleep when awaiting for a task and there is no work available.
|
||||
- Scheduling improvement, Constantine's threadpool incorporate Weave's adaptative scheduling policy with additional enhancement (leapfrogging)
|
||||
@ -0,0 +1,11 @@
|
||||
# BPC (Bouncing Producer-Consumer)
|
||||
|
||||
From [tasking-2.0](https://github.com/aprell/tasking-2.0) description
|
||||
|
||||
> **BPC**, short for **B**ouncing **P**roducer-**C**onsumer benchmark, as far
|
||||
> as I know, first described by [Dinan et al][1]. There are two types of
|
||||
> tasks, producer and consumer tasks. Each producer task creates another
|
||||
> producer task followed by *n* consumer tasks, until a certain depth *d* is
|
||||
> reached. Consumer tasks run for *t* microseconds. The smaller the values of
|
||||
> *n* and *t*, the harder it becomes to exploit the available parallelism. A
|
||||
> solid contender for the most antagonistic microbenchmark.
|
||||
@ -0,0 +1,157 @@
|
||||
import
|
||||
# STD lib
|
||||
system/ansi_c, std/[os, strutils, cpuinfo, strformat, math],
|
||||
# Library
|
||||
../../threadpool,
|
||||
# bench
|
||||
../wtime, ../resources
|
||||
|
||||
var
|
||||
Depth: int32 # For example 10000
|
||||
NumTasksPerDepth: int32 # For example 9
|
||||
# The total number of tasks in the BPC benchmark is
|
||||
# (NumTasksPerDepth + 1) * Depth
|
||||
NumTasksTotal: int32
|
||||
TaskGranularity: int32 # in microseconds
|
||||
PollInterval: float64 # in microseconds
|
||||
|
||||
tp: Threadpool
|
||||
|
||||
var global_poll_elapsed {.threadvar.}: float64
|
||||
|
||||
template dummy_cpt(): untyped =
|
||||
# Dummy computation
|
||||
# Calculate fib(30) iteratively
|
||||
var
|
||||
fib = 0
|
||||
f2 = 0
|
||||
f1 = 1
|
||||
for i in 2 .. 30:
|
||||
fib = f1 + f2
|
||||
f2 = f1
|
||||
f1 = fib
|
||||
|
||||
proc bpc_consume(usec: int32) =
|
||||
|
||||
var pollElapsed = 0'f64
|
||||
|
||||
let start = wtime_usec()
|
||||
let stop = usec.float64
|
||||
global_poll_elapsed = PollInterval
|
||||
|
||||
while true:
|
||||
var elapsed = wtime_usec() - start
|
||||
elapsed -= pollElapsed
|
||||
if elapsed >= stop:
|
||||
break
|
||||
|
||||
dummy_cpt()
|
||||
|
||||
# if elapsed >= global_poll_elapsed:
|
||||
# let pollStart = wtime_usec()
|
||||
# loadBalance(Weave)
|
||||
# pollElapsed += wtime_usec() - pollStart
|
||||
# global_poll_elapsed += PollInterval
|
||||
|
||||
proc bpc_consume_nopoll(usec: int32) =
|
||||
|
||||
let start = wtime_usec()
|
||||
let stop = usec.float64
|
||||
|
||||
while true:
|
||||
var elapsed = wtime_usec() - start
|
||||
if elapsed >= stop:
|
||||
break
|
||||
|
||||
dummy_cpt()
|
||||
|
||||
proc bpc_produce(n, d: int32) {.gcsafe.} =
|
||||
if d > 0:
|
||||
# Create producer task
|
||||
tp.spawn bpc_produce(n, d-1)
|
||||
else:
|
||||
return
|
||||
|
||||
# Followed by n consumer tasks
|
||||
for i in 0 ..< n:
|
||||
tp.spawn bpc_consume(TaskGranularity)
|
||||
|
||||
proc main() =
|
||||
Depth = 10000
|
||||
NumTasksPerDepth = 999
|
||||
TaskGranularity = 1
|
||||
|
||||
if paramCount() == 0:
|
||||
let exeName = getAppFilename().extractFilename()
|
||||
echo &"Usage: {exeName} <depth: {Depth}> " &
|
||||
&"<# of tasks per depth: {NumTasksPerDepth}> " &
|
||||
&"[task granularity (us): {TaskGranularity}] " &
|
||||
&"[polling interval (us): task granularity]"
|
||||
echo &"Running with default config Depth = {Depth}, NumTasksPerDepth = {NumTasksPerDepth}, granularity (us) = {TaskGranularity}, polling (us) = {PollInterval}"
|
||||
if paramCount() >= 1:
|
||||
Depth = paramStr(1).parseInt.int32
|
||||
if paramCount() >= 2:
|
||||
NumTasksPerDepth = paramStr(2). parseInt.int32
|
||||
if paramCount() >= 3:
|
||||
TaskGranularity = paramStr(3). parseInt.int32
|
||||
if paramCount() == 4:
|
||||
PollInterval = paramStr(4).parseInt.float64
|
||||
else:
|
||||
PollInterval = TaskGranularity.float64
|
||||
if paramCount() > 4:
|
||||
let exeName = getAppFilename().extractFilename()
|
||||
echo &"Usage: {exeName} <depth: {Depth}> " &
|
||||
&"<# of tasks per depth: {NumTasksPerDepth}> " &
|
||||
&"[task granularity (us): {TaskGranularity}] " &
|
||||
&"[polling interval (us): task granularity]"
|
||||
quit 1
|
||||
|
||||
NumTasksTotal = (NumTasksPerDepth + 1) * Depth
|
||||
|
||||
var nthreads: int
|
||||
if existsEnv"CTT_NUM_THREADS":
|
||||
nthreads = getEnv"CTT_NUM_THREADS".parseInt()
|
||||
else:
|
||||
nthreads = countProcessors()
|
||||
|
||||
tp = Threadpool.new(numThreads = nthreads)
|
||||
|
||||
# measure overhead during tasking
|
||||
var ru: Rusage
|
||||
getrusage(RusageSelf, ru)
|
||||
var
|
||||
rss = ru.ru_maxrss
|
||||
flt = ru.ru_minflt
|
||||
|
||||
let start = wtime_msec()
|
||||
|
||||
bpc_produce(NumTasksPerDepth, Depth)
|
||||
tp.syncAll()
|
||||
|
||||
let stop = wtime_msec()
|
||||
|
||||
getrusage(RusageSelf, ru)
|
||||
rss = ru.ru_maxrss - rss
|
||||
flt = ru.ru_minflt - flt
|
||||
|
||||
tp.shutdown()
|
||||
|
||||
echo "--------------------------------------------------------------------------"
|
||||
echo "Scheduler: Constantine's Threadpool"
|
||||
echo "Benchmark: BPC (Bouncing Producer-Consumer)"
|
||||
echo "Threads: ", nthreads
|
||||
echo "Time(ms) ", round(stop - start, 3)
|
||||
echo "Max RSS (KB): ", ru.ru_maxrss
|
||||
echo "Runtime RSS (KB): ", rss
|
||||
echo "# of page faults: ", flt
|
||||
echo "--------------------------------------------------------------------------"
|
||||
echo "# of tasks: ", NumTasksTotal
|
||||
echo "# of tasks/depth: ", NumTasksPerDepth
|
||||
echo "Depth: ", Depth
|
||||
echo "Task granularity (us): ", TaskGranularity
|
||||
echo "Polling / manual load balancing interval (us): ", PollInterval
|
||||
echo "--------------------------------------------------------------------------"
|
||||
|
||||
quit 0
|
||||
|
||||
main()
|
||||
@ -0,0 +1,86 @@
|
||||
# Weave
|
||||
# Copyright (c) 2019 Mamy André-Ratsimbazafy
|
||||
# Licensed and distributed under either of
|
||||
# * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
|
||||
# * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
|
||||
# at your option. This file may not be copied, modified, or distributed except according to those terms.
|
||||
|
||||
import
|
||||
# Stdlib
|
||||
system/ansi_c, std/[strformat, os, strutils, cpuinfo],
|
||||
# Library
|
||||
../../threadpool
|
||||
|
||||
when not defined(windows):
|
||||
# bench
|
||||
import ../wtime
|
||||
|
||||
var tp: Threadpool
|
||||
|
||||
proc dfs(depth, breadth: int): uint32 {.gcsafe.} =
|
||||
if depth == 0:
|
||||
return 1
|
||||
|
||||
# We could use alloca to avoid heap allocation here
|
||||
var sums = newSeq[Flowvar[uint32]](breadth)
|
||||
|
||||
for i in 0 ..< breadth:
|
||||
sums[i] = tp.spawn dfs(depth - 1, breadth)
|
||||
|
||||
for i in 0 ..< breadth:
|
||||
result += sync(sums[i])
|
||||
|
||||
proc test(depth, breadth: int): uint32 =
|
||||
result = sync tp.spawn dfs(depth, breadth)
|
||||
|
||||
proc main() =
|
||||
|
||||
var
|
||||
depth = 8
|
||||
breadth = 8
|
||||
answer: uint32
|
||||
nthreads: int
|
||||
|
||||
if existsEnv"CTT_NUM_THREADS":
|
||||
nthreads = getEnv"CTT_NUM_THREADS".parseInt()
|
||||
else:
|
||||
nthreads = countProcessors()
|
||||
|
||||
if paramCount() == 0:
|
||||
let exeName = getAppFilename().extractFilename()
|
||||
echo &"Usage: {exeName} <depth:{depth}> <breadth:{breadth}>"
|
||||
echo &"Running with default config depth = {depth} and breadth = {breadth}"
|
||||
|
||||
if paramCount() >= 1:
|
||||
depth = paramStr(1).parseInt()
|
||||
if paramCount() == 2:
|
||||
breadth = paramStr(2).parseInt()
|
||||
if paramCount() > 2:
|
||||
let exeName = getAppFilename().extractFilename()
|
||||
echo &"Usage: {exeName} <depth:{depth}> <breadth:{breadth}>"
|
||||
echo &"Up to 2 parameters are valid. Received {paramCount()}"
|
||||
quit 1
|
||||
|
||||
# Staccato benches runtime init and exit as well
|
||||
when not defined(windows):
|
||||
let start = wtime_usec()
|
||||
|
||||
tp = Threadpool.new()
|
||||
answer = test(depth, breadth)
|
||||
tp.shutdown()
|
||||
|
||||
when not defined(windows):
|
||||
let stop = wtime_usec()
|
||||
|
||||
echo "--------------------------------------------------------------------------"
|
||||
echo "Scheduler: Constantine's Threadpool"
|
||||
echo "Benchmark: dfs"
|
||||
echo "Threads: ", nthreads
|
||||
when not defined(windows):
|
||||
echo "Time(us) ", stop - start
|
||||
echo "Output: ", answer
|
||||
echo "--------------------------------------------------------------------------"
|
||||
|
||||
quit 0
|
||||
|
||||
main()
|
||||
@ -0,0 +1,77 @@
|
||||
# Fibonacci benchmarks
|
||||
|
||||
⚠️ Disclaimer:
|
||||
Please don't use parallel fibonacci in production!
|
||||
Use the fast doubling method with memoization instead.
|
||||
|
||||
Fibonacci benchmark has 3 draws:
|
||||
|
||||
1. It's very simple to implement
|
||||
2. It's unbalanced and efficiency requires distributions to avoid idle cores.
|
||||
3. It's a very effective scheduler overhead benchmark, because the basic task is very trivial and the task spawning grows at 2^n scale.
|
||||
|
||||
Want to know the difference between low and high overhead?
|
||||
|
||||
Run the following C code (taken from [Oracle OpenMP example](https://docs.oracle.com/cd/E19205-01/820-7883/girtd/index.html))
|
||||
|
||||
```C
|
||||
#include <stdio.h>
|
||||
#include <omp.h>
|
||||
int fib(int n)
|
||||
{
|
||||
int i, j;
|
||||
if (n<2)
|
||||
return n;
|
||||
else
|
||||
{
|
||||
#pragma omp task shared(i) firstprivate(n)
|
||||
{
|
||||
i=fib(n-1);
|
||||
}
|
||||
|
||||
j=fib(n-2);
|
||||
#pragma omp taskwait
|
||||
return i+j;
|
||||
}
|
||||
}
|
||||
|
||||
int main()
|
||||
{
|
||||
int n = 40;
|
||||
|
||||
#pragma omp parallel shared(n)
|
||||
{
|
||||
#pragma omp single
|
||||
printf ("fib(%d) = %d\n", n, fib(n));
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
First compile with Clang and run it
|
||||
```
|
||||
clang -O3 -fopenmp benchmarks/fibonacci/omp_fib.c
|
||||
time a.out
|
||||
```
|
||||
It should be fairly quick
|
||||
|
||||
|
||||
Then compile with GCC and run it
|
||||
```
|
||||
gcc -O3 -fopenmp benchmarks/fibonacci/omp_fib.c
|
||||
time a.out
|
||||
```
|
||||
|
||||
Notice how some cores get idle as time goes on?
|
||||
Don't forget to kill the benchmark, you'll be there all day.
|
||||
|
||||
What's happening?
|
||||
|
||||
GCC's OpenMP implementation uses a single queue for all tasks.
|
||||
That queue gets constantly hammered by all threads and becomes a contention point.
|
||||
Furthermore, it seems like there is no load balancing or that due to the contention/lock
|
||||
threads are descheduled.
|
||||
|
||||
However Clang implementation uses a work-stealing scheduler with one queue per thread.
|
||||
The only contention happens when a thread run out of work and has to look for more work,
|
||||
in the queue of other threads. And which thread to check is chosen at random so
|
||||
the potential contention is distributed among all threads instead of a single structure.
|
||||
@ -0,0 +1,35 @@
|
||||
import
|
||||
# STD lib
|
||||
std/[os, strutils, threadpool, strformat],
|
||||
# bench
|
||||
../wtime
|
||||
|
||||
# Using Nim's standard threadpool
|
||||
# Compile with "nim c --threads:on -d:release -d:danger --outdir:build benchmarks/fibonacci/stdnim_fib.nim"
|
||||
#
|
||||
# Note: it breaks at fib 16.
|
||||
|
||||
proc parfib(n: uint64): uint64 =
|
||||
if n < 2: # Note: be sure to compare n<2 -> return n
|
||||
return n # instead of n<=2 -> return 1
|
||||
|
||||
let x = spawn parfib(n-1)
|
||||
let y = parfib(n-2)
|
||||
|
||||
return ^x + y
|
||||
|
||||
proc main() =
|
||||
if paramCount() != 1:
|
||||
echo "Usage: fib <n-th fibonacci number requested>"
|
||||
quit 0
|
||||
|
||||
let n = paramStr(1).parseUInt.uint64
|
||||
|
||||
let start = wtime_msec()
|
||||
let f = parfib(n)
|
||||
let stop = wtime_msec()
|
||||
|
||||
echo "Result: ", f
|
||||
echo &"Elapsed wall time: {stop-start:.2} ms"
|
||||
|
||||
main()
|
||||
@ -0,0 +1,79 @@
|
||||
import
|
||||
# STD lib
|
||||
std/[os, strutils, cpuinfo, strformat, math],
|
||||
# Library
|
||||
../../threadpool
|
||||
|
||||
when not defined(windows):
|
||||
# bench
|
||||
import ../wtime, ../resources
|
||||
|
||||
var tp: Threadpool
|
||||
|
||||
proc fib(n: int): int =
|
||||
# int64 on x86-64
|
||||
if n < 2:
|
||||
return n
|
||||
|
||||
let x = tp.spawn fib(n-1)
|
||||
let y = fib(n-2)
|
||||
|
||||
result = sync(x) + y
|
||||
|
||||
proc main() =
|
||||
var n = 40
|
||||
var nthreads: int
|
||||
|
||||
if paramCount() == 0:
|
||||
let exeName = getAppFilename().extractFilename()
|
||||
echo &"Usage: {exeName} <n-th fibonacci number requested:{n}> "
|
||||
echo &"Running with default n = {n}"
|
||||
elif paramCount() == 1:
|
||||
n = paramStr(1).parseInt
|
||||
else:
|
||||
let exeName = getAppFilename().extractFilename()
|
||||
echo &"Usage: {exeName} <n-th fibonacci number requested:{n}>"
|
||||
quit 1
|
||||
|
||||
if existsEnv"CTT_NUM_THREADS":
|
||||
nthreads = getEnv"CTT_NUM_THREADS".parseInt()
|
||||
else:
|
||||
nthreads = countProcessors()
|
||||
|
||||
tp = Threadpool.new()
|
||||
|
||||
# measure overhead during tasking
|
||||
when not defined(windows):
|
||||
var ru: Rusage
|
||||
getrusage(RusageSelf, ru)
|
||||
var
|
||||
rss = ru.ru_maxrss
|
||||
flt = ru.ru_minflt
|
||||
|
||||
let start = wtime_msec()
|
||||
let f = fib(n)
|
||||
|
||||
when not defined(windows):
|
||||
let stop = wtime_msec()
|
||||
|
||||
tp.shutdown()
|
||||
|
||||
when not defined(windows):
|
||||
getrusage(RusageSelf, ru)
|
||||
rss = ru.ru_maxrss - rss
|
||||
flt = ru.ru_minflt - flt
|
||||
|
||||
echo "--------------------------------------------------------------------------"
|
||||
echo "Scheduler: Constantine's Threadpool"
|
||||
echo "Benchmark: Fibonacci"
|
||||
echo "Threads: ", nthreads
|
||||
when not defined(windows):
|
||||
echo "Time(ms) ", round(stop - start, 3)
|
||||
echo "Max RSS (KB): ", ru.ru_maxrss
|
||||
echo "Runtime RSS (KB): ", rss
|
||||
echo "# of page faults: ", flt
|
||||
echo "--------------------------------------------------------------------------"
|
||||
echo "n requested: ", n
|
||||
echo "result: ", f
|
||||
|
||||
main()
|
||||
300
constantine/platforms/threadpool/benchmarks/heat/stdnim_heat.nim
Normal file
300
constantine/platforms/threadpool/benchmarks/heat/stdnim_heat.nim
Normal file
@ -0,0 +1,300 @@
|
||||
# Weave
|
||||
# Copyright (c) 2019 Mamy André-Ratsimbazafy
|
||||
# Licensed and distributed under either of
|
||||
# * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
|
||||
# * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
|
||||
# at your option. This file may not be copied, modified, or distributed except according to those terms.
|
||||
|
||||
# From fibril
|
||||
#
|
||||
# Original license
|
||||
#
|
||||
# /*
|
||||
# * Heat diffusion (Jacobi-type iteration)
|
||||
# *
|
||||
# * Volker Strumpen, Boston August 1996
|
||||
# *
|
||||
# * Copyright (c) 1996 Massachusetts Institute of Technology
|
||||
# *
|
||||
# * This program is free software; you can redistribute it and/or modify
|
||||
# * it under the terms of the GNU General Public License as published by
|
||||
# * the Free Software Foundation; either version 2 of the License, or
|
||||
# * (at your option) any later version.
|
||||
# *
|
||||
# * This program is distributed in the hope that it will be useful,
|
||||
# * but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# * GNU General Public License for more details.
|
||||
# *
|
||||
# * You should have received a copy of the GNU General Public License
|
||||
# * along with this program; if not, write to the Free Software
|
||||
# * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
||||
# */
|
||||
|
||||
import
|
||||
# Stdlib
|
||||
system/ansi_c, std/[strformat, os, strutils, math, cpuinfo],
|
||||
std/threadpool,
|
||||
# bench
|
||||
../wtime, ../resources
|
||||
|
||||
# This deadlocks :/
|
||||
|
||||
# Helpers
|
||||
# -------------------------------------------------------
|
||||
|
||||
# We need a thin wrapper around raw pointers for matrices,
|
||||
# we can't pass "var seq[seq[float64]]" to other threads
|
||||
# nor "var" for that matter
|
||||
type
|
||||
Matrix[T] = object
|
||||
buffer: ptr UncheckedArray[T]
|
||||
m, n: int
|
||||
|
||||
Row[T] = object
|
||||
buffer: ptr UncheckedArray[T]
|
||||
len: int
|
||||
|
||||
func newMatrix[T](m, n: int): Matrix[T] {.inline.} =
|
||||
result.buffer = cast[ptr UncheckedArray[T]](c_malloc(csize_t m*n*sizeof(T)))
|
||||
result.m = m
|
||||
result.n = n
|
||||
|
||||
template `[]`[T](mat: Matrix[T], row, col: Natural): T =
|
||||
# row-major storage
|
||||
assert row < mat.m
|
||||
assert col < mat.n
|
||||
mat.buffer[row * mat.n + col]
|
||||
|
||||
template `[]=`[T](mat: Matrix[T], row, col: Natural, value: T) =
|
||||
assert row < mat.m
|
||||
assert col < mat.n
|
||||
mat.buffer[row * mat.n + col] = value
|
||||
|
||||
func getRow[T](mat: Matrix[T], rowIdx: Natural): Row[T] {.inline.} =
|
||||
# row-major storage, there are n columns in between each rows
|
||||
assert rowIdx < mat.m
|
||||
result.buffer = cast[ptr UncheckedArray[T]](mat.buffer[rowIdx * mat.n].addr)
|
||||
result.len = mat.m
|
||||
|
||||
template `[]`[T](row: Row[T], idx: Natural): T =
|
||||
assert idx < row.len
|
||||
row.buffer[idx]
|
||||
|
||||
template `[]=`[T](row: Row[T], idx: Natural, value: T) =
|
||||
assert idx < row.len
|
||||
row.buffer[idx] = value
|
||||
|
||||
func delete[T](mat: sink Matrix[T]) =
|
||||
c_free(mat.buffer)
|
||||
|
||||
# And an auto converter for int32 -> float64 so we don't have to convert
|
||||
# all i, j indices manually
|
||||
|
||||
converter i32toF64(x: int32): float64 {.inline.} =
|
||||
float64(x)
|
||||
|
||||
# -------------------------------------------------------
|
||||
|
||||
template f(x, y: SomeFloat): SomeFloat =
|
||||
sin(x) * sin(y)
|
||||
|
||||
template randa[T: SomeFloat](x, t: T): T =
|
||||
T(0.0)
|
||||
|
||||
proc randb(x, t: SomeFloat): SomeFloat {.inline.} =
|
||||
# proc instead of template to avoid Nim constant folding bug:
|
||||
# https://github.com/nim-lang/Nim/issues/12783
|
||||
exp(-2 * t) * sin(x)
|
||||
|
||||
template randc[T: SomeFloat](y, t: T): T =
|
||||
T(0.0)
|
||||
|
||||
proc randd(y, t: SomeFloat): SomeFloat {.inline.} =
|
||||
# proc instead of template to avoid Nim constant folding bug:
|
||||
# https://github.com/nim-lang/Nim/issues/12783
|
||||
exp(-2 * t) * sin(y)
|
||||
|
||||
template solu(x, y, t: SomeFloat): SomeFloat =
|
||||
exp(-2 * t) * sin(x) * sin(y)
|
||||
|
||||
const n = 4096'i32
|
||||
|
||||
var
|
||||
nx, ny, nt: int32
|
||||
xu, xo, yu, yo, tu, to: float64
|
||||
|
||||
dx, dy, dt: float64
|
||||
dtdxsq, dtdysq: float64
|
||||
|
||||
odd: Matrix[float64]
|
||||
even: Matrix[float64]
|
||||
|
||||
proc heat(m: Matrix[float64], il, iu: int32): bool {.discardable.}=
|
||||
# TODO to allow awaiting `heat` we return a dummy bool
|
||||
# The parallel spawns are updating the same matrix cells otherwise
|
||||
if iu - il > 1:
|
||||
let im = (il + iu) div 2
|
||||
|
||||
let h = spawn heat(m, il, im)
|
||||
heat(m, im, iu)
|
||||
discard ^h
|
||||
return true
|
||||
# ------------------------
|
||||
|
||||
let i = il
|
||||
let row = m.getRow(i)
|
||||
|
||||
if i == 0:
|
||||
for j in 0 ..< ny:
|
||||
row[j] = randc(yu + j*dy, 0)
|
||||
elif i == nx - 1:
|
||||
for j in 0 ..< ny:
|
||||
row[j] = randd(yu + j*dy, 0)
|
||||
else:
|
||||
row[0] = randa(xu + i*dx, 0)
|
||||
for j in 1 ..< ny - 1:
|
||||
row[j] = f(xu + i*dx, yu + j*dy)
|
||||
row[ny - 1] = randb(xu + i*dx, 0)
|
||||
|
||||
proc diffuse(output: Matrix[float64], input: Matrix[float64], il, iu: int32, t: float64): bool {.discardable.} =
|
||||
# TODO to allow awaiting `diffuse` we return a dummy bool
|
||||
# The parallel spawns are updating the same matrix cells otherwise
|
||||
if iu - il > 1:
|
||||
let im = (il + iu) div 2
|
||||
|
||||
let d = spawn diffuse(output, input, il, im, t)
|
||||
diffuse(output, input, im, iu, t)
|
||||
discard ^d
|
||||
return true
|
||||
# ------------------------
|
||||
|
||||
let i = il
|
||||
let row = output.getRow(i)
|
||||
|
||||
if i == 0:
|
||||
for j in 0 ..< ny:
|
||||
row[j] = randc(yu + j*dy, t)
|
||||
elif i == nx - 1:
|
||||
for j in 0 ..< ny:
|
||||
row[j] = randd(yu + j*dy, t)
|
||||
else:
|
||||
row[0] = randa(xu + i*dx, t)
|
||||
for j in 1 ..< ny - 1:
|
||||
row[j] = input[i, j] + # The use of nested sequences here is a bad idea ...
|
||||
dtdysq * (input[i, j+1] - 2 * input[i, j] + input[i, j-1]) +
|
||||
dtdxsq * (input[i+1, j] - 2 * input[i, j] + input[i-1, j])
|
||||
row[ny - 1] = randb(xu + i*dx, t)
|
||||
|
||||
proc initTest() =
|
||||
nx = n
|
||||
ny = 1024
|
||||
nt = 100
|
||||
xu = 0.0
|
||||
xo = 1.570796326794896558
|
||||
yu = 0.0
|
||||
yo = 1.570796326794896558
|
||||
tu = 0.0
|
||||
to = 0.0000001
|
||||
|
||||
dx = (xo - xu) / float64(nx - 1)
|
||||
dy = (yo - yu) / float64(ny - 1)
|
||||
dt = (to - tu) / float64(nt)
|
||||
|
||||
dtdxsq = dt / (dx * dx)
|
||||
dtdysq = dt / (dy * dy)
|
||||
|
||||
even = newMatrix[float64](nx, ny)
|
||||
odd = newMatrix[float64](nx, ny)
|
||||
|
||||
proc prep() =
|
||||
heat(even, 0, nx)
|
||||
|
||||
proc test() =
|
||||
var t = tu
|
||||
|
||||
for _ in countup(1, nt.int, 2):
|
||||
# nt included
|
||||
t += dt
|
||||
diffuse(odd, even, 0, nx, t)
|
||||
t += dt
|
||||
diffuse(even, odd, 0, nx, t)
|
||||
|
||||
if nt mod 2 != 0:
|
||||
t += dt
|
||||
diffuse(odd, even, 0, nx, t)
|
||||
|
||||
proc verify() =
|
||||
var
|
||||
mat: Matrix[float64]
|
||||
mae: float64
|
||||
mre: float64
|
||||
me: float64
|
||||
|
||||
mat = if nt mod 2 != 0: odd else: even
|
||||
|
||||
for a in 0 ..< nx:
|
||||
for b in 0 ..< ny:
|
||||
var tmp = abs(mat[a, b] - solu(xu + a*dx, yu + b*dy, to))
|
||||
if tmp > 1e-3:
|
||||
echo "nx: ", nx, " - ny: ", ny
|
||||
echo "mat[", a, ", ", b, "] = ", mat[a, b], ", expected sol = ", solu(xu + a*dx, yu + b*dy, to)
|
||||
quit 1
|
||||
|
||||
me += tmp
|
||||
if tmp > mae: mae = tmp
|
||||
if mat[a, b] != 0.0: tmp /= mat[a, b]
|
||||
if tmp > mre: mre = tmp
|
||||
|
||||
me /= nx * ny
|
||||
|
||||
if mae > 1e-12:
|
||||
echo &"Local maximal absolute error {mae:1.3e}"
|
||||
quit 1
|
||||
if mre > 1e-12:
|
||||
echo &"Local maximal relative error {mre:1.3e}"
|
||||
quit 1
|
||||
if me > 1e-12:
|
||||
echo &"Global mean absolute error {me:1.3e}"
|
||||
quit 1
|
||||
|
||||
echo "Verification successful"
|
||||
|
||||
proc main() =
|
||||
var nthreads: int
|
||||
nthreads = countProcessors()
|
||||
|
||||
var ru: Rusage
|
||||
getrusage(RusageSelf, ru)
|
||||
var
|
||||
rss = ru.ru_maxrss
|
||||
flt = ru.ru_minflt
|
||||
|
||||
initTest()
|
||||
|
||||
prep()
|
||||
let start = wtime_usec()
|
||||
test()
|
||||
let stop = wtime_usec()
|
||||
|
||||
getrusage(RusageSelf, ru)
|
||||
rss = ru.ru_maxrss - rss
|
||||
flt = ru.ru_minflt - flt
|
||||
|
||||
sync()
|
||||
|
||||
verify()
|
||||
delete(even)
|
||||
delete(odd)
|
||||
|
||||
echo "Scheduler: Nim threadpool (standard lib)"
|
||||
echo "Benchmark: heat"
|
||||
echo "Threads: ", nthreads
|
||||
echo "Time(us) ", stop - start
|
||||
echo "Max RSS (KB): ", ru.ru_maxrss
|
||||
echo "Runtime RSS (KB): ", rss
|
||||
echo "# of page faults: ", flt
|
||||
|
||||
quit 0
|
||||
|
||||
main()
|
||||
@ -0,0 +1,314 @@
|
||||
# Weave
|
||||
# Copyright (c) 2019 Mamy André-Ratsimbazafy
|
||||
# Licensed and distributed under either of
|
||||
# * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
|
||||
# * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
|
||||
# at your option. This file may not be copied, modified, or distributed except according to those terms.
|
||||
|
||||
# From fibril
|
||||
#
|
||||
# Original license
|
||||
#
|
||||
# /*
|
||||
# * Heat diffusion (Jacobi-type iteration)
|
||||
# *
|
||||
# * Volker Strumpen, Boston August 1996
|
||||
# *
|
||||
# * Copyright (c) 1996 Massachusetts Institute of Technology
|
||||
# *
|
||||
# * This program is free software; you can redistribute it and/or modify
|
||||
# * it under the terms of the GNU General Public License as published by
|
||||
# * the Free Software Foundation; either version 2 of the License, or
|
||||
# * (at your option) any later version.
|
||||
# *
|
||||
# * This program is distributed in the hope that it will be useful,
|
||||
# * but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# * GNU General Public License for more details.
|
||||
# *
|
||||
# * You should have received a copy of the GNU General Public License
|
||||
# * along with this program; if not, write to the Free Software
|
||||
# * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
||||
# */
|
||||
|
||||
import
|
||||
# Stdlib
|
||||
system/ansi_c, std/[strformat, os, strutils, math, cpuinfo],
|
||||
# Library
|
||||
../../threadpool
|
||||
when not defined(windows):
|
||||
# bench
|
||||
import ../wtime, ../resources
|
||||
|
||||
# Helpers
|
||||
# -------------------------------------------------------
|
||||
|
||||
# We need a thin wrapper around raw pointers for matrices,
|
||||
# we can't pass "var seq[seq[float64]]" to other threads
|
||||
# nor "var" for that matter
|
||||
type
|
||||
Matrix[T] = object
|
||||
buffer: ptr UncheckedArray[T]
|
||||
m, n: int
|
||||
|
||||
Row[T] = object
|
||||
buffer: ptr UncheckedArray[T]
|
||||
len: int
|
||||
|
||||
var tp: Threadpool
|
||||
|
||||
func newMatrix[T](m, n: int): Matrix[T] {.inline.} =
|
||||
result.buffer = cast[ptr UncheckedArray[T]](c_malloc(csize_t m*n*sizeof(T)))
|
||||
result.m = m
|
||||
result.n = n
|
||||
|
||||
template `[]`[T](mat: Matrix[T], row, col: Natural): T =
|
||||
# row-major storage
|
||||
assert row < mat.m
|
||||
assert col < mat.n
|
||||
mat.buffer[row * mat.n + col]
|
||||
|
||||
template `[]=`[T](mat: Matrix[T], row, col: Natural, value: T) =
|
||||
assert row < mat.m
|
||||
assert col < mat.n
|
||||
mat.buffer[row * mat.n + col] = value
|
||||
|
||||
func getRow[T](mat: Matrix[T], rowIdx: Natural): Row[T] {.inline.} =
|
||||
# row-major storage, there are n columns in between each rows
|
||||
assert rowIdx < mat.m
|
||||
result.buffer = cast[ptr UncheckedArray[T]](mat.buffer[rowIdx * mat.n].addr)
|
||||
result.len = mat.m
|
||||
|
||||
template `[]`[T](row: Row[T], idx: Natural): T =
|
||||
assert idx < row.len
|
||||
row.buffer[idx]
|
||||
|
||||
template `[]=`[T](row: Row[T], idx: Natural, value: T) =
|
||||
assert idx < row.len
|
||||
row.buffer[idx] = value
|
||||
|
||||
func delete[T](mat: sink Matrix[T]) =
|
||||
c_free(mat.buffer)
|
||||
|
||||
# And an auto converter for int32 -> float64 so we don't have to convert
|
||||
# all i, j indices manually
|
||||
|
||||
converter i32toF64(x: int32): float64 {.inline.} =
|
||||
float64(x)
|
||||
|
||||
# -------------------------------------------------------
|
||||
|
||||
template f(x, y: SomeFloat): SomeFloat =
|
||||
sin(x) * sin(y)
|
||||
|
||||
template randa[T: SomeFloat](x, t: T): T =
|
||||
T(0.0)
|
||||
|
||||
proc randb(x, t: SomeFloat): SomeFloat {.inline.} =
|
||||
# proc instead of template to avoid Nim constant folding bug:
|
||||
# https://github.com/nim-lang/Nim/issues/12783
|
||||
exp(-2 * t) * sin(x)
|
||||
|
||||
template randc[T: SomeFloat](y, t: T): T =
|
||||
T(0.0)
|
||||
|
||||
proc randd(y, t: SomeFloat): SomeFloat {.inline.} =
|
||||
# proc instead of template to avoid Nim constant folding bug:
|
||||
# https://github.com/nim-lang/Nim/issues/12783
|
||||
exp(-2 * t) * sin(y)
|
||||
|
||||
template solu(x, y, t: SomeFloat): SomeFloat =
|
||||
exp(-2 * t) * sin(x) * sin(y)
|
||||
|
||||
const n = 4096'i32
|
||||
|
||||
var
|
||||
nx, ny, nt: int32
|
||||
xu, xo, yu, yo, tu, to: float64
|
||||
|
||||
dx, dy, dt: float64
|
||||
dtdxsq, dtdysq: float64
|
||||
|
||||
odd: Matrix[float64]
|
||||
even: Matrix[float64]
|
||||
|
||||
proc heat(m: Matrix[float64], il, iu: int32): bool {.discardable, gcsafe.}=
|
||||
# TODO to allow awaiting `heat` we return a dummy bool
|
||||
# The parallel spawns are updating the same matrix cells otherwise
|
||||
if iu - il > 1:
|
||||
let im = (il + iu) div 2
|
||||
|
||||
let h = tp.spawn heat(m, il, im)
|
||||
heat(m, im, iu)
|
||||
discard sync(h)
|
||||
return true
|
||||
# ------------------------
|
||||
|
||||
let i = il
|
||||
let row = m.getRow(i)
|
||||
|
||||
if i == 0:
|
||||
for j in 0 ..< ny:
|
||||
row[j] = randc(yu + j*dy, 0)
|
||||
elif i == nx - 1:
|
||||
for j in 0 ..< ny:
|
||||
row[j] = randd(yu + j*dy, 0)
|
||||
else:
|
||||
row[0] = randa(xu + i*dx, 0)
|
||||
for j in 1 ..< ny - 1:
|
||||
row[j] = f(xu + i*dx, yu + j*dy)
|
||||
row[ny - 1] = randb(xu + i*dx, 0)
|
||||
|
||||
proc diffuse(output: Matrix[float64], input: Matrix[float64], il, iu: int32, t: float64): bool {.discardable, gcsafe.} =
|
||||
# TODO to allow awaiting `diffuse` we return a dummy bool
|
||||
# The parallel spawns are updating the same matrix cells otherwise
|
||||
if iu - il > 1:
|
||||
let im = (il + iu) div 2
|
||||
|
||||
let d = tp.spawn diffuse(output, input, il, im, t)
|
||||
diffuse(output, input, im, iu, t)
|
||||
discard sync(d)
|
||||
return true
|
||||
# ------------------------
|
||||
|
||||
let i = il
|
||||
let row = output.getRow(i)
|
||||
|
||||
if i == 0:
|
||||
for j in 0 ..< ny:
|
||||
row[j] = randc(yu + j*dy, t)
|
||||
elif i == nx - 1:
|
||||
for j in 0 ..< ny:
|
||||
row[j] = randd(yu + j*dy, t)
|
||||
else:
|
||||
row[0] = randa(xu + i*dx, t)
|
||||
for j in 1 ..< ny - 1:
|
||||
row[j] = input[i, j] + # The use of nested sequences here is a bad idea ...
|
||||
dtdysq * (input[i, j+1] - 2 * input[i, j] + input[i, j-1]) +
|
||||
dtdxsq * (input[i+1, j] - 2 * input[i, j] + input[i-1, j])
|
||||
row[ny - 1] = randb(xu + i*dx, t)
|
||||
|
||||
proc initTest() =
|
||||
nx = n
|
||||
ny = 1024
|
||||
nt = 100
|
||||
xu = 0.0
|
||||
xo = 1.570796326794896558
|
||||
yu = 0.0
|
||||
yo = 1.570796326794896558
|
||||
tu = 0.0
|
||||
to = 0.0000001
|
||||
|
||||
dx = (xo - xu) / float64(nx - 1)
|
||||
dy = (yo - yu) / float64(ny - 1)
|
||||
dt = (to - tu) / float64(nt)
|
||||
|
||||
dtdxsq = dt / (dx * dx)
|
||||
dtdysq = dt / (dy * dy)
|
||||
|
||||
even = newMatrix[float64](nx, ny)
|
||||
odd = newMatrix[float64](nx, ny)
|
||||
|
||||
proc prep() =
|
||||
heat(even, 0, nx)
|
||||
|
||||
proc test() =
|
||||
var t = tu
|
||||
|
||||
for _ in countup(1, nt.int, 2):
|
||||
# nt included
|
||||
t += dt
|
||||
diffuse(odd, even, 0, nx, t)
|
||||
t += dt
|
||||
diffuse(even, odd, 0, nx, t)
|
||||
|
||||
if nt mod 2 != 0:
|
||||
t += dt
|
||||
diffuse(odd, even, 0, nx, t)
|
||||
|
||||
proc verify() =
|
||||
var
|
||||
mat: Matrix[float64]
|
||||
mae: float64
|
||||
mre: float64
|
||||
me: float64
|
||||
|
||||
mat = if nt mod 2 != 0: odd else: even
|
||||
|
||||
for a in 0 ..< nx:
|
||||
for b in 0 ..< ny:
|
||||
var tmp = abs(mat[a, b] - solu(xu + a*dx, yu + b*dy, to))
|
||||
if tmp > 1e-3:
|
||||
echo "nx: ", nx, " - ny: ", ny
|
||||
echo "mat[", a, ", ", b, "] = ", mat[a, b], ", expected sol = ", solu(xu + a*dx, yu + b*dy, to)
|
||||
quit 1
|
||||
|
||||
me += tmp
|
||||
if tmp > mae: mae = tmp
|
||||
if mat[a, b] != 0.0: tmp /= mat[a, b]
|
||||
if tmp > mre: mre = tmp
|
||||
|
||||
me /= nx * ny
|
||||
|
||||
if mae > 1e-12:
|
||||
echo &"Local maximal absolute error {mae:1.3e}"
|
||||
quit 1
|
||||
if mre > 1e-12:
|
||||
echo &"Local maximal relative error {mre:1.3e}"
|
||||
quit 1
|
||||
if me > 1e-12:
|
||||
echo &"Global mean absolute error {me:1.3e}"
|
||||
quit 1
|
||||
|
||||
echo "Heat: Verification successful"
|
||||
|
||||
proc main() =
|
||||
var nthreads: int
|
||||
if existsEnv"CTT_NUM_THREADS":
|
||||
nthreads = getEnv"CTT_NUM_THREADS".parseInt()
|
||||
else:
|
||||
nthreads = countProcessors()
|
||||
|
||||
when not defined(windows):
|
||||
var ru: Rusage
|
||||
getrusage(RusageSelf, ru)
|
||||
var
|
||||
rss = ru.ru_maxrss
|
||||
flt = ru.ru_minflt
|
||||
|
||||
initTest()
|
||||
|
||||
# Fibril initializes before benching
|
||||
tp = Threadpool.new(numThreads = nthreads)
|
||||
|
||||
prep()
|
||||
when not defined(windows):
|
||||
let start = wtime_usec()
|
||||
test()
|
||||
when not defined(windows):
|
||||
let stop = wtime_usec()
|
||||
|
||||
getrusage(RusageSelf, ru)
|
||||
rss = ru.ru_maxrss - rss
|
||||
flt = ru.ru_minflt - flt
|
||||
|
||||
tp.shutdown()
|
||||
|
||||
verify()
|
||||
delete(even)
|
||||
delete(odd)
|
||||
|
||||
echo "--------------------------------------------------------------------------"
|
||||
echo "Scheduler: Constantine's Threadpool"
|
||||
echo "Benchmark: heat"
|
||||
echo "Threads: ", nthreads
|
||||
when not defined(windows):
|
||||
echo "Time(us) ", stop - start
|
||||
echo "Max RSS (KB): ", ru.ru_maxrss
|
||||
echo "Runtime RSS (KB): ", rss
|
||||
echo "# of page faults: ", flt
|
||||
echo "--------------------------------------------------------------------------"
|
||||
|
||||
quit 0
|
||||
|
||||
main()
|
||||
@ -0,0 +1,12 @@
|
||||
# Cache-Oblivious Matrix Multiplication
|
||||
|
||||
From Staccato and Cilk
|
||||
|
||||
https://bradley.csail.mit.edu/svn/repos/cilk/5.4.3/examples/matmul.cilk
|
||||
See the paper ``Cache-Oblivious Algorithms'', by
|
||||
Matteo Frigo, Charles E. Leiserson, Harald Prokop, and
|
||||
Sridhar Ramachandran, FOCS 1999, for an explanation of
|
||||
why this algorithm is good for caches.
|
||||
|
||||
Note that the benchmarks output incorrect matrix traces
|
||||
according to the check ...
|
||||
@ -0,0 +1,214 @@
|
||||
# Weave
|
||||
# Copyright (c) 2019 Mamy André-Ratsimbazafy
|
||||
# Licensed and distributed under either of
|
||||
# * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
|
||||
# * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
|
||||
# at your option. This file may not be copied, modified, or distributed except according to those terms.
|
||||
|
||||
# Rectangular matrix multiplication.
|
||||
#
|
||||
# Adapted from Cilk 5.4.3 example
|
||||
#
|
||||
# https://bradley.csail.mit.edu/svn/repos/cilk/5.4.3/examples/matmul.cilk;
|
||||
# See the paper ``Cache-Oblivious Algorithms'', by
|
||||
# Matteo Frigo, Charles E. Leiserson, Harald Prokop, and
|
||||
# Sridhar Ramachandran, FOCS 1999, for an explanation of
|
||||
# why this algorithm is good for caches.
|
||||
|
||||
import
|
||||
# Stdlib
|
||||
system/ansi_c, std/[strformat, os, strutils, math, cpuinfo],
|
||||
# Library
|
||||
../../threadpool,
|
||||
# bench
|
||||
../wtime, ../resources
|
||||
|
||||
# Helpers
|
||||
# -------------------------------------------------------
|
||||
|
||||
# We need a thin wrapper around raw pointers for matrices,
|
||||
# we can't pass "var" to other threads
|
||||
type
|
||||
Matrix[T: SomeFloat] = object
|
||||
buffer: ptr UncheckedArray[T]
|
||||
ld: int
|
||||
|
||||
var tp: Threadpool
|
||||
|
||||
func newMatrixNxN[T](n: int): Matrix[T] {.inline.} =
|
||||
result.buffer = cast[ptr UncheckedArray[T]](c_malloc(csize_t n*n*sizeof(T)))
|
||||
result.ld = n
|
||||
|
||||
template `[]`[T](mat: Matrix[T], row, col: Natural): T =
|
||||
# row-major storage
|
||||
assert row < mat.ld, $i & " < " & $mat.ld
|
||||
assert col < mat.ld, $i & " < " & $mat.ld
|
||||
mat.buffer[row * mat.ld + col]
|
||||
|
||||
template `[]=`[T](mat: Matrix[T], row, col: Natural, value: T) =
|
||||
assert row < mat.ld, $i & " < " & $mat.ld
|
||||
assert col < mat.ld, $i & " < " & $mat.ld
|
||||
mat.buffer[row * mat.ld + col] = value
|
||||
|
||||
func stride*[T](mat: Matrix[T], row, col: Natural): Matrix[T]{.inline.}=
|
||||
## Returns a new view offset by the row and column stride
|
||||
result.buffer = cast[ptr UncheckedArray[T]](
|
||||
addr mat.buffer[row*mat.ld + col]
|
||||
)
|
||||
|
||||
func delete[T](mat: sink Matrix[T]) =
|
||||
c_free(mat.buffer)
|
||||
|
||||
# -------------------------------------------------------
|
||||
|
||||
proc xorshiftRand(): uint32 =
|
||||
var x {.global.} = uint32(2463534242)
|
||||
x = x xor (x shr 13)
|
||||
x = x xor (x shl 17)
|
||||
x = x xor (x shr 5)
|
||||
return x
|
||||
|
||||
func zero[T](A: Matrix[T]) =
|
||||
# zeroing is not timed
|
||||
zeroMem(A.buffer, A.ld * A.ld * sizeof(T))
|
||||
|
||||
proc fill[T](A: Matrix[T]) =
|
||||
for i in 0 ..< A.ld:
|
||||
for j in 0 ..< A.ld:
|
||||
A[i, j] = T(xorshiftRand() mod A.ld.uint32)
|
||||
|
||||
func maxError(A, B: Matrix): float64 =
|
||||
assert A.ld == B.ld
|
||||
for i in 0 ..< A.ld:
|
||||
for j in 0 ..< A.ld:
|
||||
var diff = (A[i, j] - B[i, j]) / A[i, j]
|
||||
if diff < 0:
|
||||
diff = -diff
|
||||
if diff > result:
|
||||
result = diff
|
||||
|
||||
func check[T](A, B, C: Matrix[T], n: int): bool =
|
||||
var
|
||||
tr_C = 0.T
|
||||
tr_AB = 0.T
|
||||
for i in 0 ..< n:
|
||||
for j in 0 ..< n:
|
||||
tr_AB += A[i, j] * B[j, i]
|
||||
tr_C += C[i, i]
|
||||
|
||||
# Note, all benchmarks return false ‾\_(ツ)_/‾
|
||||
return abs(tr_AB - tr_C) < 1e-3
|
||||
|
||||
proc matmul[T](A, B, C: Matrix[T], m, n, p: int, add: bool): bool =
|
||||
# The original bench passes around a ``ld`` parameter (leading dimension?),
|
||||
# we store it in the matrices
|
||||
# We return a dummy bool to allow waiting on the matmul
|
||||
|
||||
# Threshold
|
||||
if (m + n + p) <= 64:
|
||||
if add:
|
||||
for i in 0 ..< m:
|
||||
for k in 0 ..< p:
|
||||
var c = 0.T
|
||||
for j in 0 ..< n:
|
||||
c += A[i, j] * B[j, k]
|
||||
C[i, k] += c
|
||||
else:
|
||||
for i in 0 ..< m:
|
||||
for k in 0 ..< p:
|
||||
var c = 0.T
|
||||
for j in 0 ..< n:
|
||||
c += A[i, j] * B[j, k]
|
||||
C[i, k] = c
|
||||
|
||||
return
|
||||
|
||||
var h0, h1: FlowVar[bool]
|
||||
## Each half of the computation
|
||||
|
||||
# matrix is larger than threshold
|
||||
if m >= n and n >= p:
|
||||
let m1 = m shr 1 # divide by 2
|
||||
h0 = tp.spawn matmul(A, B, C, m1, n, p, add)
|
||||
h1 = tp.spawn matmul(A.stride(m1, 0), B, C.stride(m1, 0), m - m1, n, p, add)
|
||||
elif n >= m and n >= p:
|
||||
let n1 = n shr 1 # divide by 2
|
||||
h0 = tp.spawn matmul(A, B, C, m, n1, p, add)
|
||||
h1 = tp.spawn matmul(A.stride(0, n1), B.stride(n1, 0), C, m, n - n1, p, add = true)
|
||||
else:
|
||||
let p1 = p shr 1
|
||||
h0 = tp.spawn matmul(A, B, C, m, n, p1, add)
|
||||
h1 = tp.spawn matmul(A, B.stride(0, p1), C.stride(0, p1), m, n, p - p1, add)
|
||||
|
||||
discard sync(h0)
|
||||
discard sync(h1)
|
||||
|
||||
proc main() =
|
||||
echo "Warning the benchmark seems to not be correct."
|
||||
var
|
||||
n = 3000
|
||||
nthreads: int
|
||||
|
||||
if existsEnv"CTT_NUM_THREADS":
|
||||
nthreads = getEnv"CTT_NUM_THREADS".parseInt()
|
||||
else:
|
||||
nthreads = countProcessors()
|
||||
|
||||
if paramCount() == 0:
|
||||
let exeName = getAppFilename().extractFilename()
|
||||
echo &"Usage: {exeName} <n (matrix size):{n}>"
|
||||
echo &"Running with default config n = {n}"
|
||||
elif paramCount() == 1:
|
||||
n = paramStr(1).parseInt()
|
||||
else:
|
||||
let exeName = getAppFilename().extractFilename()
|
||||
echo &"Usage: {exeName} <n (matrix size):{n}>"
|
||||
echo &"Up to 1 parameter is valid. Received {paramCount()}"
|
||||
quit 1
|
||||
|
||||
var A = newMatrixNxN[float32](n)
|
||||
var B = newMatrixNxN[float32](n)
|
||||
var C = newMatrixNxN[float32](n)
|
||||
|
||||
fill(A)
|
||||
fill(B)
|
||||
zero(C)
|
||||
|
||||
var ru: Rusage
|
||||
getrusage(RusageSelf, ru)
|
||||
var
|
||||
rss = ru.ru_maxrss
|
||||
flt = ru.ru_minflt
|
||||
|
||||
# Staccato benches runtime init and exit as well
|
||||
let start = wtime_msec()
|
||||
|
||||
tp = Threadpool.new(numThreads = nthreads)
|
||||
discard sync tp.spawn matmul(A, B, C, n, n, n, add = false)
|
||||
tp.shutdown()
|
||||
|
||||
let stop = wtime_msec()
|
||||
|
||||
getrusage(RusageSelf, ru)
|
||||
rss = ru.ru_maxrss - rss
|
||||
flt = ru.ru_minflt - flt
|
||||
|
||||
echo "--------------------------------------------------------------------------"
|
||||
echo "Scheduler: Constantine's Threadpool"
|
||||
echo "Benchmark: Matrix Multiplication (cache oblivious)"
|
||||
echo "Threads: ", nthreads
|
||||
echo "Time(ms) ", stop - start
|
||||
echo "Max RSS (KB): ", ru.ru_maxrss
|
||||
echo "Runtime RSS (KB): ", rss
|
||||
echo "# of page faults: ", flt
|
||||
echo "Input: ", n
|
||||
echo "Error: ", check(A, B, C, n)
|
||||
echo "--------------------------------------------------------------------------"
|
||||
|
||||
delete A
|
||||
delete B
|
||||
delete C
|
||||
|
||||
quit 0
|
||||
|
||||
main()
|
||||
@ -0,0 +1,181 @@
|
||||
# Weave
|
||||
# Copyright (c) 2019 Mamy André-Ratsimbazafy
|
||||
# Licensed and distributed under either of
|
||||
# * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
|
||||
# * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
|
||||
# at your option. This file may not be copied, modified, or distributed except according to those terms.
|
||||
#
|
||||
# Original code licenses
|
||||
# ------------------------------------------------------------------------------------------------
|
||||
#
|
||||
# /**********************************************************************************************/
|
||||
# /* This program is part of the Barcelona OpenMP Tasks Suite */
|
||||
# /* Copyright (C) 2009 Barcelona Supercomputing Center - Centro Nacional de Supercomputacion */
|
||||
# /* Copyright (C) 2009 Universitat Politecnica de Catalunya */
|
||||
# /* */
|
||||
# /* This program is free software; you can redistribute it and/or modify */
|
||||
# /* it under the terms of the GNU General Public License as published by */
|
||||
# /* the Free Software Foundation; either version 2 of the License, or */
|
||||
# /* (at your option) any later version. */
|
||||
# /* */
|
||||
# /* This program is distributed in the hope that it will be useful, */
|
||||
# /* but WITHOUT ANY WARRANTY; without even the implied warranty of */
|
||||
# /* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the */
|
||||
# /* GNU General Public License for more details. */
|
||||
# /* */
|
||||
# /* You should have received a copy of the GNU General Public License */
|
||||
# /* along with this program; if not, write to the Free Software */
|
||||
# /* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */
|
||||
# /**********************************************************************************************/
|
||||
#
|
||||
# /*
|
||||
# * Original code from the Cilk project (by Keith Randall)
|
||||
# *
|
||||
# * Copyright (c) 2000 Massachusetts Institute of Technology
|
||||
# * Copyright (c) 2000 Matteo Frigo
|
||||
# */
|
||||
|
||||
import
|
||||
# Stdlib
|
||||
system/ansi_c, std/[strformat, os, strutils],
|
||||
std/threadpool,
|
||||
# bench
|
||||
../wtime
|
||||
|
||||
# This deadlocks :/
|
||||
|
||||
# Nim helpers
|
||||
# -------------------------------------------------
|
||||
|
||||
when defined(windows):
|
||||
proc alloca(size: csize): pointer {.header: "<malloc.h>".}
|
||||
else:
|
||||
proc alloca(size: csize): pointer {.header: "<alloca.h>".}
|
||||
|
||||
template alloca*(T: typedesc): ptr T =
|
||||
cast[ptr T](alloca(sizeof(T)))
|
||||
|
||||
template alloca*(T: typedesc, len: Natural): ptr UncheckedArray[T] =
|
||||
cast[ptr UncheckedArray[T]](alloca(sizeof(T) * len))
|
||||
|
||||
proc tp_alloc*(T: typedesc, len: SomeInteger): ptr UncheckedArray[T] {.inline.} =
|
||||
cast[type result](c_malloc(csize_t len*sizeof(T)))
|
||||
|
||||
proc tp_free*[T: ptr](p: T) {.inline.} =
|
||||
c_free(p)
|
||||
|
||||
# We assume that Nim zeroMem vs C memset
|
||||
# and Nim copyMem vs C memcpy have no difference
|
||||
# Nim does have extra checks to handle GC-ed types
|
||||
# but they should be eliminated by the Nim compiler.
|
||||
|
||||
# -------------------------------------------------
|
||||
|
||||
type CharArray = ptr UncheckedArray[char]
|
||||
|
||||
var example_solution: ptr UncheckedArray[char]
|
||||
|
||||
func isValid(n: int32, a: CharArray): bool =
|
||||
## `a` contains an array of `n` queen positions.
|
||||
## Returns true if none of the queens conflict and 0 otherwise.
|
||||
|
||||
for i in 0'i32 ..< n:
|
||||
let p = cast[int32](a[i])
|
||||
|
||||
for j in i+1 ..< n:
|
||||
let q = cast[int32](a[j])
|
||||
if q == p or q == p - (j-i) or q == p + (j-i):
|
||||
return false
|
||||
return true
|
||||
|
||||
proc nqueens_ser(n, j: int32, a: CharArray): int32 =
|
||||
# Serial nqueens
|
||||
if n == j:
|
||||
# Good solution count it
|
||||
if example_solution.isNil:
|
||||
example_solution = tp_alloc(char, n)
|
||||
copyMem(example_solution, a, n * sizeof(char))
|
||||
return 1
|
||||
|
||||
# Try each possible position for queen `j`
|
||||
for i in 0 ..< n:
|
||||
a[j] = cast[char](i)
|
||||
if isValid(j+1, a):
|
||||
result += nqueens_ser(n, j+1, a)
|
||||
|
||||
proc nqueens_par(n, j: int32, a: CharArray): int32 =
|
||||
|
||||
if n == j:
|
||||
# Good solution, count it
|
||||
return 1
|
||||
|
||||
var localCounts = alloca(Flowvar[int32], n)
|
||||
zeroMem(localCounts, n * sizeof(Flowvar[int32]))
|
||||
|
||||
# Try each position for queen `j`
|
||||
for i in 0 ..< n:
|
||||
var b = alloca(char, j+1)
|
||||
copyMem(b, a, j * sizeof(char))
|
||||
b[j] = cast[char](i)
|
||||
if isValid(j+1, b):
|
||||
localCounts[i] = spawn nqueens_par(n, j+1, b)
|
||||
|
||||
for i in 0 ..< n:
|
||||
if not localCounts[i].isNil():
|
||||
result += ^localCounts[i]
|
||||
|
||||
const solutions = [
|
||||
1,
|
||||
0,
|
||||
0,
|
||||
2,
|
||||
10, # 5x5
|
||||
4,
|
||||
10,
|
||||
92, # 8x8
|
||||
352,
|
||||
724, # 10x10
|
||||
2680,
|
||||
14200,
|
||||
73712,
|
||||
365596,
|
||||
2279184, # 15x15
|
||||
14772512
|
||||
]
|
||||
|
||||
proc verifyQueens(n, res: int32) =
|
||||
if n > solutions.len:
|
||||
echo &"Cannot verify result: {n} is out of range [1,{solutions.len}]"
|
||||
return
|
||||
|
||||
if res != solutions[n-1]:
|
||||
echo &"N-Queens failure: {res} is different from expected {solutions[n-1]}"
|
||||
|
||||
proc main() =
|
||||
if paramCount() != 1:
|
||||
let exeName = getAppFilename().extractFilename()
|
||||
echo &"Usage: {exeName} <n: number of queens on a nxn board>"
|
||||
quit 0
|
||||
|
||||
let n = paramStr(1).parseInt.int32
|
||||
|
||||
if n notin 1 .. solutions.len:
|
||||
echo &"The number of queens N (on a NxN board) must be in the range [1, {solutions.len}]"
|
||||
quit 1
|
||||
|
||||
|
||||
let start = wtime_msec()
|
||||
let count = nqueens_par(n, 0, alloca(char, n))
|
||||
let stop = wtime_msec()
|
||||
|
||||
verifyQueens(n, count)
|
||||
|
||||
if not example_solution.isNil:
|
||||
stdout.write("Example solution: ")
|
||||
for i in 0 ..< n:
|
||||
c_printf("%2d ", example_solution[i])
|
||||
stdout.write('\n')
|
||||
|
||||
echo &"Elapsed wall time: {stop-start:2.4f} ms"
|
||||
|
||||
main()
|
||||
@ -0,0 +1,231 @@
|
||||
# Weave
|
||||
# Copyright (c) 2019 Mamy André-Ratsimbazafy
|
||||
# Licensed and distributed under either of
|
||||
# * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
|
||||
# * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
|
||||
# at your option. This file may not be copied, modified, or distributed except according to those terms.
|
||||
#
|
||||
# Original code licenses
|
||||
# ------------------------------------------------------------------------------------------------
|
||||
#
|
||||
# /**********************************************************************************************/
|
||||
# /* This program is part of the Barcelona OpenMP Tasks Suite */
|
||||
# /* Copyright (C) 2009 Barcelona Supercomputing Center - Centro Nacional de Supercomputacion */
|
||||
# /* Copyright (C) 2009 Universitat Politecnica de Catalunya */
|
||||
# /* */
|
||||
# /* This program is free software; you can redistribute it and/or modify */
|
||||
# /* it under the terms of the GNU General Public License as published by */
|
||||
# /* the Free Software Foundation; either version 2 of the License, or */
|
||||
# /* (at your option) any later version. */
|
||||
# /* */
|
||||
# /* This program is distributed in the hope that it will be useful, */
|
||||
# /* but WITHOUT ANY WARRANTY; without even the implied warranty of */
|
||||
# /* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the */
|
||||
# /* GNU General Public License for more details. */
|
||||
# /* */
|
||||
# /* You should have received a copy of the GNU General Public License */
|
||||
# /* along with this program; if not, write to the Free Software */
|
||||
# /* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */
|
||||
# /**********************************************************************************************/
|
||||
#
|
||||
# /*
|
||||
# * Original code from the Cilk project (by Keith Randall)
|
||||
# *
|
||||
# * Copyright (c) 2000 Massachusetts Institute of Technology
|
||||
# * Copyright (c) 2000 Matteo Frigo
|
||||
# */
|
||||
|
||||
import
|
||||
# Stdlib
|
||||
system/ansi_c, std/[strformat, os, strutils, cpuinfo],
|
||||
# library
|
||||
../../threadpool
|
||||
|
||||
when not defined(windows):
|
||||
# bench
|
||||
import ../wtime, ../resources
|
||||
|
||||
# Nim helpers
|
||||
# -------------------------------------------------
|
||||
|
||||
when defined(windows):
|
||||
proc alloca(size: int): pointer {.header: "<malloc.h>".}
|
||||
else:
|
||||
proc alloca(size: int): pointer {.header: "<alloca.h>".}
|
||||
|
||||
template alloca*(T: typedesc): ptr T =
|
||||
cast[ptr T](alloca(sizeof(T)))
|
||||
|
||||
template alloca*(T: typedesc, len: Natural): ptr UncheckedArray[T] =
|
||||
cast[ptr UncheckedArray[T]](alloca(sizeof(T) * len))
|
||||
|
||||
proc tp_alloc*(T: typedesc, len: SomeInteger): ptr UncheckedArray[T] {.inline.} =
|
||||
when defined(TP_useNimAlloc):
|
||||
cast[type result](createSharedU(T, len))
|
||||
else:
|
||||
cast[type result](c_malloc(csize_t len*sizeof(T)))
|
||||
|
||||
proc tp_free*[T: ptr](p: T) {.inline.} =
|
||||
when defined(TP_useNimAlloc):
|
||||
freeShared(p)
|
||||
else:
|
||||
c_free(p)
|
||||
|
||||
# We assume that Nim zeroMem vs C memset
|
||||
# and Nim copyMem vs C memcpy have no difference
|
||||
# Nim does have extra checks to handle GC-ed types
|
||||
# but they should be eliminated by the Nim compiler.
|
||||
|
||||
# -------------------------------------------------
|
||||
|
||||
type CharArray = ptr UncheckedArray[char]
|
||||
|
||||
var tp: Threadpool
|
||||
var example_solution: ptr UncheckedArray[char]
|
||||
|
||||
func isValid(n: int32, a: CharArray): bool =
|
||||
## `a` contains an array of `n` queen positions.
|
||||
## Returns true if none of the queens conflict and 0 otherwise.
|
||||
|
||||
for i in 0'i32 ..< n:
|
||||
let p = cast[int32](a[i])
|
||||
|
||||
for j in i+1 ..< n:
|
||||
let q = cast[int32](a[j])
|
||||
if q == p or q == p - (j-i) or q == p + (j-i):
|
||||
return false
|
||||
return true
|
||||
|
||||
proc nqueens_ser(n, j: int32, a: CharArray): int32 =
|
||||
# Serial nqueens
|
||||
if n == j:
|
||||
# Good solution count it
|
||||
if example_solution.isNil:
|
||||
example_solution = tp_alloc(char, n)
|
||||
copyMem(example_solution, a, n * sizeof(char))
|
||||
return 1
|
||||
|
||||
# Try each possible position for queen `j`
|
||||
for i in 0 ..< n:
|
||||
a[j] = cast[char](i)
|
||||
if isValid(j+1, a):
|
||||
result += nqueens_ser(n, j+1, a)
|
||||
|
||||
proc nqueens_par(n, j: int32, a: CharArray): int32 {.gcsafe.} =
|
||||
|
||||
if n == j:
|
||||
# Good solution, count it
|
||||
return 1
|
||||
|
||||
var localCounts = alloca(Flowvar[int32], n)
|
||||
zeroMem(localCounts, n * sizeof(Flowvar[int32]))
|
||||
|
||||
# Try each position for queen `j`
|
||||
for i in 0 ..< n:
|
||||
var b = alloca(char, j+1)
|
||||
copyMem(b, a, j * sizeof(char))
|
||||
b[j] = cast[char](i)
|
||||
if isValid(j+1, b):
|
||||
localCounts[i] = tp.spawn nqueens_par(n, j+1, b)
|
||||
|
||||
for i in 0 ..< n:
|
||||
if localCounts[i].isSpawned():
|
||||
result += sync(localCounts[i])
|
||||
|
||||
const solutions = [
|
||||
1,
|
||||
0,
|
||||
0,
|
||||
2,
|
||||
10, # 5x5
|
||||
4,
|
||||
10,
|
||||
92, # 8x8
|
||||
352,
|
||||
724, # 10x10
|
||||
2680,
|
||||
14200,
|
||||
73712,
|
||||
365596,
|
||||
2279184, # 15x15
|
||||
14772512
|
||||
]
|
||||
|
||||
proc verifyQueens(n, res: int32) =
|
||||
if n > solutions.len:
|
||||
echo &"Cannot verify result: {n} is out of range [1,{solutions.len}]"
|
||||
return
|
||||
|
||||
if res != solutions[n-1]:
|
||||
echo &"N-Queens failure: {res} is different from expected {solutions[n-1]}"
|
||||
|
||||
proc main() =
|
||||
var
|
||||
n = 11'i32
|
||||
nthreads: int
|
||||
|
||||
if existsEnv"CTT_NUM_THREADS":
|
||||
nthreads = getEnv"CTT_NUM_THREADS".parseInt()
|
||||
else:
|
||||
nthreads = countProcessors()
|
||||
|
||||
if paramCount() == 0:
|
||||
let exeName = getAppFilename().extractFilename()
|
||||
echo &"Usage: {exeName} <N:{n}>"
|
||||
echo &"Running with default config N = {n}\n"
|
||||
|
||||
if paramCount() >= 1:
|
||||
n = paramStr(1).parseInt.int32
|
||||
|
||||
if n notin 1 .. solutions.len:
|
||||
echo &"The number of queens N (on a NxN board) must be in the range [1, {solutions.len}]"
|
||||
quit 1
|
||||
|
||||
when not defined(windows):
|
||||
var ru: Rusage
|
||||
getrusage(RusageSelf, ru)
|
||||
var
|
||||
rss = ru.ru_maxrss
|
||||
flt = ru.ru_minflt
|
||||
|
||||
tp = Threadpool.new(numThreads = nthreads)
|
||||
|
||||
when not defined(windows):
|
||||
let start = wtime_msec()
|
||||
|
||||
let count = nqueens_par(n, 0, alloca(char, n))
|
||||
|
||||
when not defined(windows):
|
||||
let stop = wtime_msec()
|
||||
|
||||
when not defined(windows):
|
||||
getrusage(RusageSelf, ru)
|
||||
rss = ru.ru_maxrss - rss
|
||||
flt = ru.ru_minflt - flt
|
||||
|
||||
tp.shutdown()
|
||||
|
||||
verifyQueens(n, count)
|
||||
|
||||
if not example_solution.isNil:
|
||||
stdout.write("Example solution: ")
|
||||
for i in 0 ..< n:
|
||||
c_printf("%2d ", example_solution[i])
|
||||
stdout.write('\n')
|
||||
|
||||
echo "--------------------------------------------------------------------------"
|
||||
echo "Scheduler: Constantine's Threadpool"
|
||||
echo "Benchmark: N-queens"
|
||||
echo "Threads: ", nthreads
|
||||
when not defined(windows):
|
||||
echo "Time(us) ", stop - start
|
||||
echo "Max RSS (KB): ", ru.ru_maxrss
|
||||
echo "Runtime RSS (KB): ", rss
|
||||
echo "# of page faults: ", flt
|
||||
echo "Problem size: ", n,"x",n, " board with ",n, " queens"
|
||||
echo "Solutions found: ", count
|
||||
echo "--------------------------------------------------------------------------"
|
||||
|
||||
quit 0
|
||||
|
||||
main()
|
||||
24
constantine/platforms/threadpool/benchmarks/resources.nim
Normal file
24
constantine/platforms/threadpool/benchmarks/resources.nim
Normal file
@ -0,0 +1,24 @@
|
||||
type
|
||||
Timeval {.importc: "timeval", header:"<sys/time.h>", bycopy.} = object
|
||||
|
||||
Rusage* {.importc: "struct rusage", header:"<sys/resource.h>", bycopy.} = object
|
||||
ru_utime {.importc.}: Timeval
|
||||
ru_stime {.importc.}: Timeval
|
||||
ru_maxrss* {.importc.}: int32 # Maximum resident set size
|
||||
# ...
|
||||
ru_minflt* {.importc.}: int32 # page reclaims (soft page faults)
|
||||
|
||||
RusageWho* {.size: sizeof(cint).} = enum
|
||||
RusageChildren = -1
|
||||
RusageSelf = 0
|
||||
RusageThread = 1
|
||||
|
||||
when defined(debug):
|
||||
var H_RUSAGE_SELF{.importc, header:"<sys/resource.h".}: cint
|
||||
var H_RUSAGE_CHILDREN{.importc, header:"<sys/resource.h".}: cint
|
||||
var H_RUSAGE_THREAD{.importc, header:"<sys/resource.h".}: cint
|
||||
assert H_RUSAGE_SELF == ord(RusageSelf)
|
||||
assert H_RUSAGE_CHILDREN = ord(RusageChildren)
|
||||
assert H_RUSAGE_THREAD = ord(RusageThread)
|
||||
|
||||
proc getrusage*(who: RusageWho, usage: var Rusage) {.importc, header: "sys/resource.h".}
|
||||
@ -0,0 +1,7 @@
|
||||
# Simple single-producer multiple consumers benchmarks
|
||||
|
||||
SPC A Simple Producer-Consumer benchmark.
|
||||
|
||||
A single worker produces n tasks,
|
||||
each running for t microseconds. This benchmark allows us to test how many
|
||||
concurrent consumers a single producer can sustain.
|
||||
@ -0,0 +1,146 @@
|
||||
import
|
||||
# STD lib
|
||||
system/ansi_c, std/[os, strutils, cpuinfo, strformat, math],
|
||||
# Library
|
||||
../../threadpool,
|
||||
# bench
|
||||
../wtime, ../resources
|
||||
|
||||
var NumTasksTotal: int32
|
||||
var TaskGranularity: int32 # microsecond
|
||||
var PollInterval: float64 # microsecond
|
||||
|
||||
var tp: Threadpool
|
||||
|
||||
var global_poll_elapsed {.threadvar.}: float64
|
||||
|
||||
template dummy_cpt(): untyped =
|
||||
# Dummy computation
|
||||
# Calculate fib(30) iteratively
|
||||
var
|
||||
fib = 0
|
||||
f2 = 0
|
||||
f1 = 1
|
||||
for i in 2 .. 30:
|
||||
fib = f1 + f2
|
||||
f2 = f1
|
||||
f1 = fib
|
||||
|
||||
proc spc_consume(usec: int32) =
|
||||
|
||||
var pollElapsed = 0'f64
|
||||
|
||||
let start = wtime_usec()
|
||||
let stop = usec.float64
|
||||
global_poll_elapsed = PollInterval
|
||||
|
||||
while true:
|
||||
var elapsed = wtime_usec() - start
|
||||
elapsed = elapsed - pollElapsed
|
||||
if elapsed >= stop:
|
||||
break
|
||||
|
||||
dummy_cpt()
|
||||
|
||||
# if elapsed >= global_poll_elapsed:
|
||||
# let pollStart = wtime_usec()
|
||||
# loadBalance(Weave)
|
||||
# pollElapsed += wtime_usec() - pollStart
|
||||
# global_poll_elapsed += PollInterval
|
||||
|
||||
# c_printf("Elapsed: %.2lfus\n", elapsed)
|
||||
|
||||
proc spc_consume_nopoll(usec: int32) =
|
||||
|
||||
let start = wtime_usec()
|
||||
let stop = usec.float64
|
||||
|
||||
while true:
|
||||
var elapsed = wtime_usec() - start
|
||||
if elapsed >= stop:
|
||||
break
|
||||
|
||||
dummy_cpt()
|
||||
|
||||
# c_printf("Elapsed: %.2lfus\n", elapsed)
|
||||
|
||||
proc spc_produce(n: int32) =
|
||||
for i in 0 ..< n:
|
||||
tp.spawn spc_consume(TaskGranularity)
|
||||
|
||||
proc spc_produce_seq(n: int32) =
|
||||
for i in 0 ..< n:
|
||||
spc_consume_nopoll(TaskGranularity)
|
||||
|
||||
proc main() =
|
||||
NumTasksTotal = 1000000
|
||||
TaskGranularity = 10
|
||||
PollInterval = 10
|
||||
|
||||
if paramCount() == 0:
|
||||
let exeName = getAppFilename().extractFilename()
|
||||
echo &"Usage: {exeName} <# of tasks:{NumTasksTotal}> " &
|
||||
&"<task granularity (us): {TaskGranularity}> " &
|
||||
&"[polling interval (us): task granularity]"
|
||||
echo &"Running with default config tasks = {NumTasksTotal}, granularity (us) = {TaskGranularity}, polling (us) = {PollInterval}"
|
||||
if paramCount() >= 1:
|
||||
NumTasksTotal = paramStr(1).parseInt.int32
|
||||
if paramCount() >= 2:
|
||||
TaskGranularity = paramStr(2). parseInt.int32
|
||||
if paramCount() == 3:
|
||||
PollInterval = paramStr(3).parseInt.float64
|
||||
else:
|
||||
PollInterval = TaskGranularity.float64
|
||||
if paramCount() > 3:
|
||||
let exeName = getAppFilename().extractFilename()
|
||||
echo &"Usage: {exeName} <# of tasks:{NumTasksTotal}> " &
|
||||
&"<task granularity (us): {TaskGranularity}> " &
|
||||
&"[polling interval (us): task granularity]"
|
||||
quit 1
|
||||
|
||||
var nthreads: int
|
||||
if existsEnv"CTT_NUM_THREADS":
|
||||
nthreads = getEnv"CTT_NUM_THREADS".parseInt()
|
||||
else:
|
||||
nthreads = countProcessors()
|
||||
|
||||
tp = Threadpool.new(numThreads = nthreads)
|
||||
|
||||
# measure overhead during tasking
|
||||
var ru: Rusage
|
||||
getrusage(RusageSelf, ru)
|
||||
var
|
||||
rss = ru.ru_maxrss
|
||||
flt = ru.ru_minflt
|
||||
|
||||
let start = wtime_msec()
|
||||
|
||||
# spc_produce_seq(NumTasksTotal)
|
||||
spc_produce(NumTasksTotal)
|
||||
tp.syncAll()
|
||||
|
||||
let stop = wtime_msec()
|
||||
|
||||
getrusage(RusageSelf, ru)
|
||||
rss = ru.ru_maxrss - rss
|
||||
flt = ru.ru_minflt - flt
|
||||
|
||||
tp.shutdown()
|
||||
|
||||
echo "--------------------------------------------------------------------------"
|
||||
echo "Scheduler: Constantine's Threadpool"
|
||||
echo "Benchmark: SPC (Single task Producer - multi Consumer)"
|
||||
echo "Threads: ", nthreads
|
||||
echo "Time(ms) ", round(stop - start, 3)
|
||||
echo "Max RSS (KB): ", ru.ru_maxrss
|
||||
echo "Runtime RSS (KB): ", rss
|
||||
echo "# of page faults: ", flt
|
||||
echo "--------------------------------------------------------------------------"
|
||||
echo "# of tasks: ", NumTasksTotal
|
||||
echo "Task granularity (us): ", TaskGranularity
|
||||
echo "Polling / manual load balancing interval (us): ", PollInterval
|
||||
echo "--------------------------------------------------------------------------"
|
||||
|
||||
quit 0
|
||||
|
||||
main()
|
||||
53
constantine/platforms/threadpool/benchmarks/wtime.h
Normal file
53
constantine/platforms/threadpool/benchmarks/wtime.h
Normal file
@ -0,0 +1,53 @@
|
||||
#ifndef WTIME_H
|
||||
#define WTIME_H
|
||||
|
||||
#include <sys/time.h>
|
||||
#include <time.h>
|
||||
|
||||
// Number of seconds since the Epoch
|
||||
static inline double Wtime_sec(void)
|
||||
{
|
||||
struct timeval tv;
|
||||
gettimeofday(&tv, NULL);
|
||||
return tv.tv_sec + tv.tv_usec / 1e6;
|
||||
}
|
||||
|
||||
// Number of milliseconds since the Epoch
|
||||
static inline double Wtime_msec(void)
|
||||
{
|
||||
struct timeval tv;
|
||||
gettimeofday(&tv, NULL);
|
||||
return tv.tv_sec * 1e3 + tv.tv_usec / 1e3;
|
||||
}
|
||||
|
||||
// Number of microseconds since the Epoch
|
||||
static inline double Wtime_usec(void)
|
||||
{
|
||||
struct timeval tv;
|
||||
gettimeofday(&tv, NULL);
|
||||
return tv.tv_sec * 1e6 + tv.tv_usec;
|
||||
}
|
||||
|
||||
// Read time stamp counter on x86
|
||||
static inline unsigned long long readtsc(void)
|
||||
{
|
||||
unsigned int lo, hi;
|
||||
// RDTSC copies contents of 64-bit TSC into EDX:EAX
|
||||
asm volatile ("rdtsc" : "=a" (lo), "=d" (hi));
|
||||
return (unsigned long long)hi << 32 | lo;
|
||||
}
|
||||
|
||||
#define WTIME_unique_var_name_paste(id, n) id ## n
|
||||
#define WTIME_unique_var_name(id, n) WTIME_unique_var_name_paste(id, n)
|
||||
#define WTIME_unique_var(id) WTIME_unique_var_name(id, __LINE__)
|
||||
|
||||
// Convenience macro for time measurement
|
||||
#define WTIME(unit) \
|
||||
double WTIME_unique_var(_start_##unit##_) = Wtime_##unit##ec(); \
|
||||
int WTIME_unique_var(_i_) = 0; \
|
||||
for (; WTIME_unique_var(_i_) == 0 || \
|
||||
(printf("Elapsed wall time: %.2lf "#unit"\n", \
|
||||
Wtime_##unit##ec() - WTIME_unique_var(_start_##unit##_)), 0); \
|
||||
WTIME_unique_var(_i_)++)
|
||||
|
||||
#endif // WTIME_H
|
||||
10
constantine/platforms/threadpool/benchmarks/wtime.nim
Normal file
10
constantine/platforms/threadpool/benchmarks/wtime.nim
Normal file
@ -0,0 +1,10 @@
|
||||
|
||||
import strutils, os
|
||||
|
||||
const cSourcesPath = currentSourcePath.rsplit(DirSep, 1)[0]
|
||||
const cHeader = csourcesPath / "wtime.h"
|
||||
|
||||
{.passC: "-I" & cSourcesPath .}
|
||||
|
||||
proc wtime_usec*: float64 {.importc: "Wtime_usec", header: cHeader.}
|
||||
proc wtime_msec*: float64 {.importc: "Wtime_msec", header: cHeader.}
|
||||
186
constantine/platforms/threadpool/crossthread/backoff.nim
Normal file
186
constantine/platforms/threadpool/crossthread/backoff.nim
Normal file
@ -0,0 +1,186 @@
|
||||
# Constantine
|
||||
# Copyright (c) 2018-2019 Status Research & Development GmbH
|
||||
# Copyright (c) 2020-Present Mamy André-Ratsimbazafy
|
||||
# Licensed and distributed under either of
|
||||
# * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
|
||||
# * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
|
||||
# at your option. This file may not be copied, modified, or distributed except according to those terms.
|
||||
|
||||
import
|
||||
std/atomics,
|
||||
../primitives/futexes
|
||||
|
||||
# We implement 2 datastructures to put threads to sleep:
|
||||
# 1. An event notifier to put an awaiting thread to sleep when the task they require is worked on by another thread
|
||||
# 2. An eventcount to put an idle thread to sleep
|
||||
|
||||
{.push raises:[], checks:off.}
|
||||
|
||||
# ############################################################
|
||||
#
|
||||
# Event Notifier
|
||||
#
|
||||
# ############################################################
|
||||
|
||||
# Formal verification at: https://github.com/mratsim/weave/blob/7682784/formal_verification/event_notifiers.tla#L76-L109
|
||||
|
||||
type
|
||||
EventNotifier* = object
|
||||
## Multi Producers, Single Consumer event notification
|
||||
## This is can be seen as a wait-free condition variable for producers
|
||||
## that avoids them spending time in expensive kernel land due to mutexes.
|
||||
# ---- Consumer specific ----
|
||||
ticket{.align: 64.}: uint8 # A ticket for the consumer to sleep in a phase
|
||||
# ---- Contention ---- no real need for padding as cache line should be reloaded in case of contention anyway
|
||||
futex: Futex # A Futex (atomic int32 that can put thread to sleep)
|
||||
phase: Atomic[uint8] # A binary timestamp, toggles between 0 and 1 (but there is no atomic "not")
|
||||
signaled: Atomic[bool] # Signaling condition
|
||||
|
||||
func initialize*(en: var EventNotifier) {.inline.} =
|
||||
en.futex.initialize()
|
||||
en.ticket = 0
|
||||
en.phase.store(0, moRelaxed)
|
||||
en.signaled.store(false, moRelaxed)
|
||||
|
||||
func `=destroy`*(en: var EventNotifier) {.inline.} =
|
||||
en.futex.teardown()
|
||||
|
||||
func `=`*(dst: var EventNotifier, src: EventNotifier) {.error: "An event notifier cannot be copied".}
|
||||
func `=sink`*(dst: var EventNotifier, src: EventNotifier) {.error: "An event notifier cannot be moved".}
|
||||
|
||||
func prepareToPark*(en: var EventNotifier) {.inline.} =
|
||||
## The consumer intends to sleep soon.
|
||||
## This must be called before the formal notification
|
||||
## via a channel.
|
||||
if not en.signaled.load(moRelaxed):
|
||||
en.ticket = en.phase.load(moRelaxed)
|
||||
|
||||
proc park*(en: var EventNotifier) {.inline.} =
|
||||
## Wait until we are signaled of an event
|
||||
## Thread is parked and does not consume CPU resources
|
||||
## This may wakeup spuriously.
|
||||
if not en.signaled.load(moRelaxed):
|
||||
if en.ticket == en.phase.load(moRelaxed):
|
||||
en.futex.wait(0)
|
||||
en.signaled.store(false, moRelaxed)
|
||||
en.futex.initialize()
|
||||
|
||||
proc notify*(en: var EventNotifier) {.inline.} =
|
||||
## Signal a thread that it can be unparked
|
||||
|
||||
if en.signaled.load(moRelaxed):
|
||||
# Another producer is signaling
|
||||
return
|
||||
en.signaled.store(true, moRelease)
|
||||
discard en.phase.fetchXor(1, moRelaxed)
|
||||
en.futex.store(1, moRelease)
|
||||
en.futex.wake()
|
||||
|
||||
# ############################################################
|
||||
#
|
||||
# Eventcount
|
||||
#
|
||||
# ############################################################
|
||||
|
||||
type
|
||||
Eventcount* = object
|
||||
## The lock-free equivalent of a condition variable.
|
||||
##
|
||||
## Usage, if a thread needs to be parked until a condition is true
|
||||
## and signaled by another thread:
|
||||
## ```Nim
|
||||
## if condition:
|
||||
## return
|
||||
##
|
||||
## while true:
|
||||
## ticket = ec.sleepy()
|
||||
## if condition:
|
||||
## ec.cancelSleep()
|
||||
## break
|
||||
## else:
|
||||
## ec.sleep()
|
||||
## ```
|
||||
|
||||
state: Atomic[uint32]
|
||||
# State is actually the equivalent of a bitfield
|
||||
# type State = object
|
||||
# waiters {.bitsize: 16.}: uint16
|
||||
# when sizeof(pointer) == 4:
|
||||
# epoch {.bitsize: 16.}: uint16
|
||||
# else:
|
||||
# epoch {.bitsize: 48.}: uint48
|
||||
#
|
||||
# of size, the native integer size
|
||||
# and so can be used for atomic operations on 32-bit or 64-bit platforms.
|
||||
# but there is no native fetchAdd for bitfield
|
||||
futex: Futex
|
||||
# Technically we could use the futex as the state.
|
||||
# When you wait on a Futex, it waits only if the value of the futex
|
||||
# matches with a reference value.
|
||||
# But our reference value will be the epoch of notifications
|
||||
# and it is non-trivial to zero-out the waiters bits.
|
||||
# - One way could be to split a 64-bit number in 2
|
||||
# and cast the epoch part to Futex but that would only work on 64-bit CPU.
|
||||
# - Another more hacky way would be to pad with a zero-out uint16 before and after the Futex
|
||||
# and depending on big or little endian provide a shifted address as Futex.
|
||||
|
||||
ParkingTicket* = object
|
||||
epoch: uint32
|
||||
|
||||
const # bitfield
|
||||
# Low 16 bits are waiters, up to 2¹⁶ = 65536 threads are supported
|
||||
# High 16 or 48 bits are epochs.
|
||||
# We can deal with the ABA problem o:
|
||||
# - up to 65536 wake requests on 32-bit
|
||||
# - up to 281 474 976 710 656 wake requests on 64-bit
|
||||
# Epoch rolling over to 0 are not a problem, they won't change the low 16 bits
|
||||
kEpochShift = 16
|
||||
kAddEpoch = 1 shl kEpochShift
|
||||
kWaiterMask = kAddEpoch - 1
|
||||
kEpochMask {.used.} = not kWaiterMask
|
||||
kAddWaiter = 1
|
||||
kSubWaiter = 1
|
||||
|
||||
func initialize*(ec: var EventCount) {.inline.} =
|
||||
ec.state.store(0, moRelaxed)
|
||||
ec.futex.initialize()
|
||||
|
||||
func `=destroy`*(ec: var EventCount) {.inline.} =
|
||||
ec.futex.teardown()
|
||||
|
||||
proc sleepy*(ec: var Eventcount): ParkingTicket {.inline.} =
|
||||
## To be called before checking if the condition to not sleep is met.
|
||||
## Returns a ticket to be used when committing to sleep
|
||||
let prevState = ec.state.fetchAdd(kAddWaiter, moAcquireRelease)
|
||||
result.epoch = prevState shr kEpochShift
|
||||
|
||||
proc sleep*(ec: var Eventcount, ticket: ParkingTicket) {.inline.} =
|
||||
## Put a thread to sleep until notified.
|
||||
## If the ticket becomes invalid (a notfication has been received)
|
||||
## by the time sleep is called, the thread won't enter sleep
|
||||
while ec.state.load(moAcquire) shr kEpochShift == ticket.epoch:
|
||||
ec.futex.wait(ticket.epoch) # We don't use the futex internal value
|
||||
|
||||
let prev {.used.} = ec.state.fetchSub(kSubWaiter, moRelaxed)
|
||||
|
||||
proc cancelSleep*(ec: var Eventcount) {.inline.} =
|
||||
## Cancel a sleep that was scheduled.
|
||||
let prev {.used.} = ec.state.fetchSub(kSubWaiter, moRelaxed)
|
||||
|
||||
proc wake*(ec: var EventCount) {.inline.} =
|
||||
## Wake a thread if at least 1 is parked
|
||||
let prev = ec.state.fetchAdd(kAddEpoch, moAcquireRelease)
|
||||
if (prev and kWaiterMask) != 0:
|
||||
ec.futex.wake()
|
||||
|
||||
proc wakeAll*(ec: var EventCount) {.inline.} =
|
||||
## Wake all threads if at least 1 is parked
|
||||
let prev = ec.state.fetchAdd(kAddEpoch, moAcquireRelease)
|
||||
if (prev and kWaiterMask) != 0:
|
||||
ec.futex.wakeAll()
|
||||
|
||||
proc getNumWaiters*(ec: var EventCount): uint32 {.inline.} =
|
||||
## Get the number of parked threads
|
||||
ec.state.load(moRelaxed) and kWaiterMask
|
||||
|
||||
{.pop.} # {.push raises:[], checks:off.}
|
||||
286
constantine/platforms/threadpool/crossthread/taskqueues.nim
Normal file
286
constantine/platforms/threadpool/crossthread/taskqueues.nim
Normal file
@ -0,0 +1,286 @@
|
||||
# Constantine
|
||||
# Copyright (c) 2018-2019 Status Research & Development GmbH
|
||||
# Copyright (c) 2020-Present Mamy André-Ratsimbazafy
|
||||
# Licensed and distributed under either of
|
||||
# * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
|
||||
# * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
|
||||
# at your option. This file may not be copied, modified, or distributed except according to those terms.
|
||||
|
||||
# This file implements a single-producer multi-consumer
|
||||
# task queue for work-stealing schedulers.
|
||||
#
|
||||
# Papers:
|
||||
# - Dynamic Circular Work-Stealing Deque
|
||||
# David Chase, Yossi Lev, 1993
|
||||
# https://www.dre.vanderbilt.edu/~schmidt/PDF/work-stealing-dequeue.pdf
|
||||
#
|
||||
# - Non-Blocking Steal-Half Work Queues
|
||||
# Danny Hendler, Nir Shavit, 2002
|
||||
# https://www.cs.bgu.ac.il/~hendlerd/papers/p280-hendler.pdf
|
||||
#
|
||||
# - Correct and Efficient Work-Stealing for Weak Memory Models
|
||||
# Nhat Minh Lê, Antoniu Pop, Albert Cohen, Francesco Zappa Nardelli, 2013
|
||||
# https://fzn.fr/readings/ppopp13.pdf
|
||||
#
|
||||
# The task queue implements the following push, pop, steal, stealHalf
|
||||
#
|
||||
# front back
|
||||
# ---------------------------------
|
||||
# steal() <- | | | | <- push()
|
||||
# stealHalf() <- | Task 0 | Task 1 | Task 2 | -> pop()
|
||||
# any thread | | | | owner-only
|
||||
# ---------------------------------
|
||||
#
|
||||
# To reduce contention, stealing is done on the opposite end from push/pop
|
||||
# so that there is a race only for the very last task(s).
|
||||
|
||||
{.push raises: [], checks: off.} # No exceptions in a multithreading datastructure
|
||||
|
||||
import
|
||||
std/atomics,
|
||||
../instrumentation,
|
||||
../../allocs,
|
||||
./tasks_flowvars
|
||||
|
||||
type
|
||||
Buf = object
|
||||
## Backend buffer of a Taskqueue
|
||||
## `capacity` MUST be a power of 2
|
||||
prevRetired: ptr Buf # intrusive linked list. Used for garbage collection
|
||||
|
||||
capacity: int
|
||||
rawBuffer: UncheckedArray[Atomic[ptr Task]]
|
||||
|
||||
Taskqueue* = object
|
||||
## This implements a lock-free, growable, work-stealing task queue.
|
||||
## The owning thread enqueues and dequeues at the back
|
||||
## Foreign threads steal at the front.
|
||||
##
|
||||
## There is no memory reclamation scheme for simplicity
|
||||
front {.align: 64.}: Atomic[int] # Consumers - steal/stealHalf
|
||||
back: Atomic[int] # Producer - push/pop
|
||||
buf: Atomic[ptr Buf]
|
||||
garbage: ptr Buf
|
||||
|
||||
proc peek*(tq: var Taskqueue): int =
|
||||
## Estimates the number of items pending in the channel
|
||||
## In a SPMC setting
|
||||
## - If called by the producer the true number might be less
|
||||
## due to consumers removing items concurrently.
|
||||
## - If called by a consumer the true number is undefined
|
||||
## as other consumers also remove items concurrently and
|
||||
## the producer removes them concurrently.
|
||||
##
|
||||
## If the producer peeks and the Chase-Lev Deque returns 0,
|
||||
## the queue is empty.
|
||||
##
|
||||
## This is a non-locking operation.
|
||||
let # Handle race conditions
|
||||
b = tq.back.load(moRelaxed)
|
||||
f = tq.front.load(moRelaxed)
|
||||
|
||||
if b >= f:
|
||||
return b-f
|
||||
else:
|
||||
return 0
|
||||
|
||||
func isPowerOfTwo(n: int): bool {.used, inline.} =
|
||||
(n and (n - 1)) == 0 and (n != 0)
|
||||
|
||||
proc newBuf(capacity: int): ptr Buf =
|
||||
# Tasks have a destructor
|
||||
# static:
|
||||
# doAssert supportsCopyMem(T), $T & " must be a (POD) plain-old-data type: no seq, string, ref."
|
||||
|
||||
preCondition: capacity.isPowerOfTwo()
|
||||
|
||||
result = allocHeapUnchecked(Buf, 1*sizeof(pointer) + 2*sizeof(int) + sizeof(pointer)*capacity)
|
||||
|
||||
result.prevRetired = nil
|
||||
result.capacity = capacity
|
||||
result.rawBuffer.addr.zeroMem(sizeof(pointer)*capacity)
|
||||
|
||||
proc `[]=`(buf: var Buf, index: int, item: ptr Task) {.inline.} =
|
||||
buf.rawBuffer[index and (buf.capacity-1)].store(item, moRelaxed)
|
||||
|
||||
proc `[]`(buf: var Buf, index: int): ptr Task {.inline.} =
|
||||
result = buf.rawBuffer[index and (buf.capacity-1)].load(moRelaxed)
|
||||
|
||||
proc grow(tq: var Taskqueue, buf: var ptr Buf, newCapacity, front, back: int) {.inline.} =
|
||||
## Double the buffer size
|
||||
## back is the last item index
|
||||
##
|
||||
## To handle race-conditions the current "front", "back" and "buf"
|
||||
## have to be saved before calling this procedure.
|
||||
## It reads and writes the "tq.buf" and "tq.garbage"
|
||||
|
||||
# Read -> Copy -> Update
|
||||
var tmp = newBuf(newCapacity)
|
||||
for i in front ..< back:
|
||||
tmp[][i] = buf[][i]
|
||||
|
||||
buf.prevRetired = tq.garbage
|
||||
tq.garbage = buf
|
||||
# publish globally
|
||||
tq.buf.store(tmp, moRelaxed)
|
||||
# publish locally
|
||||
swap(buf, tmp)
|
||||
|
||||
proc garbageCollect(tq: var Taskqueue) {.inline.} =
|
||||
var node = tq.garbage
|
||||
while node != nil:
|
||||
let tmp = node.prevRetired
|
||||
freeHeap(node)
|
||||
node = tmp
|
||||
tq.garbage = nil
|
||||
|
||||
# Public API
|
||||
# ---------------------------------------------------
|
||||
|
||||
proc init*(tq: var Taskqueue, initialCapacity: int) =
|
||||
zeroMem(tq.addr, tq.sizeof())
|
||||
tq.buf.store(newBuf(initialCapacity), moRelaxed)
|
||||
|
||||
proc teardown*(tq: var Taskqueue) =
|
||||
tq.garbageCollect()
|
||||
freeHeap(tq.buf.load(moRelaxed))
|
||||
|
||||
proc push*(tq: var Taskqueue, item: ptr Task) =
|
||||
## Enqueue an item at the back
|
||||
## As the task queue takes ownership of it. The item must not be used afterwards.
|
||||
## This is intended for the producer only.
|
||||
|
||||
let # Handle race conditions
|
||||
b = tq.back.load(moRelaxed)
|
||||
f = tq.front.load(moAcquire)
|
||||
var buf = tq.buf.load(moRelaxed)
|
||||
|
||||
if b-f > buf.capacity - 1:
|
||||
# Full queue
|
||||
tq.grow(buf, buf.capacity*2, f, b)
|
||||
|
||||
if not tq.garbage.isNil and f == b:
|
||||
# Empty queue, no thieves can have a pointer to an old retired buffer
|
||||
tq.garbageCollect()
|
||||
|
||||
buf[][b] = item
|
||||
fence(moRelease)
|
||||
tq.back.store(b+1, moRelaxed)
|
||||
|
||||
proc pop*(tq: var Taskqueue): ptr Task =
|
||||
## Dequeue an item at the back. Takes ownership of the item
|
||||
## This is intended for the producer only.
|
||||
|
||||
let # Handle race conditions
|
||||
b = tq.back.load(moRelaxed) - 1
|
||||
buf = tq.buf.load(moRelaxed)
|
||||
|
||||
tq.back.store(b, moRelaxed)
|
||||
fence(moSequentiallyConsistent)
|
||||
var f = tq.front.load(moRelaxed)
|
||||
|
||||
if f <= b:
|
||||
# Non-empty queue.
|
||||
result = buf[][b]
|
||||
if f == b:
|
||||
# Single last element in queue.
|
||||
if not compareExchange(tq.front, f, f+1, moSequentiallyConsistent, moRelaxed):
|
||||
# Failed race.
|
||||
result = nil
|
||||
tq.back.store(b+1, moRelaxed)
|
||||
if not tq.garbage.isNil:
|
||||
# Empty queue, no thieves can have a pointer to an old retired buffer
|
||||
tq.garbageCollect()
|
||||
else:
|
||||
# Empty queue.
|
||||
result = nil
|
||||
tq.back.store(b+1, moRelaxed)
|
||||
if not tq.garbage.isNil:
|
||||
# Empty queue, no thieves can have a pointer to an old retired buffer
|
||||
tq.garbageCollect()
|
||||
|
||||
proc steal*(thiefID: uint32, tq: var Taskqueue): ptr Task =
|
||||
## Dequeue an item at the front. Takes ownership of the item
|
||||
## This is intended for consumers.
|
||||
var f = tq.front.load(moAcquire)
|
||||
fence(moSequentiallyConsistent)
|
||||
let b = tq.back.load(moAcquire)
|
||||
result = nil
|
||||
|
||||
if f < b:
|
||||
# Non-empty queue.
|
||||
let a = tq.buf.load(moConsume)
|
||||
result = a[][f]
|
||||
if not compareExchange(tq.front, f, f+1, moSequentiallyConsistent, moRelaxed):
|
||||
# Failed race.
|
||||
return nil
|
||||
result.thiefID.store(thiefID, moRelease)
|
||||
|
||||
proc stealHalfImpl(dst: var Buf, dstBack: int, src: var Taskqueue): int =
|
||||
## Theft part of stealHalf:
|
||||
## - updates the victim metadata if successful
|
||||
## - uncommitted updates to the thief tq whether successful or not
|
||||
## Returns -1 if dst buffer is too small
|
||||
## Assumes `dst` buffer is empty (i.e. not ahead-of-time thefts)
|
||||
|
||||
while true:
|
||||
# Try as long as there are something to steal, we are idling anyway.
|
||||
|
||||
var f = src.front.load(moAcquire)
|
||||
fence(moSequentiallyConsistent)
|
||||
let b = src.back.load(moAcquire)
|
||||
var n = b-f
|
||||
n = n - (n shr 1) # Division by 2 rounded up, so if only one task is left, we still get it.
|
||||
|
||||
if n <= 0:
|
||||
return 0
|
||||
if n > dst.capacity:
|
||||
return -1
|
||||
|
||||
# Non-empty queue.
|
||||
let sBuf = src.buf.load(moConsume)
|
||||
for i in 0 ..< n: # Copy LIFO or FIFO?
|
||||
dst[dstBack+i] = sBuf[][f+i]
|
||||
if compareExchange(src.front, f, f+n, moSequentiallyConsistent, moRelaxed):
|
||||
return n
|
||||
|
||||
proc stealHalf*(thiefID: uint32, dst: var Taskqueue, src: var Taskqueue): ptr Task =
|
||||
## Dequeue up to half of the items in the `src` tq, fom the front.
|
||||
## Return the last of those, or nil if none found
|
||||
|
||||
while true:
|
||||
# Prepare for batch steal
|
||||
let
|
||||
bDst = dst.back.load(moRelaxed)
|
||||
fDst = dst.front.load(moAcquire)
|
||||
var dBuf = dst.buf.load(moAcquire)
|
||||
let sBuf = src.buf.load(moAcquire)
|
||||
|
||||
if dBuf.capacity < sBuf.capacity:
|
||||
# We could grow to sBuf/2 since we steal half, but we want to minimize
|
||||
# churn if we are actually in the process of stealing and the buffers grows.
|
||||
dst.grow(dBuf, sBuf.capacity, fDst, bDst)
|
||||
|
||||
# Steal
|
||||
let n = dBuf[].stealHalfImpl(bDst, src)
|
||||
|
||||
if n == 0:
|
||||
return nil
|
||||
if n == -1:
|
||||
# Oops, victim buffer grew bigger than ours, restart the whole process
|
||||
continue
|
||||
|
||||
# Update metadata
|
||||
for i in 0 ..< n:
|
||||
dBuf[][bDst+i].thiefID.store(thiefID, moRelease)
|
||||
|
||||
# Commit/publish theft, return the first item for processing
|
||||
let last = dBuf[][bDst+n-1]
|
||||
fence(moSequentiallyConsistent)
|
||||
if n == 1:
|
||||
return last
|
||||
|
||||
# We have more than one item, so some must go in our queue
|
||||
# they are already here but inaccessible for other thieves
|
||||
dst.back.store(bDst+n-1, moRelease) # We assume that queue was empty and so dst.front didn't change
|
||||
return last
|
||||
127
constantine/platforms/threadpool/crossthread/tasks_flowvars.nim
Normal file
127
constantine/platforms/threadpool/crossthread/tasks_flowvars.nim
Normal file
@ -0,0 +1,127 @@
|
||||
# Constantine
|
||||
# Copyright (c) 2018-2019 Status Research & Development GmbH
|
||||
# Copyright (c) 2020-Present Mamy André-Ratsimbazafy
|
||||
# Licensed and distributed under either of
|
||||
# * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
|
||||
# * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
|
||||
# at your option. This file may not be copied, modified, or distributed except according to those terms.
|
||||
|
||||
import
|
||||
std/atomics,
|
||||
../instrumentation,
|
||||
../../allocs,
|
||||
./backoff
|
||||
|
||||
# Tasks have an efficient design so that a single heap allocation
|
||||
# is required per `spawn`.
|
||||
# This greatly reduce overhead and potential memory fragmentation for long-running applications.
|
||||
#
|
||||
# This is done by tasks:
|
||||
# - being an intrusive linked lists
|
||||
# - integrating the channel to send results
|
||||
#
|
||||
# Flowvar is the public type created when spawning a task.
|
||||
# and can be synced to receive the task result.
|
||||
# Flowvars are also called future interchangeably.
|
||||
# (The name future is already used for IO scheduling)
|
||||
|
||||
type
|
||||
Task* = object
|
||||
# Intrusive metadata
|
||||
# ------------------
|
||||
parent*: ptr Task # When a task is awaiting, a thread can quickly prioritize the direct child of a task
|
||||
|
||||
thiefID*: Atomic[uint32] # ID of the worker that stole and run the task. For leapfrogging.
|
||||
|
||||
# Result sync
|
||||
# ------------------
|
||||
hasFuture*: bool # Ownership: if a task has a future, the future deallocates it. Otherwise the worker thread does.
|
||||
completed*: Atomic[bool]
|
||||
waiter*: Atomic[ptr EventNotifier]
|
||||
|
||||
# Execution
|
||||
# ------------------
|
||||
fn*: proc (param: pointer) {.nimcall, gcsafe.}
|
||||
# destroy*: proc (param: pointer) {.nimcall, gcsafe.} # Constantine only deals with plain old data
|
||||
data*{.align:sizeof(int).}: UncheckedArray[byte]
|
||||
|
||||
Flowvar*[T] = object
|
||||
task: ptr Task
|
||||
|
||||
const SentinelThief* = 0xFACADE'u32
|
||||
|
||||
proc new*(
|
||||
T: typedesc[Task],
|
||||
parent: ptr Task,
|
||||
fn: proc (param: pointer) {.nimcall, gcsafe.}): ptr Task {.inline.} =
|
||||
|
||||
const size = sizeof(T)
|
||||
|
||||
result = allocHeapUnchecked(T, size)
|
||||
result.parent = parent
|
||||
result.thiefID.store(SentinelThief, moRelaxed)
|
||||
result.completed.store(false, moRelaxed)
|
||||
result.waiter.store(nil, moRelaxed)
|
||||
result.fn = fn
|
||||
|
||||
proc new*(
|
||||
T: typedesc[Task],
|
||||
parent: ptr Task,
|
||||
fn: proc (param: pointer) {.nimcall, gcsafe.},
|
||||
params: auto): ptr Task {.inline.} =
|
||||
|
||||
const size = sizeof(T) + # size without Unchecked
|
||||
sizeof(params)
|
||||
|
||||
result = allocHeapUnchecked(T, size)
|
||||
result.parent = parent
|
||||
result.thiefID.store(SentinelThief, moRelaxed)
|
||||
result.completed.store(false, moRelaxed)
|
||||
result.waiter.store(nil, moRelaxed)
|
||||
result.fn = fn
|
||||
cast[ptr[type params]](result.data)[] = params
|
||||
|
||||
# proc `=copy`*[T](dst: var Flowvar[T], src: Flowvar[T]) {.error: "Futures/Flowvars cannot be copied".}
|
||||
|
||||
proc newFlowVar*(T: typedesc, task: ptr Task): Flowvar[T] {.inline.} =
|
||||
result.task = task
|
||||
result.task.hasFuture = true
|
||||
|
||||
# Task with future references themselves so that readyWith can be called
|
||||
# within the constructed
|
||||
# proc async_fn(param: pointer) {.nimcall.}
|
||||
# that can only access data
|
||||
cast[ptr ptr Task](task.data.addr)[] = task
|
||||
|
||||
proc cleanup*(fv: var Flowvar) {.inline.} =
|
||||
fv.task.freeHeap()
|
||||
fv.task = nil
|
||||
|
||||
func isSpawned*(fv: Flowvar): bool {.inline.} =
|
||||
## Returns true if a flowvar is spawned
|
||||
## This may be useful for recursive algorithms that
|
||||
## may or may not spawn a flowvar depending on a condition.
|
||||
## This is similar to Option or Maybe types
|
||||
return not fv.task.isNil
|
||||
|
||||
func isReady*[T](fv: Flowvar[T]): bool {.inline.} =
|
||||
## Returns true if the result of a Flowvar is ready.
|
||||
## In that case `sync` will not block.
|
||||
## Otherwise the current will block to help on all the pending tasks
|
||||
## until the Flowvar is ready.
|
||||
fv.task.completed.load(moAcquire)
|
||||
|
||||
func readyWith*[T](task: ptr Task, childResult: T) {.inline.} =
|
||||
## Send the Flowvar result from the child thread processing the task
|
||||
## to its parent thread.
|
||||
precondition: not task.completed.load(moAcquire)
|
||||
cast[ptr (ptr Task, T)](task.data.addr)[1] = childResult
|
||||
task.completed.store(true, moRelease)
|
||||
|
||||
proc sync*[T](fv: sink Flowvar[T]): T {.inline, gcsafe.} =
|
||||
## Blocks the current thread until the flowvar is available
|
||||
## and returned.
|
||||
## The thread is not idle and will complete pending tasks.
|
||||
mixin completeFuture
|
||||
completeFuture(fv, result)
|
||||
cleanup(fv)
|
||||
166
constantine/platforms/threadpool/docs/partitioners.md
Normal file
166
constantine/platforms/threadpool/docs/partitioners.md
Normal file
@ -0,0 +1,166 @@
|
||||
# Partitioners
|
||||
|
||||
For data parallelism (parallel for loops) there are 2 main scheduling strategies:
|
||||
- static scheduling, when work is regular (for example adding 2 matrices).
|
||||
In that case, splitting the loop
|
||||
into same-sized chunk provides perfect speedup, with no synchronization overhead.
|
||||
(Assuming threads have the same performance and no parasite load)
|
||||
- dynamic scheduling, when work is irregular (for example zero-ing buffers of different length).
|
||||
|
||||
Partitioners help implementing static scheduling.
|
||||
|
||||
Static scheduling
|
||||
=================
|
||||
|
||||
Usually static scheduling is problematic because the threshold below which running in parallel
|
||||
is both hardware dependent, data layout and function dependent, see https://github.com/zy97140/omp-benchmark-for-pytorch
|
||||
|
||||
| CPU Model | Sockets | Cores/Socket | Frequency |
|
||||
|------------------------------------|---------|--------------|-----------|
|
||||
| Intel(R) Xeon(R) CPU E5-2699 v4 | 2 | 22 | 2.20GHz |
|
||||
| Intel(R) Xeon(R) Platinum 8180 CPU | 2 | 28 | 2.50GHz |
|
||||
| Intel(R) Core(TM) i7-5960X CPU | 1 | 8 | 3.00GHz |
|
||||
|
||||
| contiguous op | Xeon(R) Platinum 8180 CPU | Xeon(R) CPU E5-2699 v4 | i7-5960X CPU |
|
||||
|---------------|---------------------------|------------------------|--------------|
|
||||
| copy | 80k | 20k | 8k |
|
||||
| add | 80k | 20k | 8k |
|
||||
| div | 50k | 10k | 2k |
|
||||
| exp | 1k | 1k | 1k |
|
||||
| sin | 1k | 1k | 1k |
|
||||
| sum | 1k | 1k | 1k |
|
||||
| prod | 1k | 1k | 1k |
|
||||
|
||||
| non-contiguous op | Xeon(R) Platinum 8180 CPU | Xeon(R) CPU E5-2699 v4 | i7-5960X CPU |
|
||||
|-------------------|---------------------------|------------------------|--------------|
|
||||
| copy | 20k | 8k | 2k |
|
||||
| add | 20k | 8k | 2k |
|
||||
| div | 10k | 8k | 1k |
|
||||
| exp | 1k | 1k | 1k |
|
||||
| sin | 2k | 2k | 1k |
|
||||
| sum | 1k | 1k | 1k |
|
||||
| prod | 1k | 1k | 1k |
|
||||
|
||||
|
||||
# Static partitioner
|
||||
====================
|
||||
|
||||
```Nim
|
||||
iterator balancedChunks*(start, stopEx, numChunks: int): tuple[chunkID, start, stopEx: int] =
|
||||
## Balanced chunking algorithm for a range [start, stopEx)
|
||||
## This splits a range into min(stopEx-start, numChunks) balanced regions
|
||||
## and returns a tuple (chunkID, offset, length)
|
||||
|
||||
# Rationale
|
||||
# The following simple chunking scheme can lead to severe load imbalance
|
||||
#
|
||||
# let chunk_offset = chunk_size * thread_id
|
||||
# let chunk_size = if thread_id < nb_chunks - 1: chunk_size
|
||||
# else: omp_size - chunk_offset
|
||||
#
|
||||
# For example dividing 40 items on 12 threads will lead to
|
||||
# a base_chunk_size of 40/12 = 3 so work on the first 11 threads
|
||||
# will be 3 * 11 = 33, and the remainder 7 on the last thread.
|
||||
#
|
||||
# Instead of dividing 40 work items on 12 cores into:
|
||||
# 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 7 = 3*11 + 7 = 40
|
||||
# the following scheme will divide into
|
||||
# 4, 4, 4, 4, 3, 3, 3, 3, 3, 3, 3, 3 = 4*4 + 3*8 = 40
|
||||
#
|
||||
# This is compliant with OpenMP spec (page 60)
|
||||
# http://www.openmp.org/mp-documents/openmp-4.5.pdf
|
||||
# "When no chunk_size is specified, the iteration space is divided into chunks
|
||||
# that are approximately equal in size, and at most one chunk is distributed to
|
||||
# each thread. The size of the chunks is unspecified in this case."
|
||||
# ---> chunks are the same ±1
|
||||
|
||||
let
|
||||
totalIters = stopEx - start
|
||||
baseChunkSize = totalIters div numChunks
|
||||
cutoff = totalIters mod numChunks
|
||||
|
||||
for chunkID in 0 ..< min(numChunks, totalIters):
|
||||
if chunkID < cutoff:
|
||||
let offset = start + ((baseChunkSize + 1) * chunkID)
|
||||
let chunkSize = baseChunkSize + 1
|
||||
yield (chunkID, offset, offset+chunkSize)
|
||||
else:
|
||||
let offset = start + (baseChunkSize * chunkID) + cutoff
|
||||
let chunkSize = baseChunkSize
|
||||
yield (chunkID, offset, chunkSize)
|
||||
|
||||
when isMainModule:
|
||||
for chunk in balancedChunks(start = 0, stopEx = 40, numChunks = 12):
|
||||
echo chunk
|
||||
```
|
||||
|
||||
Dynamic scheduling
|
||||
==================
|
||||
|
||||
Dynamic schedulers decide at runtime whether to split a range for processing by multiple threads at runtime.
|
||||
Unfortunately most (all?) of those in production do not or cannot take into account
|
||||
the actual functions being called within a `parallel_for` section,
|
||||
and might split into too fine-grained chunks or into too coarse-grained chunks.
|
||||
Alternatively they might ask the programmer for a threshold below which not to split.
|
||||
As the programmer has no way to know if their code will run on a Raspberry Pi or a powerful workstation, that choice cannot be optimal.
|
||||
|
||||
Recent advances in research like "Lazy Binary Splitting" and "Lazy Tree Splitting"
|
||||
allows dynamic scheduling to fully adapt to the system current load and
|
||||
the parallel section computational needs without programmer input (grain size or minimal split threshold)
|
||||
by using backpressure.
|
||||
|
||||
|
||||
Why do we need partitioners for cryptographic workload then?
|
||||
============================================================
|
||||
|
||||
|
||||
Here are 3 basic cryptographic primitives that are non-trivial to parallelize:
|
||||
|
||||
1. batch elliptic-curve addition
|
||||
2. multi-scalar multiplication (MSM)
|
||||
3. batch signature verification via multi-pairing
|
||||
|
||||
On the other hand, merkle tree hashing for example is a primitive where both static or dynamic scheduling should give perfect speedup.
|
||||
|
||||
Let's take the example of batch EC addition.
|
||||
--------------------------------------------
|
||||
|
||||
There is a naive way, via doing a parallel EC sum reduction,
|
||||
for example for 1M points,
|
||||
using projective coordinates, each sum costs 12M (field multiplication)
|
||||
At first level we have 500k sums, then 250k sums, then 125k, ...
|
||||
The number of sums is ~1M, for a total cost of 12e⁶
|
||||
|
||||
```
|
||||
0 1 2 3 4 5 6 7
|
||||
+ + + +
|
||||
+ +
|
||||
+
|
||||
```
|
||||
|
||||
The fast way uses affine sum for an asymptotic cost of 6M, so 2x faster.
|
||||
but this requires an inversion, a fixed cost of ~131M (256-bit) to ~96M (384-bit)
|
||||
whatever number of additions we accumulate
|
||||
That inversion needs to be amortized into at least 20-25 additions.
|
||||
|
||||
Let's take a look at multi-pairing
|
||||
----------------------------------
|
||||
|
||||
Multi-pairing is split into 2 phases:
|
||||
- 1. the Miller-Loop which is embarassing parallel, each thread can compute it on their own.
|
||||
- 2. reducing the n Miller-Loops into a single 𝔽pₖ point using parallel 𝔽pₖ sum reduction.
|
||||
- 3. Computing the final exponentiation, a fixed cost whatever the number of Miller Loops we did.
|
||||
3alt. Alternatively, computing n final exponentiations, and merging them with a 𝔽pₖ product reduction
|
||||
in that case, step 2 is not needed.
|
||||
|
||||
Conclusion
|
||||
----------
|
||||
|
||||
Dynamic scheduling for reduction with variable + fixed costs (independent of chunk size) is tricky.
|
||||
Furthermore the computations are regular, same workload per range and static scheduling seems like a great fit.
|
||||
|
||||
The downside is if a core has a parasite workload or on architecture like ARM big.Little or Alder Lake
|
||||
with performance and power-saving cores.
|
||||
|
||||
Alternatively, we can add dynamic scheduling hints about min and max chunk size so that
|
||||
chunk size is kept within the optimal range whatever the number of idle threads.
|
||||
42
constantine/platforms/threadpool/docs/random_permutations.md
Normal file
42
constantine/platforms/threadpool/docs/random_permutations.md
Normal file
@ -0,0 +1,42 @@
|
||||
# Random permutations
|
||||
|
||||
Work-stealing is more efficient when the thread we steal from is randomized.
|
||||
If all threads steal in the same order, we increase contention
|
||||
on the start victims task queues.
|
||||
|
||||
The randomness quality is not important besides distributing potential contention,
|
||||
i.e. randomly trying thread i, then i+1, then i+n-1 (mod n) is good enough.
|
||||
|
||||
Hence for efficiency, so that a thread can go to sleep faster, we want to
|
||||
reduce calls to to the RNG as:
|
||||
- Getting a random value itself can be expensive, especially if we use a CSPRNG (not a requirement)
|
||||
- a CSPRNG can be starved of entropy as with small tasks, threads might make millions of calls.
|
||||
- If we want unbiaised thread ID generation in a range, rejection sampling is costly (not a requirement).
|
||||
|
||||
Instead of using Fisher-Yates
|
||||
- generates the victim set eagerly, inefficient if the first steal attempts are successful
|
||||
- needs a RNG call when sampling a victim
|
||||
- memory usage: numThreads per thread so numthreads² uint8 (255 threads max) or uint32
|
||||
|
||||
or a sparseset
|
||||
- 1 RNG call when sampling a victim
|
||||
- memory usage: 2\*numThreads per thread so 2\*numthreads² uint8 (255 threads max) or uint32
|
||||
|
||||
we can use Linear Congruential Generators, a recurrence relation of the form Xₙ₊₁ = aXₙ+c (mod m)
|
||||
If we respect the Hull-Dobell theorem requirements, we can generate pseudo-random permutations in [0, m)
|
||||
with fixed memory usage whatever the number of potential victims: just 4 registers for a, x, c, m
|
||||
|
||||
References:
|
||||
- Fisher-Yates: https://en.wikipedia.org/wiki/Fisher%E2%80%93Yates_shuffle
|
||||
- Sparse sets: https://dl.acm.org/doi/pdf/10.1145/176454.176484
|
||||
https://github.com/mratsim/weave/blob/7682784/weave/datatypes/sparsesets.nim
|
||||
https://github.com/status-im/nim-taskpools/blob/4bc0b59/taskpools/sparsesets.nim
|
||||
- Linear Congruential Generator: https://en.wikipedia.org/wiki/Linear_congruential_generator
|
||||
|
||||
And if we want cryptographic strength:
|
||||
- Ciphers with Arbitrary Finite Domains
|
||||
John Black and Phillip Rogaway, 2001
|
||||
https://eprint.iacr.org/2001/012
|
||||
- An Enciphering Scheme Based on a Card Shuffle
|
||||
Viet Tung Hoang, Ben Morris, Phillip Rogaway
|
||||
https://www.iacr.org/archive/crypto2012/74170001/74170001.pdf
|
||||
@ -0,0 +1,48 @@
|
||||
import ../threadpool
|
||||
|
||||
block: # Async without result
|
||||
|
||||
proc displayInt(x: int) =
|
||||
stdout.write(x)
|
||||
stdout.write(" - SUCCESS\n")
|
||||
|
||||
proc main() =
|
||||
echo "\n=============================================================================================="
|
||||
echo "Running 'threadpool/examples/e01_simple_tasks.nim'"
|
||||
echo "=============================================================================================="
|
||||
|
||||
echo "\nSanity check 1: Printing 123456 654321 in parallel"
|
||||
|
||||
var tp = Threadpool.new(numThreads = 4)
|
||||
tp.spawn displayInt(123456)
|
||||
tp.spawn displayInt(654321)
|
||||
tp.shutdown()
|
||||
|
||||
main()
|
||||
|
||||
block: # Async/Await
|
||||
var tp: Threadpool
|
||||
|
||||
proc asyncFib(n: int): int =
|
||||
if n < 2:
|
||||
return n
|
||||
|
||||
let x = tp.spawn asyncFib(n-1)
|
||||
let y = asyncFib(n-2)
|
||||
|
||||
result = sync(x) + y
|
||||
|
||||
proc main2() =
|
||||
echo "\n=============================================================================================="
|
||||
echo "Running 'threadpool/examples/e01_simple_tasks.nim'"
|
||||
echo "=============================================================================================="
|
||||
|
||||
echo "\nSanity check 2: fib(20)"
|
||||
|
||||
tp = Threadpool.new()
|
||||
let f = asyncFib(20)
|
||||
tp.shutdown()
|
||||
|
||||
doAssert f == 6765, "f was " & $f
|
||||
|
||||
main2()
|
||||
@ -0,0 +1,38 @@
|
||||
# Demo of API using a very inefficient π approcimation algorithm.
|
||||
|
||||
import
|
||||
std/[strutils, math, cpuinfo],
|
||||
../threadpool
|
||||
|
||||
# From https://github.com/nim-lang/Nim/blob/v1.6.2/tests/parallel/tpi.nim
|
||||
# Leibniz Formula https://en.wikipedia.org/wiki/Leibniz_formula_for_%CF%80
|
||||
proc term(k: int): float =
|
||||
if k mod 2 == 1:
|
||||
-4'f / float(2*k + 1)
|
||||
else:
|
||||
4'f / float(2*k + 1)
|
||||
|
||||
proc piApprox(tp: Threadpool, n: int): float =
|
||||
var pendingFuts = newSeq[Flowvar[float]](n)
|
||||
for k in 0 ..< pendingFuts.len:
|
||||
pendingFuts[k] = tp.spawn term(k) # Schedule a task on the threadpool a return a handle to retrieve the result.
|
||||
for k in 0 ..< pendingFuts.len:
|
||||
result += sync pendingFuts[k] # Block until the result is available.
|
||||
|
||||
proc main() =
|
||||
|
||||
echo "\n=============================================================================================="
|
||||
echo "Running 'threadpool/examples/e02_parallel_pi.nim'"
|
||||
echo "=============================================================================================="
|
||||
|
||||
var n = 1_000_000
|
||||
var nthreads = countProcessors()
|
||||
|
||||
var tp = Threadpool.new(num_threads = nthreads) # Default to the number of hardware threads.
|
||||
|
||||
echo formatFloat(tp.piApprox(n))
|
||||
|
||||
tp.shutdown()
|
||||
|
||||
# Compile with nim c -r -d:release --threads:on --outdir:build example.nim
|
||||
main()
|
||||
142
constantine/platforms/threadpool/instrumentation.nim
Normal file
142
constantine/platforms/threadpool/instrumentation.nim
Normal file
@ -0,0 +1,142 @@
|
||||
# Weave
|
||||
# Copyright (c) 2019 Mamy André-Ratsimbazafy
|
||||
# Licensed and distributed under either of
|
||||
# * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
|
||||
# * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
|
||||
# at your option. This file may not be copied, modified, or distributed except according to those terms.
|
||||
|
||||
import system/ansi_c
|
||||
|
||||
# Loggers
|
||||
# --------------------------------------------------------
|
||||
|
||||
template log*(args: varargs[untyped]): untyped =
|
||||
c_printf(args)
|
||||
flushFile(stdout)
|
||||
|
||||
template debugTermination*(body: untyped): untyped =
|
||||
when defined(TP_DebugTermination) or defined(TP_Debug):
|
||||
{.noSideEffect, gcsafe.}: body
|
||||
|
||||
template debug*(body: untyped): untyped =
|
||||
when defined(TP_Debug):
|
||||
{.noSideEffect, gcsafe.}: body
|
||||
|
||||
# --------------------------------------------------------
|
||||
|
||||
import std/macros
|
||||
|
||||
# A simple design-by-contract API
|
||||
# --------------------------------------------------------
|
||||
|
||||
# Everything should be a template that doesn't produce any code
|
||||
# when debugConstantine is not defined.
|
||||
# Those checks are controlled by a custom flag instead of
|
||||
# "--boundsChecks" or "--nilChecks" to decouple them from user code checks.
|
||||
# Furthermore, we want them to be very lightweight on performance
|
||||
|
||||
func toHex*(a: SomeInteger): string =
|
||||
const hexChars = "0123456789abcdef"
|
||||
const L = 2*sizeof(a)
|
||||
result = newString(2 + L)
|
||||
result[0] = '0'
|
||||
result[1] = 'x'
|
||||
var a = a
|
||||
for j in countdown(result.len-1, 0):
|
||||
result[j] = hexChars[a and 0xF]
|
||||
a = a shr 4
|
||||
|
||||
proc inspectInfix(node: NimNode): NimNode =
|
||||
## Inspect an expression,
|
||||
## Returns the AST as string with runtime values inlined
|
||||
## from infix operators inlined.
|
||||
# TODO: pointer and custom type need a default repr
|
||||
# otherwise we can only resulve simple expressions
|
||||
proc inspect(node: NimNode): NimNode =
|
||||
case node.kind:
|
||||
of nnkInfix:
|
||||
return newCall(
|
||||
bindSym"&",
|
||||
newCall(
|
||||
bindSym"&",
|
||||
newCall(ident"$", inspect(node[1])),
|
||||
newLit(" " & $node[0] & " ")
|
||||
),
|
||||
newCall(ident"$", inspect(node[2]))
|
||||
)
|
||||
of {nnkIdent, nnkSym}:
|
||||
return node
|
||||
of nnkDotExpr:
|
||||
return quote do:
|
||||
when `node` is pointer or
|
||||
`node` is ptr or
|
||||
`node` is (proc):
|
||||
toHex(cast[ByteAddress](`node`) and 0xffff_ffff)
|
||||
else:
|
||||
$(`node`)
|
||||
of nnkPar:
|
||||
result = nnkPar.newTree()
|
||||
for sub in node:
|
||||
result.add inspect(sub)
|
||||
else:
|
||||
return node.toStrLit()
|
||||
return inspect(node)
|
||||
|
||||
macro assertContract(
|
||||
checkName: static string,
|
||||
predicate: untyped) =
|
||||
let lineinfo = lineInfoObj(predicate)
|
||||
|
||||
var strippedPredicate: NimNode
|
||||
if predicate.kind == nnkStmtList:
|
||||
assert predicate.len == 1, "Only one-liner conditions are supported"
|
||||
strippedPredicate = predicate[0]
|
||||
else:
|
||||
strippedPredicate = predicate
|
||||
|
||||
let debug = "\n Contract violated for " & checkName & " at " & $lineinfo &
|
||||
"\n " & $strippedPredicate.toStrLit &
|
||||
"\n The following values are contrary to expectations:" &
|
||||
"\n "
|
||||
let values = inspectInfix(strippedPredicate)
|
||||
let workerID = quote do:
|
||||
when declared(workerContext):
|
||||
$workerContext.id
|
||||
else:
|
||||
"N/A"
|
||||
let threadpoolID = quote do:
|
||||
when declared(workerContext):
|
||||
cast[ByteAddress](workerContext.threadpool).toHex()
|
||||
else:
|
||||
"N/A"
|
||||
|
||||
result = quote do:
|
||||
{.noSideEffect.}:
|
||||
when compileOption("assertions"):
|
||||
assert(`predicate`, `debug` & $`values` & " [Worker " & `workerID` & " on threadpool " & `threadpoolID` & "]\n")
|
||||
elif defined(TP_Asserts):
|
||||
if unlikely(not(`predicate`)):
|
||||
raise newException(AssertionError, `debug` & $`values` & " [Worker " & `workerID` & " on threadpool " & `threadpoolID` & "]\n")
|
||||
|
||||
|
||||
template preCondition*(require: untyped) =
|
||||
## Optional runtime check before returning from a function
|
||||
assertContract("pre-condition", require)
|
||||
|
||||
template postCondition*(ensure: untyped) =
|
||||
## Optional runtime check at the start of a function
|
||||
assertContract("post-condition", ensure)
|
||||
|
||||
template ascertain*(check: untyped) =
|
||||
## Optional runtime check in the middle of processing
|
||||
assertContract("transient condition", check)
|
||||
|
||||
# Sanity checks
|
||||
# ----------------------------------------------------------------------------------
|
||||
|
||||
when isMainModule:
|
||||
proc assertGreater(x, y: int) =
|
||||
postcondition(x > y)
|
||||
|
||||
# We should get a nicely formatted exception
|
||||
assertGreater(10, 12)
|
||||
160
constantine/platforms/threadpool/parallel_offloading.nim
Normal file
160
constantine/platforms/threadpool/parallel_offloading.nim
Normal file
@ -0,0 +1,160 @@
|
||||
# Constantine
|
||||
# Copyright (c) 2018-2019 Status Research & Development GmbH
|
||||
# Copyright (c) 2020-Present Mamy André-Ratsimbazafy
|
||||
# Licensed and distributed under either of
|
||||
# * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
|
||||
# * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
|
||||
# at your option. This file may not be copied, modified, or distributed except according to those terms.
|
||||
|
||||
import
|
||||
std/macros,
|
||||
./instrumentation,
|
||||
./crossthread/tasks_flowvars
|
||||
|
||||
# Task parallelism - spawn
|
||||
# ---------------------------------------------
|
||||
|
||||
proc spawnVoid(funcCall: NimNode, args, argsTy: NimNode, workerContext, schedule: NimNode): NimNode =
|
||||
# Create the async function
|
||||
let fn = funcCall[0]
|
||||
let fnName = $fn
|
||||
let withArgs = args.len > 0
|
||||
let async_fn = ident("async_" & fnName)
|
||||
var fnCall = newCall(fn)
|
||||
let data = ident("data") # typed pointer to data
|
||||
|
||||
# Schedule
|
||||
let task = ident"task"
|
||||
let scheduleBlock = newCall(schedule, workerContext, task)
|
||||
|
||||
result = newStmtList()
|
||||
|
||||
if funcCall.len == 2:
|
||||
# With only 1 arg, the tuple syntax doesn't construct a tuple
|
||||
# let data = (123) # is an int
|
||||
fnCall.add nnkDerefExpr.newTree(data)
|
||||
else: # This handles the 0 arg case as well
|
||||
for i in 1 ..< funcCall.len:
|
||||
fnCall.add nnkBracketExpr.newTree(
|
||||
data,
|
||||
newLit i-1
|
||||
)
|
||||
|
||||
# Create the async call
|
||||
result.add quote do:
|
||||
proc `async_fn`(param: pointer) {.nimcall.} =
|
||||
# preCondition: not isRootTask(`workerContext`.currentTask)
|
||||
|
||||
when bool(`withArgs`):
|
||||
let `data` = cast[ptr `argsTy`](param)
|
||||
`fnCall`
|
||||
|
||||
# Create the task
|
||||
result.add quote do:
|
||||
block enq_deq_task:
|
||||
when bool(`withArgs`):
|
||||
let `task` = Task.new(
|
||||
parent = `workerContext`.currentTask,
|
||||
fn = `async_fn`,
|
||||
params = `args`)
|
||||
else:
|
||||
let `task` = Task.new(
|
||||
parent = `workerContext`.currentTask,
|
||||
fn = `async_fn`)
|
||||
`scheduleBlock`
|
||||
|
||||
proc spawnRet(funcCall: NimNode, retTy, args, argsTy: NimNode, workerContext, schedule: NimNode): NimNode =
|
||||
# Create the async function
|
||||
let fn = funcCall[0]
|
||||
let fnName = $fn
|
||||
let async_fn = ident("async_" & fnName)
|
||||
var fnCall = newCall(fn)
|
||||
let data = ident("data") # typed pointer to data
|
||||
|
||||
# Schedule
|
||||
let task = ident"task"
|
||||
let scheduleBlock = newCall(schedule, workerContext, task)
|
||||
|
||||
result = newStmtList()
|
||||
|
||||
# tasks have no return value.
|
||||
# 1. The start of the task `data` buffer will store the return value for the flowvar and awaiter/sync
|
||||
# 2. We create a wrapper async_fn without return value that send the return value in the channel
|
||||
# 3. We package that wrapper function in a task
|
||||
|
||||
# We store the following in task.data:
|
||||
#
|
||||
# | ptr Task | result | arg₀ | arg₁ | ... | argₙ
|
||||
let fut = ident"fut"
|
||||
let taskSelfReference = ident"taskSelfReference"
|
||||
let retVal = ident"retVal"
|
||||
|
||||
var futArgs = nnkPar.newTree
|
||||
var futArgsTy = nnkPar.newTree
|
||||
futArgs.add taskSelfReference
|
||||
futArgsTy.add nnkPtrTy.newTree(bindSym"Task")
|
||||
futArgs.add retVal
|
||||
futArgsTy.add retTy
|
||||
|
||||
for i in 1 ..< funcCall.len:
|
||||
futArgsTy.add getTypeInst(funcCall[i])
|
||||
futArgs.add funcCall[i]
|
||||
|
||||
# data stores | ptr Task | result | arg₀ | arg₁ | ... | argₙ
|
||||
# so arguments starts at data[2] in the wrapping funcCall functions
|
||||
for i in 1 ..< funcCall.len:
|
||||
fnCall.add nnkBracketExpr.newTree(
|
||||
data,
|
||||
newLit i+1
|
||||
)
|
||||
|
||||
result.add quote do:
|
||||
proc `async_fn`(param: pointer) {.nimcall.} =
|
||||
# preCondition: not isRootTask(`workerContext`.currentTask)
|
||||
|
||||
let `data` = cast[ptr `futArgsTy`](param)
|
||||
let res = `fnCall`
|
||||
readyWith(`data`[0], res)
|
||||
|
||||
# Regenerate fresh ident, retTy has been tagged as a function call param
|
||||
let retTy = ident($retTy)
|
||||
|
||||
# Create the task
|
||||
result.add quote do:
|
||||
block enq_deq_task:
|
||||
let `taskSelfReference` = cast[ptr Task](0xDEADBEEF)
|
||||
let `retVal` = default(`retTy`)
|
||||
|
||||
let `task` = Task.new(
|
||||
parent = `workerContext`.currentTask,
|
||||
fn = `async_fn`,
|
||||
params = `futArgs`)
|
||||
let `fut` = newFlowvar(`retTy`, `task`)
|
||||
`scheduleBlock`
|
||||
# Return the future
|
||||
`fut`
|
||||
|
||||
proc spawnImpl*(tp: NimNode{nkSym}, funcCall: NimNode, workerContext, schedule: NimNode): NimNode =
|
||||
funcCall.expectKind(nnkCall)
|
||||
|
||||
# Get the return type if any
|
||||
let retType = funcCall[0].getImpl[3][0]
|
||||
let needFuture = retType.kind != nnkEmpty
|
||||
|
||||
# Get a serialized type and data for all function arguments
|
||||
# We use adhoc tuple
|
||||
var argsTy = nnkPar.newTree()
|
||||
var args = nnkPar.newTree()
|
||||
for i in 1 ..< funcCall.len:
|
||||
argsTy.add getTypeInst(funcCall[i])
|
||||
args.add funcCall[i]
|
||||
|
||||
# Package in a task
|
||||
if not needFuture:
|
||||
result = spawnVoid(funcCall, args, argsTy, workerContext, schedule)
|
||||
else:
|
||||
result = spawnRet(funcCall, retType, args, argsTy, workerContext, schedule)
|
||||
|
||||
# Wrap in a block for namespacing
|
||||
result = nnkBlockStmt.newTree(newEmptyNode(), result)
|
||||
# echo result.toStrLit
|
||||
53
constantine/platforms/threadpool/primitives/barriers.md
Normal file
53
constantine/platforms/threadpool/primitives/barriers.md
Normal file
@ -0,0 +1,53 @@
|
||||
# Synchronization Barriers
|
||||
|
||||
OSX does not implement pthread_barrier as its an optional part
|
||||
of the POSIX standard and they probably want to drive people to libdispatch/Grand Central Dispatch.
|
||||
|
||||
So we need to roll our own with a POSIX compatible API.
|
||||
|
||||
## Glibc barriers, design bug and implementation
|
||||
|
||||
> Note: due to GPL licensing, do not lift the code.
|
||||
> Not that we can as it is heavily dependent on futexes
|
||||
> which are not available on OSX
|
||||
|
||||
We need to make sure that we don't hit the same bug
|
||||
as glibc: https://sourceware.org/bugzilla/show_bug.cgi?id=13065
|
||||
which seems to be an issue in some of the barrier implementations
|
||||
in the wild.
|
||||
|
||||
The design of Glibc barriers is here:
|
||||
https://sourceware.org/git/?p=glibc.git;a=blob;f=nptl/DESIGN-barrier.txt;h=23463c6b7e77231697db3e13933b36ce295365b1;hb=HEAD
|
||||
|
||||
And implementation:
|
||||
- https://sourceware.org/git/?p=glibc.git;a=blob;f=nptl/pthread_barrier_destroy.c;h=76957adef3ee751e5b0cfa429fcf4dd3cfd80b2b;hb=HEAD
|
||||
- https://sourceware.org/git/?p=glibc.git;a=blob;f=nptl/pthread_barrier_init.c;h=c8ebab3a3cb5cbbe469c0d05fb8d9ca0c365b2bb;hb=HEAD`
|
||||
- https://sourceware.org/git/?p=glibc.git;a=blob;f=nptl/pthread_barrier_wait.c;h=49fcfd370c1c4929fdabdf420f2f19720362e4a0;hb=HEAD
|
||||
|
||||
## Synchronization barrier techniques
|
||||
|
||||
This article goes over the techniques of
|
||||
"pool barrier" and "ticket barrier"
|
||||
https://locklessinc.com/articles/barriers/
|
||||
to reach 2x to 20x the speed of pthreads barrier
|
||||
|
||||
This course https://cs.anu.edu.au/courses/comp8320/lectures/aux/comp422-Lecture21-Barriers.pdf
|
||||
goes over
|
||||
- centralized barrier with sense reversal
|
||||
- combining tree barrier
|
||||
- dissemination barrier
|
||||
- tournament barrier
|
||||
- scalable tree barrier
|
||||
More courses:
|
||||
- http://www.cs.rochester.edu/u/sandhya/csc458/seminars/jb_Barrier_Methods.pdf
|
||||
|
||||
It however requires lightweight mutexes like Linux futexes
|
||||
that OSX lacks.
|
||||
|
||||
This post goes over lightweight mutexes like Benaphores (from BeOS)
|
||||
https://preshing.com/20120226/roll-your-own-lightweight-mutex/
|
||||
|
||||
This gives a few barrier implementations
|
||||
http://gallium.inria.fr/~maranget/MPRI/02.pdf
|
||||
and refers to Cubible paper for formally verifying synchronization barriers
|
||||
http://cubicle.lri.fr/papers/jfla2014.pdf (in French)
|
||||
71
constantine/platforms/threadpool/primitives/barriers.nim
Normal file
71
constantine/platforms/threadpool/primitives/barriers.nim
Normal file
@ -0,0 +1,71 @@
|
||||
# Weave
|
||||
# Copyright (c) 2019 Mamy André-Ratsimbazafy
|
||||
# Licensed and distributed under either of
|
||||
# * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
|
||||
# * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
|
||||
# at your option. This file may not be copied, modified, or distributed except according to those terms.
|
||||
|
||||
{.push raises: [], checks: off.} # No exceptions or overflow/conversion checks
|
||||
|
||||
when defined(windows):
|
||||
import ./barriers_windows
|
||||
when compileOption("assertions"):
|
||||
import os
|
||||
|
||||
type SyncBarrier* = SynchronizationBarrier
|
||||
|
||||
proc init*(syncBarrier: var SyncBarrier, threadCount: uint32) {.inline.} =
|
||||
## Initialize a synchronization barrier that will block ``threadCount`` threads
|
||||
## before release.
|
||||
let err {.used.} = InitializeSynchronizationBarrier(syncBarrier, cast[int32](threadCount), -1)
|
||||
when compileOption("assertions"):
|
||||
if err != 1:
|
||||
assert err == 0
|
||||
raiseOSError(osLastError())
|
||||
|
||||
proc wait*(syncBarrier: var SyncBarrier): bool {.inline.} =
|
||||
## Blocks thread at a synchronization barrier.
|
||||
## Returns true for one of the threads (the last one on Windows, undefined on Posix)
|
||||
## and false for the others.
|
||||
result = bool EnterSynchronizationBarrier(syncBarrier, SYNCHRONIZATION_BARRIER_FLAGS_NO_DELETE)
|
||||
|
||||
proc delete*(syncBarrier: sink SyncBarrier) {.inline.} =
|
||||
## Deletes a synchronization barrier.
|
||||
## This assumes no race between waiting at a barrier and deleting it,
|
||||
## and reuse of the barrier requires initialization.
|
||||
DeleteSynchronizationBarrier(syncBarrier.addr)
|
||||
|
||||
else:
|
||||
import ./barriers_posix
|
||||
when compileOption("assertions"):
|
||||
import os
|
||||
|
||||
type SyncBarrier* = PthreadBarrier
|
||||
|
||||
proc init*(syncBarrier: var SyncBarrier, threadCount: uint32) {.inline.} =
|
||||
## Initialize a synchronization barrier that will block ``threadCount`` threads
|
||||
## before release.
|
||||
let err {.used.} = pthread_barrier_init(syncBarrier, nil, threadCount)
|
||||
when compileOption("assertions"):
|
||||
if err != 0:
|
||||
raiseOSError(OSErrorCode(err))
|
||||
|
||||
proc wait*(syncBarrier: var SyncBarrier): bool {.inline.} =
|
||||
## Blocks thread at a synchronization barrier.
|
||||
## Returns true for one of the threads (the last one on Windows, undefined on Posix)
|
||||
## and false for the others.
|
||||
let err {.used.} = pthread_barrier_wait(syncBarrier)
|
||||
when compileOption("assertions"):
|
||||
if err != PTHREAD_BARRIER_SERIAL_THREAD and err < 0:
|
||||
raiseOSError(OSErrorCode(err))
|
||||
result = if err == PTHREAD_BARRIER_SERIAL_THREAD: true
|
||||
else: false
|
||||
|
||||
proc delete*(syncBarrier: sink SyncBarrier) {.inline.} =
|
||||
## Deletes a synchronization barrier.
|
||||
## This assumes no race between waiting at a barrier and deleting it,
|
||||
## and reuse of the barrier requires initialization.
|
||||
let err {.used.} = pthread_barrier_destroy(syncBarrier)
|
||||
when compileOption("assertions"):
|
||||
if err < 0:
|
||||
raiseOSError(OSErrorCode(err))
|
||||
@ -0,0 +1,84 @@
|
||||
# Weave
|
||||
# Copyright (c) 2019 Mamy André-Ratsimbazafy
|
||||
# Licensed and distributed under either of
|
||||
# * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
|
||||
# * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
|
||||
# at your option. This file may not be copied, modified, or distributed except according to those terms.
|
||||
|
||||
# OSX doesn't implement pthread_barrier_t
|
||||
# It's an optional part of the POSIX standard
|
||||
#
|
||||
# This is a manual implementation of a sense reversing barrier
|
||||
|
||||
import std/locks
|
||||
|
||||
type
|
||||
Errno* = cint
|
||||
|
||||
PthreadBarrierAttr* = object
|
||||
## Dummy
|
||||
PthreadBarrier* = object
|
||||
## Implementation of a sense reversing barrier
|
||||
## (The Art of Multiprocessor Programming by Maurice Herlihy & Nir Shavit)
|
||||
|
||||
lock: Lock # Alternatively spinlock on Atomic
|
||||
cond {.guard: lock.}: Cond
|
||||
sense {.guard: lock.}: bool # Choose int32 to avoid zero-expansion cost in registers?
|
||||
left {.guard: lock.}: cuint # Number of threads missing at the barrier before opening
|
||||
count: cuint # Total number of threads that need to arrive before opening the barrier
|
||||
|
||||
const
|
||||
PTHREAD_BARRIER_SERIAL_THREAD* = Errno(1)
|
||||
|
||||
proc pthread_cond_broadcast(cond: var Cond): Errno {.header:"<pthread.h>".}
|
||||
## Nim only signal one thread in locks
|
||||
## We need to unblock all
|
||||
|
||||
proc broadcast(cond: var Cond) {.inline.}=
|
||||
discard pthread_cond_broadcast(cond)
|
||||
|
||||
func pthread_barrier_init*(
|
||||
barrier: var PthreadBarrier,
|
||||
attr: ptr PthreadBarrierAttr,
|
||||
count: cuint
|
||||
): Errno =
|
||||
barrier.lock.initLock()
|
||||
{.locks: [barrier.lock].}:
|
||||
barrier.cond.initCond()
|
||||
barrier.left = count
|
||||
barrier.count = count
|
||||
# barrier.sense = false
|
||||
|
||||
proc pthread_barrier_wait*(barrier: var PthreadBarrier): Errno =
|
||||
## Wait on `barrier`
|
||||
## Returns PTHREAD_BARRIER_SERIAL_THREAD for a single arbitrary thread
|
||||
## Returns 0 for the other
|
||||
## Returns Errno if there is an error
|
||||
barrier.lock.acquire()
|
||||
{.locks: [barrier.lock].}:
|
||||
var local_sense = barrier.sense # Thread local sense
|
||||
dec barrier.left
|
||||
|
||||
if barrier.left == 0:
|
||||
# Last thread to arrive at the barrier
|
||||
# Reverse phase and release it
|
||||
barrier.left = barrier.count
|
||||
barrier.sense = not barrier.sense
|
||||
barrier.cond.broadcast()
|
||||
barrier.lock.release()
|
||||
return PTHREAD_BARRIER_SERIAL_THREAD
|
||||
|
||||
while barrier.sense == local_sense:
|
||||
# We are waiting for threads
|
||||
# Wait for the sense to reverse
|
||||
# while loop because we might have spurious wakeups
|
||||
barrier.cond.wait(barrier.lock)
|
||||
|
||||
# Reversed, we can leave the barrier
|
||||
barrier.lock.release()
|
||||
return Errno(0)
|
||||
|
||||
proc pthread_barrier_destroy*(barrier: var PthreadBarrier): Errno =
|
||||
{.locks: [barrier.lock].}:
|
||||
barrier.cond.deinitCond()
|
||||
barrier.lock.deinitLock()
|
||||
@ -0,0 +1,56 @@
|
||||
# Weave
|
||||
# Copyright (c) 2019 Mamy André-Ratsimbazafy
|
||||
# Licensed and distributed under either of
|
||||
# * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
|
||||
# * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
|
||||
# at your option. This file may not be copied, modified, or distributed except according to those terms.
|
||||
|
||||
# Abstractions over POSIX barriers (non-)implementations
|
||||
|
||||
when not compileOption("threads"):
|
||||
{.error: "This requires --threads:on compilation flag".}
|
||||
|
||||
# Types
|
||||
# -------------------------------------------------------
|
||||
|
||||
when defined(osx):
|
||||
import ./barriers_macos
|
||||
export PthreadBarrierAttr, PthreadBarrier, Errno, PTHREAD_BARRIER_SERIAL_THREAD
|
||||
else:
|
||||
type
|
||||
PthreadBarrierAttr* {.importc: "pthread_barrierattr_t", header: "<sys/types.h>", byref.} = object
|
||||
when (defined(linux) and not defined(android)) and defined(amd64):
|
||||
abi: array[4 div sizeof(cint), cint] # https://sourceware.org/git/?p=glibc.git;a=blob;f=sysdeps/x86/nptl/bits/pthreadtypes-arch.h;h=dd06d6753ebc80d94ede6c3c18227a3ad3104570;hb=HEAD#l45
|
||||
PthreadBarrier* {.importc: "pthread_barrier_t", header: "<sys/types.h>", byref.} = object
|
||||
when (defined(linux) and not defined(android)) and defined(amd64):
|
||||
abi: array[32 div sizeof(clong), clong] # https://sourceware.org/git/?p=glibc.git;a=blob;f=sysdeps/x86/nptl/bits/pthreadtypes-arch.h;h=dd06d6753ebc80d94ede6c3c18227a3ad3104570;hb=HEAD#l28
|
||||
|
||||
|
||||
Errno* = cint
|
||||
|
||||
var PTHREAD_BARRIER_SERIAL_THREAD* {.importc, header:"<pthread.h>".}: Errno
|
||||
|
||||
# Pthread
|
||||
# -------------------------------------------------------
|
||||
when defined(osx):
|
||||
export pthread_barrier_init, pthread_barrier_wait, pthread_barrier_destroy
|
||||
else:
|
||||
proc pthread_barrier_init*(
|
||||
barrier: PthreadBarrier,
|
||||
attr: ptr PthreadBarrierAttr,
|
||||
count: cuint
|
||||
): Errno {.header: "<pthread.h>".}
|
||||
## Initialize `barrier` with the attributes `attr`.
|
||||
## The barrier is opened when `count` waiters arrived.
|
||||
|
||||
proc pthread_barrier_destroy*(
|
||||
barrier: sink PthreadBarrier): Errno {.header: "<pthread.h>".}
|
||||
## Destroy a previously dynamically initialized `barrier`.
|
||||
|
||||
proc pthread_barrier_wait*(
|
||||
barrier: var PthreadBarrier
|
||||
): Errno {.header: "<pthread.h>".}
|
||||
## Wait on `barrier`
|
||||
## Returns PTHREAD_BARRIER_SERIAL_THREAD for a single arbitrary thread
|
||||
## Returns 0 for the other
|
||||
## Returns Errno if there is an error
|
||||
@ -0,0 +1,31 @@
|
||||
# Weave
|
||||
# Copyright (c) 2019 Mamy André-Ratsimbazafy
|
||||
# Licensed and distributed under either of
|
||||
# * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
|
||||
# * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
|
||||
# at your option. This file may not be copied, modified, or distributed except according to those terms.
|
||||
|
||||
import winlean
|
||||
|
||||
# Technically in <synchapi.h> but MSVC complains with
|
||||
# @m..@s..@sweave@sscheduler.nim.cpp
|
||||
# C:\Program Files (x86)\Windows Kits\10\include\10.0.17763.0\um\winnt.h(154): fatal error C1189: #error: "No Target Architecture"
|
||||
|
||||
type
|
||||
SynchronizationBarrier*{.importc:"SYNCHRONIZATION_BARRIER", header:"<windows.h>".} = object
|
||||
|
||||
var SYNCHRONIZATION_BARRIER_FLAGS_NO_DELETE* {.importc, header: "<windows.h>".}: DWORD
|
||||
## Skip expensive checks on barrier enter if a barrier is never deleted.
|
||||
|
||||
proc EnterSynchronizationBarrier*(lpBarrier: var SynchronizationBarrier, dwFlags: DWORD): WINBOOL {.importc, stdcall, header: "<windows.h>".}
|
||||
proc DeleteSynchronizationBarrier*(lpBarrier: ptr SynchronizationBarrier) {.importc, stdcall, header: "<windows.h>".}
|
||||
proc InitializeSynchronizationBarrier*(lpBarrier: var SynchronizationBarrier, lTotalThreads: LONG, lSpinCount: LONG): WINBOOL {.importc, stdcall, header: "<windows.h>".}
|
||||
|
||||
when isMainModule:
|
||||
import os
|
||||
|
||||
var x{.noinit.}: SynchronizationBarrier
|
||||
let err = InitializeSynchronizationBarrier(x, 2, -1)
|
||||
if err != 1:
|
||||
assert err == 0
|
||||
raiseOSError(osLastError())
|
||||
18
constantine/platforms/threadpool/primitives/futexes.nim
Normal file
18
constantine/platforms/threadpool/primitives/futexes.nim
Normal file
@ -0,0 +1,18 @@
|
||||
# Weave
|
||||
# Copyright (c) 2019 Mamy André-Ratsimbazafy
|
||||
# Licensed and distributed under either of
|
||||
# * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
|
||||
# * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
|
||||
# at your option. This file may not be copied, modified, or distributed except according to those terms.
|
||||
|
||||
when defined(linux):
|
||||
import ./futexes_linux
|
||||
export futexes_linux
|
||||
elif defined(windows):
|
||||
import ./futexes_windows
|
||||
export futexes_windows
|
||||
elif defined(osx):
|
||||
import ./futexes_macos
|
||||
export futexes_macos
|
||||
else:
|
||||
{.error: "Futexes are not implemented for your OS".}
|
||||
@ -0,0 +1,68 @@
|
||||
# Weave
|
||||
# Copyright (c) 2019 Mamy André-Ratsimbazafy
|
||||
# Licensed and distributed under either of
|
||||
# * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
|
||||
# * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
|
||||
# at your option. This file may not be copied, modified, or distributed except according to those terms.
|
||||
|
||||
# A wrapper for linux futex.
|
||||
# Condition variables do not always wake on signal which can deadlock the runtime
|
||||
# so we need to roll up our sleeves and use the low-level futex API.
|
||||
|
||||
import std/atomics
|
||||
export MemoryOrder
|
||||
|
||||
type
|
||||
Futex* = object
|
||||
value: Atomic[uint32]
|
||||
FutexOp = distinct cint
|
||||
|
||||
var NR_Futex {.importc: "__NR_futex", header: "<sys/syscall.h>".}: cint
|
||||
var FutexWaitPrivate {.importc:"FUTEX_WAIT_PRIVATE", header: "<linux/futex.h>".}: FutexOp
|
||||
var FutexWakePrivate {.importc:"FUTEX_WAKE_PRIVATE", header: "<linux/futex.h>".}: FutexOp
|
||||
|
||||
proc syscall(sysno: clong): cint {.header:"<unistd.h>", varargs.}
|
||||
|
||||
proc sysFutex(
|
||||
futex: var Futex, op: FutexOp, val1: cuint or cint,
|
||||
timeout: pointer = nil, val2: pointer = nil, val3: cint = 0): cint {.inline.} =
|
||||
syscall(NR_Futex, futex.value.addr, op, val1, timeout, val2, val3)
|
||||
|
||||
proc initialize*(futex: var Futex) {.inline.} =
|
||||
futex.value.store(0, moRelaxed)
|
||||
|
||||
proc teardown*(futex: var Futex) {.inline.} =
|
||||
futex.value.store(0, moRelaxed)
|
||||
|
||||
proc load*(futex: var Futex, order: MemoryOrder): uint32 {.inline.} =
|
||||
futex.value.load(order)
|
||||
|
||||
proc loadMut*(futex: var Futex): var Atomic[uint32] {.inline.} =
|
||||
futex.value
|
||||
|
||||
proc store*(futex: var Futex, value: uint32, order: MemoryOrder) {.inline.} =
|
||||
futex.value.store(value, order)
|
||||
|
||||
proc wait*(futex: var Futex, refVal: uint32) {.inline.} =
|
||||
## Suspend a thread if the value of the futex is the same as refVal.
|
||||
|
||||
# Returns 0 in case of a successful suspend
|
||||
# If value are different, it returns EWOULDBLOCK
|
||||
# We discard as this is not needed and simplifies compat with Windows futex
|
||||
discard sysFutex(futex, FutexWaitPrivate, refVal)
|
||||
|
||||
proc wake*(futex: var Futex) {.inline.} =
|
||||
## Wake one thread (from the same process)
|
||||
|
||||
# Returns the number of actually woken threads
|
||||
# or a Posix error code (if negative)
|
||||
# We discard as this is not needed and simplifies compat with Windows futex
|
||||
discard sysFutex(futex, FutexWakePrivate, 1)
|
||||
|
||||
proc wakeAll*(futex: var Futex) {.inline.} =
|
||||
## Wake all threads (from the same process)
|
||||
|
||||
# Returns the number of actually woken threads
|
||||
# or a Posix error code (if negative)
|
||||
# We discard as this is not needed and simplifies compat with Windows futex
|
||||
discard sysFutex(futex, FutexWakePrivate, high(int32))
|
||||
109
constantine/platforms/threadpool/primitives/futexes_macos.nim
Normal file
109
constantine/platforms/threadpool/primitives/futexes_macos.nim
Normal file
@ -0,0 +1,109 @@
|
||||
# Weave
|
||||
# Copyright (c) 2019 Mamy André-Ratsimbazafy
|
||||
# Licensed and distributed under either of
|
||||
# * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
|
||||
# * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
|
||||
# at your option. This file may not be copied, modified, or distributed except according to those terms.
|
||||
|
||||
import std/atomics
|
||||
|
||||
# A wrapper for Darwin futex.
|
||||
# They are used in libc++ so likely to be very stable.
|
||||
# A new API appeared in OSX Big Sur (Jan 2021) ulock_wait2 and macOS pthread_cond_t has been migrated to it
|
||||
# - https://github.com/apple/darwin-xnu/commit/d4061fb0260b3ed486147341b72468f836ed6c8f#diff-08f993cc40af475663274687b7c326cc6c3031e0db3ac8de7b24624610616be6
|
||||
#
|
||||
# The old API is ulock_wait
|
||||
# - https://opensource.apple.com/source/xnu/xnu-7195.81.3/bsd/kern/sys_ulock.c.auto.html
|
||||
# - https://opensource.apple.com/source/xnu/xnu-7195.81.3/bsd/sys/ulock.h.auto.html
|
||||
|
||||
{.push hint[XDeclaredButNotUsed]: off.}
|
||||
|
||||
const UL_COMPARE_AND_WAIT = 1
|
||||
const UL_UNFAIR_LOCK = 2
|
||||
const UL_COMPARE_AND_WAIT_SHARED = 3
|
||||
const UL_UNFAIR_LOCK64_SHARED = 4
|
||||
const UL_COMPARE_AND_WAIT64 = 5
|
||||
const UL_COMPARE_AND_WAIT64_SHARED = 6
|
||||
# obsolete names
|
||||
const UL_OSSPINLOCK = UL_COMPARE_AND_WAIT
|
||||
const UL_HANDOFFLOCK = UL_UNFAIR_LOCK
|
||||
# These operation code are only implemented in (DEVELOPMENT || DEBUG) kernels
|
||||
const UL_DEBUG_SIMULATE_COPYIN_FAULT = 253
|
||||
const UL_DEBUG_HASH_DUMP_ALL = 254
|
||||
const UL_DEBUG_HASH_DUMP_PID = 255
|
||||
|
||||
# operation bits [15, 8] contain the flags for __ulock_wake
|
||||
#
|
||||
const ULF_WAKE_ALL = 0x00000100
|
||||
const ULF_WAKE_THREAD = 0x00000200
|
||||
const ULF_WAKE_ALLOW_NON_OWNER = 0x00000400
|
||||
|
||||
# operation bits [23, 16] contain the flags for __ulock_wait
|
||||
#
|
||||
# @const ULF_WAIT_WORKQ_DATA_CONTENTION
|
||||
# The waiter is contending on this lock for synchronization around global data.
|
||||
# This causes the workqueue subsystem to not create new threads to offset for
|
||||
# waiters on this lock.
|
||||
#
|
||||
# @const ULF_WAIT_CANCEL_POINT
|
||||
# This wait is a cancelation point
|
||||
#
|
||||
# @const ULF_WAIT_ADAPTIVE_SPIN
|
||||
# Use adaptive spinning when the thread that currently holds the unfair lock
|
||||
# is on core.
|
||||
const ULF_WAIT_WORKQ_DATA_CONTENTION = 0x00010000
|
||||
const ULF_WAIT_CANCEL_POINT = 0x00020000
|
||||
const ULF_WAIT_ADAPTIVE_SPIN = 0x00040000
|
||||
|
||||
# operation bits [31, 24] contain the generic flags
|
||||
const ULF_NO_ERRNO = 0x01000000
|
||||
|
||||
# masks
|
||||
const UL_OPCODE_MASK = 0x000000FF
|
||||
const UL_FLAGS_MASK = 0xFFFFFF00
|
||||
const ULF_GENERIC_MASK = 0xFFFF0000
|
||||
|
||||
const ULF_WAIT_MASK = ULF_NO_ERRNO or
|
||||
ULF_WAIT_WORKQ_DATA_CONTENTION or
|
||||
ULF_WAIT_CANCEL_POINT or
|
||||
ULF_WAIT_ADAPTIVE_SPIN
|
||||
|
||||
const ULF_WAKE_MASK = ULF_NO_ERRNO or
|
||||
ULF_WAKE_ALL or
|
||||
ULF_WAKE_THREAD or
|
||||
ULF_WAKE_ALLOW_NON_OWNER
|
||||
|
||||
proc ulock_wait(operation: uint32, address: pointer, value: uint64, timeout: uint32): cint {.importc:"__ulock_wait", cdecl.}
|
||||
proc ulock_wait2(operation: uint32, address: pointer, value: uint64, timeout, value2: uint64): cint {.importc:"__ulock_wait2", cdecl.}
|
||||
proc ulock_wake(operation: uint32, address: pointer, wake_value: uint64): cint {.importc:"__ulock_wake", cdecl.}
|
||||
|
||||
type
|
||||
Futex* = object
|
||||
value: Atomic[uint32]
|
||||
|
||||
proc initialize*(futex: var Futex) {.inline.} =
|
||||
futex.value.store(0, moRelaxed)
|
||||
|
||||
proc teardown*(futex: var Futex) {.inline.} =
|
||||
futex.value.store(0, moRelaxed)
|
||||
|
||||
proc load*(futex: var Futex, order: MemoryOrder): uint32 {.inline.} =
|
||||
futex.value.load(order)
|
||||
|
||||
proc loadMut*(futex: var Futex): var Atomic[uint32] {.inline.} =
|
||||
futex.value
|
||||
|
||||
proc store*(futex: var Futex, value: uint32, order: MemoryOrder) {.inline.} =
|
||||
futex.value.store(value, order)
|
||||
|
||||
proc wait*(futex: var Futex, refVal: uint32) {.inline.} =
|
||||
## Suspend a thread if the value of the futex is the same as refVal.
|
||||
discard ulock_wait(UL_UNFAIR_LOCK64_SHARED or ULF_NO_ERRNO, futex.value.addr, uint64 refVal, 0)
|
||||
|
||||
proc wake*(futex: var Futex) {.inline.} =
|
||||
## Wake one thread (from the same process)
|
||||
discard ulock_wake(ULF_WAKE_THREAD or ULF_NO_ERRNO, futex.value.addr, 0)
|
||||
|
||||
proc wakeAll*(futex: var Futex) {.inline.} =
|
||||
## Wake all threads (from the same process)
|
||||
discard ulock_wake(ULF_WAKE_ALL or ULF_NO_ERRNO, futex.value.addr, 0)
|
||||
@ -0,0 +1,59 @@
|
||||
# Weave
|
||||
# Copyright (c) 2019 Mamy André-Ratsimbazafy
|
||||
# Licensed and distributed under either of
|
||||
# * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
|
||||
# * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
|
||||
# at your option. This file may not be copied, modified, or distributed except according to those terms.
|
||||
|
||||
# An implementation of futex using Windows primitives
|
||||
|
||||
import std/atomics, winlean
|
||||
export MemoryOrder
|
||||
|
||||
type
|
||||
Futex* = object
|
||||
value: Atomic[uint32]
|
||||
|
||||
# Contrary to the documentation, the futex related primitives are NOT in kernel32.dll
|
||||
# but in API-MS-Win-Core-Synch-l1-2-0.dll ¯\_(ツ)_/¯
|
||||
|
||||
proc initialize*(futex: var Futex) {.inline.} =
|
||||
futex.value.store(0, moRelaxed)
|
||||
|
||||
proc teardown*(futex: var Futex) {.inline.} =
|
||||
futex.value.store(0, moRelaxed)
|
||||
|
||||
proc WaitOnAddress(
|
||||
Address: pointer, CompareAddress: pointer,
|
||||
AddressSize: csize_t, dwMilliseconds: DWORD
|
||||
): WINBOOL {.importc, stdcall, dynlib: "API-MS-Win-Core-Synch-l1-2-0.dll".}
|
||||
# The Address should be volatile
|
||||
|
||||
proc WakeByAddressSingle(Address: pointer) {.importc, stdcall, dynlib: "API-MS-Win-Core-Synch-l1-2-0.dll".}
|
||||
proc WakeByAddressAll(Address: pointer) {.importc, stdcall, dynlib: "API-MS-Win-Core-Synch-l1-2-0.dll".}
|
||||
|
||||
proc load*(futex: var Futex, order: MemoryOrder): uint32 {.inline.} =
|
||||
futex.value.load(order)
|
||||
|
||||
proc loadMut*(futex: var Futex): var Atomic[uint32] {.inline.} =
|
||||
futex.value
|
||||
|
||||
proc store*(futex: var Futex, value: uint32, order: MemoryOrder) {.inline.} =
|
||||
futex.value.store(value, order)
|
||||
|
||||
proc wait*(futex: var Futex, refVal: uint32) {.inline.} =
|
||||
## Suspend a thread if the value of the futex is the same as refVal.
|
||||
|
||||
# Returns TRUE if the wait succeeds or FALSE if not.
|
||||
# getLastError() will contain the error information, for example
|
||||
# if it failed due to a timeout.
|
||||
# We discard as this is not needed and simplifies compat with Linux futex
|
||||
discard WaitOnAddress(futex.value.addr, refVal.unsafeAddr, csize_t sizeof(refVal), INFINITE)
|
||||
|
||||
proc wake*(futex: var Futex) {.inline.} =
|
||||
## Wake one thread (from the same process)
|
||||
WakeByAddressSingle(futex.value.addr)
|
||||
|
||||
proc wakeAll*(futex: var Futex) {.inline.} =
|
||||
## Wake all threads (from the same process)
|
||||
WakeByAddressAll(futex.value.addr)
|
||||
555
constantine/platforms/threadpool/threadpool.nim
Normal file
555
constantine/platforms/threadpool/threadpool.nim
Normal file
@ -0,0 +1,555 @@
|
||||
# Constantine
|
||||
# Copyright (c) 2018-2019 Status Research & Development GmbH
|
||||
# Copyright (c) 2020-Present Mamy André-Ratsimbazafy
|
||||
# Licensed and distributed under either of
|
||||
# * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
|
||||
# * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
|
||||
# at your option. This file may not be copied, modified, or distributed except according to those terms.
|
||||
|
||||
when not compileOption("threads"):
|
||||
{.error: "This requires --threads:on compilation flag".}
|
||||
|
||||
{.push raises: [].}
|
||||
|
||||
import
|
||||
std/[cpuinfo, atomics, macros],
|
||||
./crossthread/[
|
||||
taskqueues,
|
||||
backoff,
|
||||
tasks_flowvars],
|
||||
./instrumentation,
|
||||
./primitives/barriers,
|
||||
./parallel_offloading,
|
||||
../allocs, ../bithacks,
|
||||
../../../helpers/prng_unsafe
|
||||
|
||||
export
|
||||
# flowvars
|
||||
Flowvar, isSpawned, isReady, sync
|
||||
|
||||
type
|
||||
WorkerID = uint32
|
||||
Signal = object
|
||||
terminate {.align: 64.}: Atomic[bool]
|
||||
|
||||
WorkerContext = object
|
||||
## Thread-local worker context
|
||||
|
||||
# Params
|
||||
id: WorkerID
|
||||
threadpool: Threadpool
|
||||
|
||||
# Tasks
|
||||
taskqueue: ptr Taskqueue # owned task queue
|
||||
currentTask: ptr Task
|
||||
|
||||
# Synchronization
|
||||
localBackoff: EventNotifier # Multi-Producer Single-Consumer backoff
|
||||
signal: ptr Signal # owned signal
|
||||
|
||||
# Thefts
|
||||
rng: RngState # RNG state to select victims
|
||||
|
||||
# Adaptative theft policy
|
||||
stealHalf: bool
|
||||
recentTasks: uint32
|
||||
recentThefts: uint32
|
||||
recentTheftsAdaptative: uint32
|
||||
recentLeaps: uint32
|
||||
|
||||
Threadpool* = ptr object
|
||||
barrier: SyncBarrier # Barrier for initialization and teardown
|
||||
# -- align: 64
|
||||
globalBackoff: EventCount # Multi-Producer Multi-Consumer backoff
|
||||
# -- align: 64
|
||||
numThreads*{.align: 64.}: uint32
|
||||
workerQueues: ptr UncheckedArray[Taskqueue]
|
||||
workers: ptr UncheckedArray[Thread[(Threadpool, WorkerID)]]
|
||||
workerSignals: ptr UncheckedArray[Signal]
|
||||
|
||||
# Thread-local config
|
||||
# ---------------------------------------------
|
||||
|
||||
var workerContext {.threadvar.}: WorkerContext
|
||||
## Thread-local Worker context
|
||||
|
||||
proc setupWorker() =
|
||||
## Initialize the thread-local context of a worker
|
||||
## Requires the ID and threadpool fields to be initialized
|
||||
template ctx: untyped = workerContext
|
||||
|
||||
preCondition: not ctx.threadpool.isNil()
|
||||
preCondition: 0 <= ctx.id and ctx.id < ctx.threadpool.numThreads.uint32
|
||||
preCondition: not ctx.threadpool.workerQueues.isNil()
|
||||
preCondition: not ctx.threadpool.workerSignals.isNil()
|
||||
|
||||
# Thefts
|
||||
ctx.rng.seed(0xEFFACED + ctx.id)
|
||||
|
||||
# Synchronization
|
||||
ctx.localBackoff.initialize()
|
||||
ctx.signal = addr ctx.threadpool.workerSignals[ctx.id]
|
||||
ctx.signal.terminate.store(false, moRelaxed)
|
||||
|
||||
# Tasks
|
||||
ctx.taskqueue = addr ctx.threadpool.workerQueues[ctx.id]
|
||||
ctx.currentTask = nil
|
||||
|
||||
# Init
|
||||
ctx.taskqueue[].init(initialCapacity = 32)
|
||||
|
||||
# Adaptative theft policy
|
||||
ctx.recentTasks = 0
|
||||
ctx.recentThefts = 0
|
||||
ctx.recentTheftsAdaptative = 0
|
||||
ctx.recentLeaps = 0
|
||||
|
||||
proc teardownWorker() =
|
||||
## Cleanup the thread-local context of a worker
|
||||
workerContext.localBackoff.`=destroy`()
|
||||
workerContext.taskqueue[].teardown()
|
||||
|
||||
proc eventLoop(ctx: var WorkerContext) {.raises:[Exception].}
|
||||
|
||||
proc workerEntryFn(params: tuple[threadpool: Threadpool, id: WorkerID]) {.raises: [Exception].} =
|
||||
## On the start of the threadpool workers will execute this
|
||||
## until they receive a termination signal
|
||||
# We assume that thread_local variables start all at their binary zero value
|
||||
preCondition: workerContext == default(WorkerContext)
|
||||
|
||||
template ctx: untyped = workerContext
|
||||
|
||||
# If the following crashes, you need --tlsEmulation:off
|
||||
ctx.id = params.id
|
||||
ctx.threadpool = params.threadpool
|
||||
|
||||
setupWorker()
|
||||
|
||||
# 1 matching barrier in Threadpool.new() for root thread
|
||||
discard params.threadpool.barrier.wait()
|
||||
|
||||
{.cast(gcsafe).}: # Compiler does not consider that GC-safe by default when multi-threaded due to thread-local variables
|
||||
ctx.eventLoop()
|
||||
|
||||
debugTermination:
|
||||
log(">>> Worker %2d shutting down <<<\n", ctx.id)
|
||||
|
||||
# 1 matching barrier in threadpool.shutdown() for root thread
|
||||
discard params.threadpool.barrier.wait()
|
||||
|
||||
teardownWorker()
|
||||
|
||||
# Tasks
|
||||
# ---------------------------------------------
|
||||
|
||||
# Sentinel values
|
||||
const ReadyFuture = cast[ptr EventNotifier](0xCA11AB1E)
|
||||
const RootTask = cast[ptr Task](0xEFFACED0)
|
||||
|
||||
proc run*(ctx: var WorkerContext, task: ptr Task) {.raises:[Exception].} =
|
||||
## Run a task, frees it if it is not owned by a Flowvar
|
||||
let suspendedTask = workerContext.currentTask
|
||||
ctx.currentTask = task
|
||||
debug: log("Worker %2d: running task.fn 0x%.08x (%d pending)\n", ctx.id, task.fn, ctx.taskqueue[].peek())
|
||||
task.fn(task.data.addr)
|
||||
debug: log("Worker %2d: completed task.fn 0x%.08x (%d pending)\n", ctx.id, task.fn, ctx.taskqueue[].peek())
|
||||
ctx.recentTasks += 1
|
||||
ctx.currentTask = suspendedTask
|
||||
if not task.hasFuture:
|
||||
freeHeap(task)
|
||||
return
|
||||
|
||||
# Sync with an awaiting thread without work in completeFuture
|
||||
var expected = (ptr EventNotifier)(nil)
|
||||
if not compareExchange(task.waiter, expected, desired = ReadyFuture, moAcquireRelease):
|
||||
debug: log("Worker %2d: completed task 0x%.08x, notifying waiter 0x%.08x\n", ctx.id, task, expected)
|
||||
expected[].notify()
|
||||
|
||||
proc schedule(ctx: var WorkerContext, tn: ptr Task, forceWake = false) {.inline.} =
|
||||
## Schedule a task in the threadpool
|
||||
## This wakes a sibling thread if our local queue is empty
|
||||
## or forceWake is true.
|
||||
debug: log("Worker %2d: schedule task 0x%.08x (parent 0x%.08x, current 0x%.08x)\n", ctx.id, tn, tn.parent, ctx.currentTask)
|
||||
|
||||
# Instead of notifying every time a task is scheduled, we notify
|
||||
# only when the worker queue is empty. This is a good approximation
|
||||
# of starvation in work-stealing.
|
||||
# - Tzannes, A., G. C. Caragea, R. Barua, and U. Vishkin.
|
||||
# Lazy binary-splitting: a run-time adaptive work-stealing scheduler.
|
||||
# In PPoPP ’10, Bangalore, India, January 2010. ACM, pp. 179–190.
|
||||
# https://user.eng.umd.edu/~barua/ppopp164.pdf
|
||||
|
||||
let wasEmpty = ctx.taskqueue[].peek() == 0
|
||||
ctx.taskqueue[].push(tn)
|
||||
if forceWake or wasEmpty:
|
||||
ctx.threadpool.globalBackoff.wake()
|
||||
|
||||
# Scheduler
|
||||
# ---------------------------------------------
|
||||
|
||||
iterator pseudoRandomPermutation(randomSeed, maxExclusive: uint32): uint32 =
|
||||
## Create a (low-quality) pseudo-random permutation from [0, max)
|
||||
# Design considerations and randomness constraint for work-stealing, see docs/random_permutations.md
|
||||
#
|
||||
# Linear Congruential Generator: https://en.wikipedia.org/wiki/Linear_congruential_generator
|
||||
#
|
||||
# Xₙ₊₁ = aXₙ+c (mod m) generates all random number mod m without repetition
|
||||
# if and only if (Hull-Dobell theorem):
|
||||
# 1. c and m are coprime
|
||||
# 2. a-1 is divisible by all prime factors of m
|
||||
# 3. a-1 is divisible by 4 if m is divisible by 4
|
||||
#
|
||||
# Alternative 1. By choosing a=1, all conditions are easy to reach.
|
||||
#
|
||||
# The randomness quality is not important besides distributing potential contention,
|
||||
# i.e. randomly trying thread i, then i+1, then i+n-1 (mod n) is good enough.
|
||||
#
|
||||
# Assuming 6 threads, co-primes are [1, 5], which means the following permutations
|
||||
# assuming we start with victim 0:
|
||||
# - [0, 1, 2, 3, 4, 5]
|
||||
# - [0, 5, 4, 3, 2, 1]
|
||||
# While we don't care much about randoness quality, it's a bit disappointing.
|
||||
#
|
||||
# Alternative 2. We can choose m to be the next power of 2, meaning all odd integers are co-primes,
|
||||
# consequently:
|
||||
# - we don't need a GCD to find the coprimes
|
||||
# - we don't need to cache coprimes, removing a cache-miss potential
|
||||
# - a != 1, so we now have a multiplicative factor, which makes output more "random looking".
|
||||
|
||||
# n and (m-1) <=> n mod m, if m is a power of 2
|
||||
let M = maxExclusive.nextPowerOfTwo_vartime()
|
||||
let c = (randomSeed and ((M shr 1) - 1)) * 2 + 1 # c odd and c ∈ [0, M)
|
||||
let a = (randomSeed and ((M shr 2) - 1)) * 4 + 1 # a-1 divisible by 2 (all prime factors of m) and by 4 if m divisible by 4
|
||||
|
||||
let mask = M-1 # for mod M
|
||||
let start = randomSeed and mask
|
||||
|
||||
var x = start
|
||||
while true:
|
||||
if x < maxExclusive:
|
||||
yield x
|
||||
x = (a*x + c) and mask # ax + c (mod M), with M power of 2
|
||||
if x == start:
|
||||
break
|
||||
|
||||
proc tryStealOne(ctx: var WorkerContext): ptr Task =
|
||||
## Try to steal a task.
|
||||
let seed = ctx.rng.next().uint32
|
||||
for targetId in seed.pseudoRandomPermutation(ctx.threadpool.numThreads):
|
||||
if targetId == ctx.id:
|
||||
continue
|
||||
|
||||
let stolenTask = ctx.id.steal(ctx.threadpool.workerQueues[targetId])
|
||||
|
||||
if not stolenTask.isNil():
|
||||
ctx.recentThefts += 1
|
||||
# Theft successful, there might be more work for idle threads, wake one
|
||||
ctx.threadpool.globalBackoff.wake()
|
||||
return stolenTask
|
||||
return nil
|
||||
|
||||
proc updateStealStrategy(ctx: var WorkerContext) =
|
||||
## Estimate work-stealing efficiency during the last interval
|
||||
## If the value is below a threshold, switch strategies
|
||||
const StealAdaptativeInterval = 25
|
||||
if ctx.recentTheftsAdaptative == StealAdaptativeInterval:
|
||||
let recentTheftsNonAdaptative = ctx.recentThefts - ctx.recentTheftsAdaptative
|
||||
let adaptativeTasks = ctx.recentTasks - ctx.recentLeaps - recentTheftsNonAdaptative
|
||||
|
||||
let ratio = adaptativeTasks.float32 / StealAdaptativeInterval.float32
|
||||
if ctx.stealHalf and ratio < 2.0f:
|
||||
# Tasks stolen are coarse-grained, steal only one to reduce re-steal
|
||||
ctx.stealHalf = false
|
||||
elif not ctx.stealHalf and ratio == 1.0f:
|
||||
# All tasks processed were stolen tasks, we need to steal many at a time
|
||||
ctx.stealHalf = true
|
||||
|
||||
# Reset interval
|
||||
ctx.recentTasks = 0
|
||||
ctx.recentThefts = 0
|
||||
ctx.recentTheftsAdaptative = 0
|
||||
ctx.recentLeaps = 0
|
||||
|
||||
proc tryStealAdaptative(ctx: var WorkerContext): ptr Task =
|
||||
## Try to steal one or many tasks, depending on load
|
||||
|
||||
# TODO: while running 'threadpool/examples/e02_parallel_pi.nim'
|
||||
# stealHalf can error out in tasks_flowvars.nim with:
|
||||
# "precondition not task.completed.load(moAcquire)"
|
||||
ctx.stealHalf = false
|
||||
# ctx.updateStealStrategy()
|
||||
|
||||
let seed = ctx.rng.next().uint32
|
||||
for targetId in seed.pseudoRandomPermutation(ctx.threadpool.numThreads):
|
||||
if targetId == ctx.id:
|
||||
continue
|
||||
|
||||
let stolenTask =
|
||||
if ctx.stealHalf: ctx.id.stealHalf(ctx.taskqueue[], ctx.threadpool.workerQueues[targetId])
|
||||
else: ctx.id.steal(ctx.threadpool.workerQueues[targetId])
|
||||
|
||||
if not stolenTask.isNil():
|
||||
ctx.recentThefts += 1
|
||||
ctx.recentTheftsAdaptative += 1
|
||||
# Theft successful, there might be more work for idle threads, wake one
|
||||
ctx.threadpool.globalBackoff.wake()
|
||||
return stolenTask
|
||||
return nil
|
||||
|
||||
proc tryLeapfrog(ctx: var WorkerContext, awaitedTask: ptr Task): ptr Task =
|
||||
## Leapfrogging:
|
||||
##
|
||||
## - Leapfrogging: a portable technique for implementing efficient futures,
|
||||
## David B. Wagner, Bradley G. Calder, 1993
|
||||
## https://dl.acm.org/doi/10.1145/173284.155354
|
||||
##
|
||||
## When awaiting a future, we can look in the thief queue first. They steal when they run out of tasks.
|
||||
## If they have tasks in their queue, it's the task we are awaiting that created them and it will likely be stuck
|
||||
## on those tasks as well, so we need to help them help us.
|
||||
|
||||
var thiefID = SentinelThief
|
||||
while true:
|
||||
debug: log("Worker %2d: waiting for thief to publish their ID\n", ctx.id)
|
||||
thiefID = awaitedTask.thiefID.load(moAcquire)
|
||||
if thiefID != SentinelThief:
|
||||
break
|
||||
cpuRelax()
|
||||
ascertain: 0 <= thiefID and thiefID < ctx.threadpool.numThreads
|
||||
|
||||
# Leapfrogging is used when completing a future, steal only one
|
||||
# and don't leave tasks stranded in our queue.
|
||||
let leapTask = ctx.id.steal(ctx.threadpool.workerQueues[thiefID])
|
||||
if not leapTask.isNil():
|
||||
ctx.recentLeaps += 1
|
||||
# Theft successful, there might be more work for idle threads, wake one
|
||||
ctx.threadpool.globalBackoff.wake()
|
||||
return leapTask
|
||||
return nil
|
||||
|
||||
proc eventLoop(ctx: var WorkerContext) {.raises:[Exception].} =
|
||||
## Each worker thread executes this loop over and over.
|
||||
while not ctx.signal.terminate.load(moRelaxed):
|
||||
# 1. Pick from local queue
|
||||
debug: log("Worker %2d: eventLoop 1 - searching task from local queue\n", ctx.id)
|
||||
while (var task = ctx.taskqueue[].pop(); not task.isNil):
|
||||
debug: log("Worker %2d: eventLoop 1 - running task 0x%.08x (parent 0x%.08x, current 0x%.08x)\n", ctx.id, task, task.parent, ctx.currentTask)
|
||||
ctx.run(task)
|
||||
|
||||
# 2. Run out of tasks, become a thief
|
||||
debug: log("Worker %2d: eventLoop 2 - becoming a thief\n", ctx.id)
|
||||
let ticket = ctx.threadpool.globalBackoff.sleepy()
|
||||
var stolenTask = ctx.tryStealAdaptative()
|
||||
if not stolenTask.isNil:
|
||||
# We manage to steal a task, cancel sleep
|
||||
ctx.threadpool.globalBackoff.cancelSleep()
|
||||
# 2.a Run task
|
||||
debug: log("Worker %2d: eventLoop 2.a - stole task 0x%.08x (parent 0x%.08x, current 0x%.08x)\n", ctx.id, stolenTask, stolenTask.parent, ctx.currentTask)
|
||||
ctx.run(stolenTask)
|
||||
else:
|
||||
# 2.b Park the thread until a new task enters the threadpool
|
||||
debug: log("Worker %2d: eventLoop 2.b - sleeping\n", ctx.id)
|
||||
ctx.threadpool.globalBackoff.sleep(ticket)
|
||||
debug: log("Worker %2d: eventLoop 2.b - waking\n", ctx.id)
|
||||
|
||||
# Sync
|
||||
# ---------------------------------------------
|
||||
|
||||
template isRootTask(task: ptr Task): bool =
|
||||
task == RootTask
|
||||
|
||||
proc completeFuture*[T](fv: Flowvar[T], parentResult: var T) {.raises:[Exception].} =
|
||||
## Eagerly complete an awaited FlowVar
|
||||
|
||||
template ctx: untyped = workerContext
|
||||
|
||||
template isFutReady(): untyped =
|
||||
let isReady = fv.task.completed.load(moAcquire)
|
||||
if isReady:
|
||||
parentResult = cast[ptr (ptr Task, T)](fv.task.data.addr)[1]
|
||||
isReady
|
||||
|
||||
if isFutReady():
|
||||
return
|
||||
|
||||
## 1. Process all the children of the current tasks.
|
||||
## This ensures that we can give control back ASAP.
|
||||
debug: log("Worker %2d: sync 1 - searching task from local queue\n", ctx.id)
|
||||
while (let task = ctx.taskqueue[].pop(); not task.isNil):
|
||||
if task.parent != ctx.currentTask:
|
||||
debug: log("Worker %2d: sync 1 - skipping non-direct descendant task 0x%.08x (parent 0x%.08x, current 0x%.08x)\n", ctx.id, task, task.parent, ctx.currentTask)
|
||||
ctx.schedule(task, forceWake = true) # reschedule task and wake a sibling to take it over.
|
||||
break
|
||||
debug: log("Worker %2d: sync 1 - running task 0x%.08x (parent 0x%.08x, current 0x%.08x)\n", ctx.id, task, task.parent, ctx.currentTask)
|
||||
ctx.run(task)
|
||||
if isFutReady():
|
||||
debug: log("Worker %2d: sync 1 - future ready, exiting\n", ctx.id)
|
||||
return
|
||||
|
||||
## 2. We run out-of-tasks or out-of-direct-child of our current awaited task
|
||||
## So the task is bottlenecked by dependencies in other threads,
|
||||
## hence we abandon our enqueued work and steal in the others' queues
|
||||
## in hope it advances our awaited task. This prioritizes latency over throughput.
|
||||
##
|
||||
## See also
|
||||
## - Proactive work-stealing for futures
|
||||
## Kyle Singer, Yifan Xu, I-Ting Angelina Lee, 2019
|
||||
## https://dl.acm.org/doi/10.1145/3293883.3295735
|
||||
debug: log("Worker %2d: sync 2 - future not ready, becoming a thief (currentTask 0x%.08x)\n", ctx.id, ctx.currentTask)
|
||||
while not isFutReady():
|
||||
if (let leapTask = ctx.tryLeapfrog(fv.task); not leapTask.isNil):
|
||||
# We stole a task generated by the task we are awaiting.
|
||||
debug: log("Worker %2d: sync 2.1 - leapfrog task 0x%.08x (parent 0x%.08x, current 0x%.08x)\n", ctx.id, leapTask, leapTask.parent, ctx.currentTask)
|
||||
ctx.run(leapTask)
|
||||
elif (let stolenTask = ctx.tryStealOne(); not stolenTask.isNil):
|
||||
# We stole a task, we hope we advance our awaited task.
|
||||
debug: log("Worker %2d: sync 2.2 - stole task 0x%.08x (parent 0x%.08x, current 0x%.08x)\n", ctx.id, stolenTask, stolenTask.parent, ctx.currentTask)
|
||||
ctx.run(stolenTask)
|
||||
elif (let ownTask = ctx.taskqueue[].pop(); not ownTask.isNil):
|
||||
# We advance our own queue, this increases global throughput but may impact latency on the awaited task.
|
||||
#
|
||||
# Note: for a scheduler to be optimal (i.e. within 2x than ideal) it should be greedy
|
||||
# so all workers should be working. This is a difficult tradeoff.
|
||||
debug: log("Worker %2d: sync 2.3 - couldn't steal, running own task\n", ctx.id)
|
||||
ctx.run(ownTask)
|
||||
else:
|
||||
# Nothing to do, we park.
|
||||
# Note: On today's hyperthreaded systems, it might be more efficient to always park
|
||||
# instead of working on unrelated tasks in our task queue, despite making the scheduler non-greedy.
|
||||
# The actual hardware resources are 2x less than the actual number of cores
|
||||
ctx.localBackoff.prepareToPark()
|
||||
|
||||
var expected = (ptr EventNotifier)(nil)
|
||||
if compareExchange(fv.task.waiter, expected, desired = ctx.localBackoff.addr, moAcquireRelease):
|
||||
ctx.localBackoff.park()
|
||||
|
||||
proc syncAll*(tp: Threadpool) {.raises: [Exception].} =
|
||||
## Blocks until all pending tasks are completed
|
||||
## This MUST only be called from
|
||||
## the root scope that created the threadpool
|
||||
template ctx: untyped = workerContext
|
||||
|
||||
debugTermination:
|
||||
log(">>> Worker %2d enters barrier <<<\n", ctx.id)
|
||||
|
||||
preCondition: ctx.id == 0
|
||||
preCondition: ctx.currentTask.isRootTask()
|
||||
|
||||
# Empty all tasks
|
||||
var foreignThreadsParked = false
|
||||
while not foreignThreadsParked:
|
||||
# 1. Empty local tasks
|
||||
debug: log("Worker %2d: syncAll 1 - searching task from local queue\n", ctx.id)
|
||||
while (let task = ctx.taskqueue[].pop(); not task.isNil):
|
||||
debug: log("Worker %2d: syncAll 1 - running task 0x%.08x (parent 0x%.08x, current 0x%.08x)\n", ctx.id, task, task.parent, ctx.currentTask)
|
||||
ctx.run(task)
|
||||
|
||||
if tp.numThreads == 1 or foreignThreadsParked:
|
||||
break
|
||||
|
||||
# 2. Help other threads
|
||||
debug: log("Worker %2d: syncAll 2 - becoming a thief\n", ctx.id)
|
||||
let stolenTask = ctx.tryStealAdaptative()
|
||||
|
||||
if not stolenTask.isNil:
|
||||
# 2.1 We stole some task
|
||||
debug: log("Worker %2d: syncAll 2.1 - stole task 0x%.08x (parent 0x%.08x, current 0x%.08x)\n", ctx.id, stolenTask, stolenTask.parent, ctx.currentTask)
|
||||
ctx.run(stolenTask)
|
||||
else:
|
||||
# 2.2 No task to steal
|
||||
if tp.globalBackoff.getNumWaiters() == tp.numThreads - 1:
|
||||
# 2.2.1 all threads besides the current are parked
|
||||
debugTermination:
|
||||
log("Worker %2d: syncAll 2.2.1 - termination, all other threads sleeping\n", ctx.id)
|
||||
foreignThreadsParked = true
|
||||
else:
|
||||
# 2.2.2 We don't park as there is no notif for task completion
|
||||
cpuRelax()
|
||||
|
||||
debugTermination:
|
||||
log(">>> Worker %2d leaves barrier <<<\n", ctx.id)
|
||||
|
||||
# Runtime
|
||||
# ---------------------------------------------
|
||||
|
||||
proc new*(T: type Threadpool, numThreads = countProcessors()): T {.raises: [Exception].} =
|
||||
## Initialize a threadpool that manages `numThreads` threads.
|
||||
## Default to the number of logical processors available.
|
||||
|
||||
type TpObj = typeof(default(Threadpool)[]) # due to C import, we need a dynamic sizeof
|
||||
var tp = allocHeapUncheckedAlignedPtr(Threadpool, sizeof(TpObj), alignment = 64)
|
||||
|
||||
tp.barrier.init(numThreads.uint32)
|
||||
tp.globalBackoff.initialize()
|
||||
tp.numThreads = numThreads.uint32
|
||||
tp.workerQueues = allocHeapArrayAligned(Taskqueue, numThreads, alignment = 64)
|
||||
tp.workers = allocHeapArrayAligned(Thread[(Threadpool, WorkerID)], numThreads, alignment = 64)
|
||||
tp.workerSignals = allocHeapArrayAligned(Signal, numThreads, alignment = 64)
|
||||
|
||||
# Setup master thread
|
||||
workerContext.id = 0
|
||||
workerContext.threadpool = tp
|
||||
|
||||
# Start worker threads
|
||||
for i in 1 ..< numThreads:
|
||||
createThread(tp.workers[i], workerEntryFn, (tp, WorkerID(i)))
|
||||
|
||||
# Root worker
|
||||
setupWorker()
|
||||
|
||||
# Root task, this is a sentinel task that is never called.
|
||||
workerContext.currentTask = RootTask
|
||||
|
||||
# Wait for the child threads
|
||||
discard tp.barrier.wait()
|
||||
return tp
|
||||
|
||||
proc cleanup(tp: var Threadpool) {.raises: [OSError].} =
|
||||
## Cleanup all resources allocated by the threadpool
|
||||
preCondition: workerContext.currentTask.isRootTask()
|
||||
|
||||
for i in 1 ..< tp.numThreads:
|
||||
joinThread(tp.workers[i])
|
||||
|
||||
tp.workerSignals.freeHeapAligned()
|
||||
tp.workers.freeHeapAligned()
|
||||
tp.workerQueues.freeHeapAligned()
|
||||
tp.globalBackoff.`=destroy`()
|
||||
tp.barrier.delete()
|
||||
|
||||
tp.freeHeapAligned()
|
||||
|
||||
proc shutdown*(tp: var Threadpool) {.raises:[Exception].} =
|
||||
## Wait until all tasks are processed and then shutdown the threadpool
|
||||
preCondition: workerContext.currentTask.isRootTask()
|
||||
tp.syncAll()
|
||||
|
||||
# Signal termination to all threads
|
||||
for i in 0 ..< tp.numThreads:
|
||||
tp.workerSignals[i].terminate.store(true, moRelaxed)
|
||||
|
||||
tp.globalBackoff.wakeAll()
|
||||
|
||||
# 1 matching barrier in workerEntryFn
|
||||
discard tp.barrier.wait()
|
||||
|
||||
teardownWorker()
|
||||
tp.cleanup()
|
||||
|
||||
# Delete dummy task
|
||||
workerContext.currentTask = nil
|
||||
|
||||
{.pop.} # raises:[]
|
||||
|
||||
# Task parallel API
|
||||
# ---------------------------------------------
|
||||
|
||||
macro spawn*(tp: Threadpool, fnCall: typed): untyped =
|
||||
## Spawns the input function call asynchronously, potentially on another thread of execution.
|
||||
##
|
||||
## If the function calls returns a result, spawn will wrap it in a Flowvar.
|
||||
## You can use `sync` to block the current thread and extract the asynchronous result from the flowvar.
|
||||
## You can use `isReady` to check if result is available and if subsequent
|
||||
## `spawn` returns immediately.
|
||||
##
|
||||
## Tasks are processed approximately in Last-In-First-Out (LIFO) order
|
||||
result = spawnImpl(tp, fnCall, bindSym"workerContext", bindSym"schedule")
|
||||
@ -68,7 +68,7 @@ func rotl(x: uint64, k: static int): uint64 {.inline.} =
|
||||
template `^=`(x: var uint64, y: uint64) =
|
||||
x = x xor y
|
||||
|
||||
func next(rng: var RngState): uint64 =
|
||||
func next*(rng: var RngState): uint64 =
|
||||
## Compute a random uint64 from the input state
|
||||
## using xoshiro512** algorithm by Vigna et al
|
||||
## State is updated.
|
||||
@ -96,6 +96,12 @@ func random_unsafe*(rng: var RngState, maxExclusive: uint32): uint32 =
|
||||
## Uses an unbiaised generation method
|
||||
## See Lemire's algorithm modified by Melissa O'Neill
|
||||
## https://www.pcg-random.org/posts/bounded-rands.html
|
||||
## Original:
|
||||
## biaised: https://lemire.me/blog/2016/06/27/a-fast-alternative-to-the-modulo-reduction/
|
||||
## unbiaised: https://arxiv.org/pdf/1805.10941.pdf
|
||||
## Also:
|
||||
## Barrett Reduction: https://en.wikipedia.org/wiki/Barrett_reduction
|
||||
## http://www.acsel-lab.com/arithmetic/arith18/papers/ARITH18_Hasenplaugh.pdf
|
||||
let max = maxExclusive
|
||||
var x = uint32 rng.next()
|
||||
var m = x.uint64 * max.uint64
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user