From 2931913b6735aaae2fdca61736d54680aec39d6b Mon Sep 17 00:00:00 2001 From: Mamy Ratsimbazafy Date: Tue, 24 Jan 2023 02:32:28 +0100 Subject: [PATCH] Add a threadpool (#213) * Implement a threadpool * int and SomeUnsignedInt ... * Type conversion for windows SynchronizationBarrier * Use the latest MacOS 11, Big Sur API (jan 2021) for MacOS futexes, Github action offers MacOS 12 and can test them * bench need posix timer not available on windows and darwin futex * Windows: nimble exec empty line is an error, Mac: use defined(osx) instead of defined(macos) * file rename * okay, that's the last one hopefully * deactivate stealHalf for now --- constantine.nimble | 100 +++- constantine/math/arithmetic/limbs.nim | 2 +- constantine/math/arithmetic/limbs_exgcd.nim | 2 +- constantine/math/config/precompute.nim | 2 +- constantine/math/config/type_bigint.nim | 2 +- .../ec_shortweierstrass_batch_ops.nim | 4 +- constantine/platforms/README.md | 20 +- constantine/platforms/allocs.nim | 103 +++- constantine/platforms/bithacks.nim | 142 +++-- constantine/platforms/compilers/bitops.nim | 94 ++- constantine/platforms/threadpool/README.md | 38 ++ .../bouncing_producer_consumer/README.md | 11 + .../threadpool_bpc.nim | 157 +++++ .../benchmarks/dfs/threadpool_dfs.nim | 86 +++ .../threadpool/benchmarks/fibonacci/README.md | 77 +++ .../benchmarks/fibonacci/stdnim_fib.nim | 35 ++ .../benchmarks/fibonacci/threadpool_fib.nim | 79 +++ .../benchmarks/heat/stdnim_heat.nim | 300 ++++++++++ .../benchmarks/heat/threadpool_heat.nim | 314 ++++++++++ .../matmul_cache_oblivious/README.md | 12 + .../threadpool_matmul_co.nim | 214 +++++++ .../benchmarks/nqueens/stdnim_nqueens.nim | 181 ++++++ .../benchmarks/nqueens/threadpool_nqueens.nim | 231 ++++++++ .../threadpool/benchmarks/resources.nim | 24 + .../benchmarks/single_task_producer/README.md | 7 + .../single_task_producer/threadpool_spc.nim | 146 +++++ .../platforms/threadpool/benchmarks/wtime.h | 53 ++ .../platforms/threadpool/benchmarks/wtime.nim | 10 + .../threadpool/crossthread/backoff.nim | 186 ++++++ .../threadpool/crossthread/taskqueues.nim | 286 +++++++++ .../threadpool/crossthread/tasks_flowvars.nim | 127 ++++ .../platforms/threadpool/docs/partitioners.md | 166 ++++++ .../threadpool/docs/random_permutations.md | 42 ++ .../threadpool/examples/e01_simple_tasks.nim | 48 ++ .../threadpool/examples/e02_parallel_pi.nim | 38 ++ .../platforms/threadpool/instrumentation.nim | 142 +++++ .../threadpool/parallel_offloading.nim | 160 +++++ .../threadpool/primitives/barriers.md | 53 ++ .../threadpool/primitives/barriers.nim | 71 +++ .../threadpool/primitives/barriers_macos.nim | 84 +++ .../threadpool/primitives/barriers_posix.nim | 56 ++ .../primitives/barriers_windows.nim | 31 + .../threadpool/primitives/futexes.nim | 18 + .../threadpool/primitives/futexes_linux.nim | 68 +++ .../threadpool/primitives/futexes_macos.nim | 109 ++++ .../threadpool/primitives/futexes_windows.nim | 59 ++ .../platforms/threadpool/threadpool.nim | 555 ++++++++++++++++++ helpers/prng_unsafe.nim | 8 +- 48 files changed, 4647 insertions(+), 106 deletions(-) create mode 100644 constantine/platforms/threadpool/README.md create mode 100644 constantine/platforms/threadpool/benchmarks/bouncing_producer_consumer/README.md create mode 100644 constantine/platforms/threadpool/benchmarks/bouncing_producer_consumer/threadpool_bpc.nim create mode 100644 constantine/platforms/threadpool/benchmarks/dfs/threadpool_dfs.nim create mode 100644 constantine/platforms/threadpool/benchmarks/fibonacci/README.md create mode 100644 constantine/platforms/threadpool/benchmarks/fibonacci/stdnim_fib.nim create mode 100644 constantine/platforms/threadpool/benchmarks/fibonacci/threadpool_fib.nim create mode 100644 constantine/platforms/threadpool/benchmarks/heat/stdnim_heat.nim create mode 100644 constantine/platforms/threadpool/benchmarks/heat/threadpool_heat.nim create mode 100644 constantine/platforms/threadpool/benchmarks/matmul_cache_oblivious/README.md create mode 100644 constantine/platforms/threadpool/benchmarks/matmul_cache_oblivious/threadpool_matmul_co.nim create mode 100644 constantine/platforms/threadpool/benchmarks/nqueens/stdnim_nqueens.nim create mode 100644 constantine/platforms/threadpool/benchmarks/nqueens/threadpool_nqueens.nim create mode 100644 constantine/platforms/threadpool/benchmarks/resources.nim create mode 100644 constantine/platforms/threadpool/benchmarks/single_task_producer/README.md create mode 100644 constantine/platforms/threadpool/benchmarks/single_task_producer/threadpool_spc.nim create mode 100644 constantine/platforms/threadpool/benchmarks/wtime.h create mode 100644 constantine/platforms/threadpool/benchmarks/wtime.nim create mode 100644 constantine/platforms/threadpool/crossthread/backoff.nim create mode 100644 constantine/platforms/threadpool/crossthread/taskqueues.nim create mode 100644 constantine/platforms/threadpool/crossthread/tasks_flowvars.nim create mode 100644 constantine/platforms/threadpool/docs/partitioners.md create mode 100644 constantine/platforms/threadpool/docs/random_permutations.md create mode 100644 constantine/platforms/threadpool/examples/e01_simple_tasks.nim create mode 100644 constantine/platforms/threadpool/examples/e02_parallel_pi.nim create mode 100644 constantine/platforms/threadpool/instrumentation.nim create mode 100644 constantine/platforms/threadpool/parallel_offloading.nim create mode 100644 constantine/platforms/threadpool/primitives/barriers.md create mode 100644 constantine/platforms/threadpool/primitives/barriers.nim create mode 100644 constantine/platforms/threadpool/primitives/barriers_macos.nim create mode 100644 constantine/platforms/threadpool/primitives/barriers_posix.nim create mode 100644 constantine/platforms/threadpool/primitives/barriers_windows.nim create mode 100644 constantine/platforms/threadpool/primitives/futexes.nim create mode 100644 constantine/platforms/threadpool/primitives/futexes_linux.nim create mode 100644 constantine/platforms/threadpool/primitives/futexes_macos.nim create mode 100644 constantine/platforms/threadpool/primitives/futexes_windows.nim create mode 100644 constantine/platforms/threadpool/threadpool.nim diff --git a/constantine.nimble b/constantine.nimble index dbb1ae1..f83cd4d 100644 --- a/constantine.nimble +++ b/constantine.nimble @@ -43,7 +43,7 @@ const testDesc: seq[tuple[path: string, useGMP: bool]] = @[ # ---------------------------------------------------------- ("tests/math/t_primitives.nim", false), ("tests/math/t_primitives_extended_precision.nim", false), - + # Big ints # ---------------------------------------------------------- ("tests/math/t_io_bigints.nim", false), @@ -53,7 +53,7 @@ const testDesc: seq[tuple[path: string, useGMP: bool]] = @[ ("tests/math/t_bigints_mod_vs_gmp.nim", true), ("tests/math/t_bigints_mul_vs_gmp.nim", true), ("tests/math/t_bigints_mul_high_words_vs_gmp.nim", true), - + # Field # ---------------------------------------------------------- ("tests/math/t_io_fields", false), @@ -64,11 +64,11 @@ const testDesc: seq[tuple[path: string, useGMP: bool]] = @[ ("tests/math/t_finite_fields_powinv.nim", false), ("tests/math/t_finite_fields_vs_gmp.nim", true), # ("tests/math/t_fp_cubic_root.nim", false), - + # Double-precision finite fields # ---------------------------------------------------------- ("tests/math/t_finite_fields_double_precision.nim", false), - + # Towers of extension fields # ---------------------------------------------------------- # ("tests/math/t_fp2.nim", false), @@ -90,7 +90,7 @@ const testDesc: seq[tuple[path: string, useGMP: bool]] = @[ ("tests/math/t_fp6_frobenius.nim", false), ("tests/math/t_fp12_frobenius.nim", false), - # Elliptic curve arithmetic + # Elliptic curve arithmetic # ---------------------------------------------------------- ("tests/math/t_ec_conversion.nim", false), @@ -111,7 +111,7 @@ const testDesc: seq[tuple[path: string, useGMP: bool]] = @[ ("tests/math/t_ec_twedwards_prj_add_double", false), ("tests/math/t_ec_twedwards_prj_mul_sanity", false), ("tests/math/t_ec_twedwards_prj_mul_distri", false), - + # Elliptic curve arithmetic G2 # ---------------------------------------------------------- @@ -172,7 +172,7 @@ const testDesc: seq[tuple[path: string, useGMP: bool]] = @[ ("tests/math/t_ec_sage_bls12_381.nim", false), ("tests/math/t_ec_sage_pallas.nim", false), ("tests/math/t_ec_sage_vesta.nim", false), - + # Edge cases highlighted by past bugs # ---------------------------------------------------------- ("tests/math/t_ec_shortw_prj_edge_cases.nim", false), @@ -233,6 +233,18 @@ const testDescNvidia: seq[string] = @[ "tests/gpu/t_nvidia_fp.nim", ] +const testDescThreadpool: seq[string] = @[ + "constantine/platforms/threadpool/examples/e01_simple_tasks.nim", + "constantine/platforms/threadpool/examples/e02_parallel_pi.nim", + # "constantine/platforms/threadpool/benchmarks/bouncing_producer_consumer/threadpool_bpc.nim", # Need timing not implemented on Windows + "constantine/platforms/threadpool/benchmarks/dfs/threadpool_dfs.nim", + "constantine/platforms/threadpool/benchmarks/fibonacci/threadpool_fib.nim", + "constantine/platforms/threadpool/benchmarks/heat/threadpool_heat.nim", + # "constantine/platforms/threadpool/benchmarks/matmul_cache_oblivious/threadpool_matmul_co.nim", + "constantine/platforms/threadpool/benchmarks/nqueens/threadpool_nqueens.nim", + # "constantine/platforms/threadpool/benchmarks/single_task_producer/threadpool_spc.nim", # Need timing not implemented on Windows +] + const benchDesc = [ "bench_fp", "bench_fp_double_precision", @@ -301,7 +313,7 @@ proc clearParallelBuild() = if fileExists(buildParallel): rmFile(buildParallel) -template setupCommand(): untyped {.dirty.} = +template setupCommand(): untyped {.dirty.} = var lang = "c" if existsEnv"TEST_LANG": lang = getEnv"TEST_LANG" @@ -367,7 +379,7 @@ proc addTestSet(cmdFile: var string, requireGMP: bool, test32bit = false, testAS if not dirExists "build": mkDir "build" echo "Found " & $testDesc.len & " tests to run." - + for td in testDesc: if not(td.useGMP and not requireGMP): var flags = "" @@ -379,17 +391,25 @@ proc addTestSet(cmdFile: var string, requireGMP: bool, test32bit = false, testAS flags &= " -d:debugConstantine" if td.path notin skipSanitizers: flags &= sanitizers - + cmdFile.testBatch(flags, td.path) proc addTestSetNvidia(cmdFile: var string) = if not dirExists "build": mkDir "build" echo "Found " & $testDescNvidia.len & " tests to run." - + for path in testDescNvidia: cmdFile.testBatch(flags = "", path) +proc addTestSetThreadpool(cmdFile: var string) = + if not dirExists "build": + mkDir "build" + echo "Found " & $testDescThreadpool.len & " tests to run." + + for path in testDescThreadpool: + cmdFile.testBatch(flags = "--threads:on --linetrace:on", path) + proc addBenchSet(cmdFile: var string, useAsm = true) = if not dirExists "build": mkDir "build" @@ -491,13 +511,13 @@ task test_bindings, "Test C bindings": # Put DLL near the exe as LD_LIBRARY_PATH doesn't work even in an POSIX compatible shell exec "gcc -Ibindings/generated -Lbindings/generated -o build/testsuite/t_libctt_bls12_381_dl.exe tests/bindings/t_libctt_bls12_381.c -lgmp -lconstantine_bls12_381" exec "./build/testsuite/t_libctt_bls12_381_dl.exe" - + echo "--> Testing statically linked library" when not defined(windows): # Beware MacOS annoying linker with regards to static libraries # The following standard way cannot be used on MacOS # exec "gcc -Ibindings/generated -Lbindings/generated -o build/t_libctt_bls12_381_sl.exe tests/bindings/t_libctt_bls12_381.c -lgmp -Wl,-Bstatic -lconstantine_bls12_381 -Wl,-Bdynamic" - + exec "gcc -Ibindings/generated -o build/testsuite/t_libctt_bls12_381_sl tests/bindings/t_libctt_bls12_381.c bindings/generated/libconstantine_bls12_381.a -lgmp" exec "./build/testsuite/t_libctt_bls12_381_sl" else: @@ -509,32 +529,40 @@ task test, "Run all tests": var cmdFile: string cmdFile.addTestSet(requireGMP = true, testASM = true) cmdFile.addBenchSet(useASM = true) # Build (but don't run) benches to ensure they stay relevant + cmdFile.addTestSetThreadpool() for cmd in cmdFile.splitLines(): - exec cmd + if cmd != "": # Windows doesn't like empty commands + exec cmd task test_no_asm, "Run all tests (no assembly)": # -d:testingCurves is configured in a *.nim.cfg for convenience var cmdFile: string cmdFile.addTestSet(requireGMP = true, testASM = false) cmdFile.addBenchSet(useASM = false) # Build (but don't run) benches to ensure they stay relevant + cmdFile.addTestSetThreadpool() for cmd in cmdFile.splitLines(): - exec cmd + if cmd != "": # Windows doesn't like empty commands + exec cmd task test_no_gmp, "Run tests that don't require GMP": # -d:testingCurves is configured in a *.nim.cfg for convenience var cmdFile: string cmdFile.addTestSet(requireGMP = false, testASM = true) cmdFile.addBenchSet(useASM = true) # Build (but don't run) benches to ensure they stay relevant + cmdFile.addTestSetThreadpool() for cmd in cmdFile.splitLines(): - exec cmd + if cmd != "": # Windows doesn't like empty commands + exec cmd task test_no_gmp_no_asm, "Run tests that don't require GMP using a pure Nim backend": # -d:testingCurves is configured in a *.nim.cfg for convenience var cmdFile: string cmdFile.addTestSet(requireGMP = false, testASM = false) cmdFile.addBenchSet(useASM = false) # Build (but don't run) benches to ensure they stay relevant + cmdFile.addTestSetThreadpool() for cmd in cmdFile.splitLines(): - exec cmd + if cmd != "": # Windows doesn't like empty commands + exec cmd task test_parallel, "Run all tests in parallel (via GNU parallel)": # -d:testingCurves is configured in a *.nim.cfg for convenience @@ -547,6 +575,13 @@ task test_parallel, "Run all tests in parallel (via GNU parallel)": writeFile(buildParallel, cmdFile) exec "build/pararun " & buildParallel + # Threadpool tests done serially + cmdFile = "" + cmdFile.addTestSetThreadpool() + for cmd in cmdFile.splitLines(): + if cmd != "": # Windows doesn't like empty commands + exec cmd + task test_parallel_no_asm, "Run all tests (without macro assembler) in parallel (via GNU parallel)": # -d:testingCurves is configured in a *.nim.cfg for convenience clearParallelBuild() @@ -558,6 +593,13 @@ task test_parallel_no_asm, "Run all tests (without macro assembler) in parallel writeFile(buildParallel, cmdFile) exec "build/pararun " & buildParallel + # Threadpool tests done serially + cmdFile = "" + cmdFile.addTestSetThreadpool() + for cmd in cmdFile.splitLines(): + if cmd != "": # Windows doesn't like empty commands + exec cmd + task test_parallel_no_gmp, "Run all tests in parallel (via GNU parallel)": # -d:testingCurves is configured in a *.nim.cfg for convenience clearParallelBuild() @@ -569,6 +611,13 @@ task test_parallel_no_gmp, "Run all tests in parallel (via GNU parallel)": writeFile(buildParallel, cmdFile) exec "build/pararun " & buildParallel + # Threadpool tests done serially + cmdFile = "" + cmdFile.addTestSetThreadpool() + for cmd in cmdFile.splitLines(): + if cmd != "": # Windows doesn't like empty commands + exec cmd + task test_parallel_no_gmp_no_asm, "Run all tests in parallel (via GNU parallel)": # -d:testingCurves is configured in a *.nim.cfg for convenience clearParallelBuild() @@ -580,11 +629,26 @@ task test_parallel_no_gmp_no_asm, "Run all tests in parallel (via GNU parallel)" writeFile(buildParallel, cmdFile) exec "build/pararun " & buildParallel + # Threadpool tests done serially + cmdFile = "" + cmdFile.addTestSetThreadpool() + for cmd in cmdFile.splitLines(): + if cmd != "": # Windows doesn't like empty commands + exec cmd + +task test_threadpool, "Run all tests for the builtin threadpool": + var cmdFile: string + cmdFile.addTestSetThreadpool() + for cmd in cmdFile.splitLines(): + if cmd != "": # Windows doesn't like empty commands + exec cmd + task test_nvidia, "Run all tests for Nvidia GPUs": var cmdFile: string cmdFile.addTestSetNvidia() for cmd in cmdFile.splitLines(): - exec cmd + if cmd != "": # Windows doesn't like empty commands + exec cmd # Finite field 𝔽p # ------------------------------------------ diff --git a/constantine/math/arithmetic/limbs.nim b/constantine/math/arithmetic/limbs.nim index 664988b..f5e5435 100644 --- a/constantine/math/arithmetic/limbs.nim +++ b/constantine/math/arithmetic/limbs.nim @@ -356,7 +356,7 @@ func div10*(a: var Limbs): SecretWord = ## Divide `a` by 10 in-place and return the remainder result = Zero - const clz = WordBitWidth - 1 - log2_vartime(10) + const clz = WordBitWidth - 1 - log2_vartime(10'u32) const norm10 = SecretWord(10) shl clz for i in countdown(a.len-1, 0): diff --git a/constantine/math/arithmetic/limbs_exgcd.nim b/constantine/math/arithmetic/limbs_exgcd.nim index 60fefb9..be8d968 100644 --- a/constantine/math/arithmetic/limbs_exgcd.nim +++ b/constantine/math/arithmetic/limbs_exgcd.nim @@ -61,7 +61,7 @@ func invModBitwidth(a: BaseType): BaseType = # which grows in O(log(log(a))) debug: doAssert (a and 1) == 1, "a must be odd" - const k = log2_vartime(a.sizeof() * 8) + const k = log2_vartime(a.sizeof().uint32 * 8) result = a # Start from an inverse of M0 modulo 2, M0 is odd and it's own inverse for _ in 0 ..< k: # at each iteration we get the inverse mod(2^2k) result *= 2 - a * result # x' = x(2 - ax) diff --git a/constantine/math/config/precompute.nim b/constantine/math/config/precompute.nim index 838afbf..cdfb6a8 100644 --- a/constantine/math/config/precompute.nim +++ b/constantine/math/config/precompute.nim @@ -282,7 +282,7 @@ func invModBitwidth*[T: SomeUnsignedInt](a: T): T = # which grows in O(log(log(a))) checkOdd(a) - let k = log2_vartime(T.sizeof() * 8) + let k = log2_vartime(T.sizeof().uint32 * 8) result = a # Start from an inverse of M0 modulo 2, M0 is odd and it's own inverse for _ in 0 ..< k: # at each iteration we get the inverse mod(2^2k) result *= 2 - a * result # x' = x(2 - ax) diff --git a/constantine/math/config/type_bigint.nim b/constantine/math/config/type_bigint.nim index 5ecd105..e5435ef 100644 --- a/constantine/math/config/type_bigint.nim +++ b/constantine/math/config/type_bigint.nim @@ -36,7 +36,7 @@ debug: result[0] = '0' result[1] = 'x' var a = a - for j in countdown(result.len-1, 2): + for j in countdown(result.len-1, 0): result[j] = hexChars.secretLookup(a and SecretWord 0xF) a = a shr 4 diff --git a/constantine/math/elliptic/ec_shortweierstrass_batch_ops.nim b/constantine/math/elliptic/ec_shortweierstrass_batch_ops.nim index 5e2a087..4bb7ddf 100644 --- a/constantine/math/elliptic/ec_shortweierstrass_batch_ops.nim +++ b/constantine/math/elliptic/ec_shortweierstrass_batch_ops.nim @@ -291,8 +291,8 @@ func sum_batch_vartime*[F; G: static Subgroup]( const maxStride = maxChunkSize div sizeof(ECP_ShortW_Aff[F, G]) let n = min(maxStride, points.len) - let accumulators = alloca(ECP_ShortW_Aff[F, G], n) - let lambdas = alloca(tuple[num, den: F], n) + let accumulators = allocStackArray(ECP_ShortW_Aff[F, G], n) + let lambdas = allocStackArray(tuple[num, den: F], n) for i in countup(0, points.len-1, maxStride): let n = min(maxStride, points.len - i) diff --git a/constantine/platforms/README.md b/constantine/platforms/README.md index 323d05f..f1497bd 100644 --- a/constantine/platforms/README.md +++ b/constantine/platforms/README.md @@ -6,9 +6,23 @@ This folder holds: to have the compiler enforce proper usage - extended precision multiplication and division primitives - assembly or builtin int128 primitives -- intrinsics -- an assembler -- Runtime CPU features detection +- SIMD intrinsics +- assemblers for x86 and LLVM IR +- a code generator for Nvidia GPU from LLVM IR +- runtime CPU features detection +- a threadpool + +## Runtimes + +Constantine strongly avoid any runtime so that it can be used even where garbage collection, dynamic memory allocation +are not allowed. That also avoids secrets remaining in heap memory. + +At runtime, Constantine may: +- detect the CPU features at the start of the application (in Nim) or after calling `ctt_myprotocol_init_NimMain()` for the C (or any other language) bindings. + +And offers the following opt-in features with use dynamic allocation: +- a threadpool, only for explicitly tagged parallel primitives. +- use LLVM and Cuda, and configure code to run computation on GPUs. ## Security diff --git a/constantine/platforms/allocs.nim b/constantine/platforms/allocs.nim index 16e4675..8af3325 100644 --- a/constantine/platforms/allocs.nim +++ b/constantine/platforms/allocs.nim @@ -6,6 +6,9 @@ # * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0). # at your option. This file may not be copied, modified, or distributed except according to those terms. +{.push raises:[].} # No exceptions for crypto +{.push checks:off.} # No int->size_t exceptions + # ############################################################ # # Allocators @@ -20,13 +23,107 @@ # # stack allocation is strongly preferred where necessary. +# Bindings +# ---------------------------------------------------------------------------------- +# We wrap them with int instead of size_t / csize_t + when defined(windows): proc alloca(size: int): pointer {.header: "".} else: proc alloca(size: int): pointer {.header: "".} -template alloca*(T: typedesc): ptr T = +proc malloc(size: int): pointer {.sideeffect, header: "".} +proc free(p: pointer) {.sideeffect, header: "".} + +when defined(windows): + proc aligned_alloc_windows(size, alignment: int): pointer {.sideeffect,importc:"_aligned_malloc", header:"".} + # Beware of the arg order! + proc aligned_alloc(alignment, size: int): pointer {.inline.} = + aligned_alloc_windows(size, alignment) + proc aligned_free(p: pointer){.sideeffect,importc:"_aligned_free", header:"".} +elif defined(osx): + proc posix_memalign(mem: var pointer, alignment, size: int){.sideeffect,importc, header:"".} + proc aligned_alloc(alignment, size: int): pointer {.inline.} = + posix_memalign(result, alignment, size) + proc aligned_free(p: pointer) {.sideeffect, importc: "free", header: "".} +else: + proc aligned_alloc(alignment, size: int): pointer {.sideeffect,importc, header:"".} + proc aligned_free(p: pointer) {.sideeffect, importc: "free", header: "".} + +# Helpers +# ---------------------------------------------------------------------------------- + +proc isPowerOfTwo(n: int): bool {.inline.} = + (n and (n - 1)) == 0 and (n != 0) + +func roundNextMultipleOf(x: int, n: static int): int {.inline.} = + ## Round the input to the next multiple of "n" + when n.isPowerOfTwo(): + # n is a power of 2. (If compiler cannot prove that x>0 it does not make the optim) + result = (x + n - 1) and not(n - 1) + else: + result = ((x + n - 1) div n) * n + +# Stack allocation +# ---------------------------------------------------------------------------------- + +template allocStack*(T: typedesc): ptr T = cast[ptr T](alloca(sizeof(T))) -template alloca*(T: typedesc, len: Natural): ptr UncheckedArray[T] = - cast[ptr UncheckedArray[T]](alloca(sizeof(T) * len)) \ No newline at end of file +template allocStackUnchecked*(T: typedesc, size: int): ptr T = + ## Stack allocation for types containing a variable-sized UncheckedArray field + cast[ptr T](alloca(size)) + +template allocStackArray*(T: typedesc, len: Natural): ptr UncheckedArray[T] = + cast[ptr UncheckedArray[T]](alloca(sizeof(T) * len)) + +# Heap allocation +# ---------------------------------------------------------------------------------- + +proc allocHeap*(T: typedesc): ptr T {.inline.} = + cast[type result](malloc(sizeof(T))) + +proc allocHeapUnchecked*(T: typedesc, size: int): ptr T {.inline.} = + ## Heap allocation for types containing a variable-sized UncheckedArray field + cast[type result](malloc(size)) + +proc allocHeapArray*(T: typedesc, len: SomeInteger): ptr UncheckedArray[T] {.inline.} = + cast[type result](malloc(len*sizeof(T))) + +proc freeHeap*(p: pointer) {.inline.} = + free(p) + +proc allocHeapAligned*(T: typedesc, alignment: static Natural): ptr T {.inline.} = + # aligned_alloc requires allocating in multiple of the alignment. + const + size = sizeof(T) + requiredMem = size.roundNextMultipleOf(alignment) + + cast[ptr T](aligned_alloc(alignment, requiredMem)) + +proc allocHeapUncheckedAligned*(T: typedesc, size: int, alignment: static Natural): ptr T {.inline.} = + ## Aligned heap allocation for types containing a variable-sized UncheckedArray field + ## or an importc type with missing size information + # aligned_alloc requires allocating in multiple of the alignment. + let requiredMem = size.roundNextMultipleOf(alignment) + + cast[ptr T](aligned_alloc(alignment, requiredMem)) + +proc allocHeapArrayAligned*(T: typedesc, len: int, alignment: static Natural): ptr UncheckedArray[T] {.inline.} = + # aligned_alloc requires allocating in multiple of the alignment. + let + size = sizeof(T) * len + requiredMem = size.roundNextMultipleOf(alignment) + + cast[ptr UncheckedArray[T]](aligned_alloc(alignment, requiredMem)) + +proc allocHeapAlignedPtr*(T: typedesc[ptr], alignment: static Natural): T {.inline.} = + allocHeapAligned(typeof(default(T)[]), alignment) + +proc allocHeapUncheckedAlignedPtr*(T: typedesc[ptr], size: int, alignment: static Natural): T {.inline.} = + ## Aligned heap allocation for types containing a variable-sized UncheckedArray field + ## or an importc type with missing size information + allocHeapUncheckedAligned(typeof(default(T)[]), size, alignment) + +proc freeHeapAligned*(p: pointer) {.inline.} = + aligned_free(p) \ No newline at end of file diff --git a/constantine/platforms/bithacks.nim b/constantine/platforms/bithacks.nim index 4f9f764..c22f811 100644 --- a/constantine/platforms/bithacks.nim +++ b/constantine/platforms/bithacks.nim @@ -32,7 +32,10 @@ import ./compilers/bitops # # See: https://www.chessprogramming.org/BitScan # https://www.chessprogramming.org/General_Setwise_Operations +# https://www.chessprogramming.org/De_Bruijn_Sequence_Generator # and https://graphics.stanford.edu/%7Eseander/bithacks.html +# and Hacker's Delight 2nd Edition, Henry S Warren, Jr. +# and https://sites.google.com/site/sydfhd/articles-tutorials/de-bruijn-sequence-generator # for compendiums of bit manipulation func clearMask[T: SomeInteger](v: T, mask: T): T {.inline.} = @@ -43,77 +46,106 @@ func clearBit*[T: SomeInteger](v: T, bit: T): T {.inline.} = ## Returns ``v``, with the bit at position ``bit`` set to 0 v.clearMask(1.T shl bit) -func log2impl_vartime(x: uint32): uint32 = +func log2_impl_vartime(n: uint32): uint32 = ## Find the log base 2 of a 32-bit or less integer. ## using De Bruijn multiplication ## Works at compile-time. ## ⚠️ not constant-time, table accesses are not uniform. # https://graphics.stanford.edu/%7Eseander/bithacks.html#IntegerLogDeBruijn - const lookup: array[32, uint8] = [0'u8, 9, 1, 10, 13, 21, 2, 29, 11, 14, 16, 18, - 22, 25, 3, 30, 8, 12, 20, 28, 15, 17, 24, 7, 19, 27, 23, 6, 26, 5, 4, 31] - var v = x - v = v or v shr 1 # first round down to one less than a power of 2 - v = v or v shr 2 - v = v or v shr 4 - v = v or v shr 8 - v = v or v shr 16 - lookup[(v * 0x07C4ACDD'u32) shr 27] + const lookup: array[32, uint8] = [ + uint8 0, 9, 1, 10, 13, 21, 2, 29, + 11, 14, 16, 18, 22, 25, 3, 30, + 8, 12, 20, 28, 15, 17, 24, 7, + 19, 27, 23, 6, 26, 5, 4, 31] + + # Isolate MSB + var n = n + n = n or n shr 1 # first round down to one less than a power of 2 + n = n or n shr 2 + n = n or n shr 4 + n = n or n shr 8 + n = n or n shr 16 + uint32 lookup[(n * 0x07C4ACDD'u32) shr 27] -func log2impl_vartime(x: uint64): uint64 {.inline.} = +func log2_impl_vartime(n: uint64): uint64 {.inline.} = ## Find the log base 2 of a 32-bit or less integer. ## using De Bruijn multiplication ## Works at compile-time. ## ⚠️ not constant-time, table accesses are not uniform. # https://graphics.stanford.edu/%7Eseander/bithacks.html#IntegerLogDeBruijn - const lookup: array[64, uint8] = [0'u8, 58, 1, 59, 47, 53, 2, 60, 39, 48, 27, 54, - 33, 42, 3, 61, 51, 37, 40, 49, 18, 28, 20, 55, 30, 34, 11, 43, 14, 22, 4, 62, - 57, 46, 52, 38, 26, 32, 41, 50, 36, 17, 19, 29, 10, 13, 21, 56, 45, 25, 31, - 35, 16, 9, 12, 44, 24, 15, 8, 23, 7, 6, 5, 63] - var v = x - v = v or v shr 1 # first round down to one less than a power of 2 - v = v or v shr 2 - v = v or v shr 4 - v = v or v shr 8 - v = v or v shr 16 - v = v or v shr 32 - lookup[(v * 0x03F6EAF2CD271461'u64) shr 58] + # https://stackoverflow.com/questions/11376288/fast-computing-of-log2-for-64-bit-integers + const lookup: array[64, uint8] = [ + uint8 0, 58, 1, 59, 47, 53, 2, 60, + 39, 48, 27, 54, 33, 42, 3, 61, + 51, 37, 40, 49, 18, 28, 20, 55, + 30, 34, 11, 43, 14, 22, 4, 62, + 57, 46, 52, 38, 26, 32, 41, 50, + 36, 17, 19, 29, 10, 13, 21, 56, + 45, 25, 31, 35, 16, 9, 12, 44, + 24, 15, 8, 23, 7, 6, 5, 63] + + # Isolate MSB + var n = n + n = n or n shr 1 # first round down to one less than a power of 2 + n = n or n shr 2 + n = n or n shr 4 + n = n or n shr 8 + n = n or n shr 16 + n = n or n shr 32 + uint64 lookup[(n * 0x03F6EAF2CD271461'u64) shr 58] func log2_vartime*[T: SomeUnsignedInt](n: T): T {.inline.} = ## Find the log base 2 of an integer - ## - ## ⚠ With GCC and Clang compilers on x86, if n is zero, result is undefined. when nimvm: - when sizeof(T) == sizeof(uint64): - T(log2impl_vartime(uint64(n))) + when sizeof(T) == 8: + T(log2_impl_vartime(uint64(n))) else: - static: doAssert sizeof(T) <= sizeof(uint32) - T(log2impl_vartime(uint32(n))) + T(log2_impl_vartime(uint32(n))) else: - when sizeof(T) == sizeof(uint64): - T(log2_c_compiler_vartime(uint64(n))) + log2_c_compiler_vartime(n) + +func ctz_impl_vartime(n: uint32): uint32 = + ## Find the number of trailing zero bits + ## Requires n != 0 + # https://sites.google.com/site/sydfhd/articles-tutorials/de-bruijn-sequence-generator + const lookup: array[32, uint8] = [ + uint8 0, 1, 16, 2, 29, 17, 3, 22, + 30, 20, 18, 11, 13, 4, 7, 23, + 31, 15, 28, 21, 19, 10, 12, 6, + 14, 27, 9, 5, 26, 8, 25, 24] + + let isolateLSB = n xor (n-1) + uint32 lookup[(isolateLSB * 0x6EB14F9'u32) shr 27] + +func ctz_impl_vartime(n: uint64): uint64 = + ## Find the number of trailing zero bits + ## Requires n != 0 + # https://www.chessprogramming.org/BitScan#Bitscan_forward + const lookup: array[64, uint8] = [ + uint8 0, 47, 1, 56, 48, 27, 2, 60, + 57, 49, 41, 37, 28, 16, 3, 61, + 54, 58, 35, 52, 50, 42, 21, 44, + 38, 32, 29, 23, 17, 11, 4, 62, + 46, 55, 26, 59, 40, 36, 15, 53, + 34, 51, 20, 43, 31, 22, 10, 45, + 25, 39, 14, 33, 19, 30, 9, 24, + 13, 18, 8, 12, 7, 6, 5, 63] + + let isolateLSB = n xor (n-1) + uint64 lookup[(isolateLSB * 0x03f79d71b4cb0a89'u64) shr 58] + +func countTrailingZeroBits*[T: SomeUnsignedInt](n: T): T {.inline.} = + ## Count the number of trailing zero bits of an integer + when nimvm: + if n == 0: + T(sizeof(n) * 8) else: - static: doAssert sizeof(T) <= sizeof(uint32) - T(log2_c_compiler_vartime(uint32(n))) - -func hammingWeight*(x: uint32): uint {.inline.} = - ## Counts the set bits in integer. - # https://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel - var v = x - v = v - ((v shr 1) and 0x55555555) - v = (v and 0x33333333) + ((v shr 2) and 0x33333333) - uint(((v + (v shr 4) and 0xF0F0F0F) * 0x1010101) shr 24) - -func hammingWeight*(x: uint64): uint {.inline.} = - ## Counts the set bits in integer. - # https://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel - var v = x - v = v - ((v shr 1'u64) and 0x5555555555555555'u64) - v = (v and 0x3333333333333333'u64) + ((v shr 2'u64) and 0x3333333333333333'u64) - v = (v + (v shr 4'u64) and 0x0F0F0F0F0F0F0F0F'u64) - uint((v * 0x0101010101010101'u64) shr 56'u64) - -func countLeadingZeros_vartime*[T: SomeUnsignedInt](x: T): T {.inline.} = - (8*sizeof(T)) - 1 - log2_vartime(x) + when sizeof(T) == 8: + T(ctz_impl_vartime(uint64(n))) + else: + T(ctz_impl_vartime(uint32(n))) + else: + ctz_c_compiler_vartime(n) func isPowerOf2_vartime*(n: SomeUnsignedInt): bool {.inline.} = ## Returns true if n is a power of 2 @@ -121,7 +153,7 @@ func isPowerOf2_vartime*(n: SomeUnsignedInt): bool {.inline.} = ## for compile-time or explicit vartime proc only. (n and (n - 1)) == 0 -func nextPowerOf2_vartime*(n: uint64): uint64 {.inline.} = +func nextPowerOfTwo_vartime*(n: uint32): uint32 {.inline.} = ## Returns x if x is a power of 2 ## or the next biggest power of 2 - 1'u64 shl (log2_vartime(n-1) + 1) + 1'u32 shl (log2_vartime(n-1) + 1) \ No newline at end of file diff --git a/constantine/platforms/compilers/bitops.nim b/constantine/platforms/compilers/bitops.nim index 02a04a1..5913e9a 100644 --- a/constantine/platforms/compilers/bitops.nim +++ b/constantine/platforms/compilers/bitops.nim @@ -15,54 +15,101 @@ when GCC_Compatible: func builtin_clzll(n: uint64): cint {.importc: "__builtin_clzll", nodecl.} ## Count the number of leading zeros ## undefined if n is zero + func builtin_ctz(n: uint32): cint {.importc: "__builtin_ctz", nodecl.} + ## Count the number of trailing zeros + ## undefined if n is zero + func builtin_ctzll(n: uint64): cint {.importc: "__builtin_ctzll", nodecl.} + ## Count the number of trailing zeros + ## undefined if n is zero - func log2_c_compiler_vartime*(n: uint8|uint16|uint32): int {.inline.} = + func log2_c_compiler_vartime*(n: SomeUnsignedInt): cint {.inline.} = ## Compute the log2 of n using compiler builtin ## ⚠ Depending on the compiler: ## - It is undefined if n == 0 ## - It is not constant-time as a zero input is checked - cast[int](31 - cast[cuint](builtin_clz(n.uint32))) - func log2_c_compiler_vartime*(n: uint64): int {.inline.} = - ## Compute the log2 of n using compiler builtin + if n == 0: + 0 + else: + when sizeof(n) == 8: + cint(64) - builtin_clzll(n) + else: + cint(31) - builtin_clz(n.uint32) + + func ctz_c_compiler_vartime*(n: SomeUnsignedInt): cint {.inline.} = + ## Compute the number of trailing zeros + ## in the bit representation of n using compiler builtin ## ⚠ Depending on the compiler: ## - It is undefined if n == 0 ## - It is not constant-time as a zero input is checked - cast[int](63 - cast[cuint](builtin_clzll(n))) + if n == 0: + sizeof(n) * 8 + else: + when sizeof(n) == 8: + builtin_ctzll(n) + else: + builtin_ctz(n.uint32) elif defined(icc): func bitScanReverse(r: var uint32, n: uint32): uint8 {.importc: "_BitScanReverse", header: "".} ## Returns 0 if n is zero and non-zero otherwise ## Returns the position of the first set bit in `r` + ## from MSB to LSB func bitScanReverse64(r: var uint32, n: uint64): uint8 {.importc: "_BitScanReverse64", header: "".} ## Returns 0 if n is zero and non-zero otherwise ## Returns the position of the first set bit in `r` + ## from MSB to LSB + func bitScanForward(r: var uint32, n: uint32): uint8 {.importc: "_BitScanForward", header: "".} + ## Returns 0 if n is zero and non-zero otherwise + ## Returns the position of the first set bit in `r` + ## from LSB to MSB + func bitScanForward64(r: var uint32, n: uint64): uint8 {.importc: "_BitScanForward64", header: "".} + ## Returns 0 if n is zero and non-zero otherwise + ## Returns the position of the first set bit in `r` + ## from LSB to MSB - template bitscan(fnc: untyped; v: untyped): int {.inline.} = + template bitscan(fnc: untyped; v: untyped, default: static int): int {.inline.} = var index: uint32 if fnc(index.addr, v) == 0: - return 0 + return default return index.int - func log2_c_compiler_vartime*(n: uint8|uint16|uint32): int {.inline.} = + func log2_c_compiler_vartime*(n: SomeUnsignedInt): cint {.inline.} = ## Compute the log2 of n using compiler builtin ## ⚠ Depending on the compiler: ## - It is undefined if n == 0 ## - It is not constant-time as a zero input is checked - bitscan(bitScanReverse, c.uint32) - func log2_c_compiler_vartime*(n: uint64): int {.inline.} = - ## Compute the log2 of n using compiler builtin - ## ⚠ Depending on the compiler: - ## - It is undefined if n == 0 - ## - It is not constant-time as a zero input is checked - bitscan(bitScanReverse64, n) + when sizeof(n) == 8: + bitscan(bitScanReverse64, n, default = 0) + else: + bitscan(bitScanReverse, c.uint32, default = 0) + func ctz_c_compiler_vartime*(n: SomeUnsignedInt): cint {.inline.} = + ## Compute the number of trailing zero bits of n using compiler builtin + ## ⚠ Depending on the compiler: + ## - It is undefined if n == 0 + ## - It is not constant-time as a zero input is checked + when sizeof(n) == 8: + bitscan(bitScanForward64, n, default = 0) + else: + bitscan(bitScanForward, c.uint32, default = 0) + elif defined(vcc): func bitScanReverse(p: ptr uint32, b: uint32): uint8 {.importc: "_BitScanReverse", header: "".} ## Returns 0 if n s no set bit and non-zero otherwise ## Returns the position of the first set bit in `r` + ## from MSB to LSB func bitScanReverse64(p: ptr uint32, b: uint64): uint8 {.importc: "_BitScanReverse64", header: "".} ## Returns 0 if n s no set bit and non-zero otherwise ## Returns the position of the first set bit in `r` + ## from MSB to LSB + func bitScanForward(r: var uint32, n: uint32): uint8 {.importc: "_BitScanForward", header: "".} + ## Returns 0 if n is zero and non-zero otherwise + ## Returns the position of the first set bit in `r` + ## from LSB to MSB + func bitScanForward64(r: var uint32, n: uint64): uint8 {.importc: "_BitScanForward64", header: "".} + ## Returns 0 if n is zero and non-zero otherwise + ## Returns the position of the first set bit in `r` + ## from LSB to MSB template bitscan(fnc: untyped; v: untyped): int = var index: uint32 @@ -70,18 +117,25 @@ elif defined(vcc): return 0 return index.int - func log2_c_compiler_vartime*(n: uint8|uint16|uint32): int {.inline.} = + func log2_c_compiler_vartime*(n: SomeUnsignedInt): cint {.inline.} = ## Compute the log2 of n using compiler builtin ## ⚠ Depending on the compiler: ## - It is undefined if n == 0 ## - It is not constant-time as a zero input is checked - bitscan(bitScanReverse, c.uint32) - func log2_c_compiler_vartime*(n: uint64): int {.inline.} = - ## Compute the log2 of n using compiler builtin + when sizeof(n) == 8: + bitscan(bitScanReverse64, n, default = 0) + else: + bitscan(bitScanReverse, c.uint32, default = 0) + + func ctz_c_compiler_vartime*(n: SomeUnsignedInt): cint {.inline.} = + ## Compute the number of trailing zero bits of n using compiler builtin ## ⚠ Depending on the compiler: ## - It is undefined if n == 0 ## - It is not constant-time as a zero input is checked - bitscan(bitScanReverse64, n) + when sizeof(n) == 8: + bitscan(bitScanForward64, n, default = sizeof(n) * 8) + else: + bitscan(bitScanForward, c.uint32, default = sizeof(n) * 8) else: {. error: "Unsupported compiler".} \ No newline at end of file diff --git a/constantine/platforms/threadpool/README.md b/constantine/platforms/threadpool/README.md new file mode 100644 index 0000000..1136e86 --- /dev/null +++ b/constantine/platforms/threadpool/README.md @@ -0,0 +1,38 @@ +# Constantine Threadpool + +## API + +The API spec follows https://github.com/nim-lang/RFCs/issues/347#task-parallelism-api + +## Overview + +This implements a lightweight, energy-efficient, easily auditable multithreaded threadpool. + +This threadpool will desirable properties are: + +- Ease of auditing and maintenance. +- Resource-efficient. Threads spindown to save power, low memory use. +- Decent performance and scalability. The CPU should spent its time processing user workloads + and not dealing with threadpool contention, latencies and overheads. + +Compared to [Weave](https://github.com/mratsim/weave), here are the tradeoffs: +- Constantine's threadpool only provide spawn/sync (task parallelism).\ + There is no (extremely) optimized parallel for (data parallelism)\ + or precise in/out dependencies (events / dataflow parallelism). +- Constantine's threadpool has been significantly optimized to provide + overhead lower than Weave's default (and as low as Weave "lazy" + "alloca" allocation scheme) +- Constantine's threadpool provides the same adaptative scheduling strategy as Weave + with additional enhancement (leapfrogging) + +Compared to [nim-taskpools](https://github.com/status-im), here are the tradeoffs: +- Constantine does not use std/tasks: + - No external dependencies at runtime (apart from compilers, OS and drivers) + - We can replace Task with an intrusive linked list + - Furthermore we can embed tasks in their future +- Hence allocation/scheduler overhead is 3x less than nim-taskpools as we fuse the following allocations: + - Task + - The linked list of tasks + - The future (Flowvar) result channel +- Contention improvement, Constantine is entirely lock-free while Nim-taskpools need a lock+condition variable for putting threads to sleep +- Powersaving improvement, threads sleep when awaiting for a task and there is no work available. +- Scheduling improvement, Constantine's threadpool incorporate Weave's adaptative scheduling policy with additional enhancement (leapfrogging) \ No newline at end of file diff --git a/constantine/platforms/threadpool/benchmarks/bouncing_producer_consumer/README.md b/constantine/platforms/threadpool/benchmarks/bouncing_producer_consumer/README.md new file mode 100644 index 0000000..99abc9c --- /dev/null +++ b/constantine/platforms/threadpool/benchmarks/bouncing_producer_consumer/README.md @@ -0,0 +1,11 @@ +# BPC (Bouncing Producer-Consumer) + +From [tasking-2.0](https://github.com/aprell/tasking-2.0) description + +> **BPC**, short for **B**ouncing **P**roducer-**C**onsumer benchmark, as far +> as I know, first described by [Dinan et al][1]. There are two types of +> tasks, producer and consumer tasks. Each producer task creates another +> producer task followed by *n* consumer tasks, until a certain depth *d* is +> reached. Consumer tasks run for *t* microseconds. The smaller the values of +> *n* and *t*, the harder it becomes to exploit the available parallelism. A +> solid contender for the most antagonistic microbenchmark. diff --git a/constantine/platforms/threadpool/benchmarks/bouncing_producer_consumer/threadpool_bpc.nim b/constantine/platforms/threadpool/benchmarks/bouncing_producer_consumer/threadpool_bpc.nim new file mode 100644 index 0000000..f0a13fd --- /dev/null +++ b/constantine/platforms/threadpool/benchmarks/bouncing_producer_consumer/threadpool_bpc.nim @@ -0,0 +1,157 @@ +import + # STD lib + system/ansi_c, std/[os, strutils, cpuinfo, strformat, math], + # Library + ../../threadpool, + # bench + ../wtime, ../resources + +var + Depth: int32 # For example 10000 + NumTasksPerDepth: int32 # For example 9 + # The total number of tasks in the BPC benchmark is + # (NumTasksPerDepth + 1) * Depth + NumTasksTotal: int32 + TaskGranularity: int32 # in microseconds + PollInterval: float64 # in microseconds + + tp: Threadpool + +var global_poll_elapsed {.threadvar.}: float64 + +template dummy_cpt(): untyped = + # Dummy computation + # Calculate fib(30) iteratively + var + fib = 0 + f2 = 0 + f1 = 1 + for i in 2 .. 30: + fib = f1 + f2 + f2 = f1 + f1 = fib + +proc bpc_consume(usec: int32) = + + var pollElapsed = 0'f64 + + let start = wtime_usec() + let stop = usec.float64 + global_poll_elapsed = PollInterval + + while true: + var elapsed = wtime_usec() - start + elapsed -= pollElapsed + if elapsed >= stop: + break + + dummy_cpt() + + # if elapsed >= global_poll_elapsed: + # let pollStart = wtime_usec() + # loadBalance(Weave) + # pollElapsed += wtime_usec() - pollStart + # global_poll_elapsed += PollInterval + +proc bpc_consume_nopoll(usec: int32) = + + let start = wtime_usec() + let stop = usec.float64 + + while true: + var elapsed = wtime_usec() - start + if elapsed >= stop: + break + + dummy_cpt() + +proc bpc_produce(n, d: int32) {.gcsafe.} = + if d > 0: + # Create producer task + tp.spawn bpc_produce(n, d-1) + else: + return + + # Followed by n consumer tasks + for i in 0 ..< n: + tp.spawn bpc_consume(TaskGranularity) + +proc main() = + Depth = 10000 + NumTasksPerDepth = 999 + TaskGranularity = 1 + + if paramCount() == 0: + let exeName = getAppFilename().extractFilename() + echo &"Usage: {exeName} " & + &"<# of tasks per depth: {NumTasksPerDepth}> " & + &"[task granularity (us): {TaskGranularity}] " & + &"[polling interval (us): task granularity]" + echo &"Running with default config Depth = {Depth}, NumTasksPerDepth = {NumTasksPerDepth}, granularity (us) = {TaskGranularity}, polling (us) = {PollInterval}" + if paramCount() >= 1: + Depth = paramStr(1).parseInt.int32 + if paramCount() >= 2: + NumTasksPerDepth = paramStr(2). parseInt.int32 + if paramCount() >= 3: + TaskGranularity = paramStr(3). parseInt.int32 + if paramCount() == 4: + PollInterval = paramStr(4).parseInt.float64 + else: + PollInterval = TaskGranularity.float64 + if paramCount() > 4: + let exeName = getAppFilename().extractFilename() + echo &"Usage: {exeName} " & + &"<# of tasks per depth: {NumTasksPerDepth}> " & + &"[task granularity (us): {TaskGranularity}] " & + &"[polling interval (us): task granularity]" + quit 1 + + NumTasksTotal = (NumTasksPerDepth + 1) * Depth + + var nthreads: int + if existsEnv"CTT_NUM_THREADS": + nthreads = getEnv"CTT_NUM_THREADS".parseInt() + else: + nthreads = countProcessors() + + tp = Threadpool.new(numThreads = nthreads) + + # measure overhead during tasking + var ru: Rusage + getrusage(RusageSelf, ru) + var + rss = ru.ru_maxrss + flt = ru.ru_minflt + + let start = wtime_msec() + + bpc_produce(NumTasksPerDepth, Depth) + tp.syncAll() + + let stop = wtime_msec() + + getrusage(RusageSelf, ru) + rss = ru.ru_maxrss - rss + flt = ru.ru_minflt - flt + + tp.shutdown() + + echo "--------------------------------------------------------------------------" + echo "Scheduler: Constantine's Threadpool" + echo "Benchmark: BPC (Bouncing Producer-Consumer)" + echo "Threads: ", nthreads + echo "Time(ms) ", round(stop - start, 3) + echo "Max RSS (KB): ", ru.ru_maxrss + echo "Runtime RSS (KB): ", rss + echo "# of page faults: ", flt + echo "--------------------------------------------------------------------------" + echo "# of tasks: ", NumTasksTotal + echo "# of tasks/depth: ", NumTasksPerDepth + echo "Depth: ", Depth + echo "Task granularity (us): ", TaskGranularity + echo "Polling / manual load balancing interval (us): ", PollInterval + echo "--------------------------------------------------------------------------" + + quit 0 + +main() diff --git a/constantine/platforms/threadpool/benchmarks/dfs/threadpool_dfs.nim b/constantine/platforms/threadpool/benchmarks/dfs/threadpool_dfs.nim new file mode 100644 index 0000000..afe0e48 --- /dev/null +++ b/constantine/platforms/threadpool/benchmarks/dfs/threadpool_dfs.nim @@ -0,0 +1,86 @@ +# Weave +# Copyright (c) 2019 Mamy André-Ratsimbazafy +# Licensed and distributed under either of +# * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT). +# * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0). +# at your option. This file may not be copied, modified, or distributed except according to those terms. + +import + # Stdlib + system/ansi_c, std/[strformat, os, strutils, cpuinfo], + # Library + ../../threadpool + +when not defined(windows): + # bench + import ../wtime + +var tp: Threadpool + +proc dfs(depth, breadth: int): uint32 {.gcsafe.} = + if depth == 0: + return 1 + + # We could use alloca to avoid heap allocation here + var sums = newSeq[Flowvar[uint32]](breadth) + + for i in 0 ..< breadth: + sums[i] = tp.spawn dfs(depth - 1, breadth) + + for i in 0 ..< breadth: + result += sync(sums[i]) + +proc test(depth, breadth: int): uint32 = + result = sync tp.spawn dfs(depth, breadth) + +proc main() = + + var + depth = 8 + breadth = 8 + answer: uint32 + nthreads: int + + if existsEnv"CTT_NUM_THREADS": + nthreads = getEnv"CTT_NUM_THREADS".parseInt() + else: + nthreads = countProcessors() + + if paramCount() == 0: + let exeName = getAppFilename().extractFilename() + echo &"Usage: {exeName} " + echo &"Running with default config depth = {depth} and breadth = {breadth}" + + if paramCount() >= 1: + depth = paramStr(1).parseInt() + if paramCount() == 2: + breadth = paramStr(2).parseInt() + if paramCount() > 2: + let exeName = getAppFilename().extractFilename() + echo &"Usage: {exeName} " + echo &"Up to 2 parameters are valid. Received {paramCount()}" + quit 1 + + # Staccato benches runtime init and exit as well + when not defined(windows): + let start = wtime_usec() + + tp = Threadpool.new() + answer = test(depth, breadth) + tp.shutdown() + + when not defined(windows): + let stop = wtime_usec() + + echo "--------------------------------------------------------------------------" + echo "Scheduler: Constantine's Threadpool" + echo "Benchmark: dfs" + echo "Threads: ", nthreads + when not defined(windows): + echo "Time(us) ", stop - start + echo "Output: ", answer + echo "--------------------------------------------------------------------------" + + quit 0 + +main() diff --git a/constantine/platforms/threadpool/benchmarks/fibonacci/README.md b/constantine/platforms/threadpool/benchmarks/fibonacci/README.md new file mode 100644 index 0000000..217376c --- /dev/null +++ b/constantine/platforms/threadpool/benchmarks/fibonacci/README.md @@ -0,0 +1,77 @@ +# Fibonacci benchmarks + +⚠️ Disclaimer: + Please don't use parallel fibonacci in production! + Use the fast doubling method with memoization instead. + +Fibonacci benchmark has 3 draws: + +1. It's very simple to implement +2. It's unbalanced and efficiency requires distributions to avoid idle cores. +3. It's a very effective scheduler overhead benchmark, because the basic task is very trivial and the task spawning grows at 2^n scale. + +Want to know the difference between low and high overhead? + +Run the following C code (taken from [Oracle OpenMP example](https://docs.oracle.com/cd/E19205-01/820-7883/girtd/index.html)) + +```C +#include +#include +int fib(int n) +{ + int i, j; + if (n<2) + return n; + else + { + #pragma omp task shared(i) firstprivate(n) + { + i=fib(n-1); + } + + j=fib(n-2); + #pragma omp taskwait + return i+j; + } +} + +int main() +{ + int n = 40; + + #pragma omp parallel shared(n) + { + #pragma omp single + printf ("fib(%d) = %d\n", n, fib(n)); + } +} +``` + +First compile with Clang and run it +``` +clang -O3 -fopenmp benchmarks/fibonacci/omp_fib.c +time a.out +``` +It should be fairly quick + + +Then compile with GCC and run it +``` +gcc -O3 -fopenmp benchmarks/fibonacci/omp_fib.c +time a.out +``` + +Notice how some cores get idle as time goes on? +Don't forget to kill the benchmark, you'll be there all day. + +What's happening? + +GCC's OpenMP implementation uses a single queue for all tasks. +That queue gets constantly hammered by all threads and becomes a contention point. +Furthermore, it seems like there is no load balancing or that due to the contention/lock +threads are descheduled. + +However Clang implementation uses a work-stealing scheduler with one queue per thread. +The only contention happens when a thread run out of work and has to look for more work, +in the queue of other threads. And which thread to check is chosen at random so +the potential contention is distributed among all threads instead of a single structure. diff --git a/constantine/platforms/threadpool/benchmarks/fibonacci/stdnim_fib.nim b/constantine/platforms/threadpool/benchmarks/fibonacci/stdnim_fib.nim new file mode 100644 index 0000000..980457f --- /dev/null +++ b/constantine/platforms/threadpool/benchmarks/fibonacci/stdnim_fib.nim @@ -0,0 +1,35 @@ +import + # STD lib + std/[os, strutils, threadpool, strformat], + # bench + ../wtime + +# Using Nim's standard threadpool +# Compile with "nim c --threads:on -d:release -d:danger --outdir:build benchmarks/fibonacci/stdnim_fib.nim" +# +# Note: it breaks at fib 16. + +proc parfib(n: uint64): uint64 = + if n < 2: # Note: be sure to compare n<2 -> return n + return n # instead of n<=2 -> return 1 + + let x = spawn parfib(n-1) + let y = parfib(n-2) + + return ^x + y + +proc main() = + if paramCount() != 1: + echo "Usage: fib " + quit 0 + + let n = paramStr(1).parseUInt.uint64 + + let start = wtime_msec() + let f = parfib(n) + let stop = wtime_msec() + + echo "Result: ", f + echo &"Elapsed wall time: {stop-start:.2} ms" + +main() diff --git a/constantine/platforms/threadpool/benchmarks/fibonacci/threadpool_fib.nim b/constantine/platforms/threadpool/benchmarks/fibonacci/threadpool_fib.nim new file mode 100644 index 0000000..de5ea20 --- /dev/null +++ b/constantine/platforms/threadpool/benchmarks/fibonacci/threadpool_fib.nim @@ -0,0 +1,79 @@ +import + # STD lib + std/[os, strutils, cpuinfo, strformat, math], + # Library + ../../threadpool + +when not defined(windows): + # bench + import ../wtime, ../resources + +var tp: Threadpool + +proc fib(n: int): int = + # int64 on x86-64 + if n < 2: + return n + + let x = tp.spawn fib(n-1) + let y = fib(n-2) + + result = sync(x) + y + +proc main() = + var n = 40 + var nthreads: int + + if paramCount() == 0: + let exeName = getAppFilename().extractFilename() + echo &"Usage: {exeName} " + echo &"Running with default n = {n}" + elif paramCount() == 1: + n = paramStr(1).parseInt + else: + let exeName = getAppFilename().extractFilename() + echo &"Usage: {exeName} " + quit 1 + + if existsEnv"CTT_NUM_THREADS": + nthreads = getEnv"CTT_NUM_THREADS".parseInt() + else: + nthreads = countProcessors() + + tp = Threadpool.new() + + # measure overhead during tasking + when not defined(windows): + var ru: Rusage + getrusage(RusageSelf, ru) + var + rss = ru.ru_maxrss + flt = ru.ru_minflt + + let start = wtime_msec() + let f = fib(n) + + when not defined(windows): + let stop = wtime_msec() + + tp.shutdown() + + when not defined(windows): + getrusage(RusageSelf, ru) + rss = ru.ru_maxrss - rss + flt = ru.ru_minflt - flt + + echo "--------------------------------------------------------------------------" + echo "Scheduler: Constantine's Threadpool" + echo "Benchmark: Fibonacci" + echo "Threads: ", nthreads + when not defined(windows): + echo "Time(ms) ", round(stop - start, 3) + echo "Max RSS (KB): ", ru.ru_maxrss + echo "Runtime RSS (KB): ", rss + echo "# of page faults: ", flt + echo "--------------------------------------------------------------------------" + echo "n requested: ", n + echo "result: ", f + +main() diff --git a/constantine/platforms/threadpool/benchmarks/heat/stdnim_heat.nim b/constantine/platforms/threadpool/benchmarks/heat/stdnim_heat.nim new file mode 100644 index 0000000..155d994 --- /dev/null +++ b/constantine/platforms/threadpool/benchmarks/heat/stdnim_heat.nim @@ -0,0 +1,300 @@ +# Weave +# Copyright (c) 2019 Mamy André-Ratsimbazafy +# Licensed and distributed under either of +# * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT). +# * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0). +# at your option. This file may not be copied, modified, or distributed except according to those terms. + +# From fibril +# +# Original license +# +# /* +# * Heat diffusion (Jacobi-type iteration) +# * +# * Volker Strumpen, Boston August 1996 +# * +# * Copyright (c) 1996 Massachusetts Institute of Technology +# * +# * This program is free software; you can redistribute it and/or modify +# * it under the terms of the GNU General Public License as published by +# * the Free Software Foundation; either version 2 of the License, or +# * (at your option) any later version. +# * +# * This program is distributed in the hope that it will be useful, +# * but WITHOUT ANY WARRANTY; without even the implied warranty of +# * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# * GNU General Public License for more details. +# * +# * You should have received a copy of the GNU General Public License +# * along with this program; if not, write to the Free Software +# * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +# */ + +import + # Stdlib + system/ansi_c, std/[strformat, os, strutils, math, cpuinfo], + std/threadpool, + # bench + ../wtime, ../resources + +# This deadlocks :/ + +# Helpers +# ------------------------------------------------------- + +# We need a thin wrapper around raw pointers for matrices, +# we can't pass "var seq[seq[float64]]" to other threads +# nor "var" for that matter +type + Matrix[T] = object + buffer: ptr UncheckedArray[T] + m, n: int + + Row[T] = object + buffer: ptr UncheckedArray[T] + len: int + +func newMatrix[T](m, n: int): Matrix[T] {.inline.} = + result.buffer = cast[ptr UncheckedArray[T]](c_malloc(csize_t m*n*sizeof(T))) + result.m = m + result.n = n + +template `[]`[T](mat: Matrix[T], row, col: Natural): T = + # row-major storage + assert row < mat.m + assert col < mat.n + mat.buffer[row * mat.n + col] + +template `[]=`[T](mat: Matrix[T], row, col: Natural, value: T) = + assert row < mat.m + assert col < mat.n + mat.buffer[row * mat.n + col] = value + +func getRow[T](mat: Matrix[T], rowIdx: Natural): Row[T] {.inline.} = + # row-major storage, there are n columns in between each rows + assert rowIdx < mat.m + result.buffer = cast[ptr UncheckedArray[T]](mat.buffer[rowIdx * mat.n].addr) + result.len = mat.m + +template `[]`[T](row: Row[T], idx: Natural): T = + assert idx < row.len + row.buffer[idx] + +template `[]=`[T](row: Row[T], idx: Natural, value: T) = + assert idx < row.len + row.buffer[idx] = value + +func delete[T](mat: sink Matrix[T]) = + c_free(mat.buffer) + +# And an auto converter for int32 -> float64 so we don't have to convert +# all i, j indices manually + +converter i32toF64(x: int32): float64 {.inline.} = + float64(x) + +# ------------------------------------------------------- + +template f(x, y: SomeFloat): SomeFloat = + sin(x) * sin(y) + +template randa[T: SomeFloat](x, t: T): T = + T(0.0) + +proc randb(x, t: SomeFloat): SomeFloat {.inline.} = + # proc instead of template to avoid Nim constant folding bug: + # https://github.com/nim-lang/Nim/issues/12783 + exp(-2 * t) * sin(x) + +template randc[T: SomeFloat](y, t: T): T = + T(0.0) + +proc randd(y, t: SomeFloat): SomeFloat {.inline.} = + # proc instead of template to avoid Nim constant folding bug: + # https://github.com/nim-lang/Nim/issues/12783 + exp(-2 * t) * sin(y) + +template solu(x, y, t: SomeFloat): SomeFloat = + exp(-2 * t) * sin(x) * sin(y) + +const n = 4096'i32 + +var + nx, ny, nt: int32 + xu, xo, yu, yo, tu, to: float64 + + dx, dy, dt: float64 + dtdxsq, dtdysq: float64 + + odd: Matrix[float64] + even: Matrix[float64] + +proc heat(m: Matrix[float64], il, iu: int32): bool {.discardable.}= + # TODO to allow awaiting `heat` we return a dummy bool + # The parallel spawns are updating the same matrix cells otherwise + if iu - il > 1: + let im = (il + iu) div 2 + + let h = spawn heat(m, il, im) + heat(m, im, iu) + discard ^h + return true + # ------------------------ + + let i = il + let row = m.getRow(i) + + if i == 0: + for j in 0 ..< ny: + row[j] = randc(yu + j*dy, 0) + elif i == nx - 1: + for j in 0 ..< ny: + row[j] = randd(yu + j*dy, 0) + else: + row[0] = randa(xu + i*dx, 0) + for j in 1 ..< ny - 1: + row[j] = f(xu + i*dx, yu + j*dy) + row[ny - 1] = randb(xu + i*dx, 0) + +proc diffuse(output: Matrix[float64], input: Matrix[float64], il, iu: int32, t: float64): bool {.discardable.} = + # TODO to allow awaiting `diffuse` we return a dummy bool + # The parallel spawns are updating the same matrix cells otherwise + if iu - il > 1: + let im = (il + iu) div 2 + + let d = spawn diffuse(output, input, il, im, t) + diffuse(output, input, im, iu, t) + discard ^d + return true + # ------------------------ + + let i = il + let row = output.getRow(i) + + if i == 0: + for j in 0 ..< ny: + row[j] = randc(yu + j*dy, t) + elif i == nx - 1: + for j in 0 ..< ny: + row[j] = randd(yu + j*dy, t) + else: + row[0] = randa(xu + i*dx, t) + for j in 1 ..< ny - 1: + row[j] = input[i, j] + # The use of nested sequences here is a bad idea ... + dtdysq * (input[i, j+1] - 2 * input[i, j] + input[i, j-1]) + + dtdxsq * (input[i+1, j] - 2 * input[i, j] + input[i-1, j]) + row[ny - 1] = randb(xu + i*dx, t) + +proc initTest() = + nx = n + ny = 1024 + nt = 100 + xu = 0.0 + xo = 1.570796326794896558 + yu = 0.0 + yo = 1.570796326794896558 + tu = 0.0 + to = 0.0000001 + + dx = (xo - xu) / float64(nx - 1) + dy = (yo - yu) / float64(ny - 1) + dt = (to - tu) / float64(nt) + + dtdxsq = dt / (dx * dx) + dtdysq = dt / (dy * dy) + + even = newMatrix[float64](nx, ny) + odd = newMatrix[float64](nx, ny) + +proc prep() = + heat(even, 0, nx) + +proc test() = + var t = tu + + for _ in countup(1, nt.int, 2): + # nt included + t += dt + diffuse(odd, even, 0, nx, t) + t += dt + diffuse(even, odd, 0, nx, t) + + if nt mod 2 != 0: + t += dt + diffuse(odd, even, 0, nx, t) + +proc verify() = + var + mat: Matrix[float64] + mae: float64 + mre: float64 + me: float64 + + mat = if nt mod 2 != 0: odd else: even + + for a in 0 ..< nx: + for b in 0 ..< ny: + var tmp = abs(mat[a, b] - solu(xu + a*dx, yu + b*dy, to)) + if tmp > 1e-3: + echo "nx: ", nx, " - ny: ", ny + echo "mat[", a, ", ", b, "] = ", mat[a, b], ", expected sol = ", solu(xu + a*dx, yu + b*dy, to) + quit 1 + + me += tmp + if tmp > mae: mae = tmp + if mat[a, b] != 0.0: tmp /= mat[a, b] + if tmp > mre: mre = tmp + + me /= nx * ny + + if mae > 1e-12: + echo &"Local maximal absolute error {mae:1.3e}" + quit 1 + if mre > 1e-12: + echo &"Local maximal relative error {mre:1.3e}" + quit 1 + if me > 1e-12: + echo &"Global mean absolute error {me:1.3e}" + quit 1 + + echo "Verification successful" + +proc main() = + var nthreads: int + nthreads = countProcessors() + + var ru: Rusage + getrusage(RusageSelf, ru) + var + rss = ru.ru_maxrss + flt = ru.ru_minflt + + initTest() + + prep() + let start = wtime_usec() + test() + let stop = wtime_usec() + + getrusage(RusageSelf, ru) + rss = ru.ru_maxrss - rss + flt = ru.ru_minflt - flt + + sync() + + verify() + delete(even) + delete(odd) + + echo "Scheduler: Nim threadpool (standard lib)" + echo "Benchmark: heat" + echo "Threads: ", nthreads + echo "Time(us) ", stop - start + echo "Max RSS (KB): ", ru.ru_maxrss + echo "Runtime RSS (KB): ", rss + echo "# of page faults: ", flt + + quit 0 + +main() diff --git a/constantine/platforms/threadpool/benchmarks/heat/threadpool_heat.nim b/constantine/platforms/threadpool/benchmarks/heat/threadpool_heat.nim new file mode 100644 index 0000000..ca36067 --- /dev/null +++ b/constantine/platforms/threadpool/benchmarks/heat/threadpool_heat.nim @@ -0,0 +1,314 @@ +# Weave +# Copyright (c) 2019 Mamy André-Ratsimbazafy +# Licensed and distributed under either of +# * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT). +# * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0). +# at your option. This file may not be copied, modified, or distributed except according to those terms. + +# From fibril +# +# Original license +# +# /* +# * Heat diffusion (Jacobi-type iteration) +# * +# * Volker Strumpen, Boston August 1996 +# * +# * Copyright (c) 1996 Massachusetts Institute of Technology +# * +# * This program is free software; you can redistribute it and/or modify +# * it under the terms of the GNU General Public License as published by +# * the Free Software Foundation; either version 2 of the License, or +# * (at your option) any later version. +# * +# * This program is distributed in the hope that it will be useful, +# * but WITHOUT ANY WARRANTY; without even the implied warranty of +# * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# * GNU General Public License for more details. +# * +# * You should have received a copy of the GNU General Public License +# * along with this program; if not, write to the Free Software +# * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +# */ + +import + # Stdlib + system/ansi_c, std/[strformat, os, strutils, math, cpuinfo], + # Library + ../../threadpool +when not defined(windows): + # bench + import ../wtime, ../resources + +# Helpers +# ------------------------------------------------------- + +# We need a thin wrapper around raw pointers for matrices, +# we can't pass "var seq[seq[float64]]" to other threads +# nor "var" for that matter +type + Matrix[T] = object + buffer: ptr UncheckedArray[T] + m, n: int + + Row[T] = object + buffer: ptr UncheckedArray[T] + len: int + +var tp: Threadpool + +func newMatrix[T](m, n: int): Matrix[T] {.inline.} = + result.buffer = cast[ptr UncheckedArray[T]](c_malloc(csize_t m*n*sizeof(T))) + result.m = m + result.n = n + +template `[]`[T](mat: Matrix[T], row, col: Natural): T = + # row-major storage + assert row < mat.m + assert col < mat.n + mat.buffer[row * mat.n + col] + +template `[]=`[T](mat: Matrix[T], row, col: Natural, value: T) = + assert row < mat.m + assert col < mat.n + mat.buffer[row * mat.n + col] = value + +func getRow[T](mat: Matrix[T], rowIdx: Natural): Row[T] {.inline.} = + # row-major storage, there are n columns in between each rows + assert rowIdx < mat.m + result.buffer = cast[ptr UncheckedArray[T]](mat.buffer[rowIdx * mat.n].addr) + result.len = mat.m + +template `[]`[T](row: Row[T], idx: Natural): T = + assert idx < row.len + row.buffer[idx] + +template `[]=`[T](row: Row[T], idx: Natural, value: T) = + assert idx < row.len + row.buffer[idx] = value + +func delete[T](mat: sink Matrix[T]) = + c_free(mat.buffer) + +# And an auto converter for int32 -> float64 so we don't have to convert +# all i, j indices manually + +converter i32toF64(x: int32): float64 {.inline.} = + float64(x) + +# ------------------------------------------------------- + +template f(x, y: SomeFloat): SomeFloat = + sin(x) * sin(y) + +template randa[T: SomeFloat](x, t: T): T = + T(0.0) + +proc randb(x, t: SomeFloat): SomeFloat {.inline.} = + # proc instead of template to avoid Nim constant folding bug: + # https://github.com/nim-lang/Nim/issues/12783 + exp(-2 * t) * sin(x) + +template randc[T: SomeFloat](y, t: T): T = + T(0.0) + +proc randd(y, t: SomeFloat): SomeFloat {.inline.} = + # proc instead of template to avoid Nim constant folding bug: + # https://github.com/nim-lang/Nim/issues/12783 + exp(-2 * t) * sin(y) + +template solu(x, y, t: SomeFloat): SomeFloat = + exp(-2 * t) * sin(x) * sin(y) + +const n = 4096'i32 + +var + nx, ny, nt: int32 + xu, xo, yu, yo, tu, to: float64 + + dx, dy, dt: float64 + dtdxsq, dtdysq: float64 + + odd: Matrix[float64] + even: Matrix[float64] + +proc heat(m: Matrix[float64], il, iu: int32): bool {.discardable, gcsafe.}= + # TODO to allow awaiting `heat` we return a dummy bool + # The parallel spawns are updating the same matrix cells otherwise + if iu - il > 1: + let im = (il + iu) div 2 + + let h = tp.spawn heat(m, il, im) + heat(m, im, iu) + discard sync(h) + return true + # ------------------------ + + let i = il + let row = m.getRow(i) + + if i == 0: + for j in 0 ..< ny: + row[j] = randc(yu + j*dy, 0) + elif i == nx - 1: + for j in 0 ..< ny: + row[j] = randd(yu + j*dy, 0) + else: + row[0] = randa(xu + i*dx, 0) + for j in 1 ..< ny - 1: + row[j] = f(xu + i*dx, yu + j*dy) + row[ny - 1] = randb(xu + i*dx, 0) + +proc diffuse(output: Matrix[float64], input: Matrix[float64], il, iu: int32, t: float64): bool {.discardable, gcsafe.} = + # TODO to allow awaiting `diffuse` we return a dummy bool + # The parallel spawns are updating the same matrix cells otherwise + if iu - il > 1: + let im = (il + iu) div 2 + + let d = tp.spawn diffuse(output, input, il, im, t) + diffuse(output, input, im, iu, t) + discard sync(d) + return true + # ------------------------ + + let i = il + let row = output.getRow(i) + + if i == 0: + for j in 0 ..< ny: + row[j] = randc(yu + j*dy, t) + elif i == nx - 1: + for j in 0 ..< ny: + row[j] = randd(yu + j*dy, t) + else: + row[0] = randa(xu + i*dx, t) + for j in 1 ..< ny - 1: + row[j] = input[i, j] + # The use of nested sequences here is a bad idea ... + dtdysq * (input[i, j+1] - 2 * input[i, j] + input[i, j-1]) + + dtdxsq * (input[i+1, j] - 2 * input[i, j] + input[i-1, j]) + row[ny - 1] = randb(xu + i*dx, t) + +proc initTest() = + nx = n + ny = 1024 + nt = 100 + xu = 0.0 + xo = 1.570796326794896558 + yu = 0.0 + yo = 1.570796326794896558 + tu = 0.0 + to = 0.0000001 + + dx = (xo - xu) / float64(nx - 1) + dy = (yo - yu) / float64(ny - 1) + dt = (to - tu) / float64(nt) + + dtdxsq = dt / (dx * dx) + dtdysq = dt / (dy * dy) + + even = newMatrix[float64](nx, ny) + odd = newMatrix[float64](nx, ny) + +proc prep() = + heat(even, 0, nx) + +proc test() = + var t = tu + + for _ in countup(1, nt.int, 2): + # nt included + t += dt + diffuse(odd, even, 0, nx, t) + t += dt + diffuse(even, odd, 0, nx, t) + + if nt mod 2 != 0: + t += dt + diffuse(odd, even, 0, nx, t) + +proc verify() = + var + mat: Matrix[float64] + mae: float64 + mre: float64 + me: float64 + + mat = if nt mod 2 != 0: odd else: even + + for a in 0 ..< nx: + for b in 0 ..< ny: + var tmp = abs(mat[a, b] - solu(xu + a*dx, yu + b*dy, to)) + if tmp > 1e-3: + echo "nx: ", nx, " - ny: ", ny + echo "mat[", a, ", ", b, "] = ", mat[a, b], ", expected sol = ", solu(xu + a*dx, yu + b*dy, to) + quit 1 + + me += tmp + if tmp > mae: mae = tmp + if mat[a, b] != 0.0: tmp /= mat[a, b] + if tmp > mre: mre = tmp + + me /= nx * ny + + if mae > 1e-12: + echo &"Local maximal absolute error {mae:1.3e}" + quit 1 + if mre > 1e-12: + echo &"Local maximal relative error {mre:1.3e}" + quit 1 + if me > 1e-12: + echo &"Global mean absolute error {me:1.3e}" + quit 1 + + echo "Heat: Verification successful" + +proc main() = + var nthreads: int + if existsEnv"CTT_NUM_THREADS": + nthreads = getEnv"CTT_NUM_THREADS".parseInt() + else: + nthreads = countProcessors() + + when not defined(windows): + var ru: Rusage + getrusage(RusageSelf, ru) + var + rss = ru.ru_maxrss + flt = ru.ru_minflt + + initTest() + + # Fibril initializes before benching + tp = Threadpool.new(numThreads = nthreads) + + prep() + when not defined(windows): + let start = wtime_usec() + test() + when not defined(windows): + let stop = wtime_usec() + + getrusage(RusageSelf, ru) + rss = ru.ru_maxrss - rss + flt = ru.ru_minflt - flt + + tp.shutdown() + + verify() + delete(even) + delete(odd) + + echo "--------------------------------------------------------------------------" + echo "Scheduler: Constantine's Threadpool" + echo "Benchmark: heat" + echo "Threads: ", nthreads + when not defined(windows): + echo "Time(us) ", stop - start + echo "Max RSS (KB): ", ru.ru_maxrss + echo "Runtime RSS (KB): ", rss + echo "# of page faults: ", flt + echo "--------------------------------------------------------------------------" + + quit 0 + +main() diff --git a/constantine/platforms/threadpool/benchmarks/matmul_cache_oblivious/README.md b/constantine/platforms/threadpool/benchmarks/matmul_cache_oblivious/README.md new file mode 100644 index 0000000..dae7756 --- /dev/null +++ b/constantine/platforms/threadpool/benchmarks/matmul_cache_oblivious/README.md @@ -0,0 +1,12 @@ +# Cache-Oblivious Matrix Multiplication + +From Staccato and Cilk + +https://bradley.csail.mit.edu/svn/repos/cilk/5.4.3/examples/matmul.cilk +See the paper ``Cache-Oblivious Algorithms'', by +Matteo Frigo, Charles E. Leiserson, Harald Prokop, and +Sridhar Ramachandran, FOCS 1999, for an explanation of +why this algorithm is good for caches. + +Note that the benchmarks output incorrect matrix traces +according to the check ... diff --git a/constantine/platforms/threadpool/benchmarks/matmul_cache_oblivious/threadpool_matmul_co.nim b/constantine/platforms/threadpool/benchmarks/matmul_cache_oblivious/threadpool_matmul_co.nim new file mode 100644 index 0000000..32307e4 --- /dev/null +++ b/constantine/platforms/threadpool/benchmarks/matmul_cache_oblivious/threadpool_matmul_co.nim @@ -0,0 +1,214 @@ +# Weave +# Copyright (c) 2019 Mamy André-Ratsimbazafy +# Licensed and distributed under either of +# * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT). +# * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0). +# at your option. This file may not be copied, modified, or distributed except according to those terms. + +# Rectangular matrix multiplication. +# +# Adapted from Cilk 5.4.3 example +# +# https://bradley.csail.mit.edu/svn/repos/cilk/5.4.3/examples/matmul.cilk; +# See the paper ``Cache-Oblivious Algorithms'', by +# Matteo Frigo, Charles E. Leiserson, Harald Prokop, and +# Sridhar Ramachandran, FOCS 1999, for an explanation of +# why this algorithm is good for caches. + +import + # Stdlib + system/ansi_c, std/[strformat, os, strutils, math, cpuinfo], + # Library + ../../threadpool, + # bench + ../wtime, ../resources + +# Helpers +# ------------------------------------------------------- + +# We need a thin wrapper around raw pointers for matrices, +# we can't pass "var" to other threads +type + Matrix[T: SomeFloat] = object + buffer: ptr UncheckedArray[T] + ld: int + +var tp: Threadpool + +func newMatrixNxN[T](n: int): Matrix[T] {.inline.} = + result.buffer = cast[ptr UncheckedArray[T]](c_malloc(csize_t n*n*sizeof(T))) + result.ld = n + +template `[]`[T](mat: Matrix[T], row, col: Natural): T = + # row-major storage + assert row < mat.ld, $i & " < " & $mat.ld + assert col < mat.ld, $i & " < " & $mat.ld + mat.buffer[row * mat.ld + col] + +template `[]=`[T](mat: Matrix[T], row, col: Natural, value: T) = + assert row < mat.ld, $i & " < " & $mat.ld + assert col < mat.ld, $i & " < " & $mat.ld + mat.buffer[row * mat.ld + col] = value + +func stride*[T](mat: Matrix[T], row, col: Natural): Matrix[T]{.inline.}= + ## Returns a new view offset by the row and column stride + result.buffer = cast[ptr UncheckedArray[T]]( + addr mat.buffer[row*mat.ld + col] + ) + +func delete[T](mat: sink Matrix[T]) = + c_free(mat.buffer) + +# ------------------------------------------------------- + +proc xorshiftRand(): uint32 = + var x {.global.} = uint32(2463534242) + x = x xor (x shr 13) + x = x xor (x shl 17) + x = x xor (x shr 5) + return x + +func zero[T](A: Matrix[T]) = + # zeroing is not timed + zeroMem(A.buffer, A.ld * A.ld * sizeof(T)) + +proc fill[T](A: Matrix[T]) = + for i in 0 ..< A.ld: + for j in 0 ..< A.ld: + A[i, j] = T(xorshiftRand() mod A.ld.uint32) + +func maxError(A, B: Matrix): float64 = + assert A.ld == B.ld + for i in 0 ..< A.ld: + for j in 0 ..< A.ld: + var diff = (A[i, j] - B[i, j]) / A[i, j] + if diff < 0: + diff = -diff + if diff > result: + result = diff + +func check[T](A, B, C: Matrix[T], n: int): bool = + var + tr_C = 0.T + tr_AB = 0.T + for i in 0 ..< n: + for j in 0 ..< n: + tr_AB += A[i, j] * B[j, i] + tr_C += C[i, i] + + # Note, all benchmarks return false ‾\_(ツ)_/‾ + return abs(tr_AB - tr_C) < 1e-3 + +proc matmul[T](A, B, C: Matrix[T], m, n, p: int, add: bool): bool = + # The original bench passes around a ``ld`` parameter (leading dimension?), + # we store it in the matrices + # We return a dummy bool to allow waiting on the matmul + + # Threshold + if (m + n + p) <= 64: + if add: + for i in 0 ..< m: + for k in 0 ..< p: + var c = 0.T + for j in 0 ..< n: + c += A[i, j] * B[j, k] + C[i, k] += c + else: + for i in 0 ..< m: + for k in 0 ..< p: + var c = 0.T + for j in 0 ..< n: + c += A[i, j] * B[j, k] + C[i, k] = c + + return + + var h0, h1: FlowVar[bool] + ## Each half of the computation + + # matrix is larger than threshold + if m >= n and n >= p: + let m1 = m shr 1 # divide by 2 + h0 = tp.spawn matmul(A, B, C, m1, n, p, add) + h1 = tp.spawn matmul(A.stride(m1, 0), B, C.stride(m1, 0), m - m1, n, p, add) + elif n >= m and n >= p: + let n1 = n shr 1 # divide by 2 + h0 = tp.spawn matmul(A, B, C, m, n1, p, add) + h1 = tp.spawn matmul(A.stride(0, n1), B.stride(n1, 0), C, m, n - n1, p, add = true) + else: + let p1 = p shr 1 + h0 = tp.spawn matmul(A, B, C, m, n, p1, add) + h1 = tp.spawn matmul(A, B.stride(0, p1), C.stride(0, p1), m, n, p - p1, add) + + discard sync(h0) + discard sync(h1) + +proc main() = + echo "Warning the benchmark seems to not be correct." + var + n = 3000 + nthreads: int + + if existsEnv"CTT_NUM_THREADS": + nthreads = getEnv"CTT_NUM_THREADS".parseInt() + else: + nthreads = countProcessors() + + if paramCount() == 0: + let exeName = getAppFilename().extractFilename() + echo &"Usage: {exeName} " + echo &"Running with default config n = {n}" + elif paramCount() == 1: + n = paramStr(1).parseInt() + else: + let exeName = getAppFilename().extractFilename() + echo &"Usage: {exeName} " + echo &"Up to 1 parameter is valid. Received {paramCount()}" + quit 1 + + var A = newMatrixNxN[float32](n) + var B = newMatrixNxN[float32](n) + var C = newMatrixNxN[float32](n) + + fill(A) + fill(B) + zero(C) + + var ru: Rusage + getrusage(RusageSelf, ru) + var + rss = ru.ru_maxrss + flt = ru.ru_minflt + + # Staccato benches runtime init and exit as well + let start = wtime_msec() + + tp = Threadpool.new(numThreads = nthreads) + discard sync tp.spawn matmul(A, B, C, n, n, n, add = false) + tp.shutdown() + + let stop = wtime_msec() + + getrusage(RusageSelf, ru) + rss = ru.ru_maxrss - rss + flt = ru.ru_minflt - flt + + echo "--------------------------------------------------------------------------" + echo "Scheduler: Constantine's Threadpool" + echo "Benchmark: Matrix Multiplication (cache oblivious)" + echo "Threads: ", nthreads + echo "Time(ms) ", stop - start + echo "Max RSS (KB): ", ru.ru_maxrss + echo "Runtime RSS (KB): ", rss + echo "# of page faults: ", flt + echo "Input: ", n + echo "Error: ", check(A, B, C, n) + echo "--------------------------------------------------------------------------" + + delete A + delete B + delete C + + quit 0 + +main() diff --git a/constantine/platforms/threadpool/benchmarks/nqueens/stdnim_nqueens.nim b/constantine/platforms/threadpool/benchmarks/nqueens/stdnim_nqueens.nim new file mode 100644 index 0000000..e73159e --- /dev/null +++ b/constantine/platforms/threadpool/benchmarks/nqueens/stdnim_nqueens.nim @@ -0,0 +1,181 @@ +# Weave +# Copyright (c) 2019 Mamy André-Ratsimbazafy +# Licensed and distributed under either of +# * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT). +# * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0). +# at your option. This file may not be copied, modified, or distributed except according to those terms. +# +# Original code licenses +# ------------------------------------------------------------------------------------------------ +# +# /**********************************************************************************************/ +# /* This program is part of the Barcelona OpenMP Tasks Suite */ +# /* Copyright (C) 2009 Barcelona Supercomputing Center - Centro Nacional de Supercomputacion */ +# /* Copyright (C) 2009 Universitat Politecnica de Catalunya */ +# /* */ +# /* This program is free software; you can redistribute it and/or modify */ +# /* it under the terms of the GNU General Public License as published by */ +# /* the Free Software Foundation; either version 2 of the License, or */ +# /* (at your option) any later version. */ +# /* */ +# /* This program is distributed in the hope that it will be useful, */ +# /* but WITHOUT ANY WARRANTY; without even the implied warranty of */ +# /* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the */ +# /* GNU General Public License for more details. */ +# /* */ +# /* You should have received a copy of the GNU General Public License */ +# /* along with this program; if not, write to the Free Software */ +# /* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ +# /**********************************************************************************************/ +# +# /* +# * Original code from the Cilk project (by Keith Randall) +# * +# * Copyright (c) 2000 Massachusetts Institute of Technology +# * Copyright (c) 2000 Matteo Frigo +# */ + +import + # Stdlib + system/ansi_c, std/[strformat, os, strutils], + std/threadpool, + # bench + ../wtime + +# This deadlocks :/ + +# Nim helpers +# ------------------------------------------------- + +when defined(windows): + proc alloca(size: csize): pointer {.header: "".} +else: + proc alloca(size: csize): pointer {.header: "".} + +template alloca*(T: typedesc): ptr T = + cast[ptr T](alloca(sizeof(T))) + +template alloca*(T: typedesc, len: Natural): ptr UncheckedArray[T] = + cast[ptr UncheckedArray[T]](alloca(sizeof(T) * len)) + +proc tp_alloc*(T: typedesc, len: SomeInteger): ptr UncheckedArray[T] {.inline.} = + cast[type result](c_malloc(csize_t len*sizeof(T))) + +proc tp_free*[T: ptr](p: T) {.inline.} = + c_free(p) + +# We assume that Nim zeroMem vs C memset +# and Nim copyMem vs C memcpy have no difference +# Nim does have extra checks to handle GC-ed types +# but they should be eliminated by the Nim compiler. + +# ------------------------------------------------- + +type CharArray = ptr UncheckedArray[char] + +var example_solution: ptr UncheckedArray[char] + +func isValid(n: int32, a: CharArray): bool = + ## `a` contains an array of `n` queen positions. + ## Returns true if none of the queens conflict and 0 otherwise. + + for i in 0'i32 ..< n: + let p = cast[int32](a[i]) + + for j in i+1 ..< n: + let q = cast[int32](a[j]) + if q == p or q == p - (j-i) or q == p + (j-i): + return false + return true + +proc nqueens_ser(n, j: int32, a: CharArray): int32 = + # Serial nqueens + if n == j: + # Good solution count it + if example_solution.isNil: + example_solution = tp_alloc(char, n) + copyMem(example_solution, a, n * sizeof(char)) + return 1 + + # Try each possible position for queen `j` + for i in 0 ..< n: + a[j] = cast[char](i) + if isValid(j+1, a): + result += nqueens_ser(n, j+1, a) + +proc nqueens_par(n, j: int32, a: CharArray): int32 = + + if n == j: + # Good solution, count it + return 1 + + var localCounts = alloca(Flowvar[int32], n) + zeroMem(localCounts, n * sizeof(Flowvar[int32])) + + # Try each position for queen `j` + for i in 0 ..< n: + var b = alloca(char, j+1) + copyMem(b, a, j * sizeof(char)) + b[j] = cast[char](i) + if isValid(j+1, b): + localCounts[i] = spawn nqueens_par(n, j+1, b) + + for i in 0 ..< n: + if not localCounts[i].isNil(): + result += ^localCounts[i] + +const solutions = [ + 1, + 0, + 0, + 2, + 10, # 5x5 + 4, + 10, + 92, # 8x8 + 352, + 724, # 10x10 + 2680, + 14200, + 73712, + 365596, + 2279184, # 15x15 + 14772512 +] + +proc verifyQueens(n, res: int32) = + if n > solutions.len: + echo &"Cannot verify result: {n} is out of range [1,{solutions.len}]" + return + + if res != solutions[n-1]: + echo &"N-Queens failure: {res} is different from expected {solutions[n-1]}" + +proc main() = + if paramCount() != 1: + let exeName = getAppFilename().extractFilename() + echo &"Usage: {exeName} " + quit 0 + + let n = paramStr(1).parseInt.int32 + + if n notin 1 .. solutions.len: + echo &"The number of queens N (on a NxN board) must be in the range [1, {solutions.len}]" + quit 1 + + + let start = wtime_msec() + let count = nqueens_par(n, 0, alloca(char, n)) + let stop = wtime_msec() + + verifyQueens(n, count) + + if not example_solution.isNil: + stdout.write("Example solution: ") + for i in 0 ..< n: + c_printf("%2d ", example_solution[i]) + stdout.write('\n') + + echo &"Elapsed wall time: {stop-start:2.4f} ms" + +main() diff --git a/constantine/platforms/threadpool/benchmarks/nqueens/threadpool_nqueens.nim b/constantine/platforms/threadpool/benchmarks/nqueens/threadpool_nqueens.nim new file mode 100644 index 0000000..93fcd0b --- /dev/null +++ b/constantine/platforms/threadpool/benchmarks/nqueens/threadpool_nqueens.nim @@ -0,0 +1,231 @@ +# Weave +# Copyright (c) 2019 Mamy André-Ratsimbazafy +# Licensed and distributed under either of +# * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT). +# * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0). +# at your option. This file may not be copied, modified, or distributed except according to those terms. +# +# Original code licenses +# ------------------------------------------------------------------------------------------------ +# +# /**********************************************************************************************/ +# /* This program is part of the Barcelona OpenMP Tasks Suite */ +# /* Copyright (C) 2009 Barcelona Supercomputing Center - Centro Nacional de Supercomputacion */ +# /* Copyright (C) 2009 Universitat Politecnica de Catalunya */ +# /* */ +# /* This program is free software; you can redistribute it and/or modify */ +# /* it under the terms of the GNU General Public License as published by */ +# /* the Free Software Foundation; either version 2 of the License, or */ +# /* (at your option) any later version. */ +# /* */ +# /* This program is distributed in the hope that it will be useful, */ +# /* but WITHOUT ANY WARRANTY; without even the implied warranty of */ +# /* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the */ +# /* GNU General Public License for more details. */ +# /* */ +# /* You should have received a copy of the GNU General Public License */ +# /* along with this program; if not, write to the Free Software */ +# /* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ +# /**********************************************************************************************/ +# +# /* +# * Original code from the Cilk project (by Keith Randall) +# * +# * Copyright (c) 2000 Massachusetts Institute of Technology +# * Copyright (c) 2000 Matteo Frigo +# */ + +import + # Stdlib + system/ansi_c, std/[strformat, os, strutils, cpuinfo], + # library + ../../threadpool + +when not defined(windows): + # bench + import ../wtime, ../resources + +# Nim helpers +# ------------------------------------------------- + +when defined(windows): + proc alloca(size: int): pointer {.header: "".} +else: + proc alloca(size: int): pointer {.header: "".} + +template alloca*(T: typedesc): ptr T = + cast[ptr T](alloca(sizeof(T))) + +template alloca*(T: typedesc, len: Natural): ptr UncheckedArray[T] = + cast[ptr UncheckedArray[T]](alloca(sizeof(T) * len)) + +proc tp_alloc*(T: typedesc, len: SomeInteger): ptr UncheckedArray[T] {.inline.} = + when defined(TP_useNimAlloc): + cast[type result](createSharedU(T, len)) + else: + cast[type result](c_malloc(csize_t len*sizeof(T))) + +proc tp_free*[T: ptr](p: T) {.inline.} = + when defined(TP_useNimAlloc): + freeShared(p) + else: + c_free(p) + +# We assume that Nim zeroMem vs C memset +# and Nim copyMem vs C memcpy have no difference +# Nim does have extra checks to handle GC-ed types +# but they should be eliminated by the Nim compiler. + +# ------------------------------------------------- + +type CharArray = ptr UncheckedArray[char] + +var tp: Threadpool +var example_solution: ptr UncheckedArray[char] + +func isValid(n: int32, a: CharArray): bool = + ## `a` contains an array of `n` queen positions. + ## Returns true if none of the queens conflict and 0 otherwise. + + for i in 0'i32 ..< n: + let p = cast[int32](a[i]) + + for j in i+1 ..< n: + let q = cast[int32](a[j]) + if q == p or q == p - (j-i) or q == p + (j-i): + return false + return true + +proc nqueens_ser(n, j: int32, a: CharArray): int32 = + # Serial nqueens + if n == j: + # Good solution count it + if example_solution.isNil: + example_solution = tp_alloc(char, n) + copyMem(example_solution, a, n * sizeof(char)) + return 1 + + # Try each possible position for queen `j` + for i in 0 ..< n: + a[j] = cast[char](i) + if isValid(j+1, a): + result += nqueens_ser(n, j+1, a) + +proc nqueens_par(n, j: int32, a: CharArray): int32 {.gcsafe.} = + + if n == j: + # Good solution, count it + return 1 + + var localCounts = alloca(Flowvar[int32], n) + zeroMem(localCounts, n * sizeof(Flowvar[int32])) + + # Try each position for queen `j` + for i in 0 ..< n: + var b = alloca(char, j+1) + copyMem(b, a, j * sizeof(char)) + b[j] = cast[char](i) + if isValid(j+1, b): + localCounts[i] = tp.spawn nqueens_par(n, j+1, b) + + for i in 0 ..< n: + if localCounts[i].isSpawned(): + result += sync(localCounts[i]) + +const solutions = [ + 1, + 0, + 0, + 2, + 10, # 5x5 + 4, + 10, + 92, # 8x8 + 352, + 724, # 10x10 + 2680, + 14200, + 73712, + 365596, + 2279184, # 15x15 + 14772512 +] + +proc verifyQueens(n, res: int32) = + if n > solutions.len: + echo &"Cannot verify result: {n} is out of range [1,{solutions.len}]" + return + + if res != solutions[n-1]: + echo &"N-Queens failure: {res} is different from expected {solutions[n-1]}" + +proc main() = + var + n = 11'i32 + nthreads: int + + if existsEnv"CTT_NUM_THREADS": + nthreads = getEnv"CTT_NUM_THREADS".parseInt() + else: + nthreads = countProcessors() + + if paramCount() == 0: + let exeName = getAppFilename().extractFilename() + echo &"Usage: {exeName} " + echo &"Running with default config N = {n}\n" + + if paramCount() >= 1: + n = paramStr(1).parseInt.int32 + + if n notin 1 .. solutions.len: + echo &"The number of queens N (on a NxN board) must be in the range [1, {solutions.len}]" + quit 1 + + when not defined(windows): + var ru: Rusage + getrusage(RusageSelf, ru) + var + rss = ru.ru_maxrss + flt = ru.ru_minflt + + tp = Threadpool.new(numThreads = nthreads) + + when not defined(windows): + let start = wtime_msec() + + let count = nqueens_par(n, 0, alloca(char, n)) + + when not defined(windows): + let stop = wtime_msec() + + when not defined(windows): + getrusage(RusageSelf, ru) + rss = ru.ru_maxrss - rss + flt = ru.ru_minflt - flt + + tp.shutdown() + + verifyQueens(n, count) + + if not example_solution.isNil: + stdout.write("Example solution: ") + for i in 0 ..< n: + c_printf("%2d ", example_solution[i]) + stdout.write('\n') + + echo "--------------------------------------------------------------------------" + echo "Scheduler: Constantine's Threadpool" + echo "Benchmark: N-queens" + echo "Threads: ", nthreads + when not defined(windows): + echo "Time(us) ", stop - start + echo "Max RSS (KB): ", ru.ru_maxrss + echo "Runtime RSS (KB): ", rss + echo "# of page faults: ", flt + echo "Problem size: ", n,"x",n, " board with ",n, " queens" + echo "Solutions found: ", count + echo "--------------------------------------------------------------------------" + + quit 0 + +main() diff --git a/constantine/platforms/threadpool/benchmarks/resources.nim b/constantine/platforms/threadpool/benchmarks/resources.nim new file mode 100644 index 0000000..4441e47 --- /dev/null +++ b/constantine/platforms/threadpool/benchmarks/resources.nim @@ -0,0 +1,24 @@ +type + Timeval {.importc: "timeval", header:"", bycopy.} = object + + Rusage* {.importc: "struct rusage", header:"", bycopy.} = object + ru_utime {.importc.}: Timeval + ru_stime {.importc.}: Timeval + ru_maxrss* {.importc.}: int32 # Maximum resident set size + # ... + ru_minflt* {.importc.}: int32 # page reclaims (soft page faults) + + RusageWho* {.size: sizeof(cint).} = enum + RusageChildren = -1 + RusageSelf = 0 + RusageThread = 1 + +when defined(debug): + var H_RUSAGE_SELF{.importc, header:"= stop: + break + + dummy_cpt() + + # if elapsed >= global_poll_elapsed: + # let pollStart = wtime_usec() + # loadBalance(Weave) + # pollElapsed += wtime_usec() - pollStart + # global_poll_elapsed += PollInterval + + # c_printf("Elapsed: %.2lfus\n", elapsed) + +proc spc_consume_nopoll(usec: int32) = + + let start = wtime_usec() + let stop = usec.float64 + + while true: + var elapsed = wtime_usec() - start + if elapsed >= stop: + break + + dummy_cpt() + + # c_printf("Elapsed: %.2lfus\n", elapsed) + +proc spc_produce(n: int32) = + for i in 0 ..< n: + tp.spawn spc_consume(TaskGranularity) + +proc spc_produce_seq(n: int32) = + for i in 0 ..< n: + spc_consume_nopoll(TaskGranularity) + +proc main() = + NumTasksTotal = 1000000 + TaskGranularity = 10 + PollInterval = 10 + + if paramCount() == 0: + let exeName = getAppFilename().extractFilename() + echo &"Usage: {exeName} <# of tasks:{NumTasksTotal}> " & + &" " & + &"[polling interval (us): task granularity]" + echo &"Running with default config tasks = {NumTasksTotal}, granularity (us) = {TaskGranularity}, polling (us) = {PollInterval}" + if paramCount() >= 1: + NumTasksTotal = paramStr(1).parseInt.int32 + if paramCount() >= 2: + TaskGranularity = paramStr(2). parseInt.int32 + if paramCount() == 3: + PollInterval = paramStr(3).parseInt.float64 + else: + PollInterval = TaskGranularity.float64 + if paramCount() > 3: + let exeName = getAppFilename().extractFilename() + echo &"Usage: {exeName} <# of tasks:{NumTasksTotal}> " & + &" " & + &"[polling interval (us): task granularity]" + quit 1 + + var nthreads: int + if existsEnv"CTT_NUM_THREADS": + nthreads = getEnv"CTT_NUM_THREADS".parseInt() + else: + nthreads = countProcessors() + + tp = Threadpool.new(numThreads = nthreads) + + # measure overhead during tasking + var ru: Rusage + getrusage(RusageSelf, ru) + var + rss = ru.ru_maxrss + flt = ru.ru_minflt + + let start = wtime_msec() + + # spc_produce_seq(NumTasksTotal) + spc_produce(NumTasksTotal) + tp.syncAll() + + let stop = wtime_msec() + + getrusage(RusageSelf, ru) + rss = ru.ru_maxrss - rss + flt = ru.ru_minflt - flt + + tp.shutdown() + + echo "--------------------------------------------------------------------------" + echo "Scheduler: Constantine's Threadpool" + echo "Benchmark: SPC (Single task Producer - multi Consumer)" + echo "Threads: ", nthreads + echo "Time(ms) ", round(stop - start, 3) + echo "Max RSS (KB): ", ru.ru_maxrss + echo "Runtime RSS (KB): ", rss + echo "# of page faults: ", flt + echo "--------------------------------------------------------------------------" + echo "# of tasks: ", NumTasksTotal + echo "Task granularity (us): ", TaskGranularity + echo "Polling / manual load balancing interval (us): ", PollInterval + echo "--------------------------------------------------------------------------" + + quit 0 + +main() diff --git a/constantine/platforms/threadpool/benchmarks/wtime.h b/constantine/platforms/threadpool/benchmarks/wtime.h new file mode 100644 index 0000000..b645135 --- /dev/null +++ b/constantine/platforms/threadpool/benchmarks/wtime.h @@ -0,0 +1,53 @@ +#ifndef WTIME_H +#define WTIME_H + +#include +#include + +// Number of seconds since the Epoch +static inline double Wtime_sec(void) +{ + struct timeval tv; + gettimeofday(&tv, NULL); + return tv.tv_sec + tv.tv_usec / 1e6; +} + +// Number of milliseconds since the Epoch +static inline double Wtime_msec(void) +{ + struct timeval tv; + gettimeofday(&tv, NULL); + return tv.tv_sec * 1e3 + tv.tv_usec / 1e3; +} + +// Number of microseconds since the Epoch +static inline double Wtime_usec(void) +{ + struct timeval tv; + gettimeofday(&tv, NULL); + return tv.tv_sec * 1e6 + tv.tv_usec; +} + +// Read time stamp counter on x86 +static inline unsigned long long readtsc(void) +{ + unsigned int lo, hi; + // RDTSC copies contents of 64-bit TSC into EDX:EAX + asm volatile ("rdtsc" : "=a" (lo), "=d" (hi)); + return (unsigned long long)hi << 32 | lo; +} + +#define WTIME_unique_var_name_paste(id, n) id ## n +#define WTIME_unique_var_name(id, n) WTIME_unique_var_name_paste(id, n) +#define WTIME_unique_var(id) WTIME_unique_var_name(id, __LINE__) + +// Convenience macro for time measurement +#define WTIME(unit) \ + double WTIME_unique_var(_start_##unit##_) = Wtime_##unit##ec(); \ + int WTIME_unique_var(_i_) = 0; \ + for (; WTIME_unique_var(_i_) == 0 || \ + (printf("Elapsed wall time: %.2lf "#unit"\n", \ + Wtime_##unit##ec() - WTIME_unique_var(_start_##unit##_)), 0); \ + WTIME_unique_var(_i_)++) + +#endif // WTIME_H diff --git a/constantine/platforms/threadpool/benchmarks/wtime.nim b/constantine/platforms/threadpool/benchmarks/wtime.nim new file mode 100644 index 0000000..420a198 --- /dev/null +++ b/constantine/platforms/threadpool/benchmarks/wtime.nim @@ -0,0 +1,10 @@ + +import strutils, os + +const cSourcesPath = currentSourcePath.rsplit(DirSep, 1)[0] +const cHeader = csourcesPath / "wtime.h" + +{.passC: "-I" & cSourcesPath .} + +proc wtime_usec*: float64 {.importc: "Wtime_usec", header: cHeader.} +proc wtime_msec*: float64 {.importc: "Wtime_msec", header: cHeader.} diff --git a/constantine/platforms/threadpool/crossthread/backoff.nim b/constantine/platforms/threadpool/crossthread/backoff.nim new file mode 100644 index 0000000..a1fe984 --- /dev/null +++ b/constantine/platforms/threadpool/crossthread/backoff.nim @@ -0,0 +1,186 @@ +# Constantine +# Copyright (c) 2018-2019 Status Research & Development GmbH +# Copyright (c) 2020-Present Mamy André-Ratsimbazafy +# Licensed and distributed under either of +# * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT). +# * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0). +# at your option. This file may not be copied, modified, or distributed except according to those terms. + +import + std/atomics, + ../primitives/futexes + +# We implement 2 datastructures to put threads to sleep: +# 1. An event notifier to put an awaiting thread to sleep when the task they require is worked on by another thread +# 2. An eventcount to put an idle thread to sleep + +{.push raises:[], checks:off.} + +# ############################################################ +# +# Event Notifier +# +# ############################################################ + +# Formal verification at: https://github.com/mratsim/weave/blob/7682784/formal_verification/event_notifiers.tla#L76-L109 + +type + EventNotifier* = object + ## Multi Producers, Single Consumer event notification + ## This is can be seen as a wait-free condition variable for producers + ## that avoids them spending time in expensive kernel land due to mutexes. + # ---- Consumer specific ---- + ticket{.align: 64.}: uint8 # A ticket for the consumer to sleep in a phase + # ---- Contention ---- no real need for padding as cache line should be reloaded in case of contention anyway + futex: Futex # A Futex (atomic int32 that can put thread to sleep) + phase: Atomic[uint8] # A binary timestamp, toggles between 0 and 1 (but there is no atomic "not") + signaled: Atomic[bool] # Signaling condition + +func initialize*(en: var EventNotifier) {.inline.} = + en.futex.initialize() + en.ticket = 0 + en.phase.store(0, moRelaxed) + en.signaled.store(false, moRelaxed) + +func `=destroy`*(en: var EventNotifier) {.inline.} = + en.futex.teardown() + +func `=`*(dst: var EventNotifier, src: EventNotifier) {.error: "An event notifier cannot be copied".} +func `=sink`*(dst: var EventNotifier, src: EventNotifier) {.error: "An event notifier cannot be moved".} + +func prepareToPark*(en: var EventNotifier) {.inline.} = + ## The consumer intends to sleep soon. + ## This must be called before the formal notification + ## via a channel. + if not en.signaled.load(moRelaxed): + en.ticket = en.phase.load(moRelaxed) + +proc park*(en: var EventNotifier) {.inline.} = + ## Wait until we are signaled of an event + ## Thread is parked and does not consume CPU resources + ## This may wakeup spuriously. + if not en.signaled.load(moRelaxed): + if en.ticket == en.phase.load(moRelaxed): + en.futex.wait(0) + en.signaled.store(false, moRelaxed) + en.futex.initialize() + +proc notify*(en: var EventNotifier) {.inline.} = + ## Signal a thread that it can be unparked + + if en.signaled.load(moRelaxed): + # Another producer is signaling + return + en.signaled.store(true, moRelease) + discard en.phase.fetchXor(1, moRelaxed) + en.futex.store(1, moRelease) + en.futex.wake() + +# ############################################################ +# +# Eventcount +# +# ############################################################ + +type + Eventcount* = object + ## The lock-free equivalent of a condition variable. + ## + ## Usage, if a thread needs to be parked until a condition is true + ## and signaled by another thread: + ## ```Nim + ## if condition: + ## return + ## + ## while true: + ## ticket = ec.sleepy() + ## if condition: + ## ec.cancelSleep() + ## break + ## else: + ## ec.sleep() + ## ``` + + state: Atomic[uint32] + # State is actually the equivalent of a bitfield + # type State = object + # waiters {.bitsize: 16.}: uint16 + # when sizeof(pointer) == 4: + # epoch {.bitsize: 16.}: uint16 + # else: + # epoch {.bitsize: 48.}: uint48 + # + # of size, the native integer size + # and so can be used for atomic operations on 32-bit or 64-bit platforms. + # but there is no native fetchAdd for bitfield + futex: Futex + # Technically we could use the futex as the state. + # When you wait on a Futex, it waits only if the value of the futex + # matches with a reference value. + # But our reference value will be the epoch of notifications + # and it is non-trivial to zero-out the waiters bits. + # - One way could be to split a 64-bit number in 2 + # and cast the epoch part to Futex but that would only work on 64-bit CPU. + # - Another more hacky way would be to pad with a zero-out uint16 before and after the Futex + # and depending on big or little endian provide a shifted address as Futex. + + ParkingTicket* = object + epoch: uint32 + +const # bitfield + # Low 16 bits are waiters, up to 2¹⁶ = 65536 threads are supported + # High 16 or 48 bits are epochs. + # We can deal with the ABA problem o: + # - up to 65536 wake requests on 32-bit + # - up to 281 474 976 710 656 wake requests on 64-bit + # Epoch rolling over to 0 are not a problem, they won't change the low 16 bits + kEpochShift = 16 + kAddEpoch = 1 shl kEpochShift + kWaiterMask = kAddEpoch - 1 + kEpochMask {.used.} = not kWaiterMask + kAddWaiter = 1 + kSubWaiter = 1 + +func initialize*(ec: var EventCount) {.inline.} = + ec.state.store(0, moRelaxed) + ec.futex.initialize() + +func `=destroy`*(ec: var EventCount) {.inline.} = + ec.futex.teardown() + +proc sleepy*(ec: var Eventcount): ParkingTicket {.inline.} = + ## To be called before checking if the condition to not sleep is met. + ## Returns a ticket to be used when committing to sleep + let prevState = ec.state.fetchAdd(kAddWaiter, moAcquireRelease) + result.epoch = prevState shr kEpochShift + +proc sleep*(ec: var Eventcount, ticket: ParkingTicket) {.inline.} = + ## Put a thread to sleep until notified. + ## If the ticket becomes invalid (a notfication has been received) + ## by the time sleep is called, the thread won't enter sleep + while ec.state.load(moAcquire) shr kEpochShift == ticket.epoch: + ec.futex.wait(ticket.epoch) # We don't use the futex internal value + + let prev {.used.} = ec.state.fetchSub(kSubWaiter, moRelaxed) + +proc cancelSleep*(ec: var Eventcount) {.inline.} = + ## Cancel a sleep that was scheduled. + let prev {.used.} = ec.state.fetchSub(kSubWaiter, moRelaxed) + +proc wake*(ec: var EventCount) {.inline.} = + ## Wake a thread if at least 1 is parked + let prev = ec.state.fetchAdd(kAddEpoch, moAcquireRelease) + if (prev and kWaiterMask) != 0: + ec.futex.wake() + +proc wakeAll*(ec: var EventCount) {.inline.} = + ## Wake all threads if at least 1 is parked + let prev = ec.state.fetchAdd(kAddEpoch, moAcquireRelease) + if (prev and kWaiterMask) != 0: + ec.futex.wakeAll() + +proc getNumWaiters*(ec: var EventCount): uint32 {.inline.} = + ## Get the number of parked threads + ec.state.load(moRelaxed) and kWaiterMask + +{.pop.} # {.push raises:[], checks:off.} \ No newline at end of file diff --git a/constantine/platforms/threadpool/crossthread/taskqueues.nim b/constantine/platforms/threadpool/crossthread/taskqueues.nim new file mode 100644 index 0000000..f10a3eb --- /dev/null +++ b/constantine/platforms/threadpool/crossthread/taskqueues.nim @@ -0,0 +1,286 @@ +# Constantine +# Copyright (c) 2018-2019 Status Research & Development GmbH +# Copyright (c) 2020-Present Mamy André-Ratsimbazafy +# Licensed and distributed under either of +# * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT). +# * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0). +# at your option. This file may not be copied, modified, or distributed except according to those terms. + +# This file implements a single-producer multi-consumer +# task queue for work-stealing schedulers. +# +# Papers: +# - Dynamic Circular Work-Stealing Deque +# David Chase, Yossi Lev, 1993 +# https://www.dre.vanderbilt.edu/~schmidt/PDF/work-stealing-dequeue.pdf +# +# - Non-Blocking Steal-Half Work Queues +# Danny Hendler, Nir Shavit, 2002 +# https://www.cs.bgu.ac.il/~hendlerd/papers/p280-hendler.pdf +# +# - Correct and Efficient Work-Stealing for Weak Memory Models +# Nhat Minh Lê, Antoniu Pop, Albert Cohen, Francesco Zappa Nardelli, 2013 +# https://fzn.fr/readings/ppopp13.pdf +# +# The task queue implements the following push, pop, steal, stealHalf +# +# front back +# --------------------------------- +# steal() <- | | | | <- push() +# stealHalf() <- | Task 0 | Task 1 | Task 2 | -> pop() +# any thread | | | | owner-only +# --------------------------------- +# +# To reduce contention, stealing is done on the opposite end from push/pop +# so that there is a race only for the very last task(s). + +{.push raises: [], checks: off.} # No exceptions in a multithreading datastructure + +import + std/atomics, + ../instrumentation, + ../../allocs, + ./tasks_flowvars + +type + Buf = object + ## Backend buffer of a Taskqueue + ## `capacity` MUST be a power of 2 + prevRetired: ptr Buf # intrusive linked list. Used for garbage collection + + capacity: int + rawBuffer: UncheckedArray[Atomic[ptr Task]] + + Taskqueue* = object + ## This implements a lock-free, growable, work-stealing task queue. + ## The owning thread enqueues and dequeues at the back + ## Foreign threads steal at the front. + ## + ## There is no memory reclamation scheme for simplicity + front {.align: 64.}: Atomic[int] # Consumers - steal/stealHalf + back: Atomic[int] # Producer - push/pop + buf: Atomic[ptr Buf] + garbage: ptr Buf + +proc peek*(tq: var Taskqueue): int = + ## Estimates the number of items pending in the channel + ## In a SPMC setting + ## - If called by the producer the true number might be less + ## due to consumers removing items concurrently. + ## - If called by a consumer the true number is undefined + ## as other consumers also remove items concurrently and + ## the producer removes them concurrently. + ## + ## If the producer peeks and the Chase-Lev Deque returns 0, + ## the queue is empty. + ## + ## This is a non-locking operation. + let # Handle race conditions + b = tq.back.load(moRelaxed) + f = tq.front.load(moRelaxed) + + if b >= f: + return b-f + else: + return 0 + +func isPowerOfTwo(n: int): bool {.used, inline.} = + (n and (n - 1)) == 0 and (n != 0) + +proc newBuf(capacity: int): ptr Buf = + # Tasks have a destructor + # static: + # doAssert supportsCopyMem(T), $T & " must be a (POD) plain-old-data type: no seq, string, ref." + + preCondition: capacity.isPowerOfTwo() + + result = allocHeapUnchecked(Buf, 1*sizeof(pointer) + 2*sizeof(int) + sizeof(pointer)*capacity) + + result.prevRetired = nil + result.capacity = capacity + result.rawBuffer.addr.zeroMem(sizeof(pointer)*capacity) + +proc `[]=`(buf: var Buf, index: int, item: ptr Task) {.inline.} = + buf.rawBuffer[index and (buf.capacity-1)].store(item, moRelaxed) + +proc `[]`(buf: var Buf, index: int): ptr Task {.inline.} = + result = buf.rawBuffer[index and (buf.capacity-1)].load(moRelaxed) + +proc grow(tq: var Taskqueue, buf: var ptr Buf, newCapacity, front, back: int) {.inline.} = + ## Double the buffer size + ## back is the last item index + ## + ## To handle race-conditions the current "front", "back" and "buf" + ## have to be saved before calling this procedure. + ## It reads and writes the "tq.buf" and "tq.garbage" + + # Read -> Copy -> Update + var tmp = newBuf(newCapacity) + for i in front ..< back: + tmp[][i] = buf[][i] + + buf.prevRetired = tq.garbage + tq.garbage = buf + # publish globally + tq.buf.store(tmp, moRelaxed) + # publish locally + swap(buf, tmp) + +proc garbageCollect(tq: var Taskqueue) {.inline.} = + var node = tq.garbage + while node != nil: + let tmp = node.prevRetired + freeHeap(node) + node = tmp + tq.garbage = nil + +# Public API +# --------------------------------------------------- + +proc init*(tq: var Taskqueue, initialCapacity: int) = + zeroMem(tq.addr, tq.sizeof()) + tq.buf.store(newBuf(initialCapacity), moRelaxed) + +proc teardown*(tq: var Taskqueue) = + tq.garbageCollect() + freeHeap(tq.buf.load(moRelaxed)) + +proc push*(tq: var Taskqueue, item: ptr Task) = + ## Enqueue an item at the back + ## As the task queue takes ownership of it. The item must not be used afterwards. + ## This is intended for the producer only. + + let # Handle race conditions + b = tq.back.load(moRelaxed) + f = tq.front.load(moAcquire) + var buf = tq.buf.load(moRelaxed) + + if b-f > buf.capacity - 1: + # Full queue + tq.grow(buf, buf.capacity*2, f, b) + + if not tq.garbage.isNil and f == b: + # Empty queue, no thieves can have a pointer to an old retired buffer + tq.garbageCollect() + + buf[][b] = item + fence(moRelease) + tq.back.store(b+1, moRelaxed) + +proc pop*(tq: var Taskqueue): ptr Task = + ## Dequeue an item at the back. Takes ownership of the item + ## This is intended for the producer only. + + let # Handle race conditions + b = tq.back.load(moRelaxed) - 1 + buf = tq.buf.load(moRelaxed) + + tq.back.store(b, moRelaxed) + fence(moSequentiallyConsistent) + var f = tq.front.load(moRelaxed) + + if f <= b: + # Non-empty queue. + result = buf[][b] + if f == b: + # Single last element in queue. + if not compareExchange(tq.front, f, f+1, moSequentiallyConsistent, moRelaxed): + # Failed race. + result = nil + tq.back.store(b+1, moRelaxed) + if not tq.garbage.isNil: + # Empty queue, no thieves can have a pointer to an old retired buffer + tq.garbageCollect() + else: + # Empty queue. + result = nil + tq.back.store(b+1, moRelaxed) + if not tq.garbage.isNil: + # Empty queue, no thieves can have a pointer to an old retired buffer + tq.garbageCollect() + +proc steal*(thiefID: uint32, tq: var Taskqueue): ptr Task = + ## Dequeue an item at the front. Takes ownership of the item + ## This is intended for consumers. + var f = tq.front.load(moAcquire) + fence(moSequentiallyConsistent) + let b = tq.back.load(moAcquire) + result = nil + + if f < b: + # Non-empty queue. + let a = tq.buf.load(moConsume) + result = a[][f] + if not compareExchange(tq.front, f, f+1, moSequentiallyConsistent, moRelaxed): + # Failed race. + return nil + result.thiefID.store(thiefID, moRelease) + +proc stealHalfImpl(dst: var Buf, dstBack: int, src: var Taskqueue): int = + ## Theft part of stealHalf: + ## - updates the victim metadata if successful + ## - uncommitted updates to the thief tq whether successful or not + ## Returns -1 if dst buffer is too small + ## Assumes `dst` buffer is empty (i.e. not ahead-of-time thefts) + + while true: + # Try as long as there are something to steal, we are idling anyway. + + var f = src.front.load(moAcquire) + fence(moSequentiallyConsistent) + let b = src.back.load(moAcquire) + var n = b-f + n = n - (n shr 1) # Division by 2 rounded up, so if only one task is left, we still get it. + + if n <= 0: + return 0 + if n > dst.capacity: + return -1 + + # Non-empty queue. + let sBuf = src.buf.load(moConsume) + for i in 0 ..< n: # Copy LIFO or FIFO? + dst[dstBack+i] = sBuf[][f+i] + if compareExchange(src.front, f, f+n, moSequentiallyConsistent, moRelaxed): + return n + +proc stealHalf*(thiefID: uint32, dst: var Taskqueue, src: var Taskqueue): ptr Task = + ## Dequeue up to half of the items in the `src` tq, fom the front. + ## Return the last of those, or nil if none found + + while true: + # Prepare for batch steal + let + bDst = dst.back.load(moRelaxed) + fDst = dst.front.load(moAcquire) + var dBuf = dst.buf.load(moAcquire) + let sBuf = src.buf.load(moAcquire) + + if dBuf.capacity < sBuf.capacity: + # We could grow to sBuf/2 since we steal half, but we want to minimize + # churn if we are actually in the process of stealing and the buffers grows. + dst.grow(dBuf, sBuf.capacity, fDst, bDst) + + # Steal + let n = dBuf[].stealHalfImpl(bDst, src) + + if n == 0: + return nil + if n == -1: + # Oops, victim buffer grew bigger than ours, restart the whole process + continue + + # Update metadata + for i in 0 ..< n: + dBuf[][bDst+i].thiefID.store(thiefID, moRelease) + + # Commit/publish theft, return the first item for processing + let last = dBuf[][bDst+n-1] + fence(moSequentiallyConsistent) + if n == 1: + return last + + # We have more than one item, so some must go in our queue + # they are already here but inaccessible for other thieves + dst.back.store(bDst+n-1, moRelease) # We assume that queue was empty and so dst.front didn't change + return last diff --git a/constantine/platforms/threadpool/crossthread/tasks_flowvars.nim b/constantine/platforms/threadpool/crossthread/tasks_flowvars.nim new file mode 100644 index 0000000..468ed56 --- /dev/null +++ b/constantine/platforms/threadpool/crossthread/tasks_flowvars.nim @@ -0,0 +1,127 @@ +# Constantine +# Copyright (c) 2018-2019 Status Research & Development GmbH +# Copyright (c) 2020-Present Mamy André-Ratsimbazafy +# Licensed and distributed under either of +# * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT). +# * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0). +# at your option. This file may not be copied, modified, or distributed except according to those terms. + +import + std/atomics, + ../instrumentation, + ../../allocs, + ./backoff + +# Tasks have an efficient design so that a single heap allocation +# is required per `spawn`. +# This greatly reduce overhead and potential memory fragmentation for long-running applications. +# +# This is done by tasks: +# - being an intrusive linked lists +# - integrating the channel to send results +# +# Flowvar is the public type created when spawning a task. +# and can be synced to receive the task result. +# Flowvars are also called future interchangeably. +# (The name future is already used for IO scheduling) + +type + Task* = object + # Intrusive metadata + # ------------------ + parent*: ptr Task # When a task is awaiting, a thread can quickly prioritize the direct child of a task + + thiefID*: Atomic[uint32] # ID of the worker that stole and run the task. For leapfrogging. + + # Result sync + # ------------------ + hasFuture*: bool # Ownership: if a task has a future, the future deallocates it. Otherwise the worker thread does. + completed*: Atomic[bool] + waiter*: Atomic[ptr EventNotifier] + + # Execution + # ------------------ + fn*: proc (param: pointer) {.nimcall, gcsafe.} + # destroy*: proc (param: pointer) {.nimcall, gcsafe.} # Constantine only deals with plain old data + data*{.align:sizeof(int).}: UncheckedArray[byte] + + Flowvar*[T] = object + task: ptr Task + +const SentinelThief* = 0xFACADE'u32 + +proc new*( + T: typedesc[Task], + parent: ptr Task, + fn: proc (param: pointer) {.nimcall, gcsafe.}): ptr Task {.inline.} = + + const size = sizeof(T) + + result = allocHeapUnchecked(T, size) + result.parent = parent + result.thiefID.store(SentinelThief, moRelaxed) + result.completed.store(false, moRelaxed) + result.waiter.store(nil, moRelaxed) + result.fn = fn + +proc new*( + T: typedesc[Task], + parent: ptr Task, + fn: proc (param: pointer) {.nimcall, gcsafe.}, + params: auto): ptr Task {.inline.} = + + const size = sizeof(T) + # size without Unchecked + sizeof(params) + + result = allocHeapUnchecked(T, size) + result.parent = parent + result.thiefID.store(SentinelThief, moRelaxed) + result.completed.store(false, moRelaxed) + result.waiter.store(nil, moRelaxed) + result.fn = fn + cast[ptr[type params]](result.data)[] = params + +# proc `=copy`*[T](dst: var Flowvar[T], src: Flowvar[T]) {.error: "Futures/Flowvars cannot be copied".} + +proc newFlowVar*(T: typedesc, task: ptr Task): Flowvar[T] {.inline.} = + result.task = task + result.task.hasFuture = true + + # Task with future references themselves so that readyWith can be called + # within the constructed + # proc async_fn(param: pointer) {.nimcall.} + # that can only access data + cast[ptr ptr Task](task.data.addr)[] = task + +proc cleanup*(fv: var Flowvar) {.inline.} = + fv.task.freeHeap() + fv.task = nil + +func isSpawned*(fv: Flowvar): bool {.inline.} = + ## Returns true if a flowvar is spawned + ## This may be useful for recursive algorithms that + ## may or may not spawn a flowvar depending on a condition. + ## This is similar to Option or Maybe types + return not fv.task.isNil + +func isReady*[T](fv: Flowvar[T]): bool {.inline.} = + ## Returns true if the result of a Flowvar is ready. + ## In that case `sync` will not block. + ## Otherwise the current will block to help on all the pending tasks + ## until the Flowvar is ready. + fv.task.completed.load(moAcquire) + +func readyWith*[T](task: ptr Task, childResult: T) {.inline.} = + ## Send the Flowvar result from the child thread processing the task + ## to its parent thread. + precondition: not task.completed.load(moAcquire) + cast[ptr (ptr Task, T)](task.data.addr)[1] = childResult + task.completed.store(true, moRelease) + +proc sync*[T](fv: sink Flowvar[T]): T {.inline, gcsafe.} = + ## Blocks the current thread until the flowvar is available + ## and returned. + ## The thread is not idle and will complete pending tasks. + mixin completeFuture + completeFuture(fv, result) + cleanup(fv) diff --git a/constantine/platforms/threadpool/docs/partitioners.md b/constantine/platforms/threadpool/docs/partitioners.md new file mode 100644 index 0000000..f31efc9 --- /dev/null +++ b/constantine/platforms/threadpool/docs/partitioners.md @@ -0,0 +1,166 @@ +# Partitioners + +For data parallelism (parallel for loops) there are 2 main scheduling strategies: +- static scheduling, when work is regular (for example adding 2 matrices). + In that case, splitting the loop + into same-sized chunk provides perfect speedup, with no synchronization overhead. + (Assuming threads have the same performance and no parasite load) +- dynamic scheduling, when work is irregular (for example zero-ing buffers of different length). + +Partitioners help implementing static scheduling. + +Static scheduling +================= + +Usually static scheduling is problematic because the threshold below which running in parallel +is both hardware dependent, data layout and function dependent, see https://github.com/zy97140/omp-benchmark-for-pytorch + +| CPU Model | Sockets | Cores/Socket | Frequency | +|------------------------------------|---------|--------------|-----------| +| Intel(R) Xeon(R) CPU E5-2699 v4 | 2 | 22 | 2.20GHz | +| Intel(R) Xeon(R) Platinum 8180 CPU | 2 | 28 | 2.50GHz | +| Intel(R) Core(TM) i7-5960X CPU | 1 | 8 | 3.00GHz | + +| contiguous op | Xeon(R) Platinum 8180 CPU | Xeon(R) CPU E5-2699 v4 | i7-5960X CPU | +|---------------|---------------------------|------------------------|--------------| +| copy | 80k | 20k | 8k | +| add | 80k | 20k | 8k | +| div | 50k | 10k | 2k | +| exp | 1k | 1k | 1k | +| sin | 1k | 1k | 1k | +| sum | 1k | 1k | 1k | +| prod | 1k | 1k | 1k | + +| non-contiguous op | Xeon(R) Platinum 8180 CPU | Xeon(R) CPU E5-2699 v4 | i7-5960X CPU | +|-------------------|---------------------------|------------------------|--------------| +| copy | 20k | 8k | 2k | +| add | 20k | 8k | 2k | +| div | 10k | 8k | 1k | +| exp | 1k | 1k | 1k | +| sin | 2k | 2k | 1k | +| sum | 1k | 1k | 1k | +| prod | 1k | 1k | 1k | + + +# Static partitioner +==================== + +```Nim +iterator balancedChunks*(start, stopEx, numChunks: int): tuple[chunkID, start, stopEx: int] = + ## Balanced chunking algorithm for a range [start, stopEx) + ## This splits a range into min(stopEx-start, numChunks) balanced regions + ## and returns a tuple (chunkID, offset, length) + + # Rationale + # The following simple chunking scheme can lead to severe load imbalance + # + # let chunk_offset = chunk_size * thread_id + # let chunk_size = if thread_id < nb_chunks - 1: chunk_size + # else: omp_size - chunk_offset + # + # For example dividing 40 items on 12 threads will lead to + # a base_chunk_size of 40/12 = 3 so work on the first 11 threads + # will be 3 * 11 = 33, and the remainder 7 on the last thread. + # + # Instead of dividing 40 work items on 12 cores into: + # 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 7 = 3*11 + 7 = 40 + # the following scheme will divide into + # 4, 4, 4, 4, 3, 3, 3, 3, 3, 3, 3, 3 = 4*4 + 3*8 = 40 + # + # This is compliant with OpenMP spec (page 60) + # http://www.openmp.org/mp-documents/openmp-4.5.pdf + # "When no chunk_size is specified, the iteration space is divided into chunks + # that are approximately equal in size, and at most one chunk is distributed to + # each thread. The size of the chunks is unspecified in this case." + # ---> chunks are the same ±1 + + let + totalIters = stopEx - start + baseChunkSize = totalIters div numChunks + cutoff = totalIters mod numChunks + + for chunkID in 0 ..< min(numChunks, totalIters): + if chunkID < cutoff: + let offset = start + ((baseChunkSize + 1) * chunkID) + let chunkSize = baseChunkSize + 1 + yield (chunkID, offset, offset+chunkSize) + else: + let offset = start + (baseChunkSize * chunkID) + cutoff + let chunkSize = baseChunkSize + yield (chunkID, offset, chunkSize) + +when isMainModule: + for chunk in balancedChunks(start = 0, stopEx = 40, numChunks = 12): + echo chunk +``` + +Dynamic scheduling +================== + +Dynamic schedulers decide at runtime whether to split a range for processing by multiple threads at runtime. +Unfortunately most (all?) of those in production do not or cannot take into account +the actual functions being called within a `parallel_for` section, +and might split into too fine-grained chunks or into too coarse-grained chunks. +Alternatively they might ask the programmer for a threshold below which not to split. +As the programmer has no way to know if their code will run on a Raspberry Pi or a powerful workstation, that choice cannot be optimal. + +Recent advances in research like "Lazy Binary Splitting" and "Lazy Tree Splitting" +allows dynamic scheduling to fully adapt to the system current load and +the parallel section computational needs without programmer input (grain size or minimal split threshold) +by using backpressure. + + +Why do we need partitioners for cryptographic workload then? +============================================================ + + +Here are 3 basic cryptographic primitives that are non-trivial to parallelize: + +1. batch elliptic-curve addition +2. multi-scalar multiplication (MSM) +3. batch signature verification via multi-pairing + +On the other hand, merkle tree hashing for example is a primitive where both static or dynamic scheduling should give perfect speedup. + +Let's take the example of batch EC addition. +-------------------------------------------- + +There is a naive way, via doing a parallel EC sum reduction, +for example for 1M points, +using projective coordinates, each sum costs 12M (field multiplication) +At first level we have 500k sums, then 250k sums, then 125k, ... +The number of sums is ~1M, for a total cost of 12e⁶ + +``` + 0 1 2 3 4 5 6 7 + + + + + + + + + + +``` + +The fast way uses affine sum for an asymptotic cost of 6M, so 2x faster. +but this requires an inversion, a fixed cost of ~131M (256-bit) to ~96M (384-bit) +whatever number of additions we accumulate +That inversion needs to be amortized into at least 20-25 additions. + +Let's take a look at multi-pairing +---------------------------------- + +Multi-pairing is split into 2 phases: +- 1. the Miller-Loop which is embarassing parallel, each thread can compute it on their own. +- 2. reducing the n Miller-Loops into a single 𝔽pₖ point using parallel 𝔽pₖ sum reduction. +- 3. Computing the final exponentiation, a fixed cost whatever the number of Miller Loops we did. + 3alt. Alternatively, computing n final exponentiations, and merging them with a 𝔽pₖ product reduction + in that case, step 2 is not needed. + +Conclusion +---------- + +Dynamic scheduling for reduction with variable + fixed costs (independent of chunk size) is tricky. +Furthermore the computations are regular, same workload per range and static scheduling seems like a great fit. + +The downside is if a core has a parasite workload or on architecture like ARM big.Little or Alder Lake +with performance and power-saving cores. + +Alternatively, we can add dynamic scheduling hints about min and max chunk size so that +chunk size is kept within the optimal range whatever the number of idle threads. \ No newline at end of file diff --git a/constantine/platforms/threadpool/docs/random_permutations.md b/constantine/platforms/threadpool/docs/random_permutations.md new file mode 100644 index 0000000..ea40e85 --- /dev/null +++ b/constantine/platforms/threadpool/docs/random_permutations.md @@ -0,0 +1,42 @@ +# Random permutations + +Work-stealing is more efficient when the thread we steal from is randomized. +If all threads steal in the same order, we increase contention +on the start victims task queues. + +The randomness quality is not important besides distributing potential contention, +i.e. randomly trying thread i, then i+1, then i+n-1 (mod n) is good enough. + +Hence for efficiency, so that a thread can go to sleep faster, we want to +reduce calls to to the RNG as: +- Getting a random value itself can be expensive, especially if we use a CSPRNG (not a requirement) +- a CSPRNG can be starved of entropy as with small tasks, threads might make millions of calls. +- If we want unbiaised thread ID generation in a range, rejection sampling is costly (not a requirement). + +Instead of using Fisher-Yates + - generates the victim set eagerly, inefficient if the first steal attempts are successful + - needs a RNG call when sampling a victim + - memory usage: numThreads per thread so numthreads² uint8 (255 threads max) or uint32 + +or a sparseset + - 1 RNG call when sampling a victim + - memory usage: 2\*numThreads per thread so 2\*numthreads² uint8 (255 threads max) or uint32 + +we can use Linear Congruential Generators, a recurrence relation of the form Xₙ₊₁ = aXₙ+c (mod m) +If we respect the Hull-Dobell theorem requirements, we can generate pseudo-random permutations in [0, m) +with fixed memory usage whatever the number of potential victims: just 4 registers for a, x, c, m + +References: +- Fisher-Yates: https://en.wikipedia.org/wiki/Fisher%E2%80%93Yates_shuffle +- Sparse sets: https://dl.acm.org/doi/pdf/10.1145/176454.176484 + https://github.com/mratsim/weave/blob/7682784/weave/datatypes/sparsesets.nim + https://github.com/status-im/nim-taskpools/blob/4bc0b59/taskpools/sparsesets.nim +- Linear Congruential Generator: https://en.wikipedia.org/wiki/Linear_congruential_generator + +And if we want cryptographic strength: +- Ciphers with Arbitrary Finite Domains + John Black and Phillip Rogaway, 2001 + https://eprint.iacr.org/2001/012 +- An Enciphering Scheme Based on a Card Shuffle + Viet Tung Hoang, Ben Morris, Phillip Rogaway + https://www.iacr.org/archive/crypto2012/74170001/74170001.pdf \ No newline at end of file diff --git a/constantine/platforms/threadpool/examples/e01_simple_tasks.nim b/constantine/platforms/threadpool/examples/e01_simple_tasks.nim new file mode 100644 index 0000000..fa91933 --- /dev/null +++ b/constantine/platforms/threadpool/examples/e01_simple_tasks.nim @@ -0,0 +1,48 @@ +import ../threadpool + +block: # Async without result + + proc displayInt(x: int) = + stdout.write(x) + stdout.write(" - SUCCESS\n") + + proc main() = + echo "\n==============================================================================================" + echo "Running 'threadpool/examples/e01_simple_tasks.nim'" + echo "==============================================================================================" + + echo "\nSanity check 1: Printing 123456 654321 in parallel" + + var tp = Threadpool.new(numThreads = 4) + tp.spawn displayInt(123456) + tp.spawn displayInt(654321) + tp.shutdown() + + main() + +block: # Async/Await + var tp: Threadpool + + proc asyncFib(n: int): int = + if n < 2: + return n + + let x = tp.spawn asyncFib(n-1) + let y = asyncFib(n-2) + + result = sync(x) + y + + proc main2() = + echo "\n==============================================================================================" + echo "Running 'threadpool/examples/e01_simple_tasks.nim'" + echo "==============================================================================================" + + echo "\nSanity check 2: fib(20)" + + tp = Threadpool.new() + let f = asyncFib(20) + tp.shutdown() + + doAssert f == 6765, "f was " & $f + + main2() diff --git a/constantine/platforms/threadpool/examples/e02_parallel_pi.nim b/constantine/platforms/threadpool/examples/e02_parallel_pi.nim new file mode 100644 index 0000000..3407652 --- /dev/null +++ b/constantine/platforms/threadpool/examples/e02_parallel_pi.nim @@ -0,0 +1,38 @@ +# Demo of API using a very inefficient π approcimation algorithm. + +import + std/[strutils, math, cpuinfo], + ../threadpool + +# From https://github.com/nim-lang/Nim/blob/v1.6.2/tests/parallel/tpi.nim +# Leibniz Formula https://en.wikipedia.org/wiki/Leibniz_formula_for_%CF%80 +proc term(k: int): float = + if k mod 2 == 1: + -4'f / float(2*k + 1) + else: + 4'f / float(2*k + 1) + +proc piApprox(tp: Threadpool, n: int): float = + var pendingFuts = newSeq[Flowvar[float]](n) + for k in 0 ..< pendingFuts.len: + pendingFuts[k] = tp.spawn term(k) # Schedule a task on the threadpool a return a handle to retrieve the result. + for k in 0 ..< pendingFuts.len: + result += sync pendingFuts[k] # Block until the result is available. + +proc main() = + + echo "\n==============================================================================================" + echo "Running 'threadpool/examples/e02_parallel_pi.nim'" + echo "==============================================================================================" + + var n = 1_000_000 + var nthreads = countProcessors() + + var tp = Threadpool.new(num_threads = nthreads) # Default to the number of hardware threads. + + echo formatFloat(tp.piApprox(n)) + + tp.shutdown() + +# Compile with nim c -r -d:release --threads:on --outdir:build example.nim +main() diff --git a/constantine/platforms/threadpool/instrumentation.nim b/constantine/platforms/threadpool/instrumentation.nim new file mode 100644 index 0000000..82c03d7 --- /dev/null +++ b/constantine/platforms/threadpool/instrumentation.nim @@ -0,0 +1,142 @@ +# Weave +# Copyright (c) 2019 Mamy André-Ratsimbazafy +# Licensed and distributed under either of +# * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT). +# * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0). +# at your option. This file may not be copied, modified, or distributed except according to those terms. + +import system/ansi_c + +# Loggers +# -------------------------------------------------------- + +template log*(args: varargs[untyped]): untyped = + c_printf(args) + flushFile(stdout) + +template debugTermination*(body: untyped): untyped = + when defined(TP_DebugTermination) or defined(TP_Debug): + {.noSideEffect, gcsafe.}: body + +template debug*(body: untyped): untyped = + when defined(TP_Debug): + {.noSideEffect, gcsafe.}: body + +# -------------------------------------------------------- + +import std/macros + +# A simple design-by-contract API +# -------------------------------------------------------- + +# Everything should be a template that doesn't produce any code +# when debugConstantine is not defined. +# Those checks are controlled by a custom flag instead of +# "--boundsChecks" or "--nilChecks" to decouple them from user code checks. +# Furthermore, we want them to be very lightweight on performance + +func toHex*(a: SomeInteger): string = + const hexChars = "0123456789abcdef" + const L = 2*sizeof(a) + result = newString(2 + L) + result[0] = '0' + result[1] = 'x' + var a = a + for j in countdown(result.len-1, 0): + result[j] = hexChars[a and 0xF] + a = a shr 4 + +proc inspectInfix(node: NimNode): NimNode = + ## Inspect an expression, + ## Returns the AST as string with runtime values inlined + ## from infix operators inlined. + # TODO: pointer and custom type need a default repr + # otherwise we can only resulve simple expressions + proc inspect(node: NimNode): NimNode = + case node.kind: + of nnkInfix: + return newCall( + bindSym"&", + newCall( + bindSym"&", + newCall(ident"$", inspect(node[1])), + newLit(" " & $node[0] & " ") + ), + newCall(ident"$", inspect(node[2])) + ) + of {nnkIdent, nnkSym}: + return node + of nnkDotExpr: + return quote do: + when `node` is pointer or + `node` is ptr or + `node` is (proc): + toHex(cast[ByteAddress](`node`) and 0xffff_ffff) + else: + $(`node`) + of nnkPar: + result = nnkPar.newTree() + for sub in node: + result.add inspect(sub) + else: + return node.toStrLit() + return inspect(node) + +macro assertContract( + checkName: static string, + predicate: untyped) = + let lineinfo = lineInfoObj(predicate) + + var strippedPredicate: NimNode + if predicate.kind == nnkStmtList: + assert predicate.len == 1, "Only one-liner conditions are supported" + strippedPredicate = predicate[0] + else: + strippedPredicate = predicate + + let debug = "\n Contract violated for " & checkName & " at " & $lineinfo & + "\n " & $strippedPredicate.toStrLit & + "\n The following values are contrary to expectations:" & + "\n " + let values = inspectInfix(strippedPredicate) + let workerID = quote do: + when declared(workerContext): + $workerContext.id + else: + "N/A" + let threadpoolID = quote do: + when declared(workerContext): + cast[ByteAddress](workerContext.threadpool).toHex() + else: + "N/A" + + result = quote do: + {.noSideEffect.}: + when compileOption("assertions"): + assert(`predicate`, `debug` & $`values` & " [Worker " & `workerID` & " on threadpool " & `threadpoolID` & "]\n") + elif defined(TP_Asserts): + if unlikely(not(`predicate`)): + raise newException(AssertionError, `debug` & $`values` & " [Worker " & `workerID` & " on threadpool " & `threadpoolID` & "]\n") + + +template preCondition*(require: untyped) = + ## Optional runtime check before returning from a function + assertContract("pre-condition", require) + +template postCondition*(ensure: untyped) = + ## Optional runtime check at the start of a function + assertContract("post-condition", ensure) + +template ascertain*(check: untyped) = + ## Optional runtime check in the middle of processing + assertContract("transient condition", check) + +# Sanity checks +# ---------------------------------------------------------------------------------- + +when isMainModule: + proc assertGreater(x, y: int) = + postcondition(x > y) + + # We should get a nicely formatted exception + assertGreater(10, 12) diff --git a/constantine/platforms/threadpool/parallel_offloading.nim b/constantine/platforms/threadpool/parallel_offloading.nim new file mode 100644 index 0000000..75b5ddb --- /dev/null +++ b/constantine/platforms/threadpool/parallel_offloading.nim @@ -0,0 +1,160 @@ +# Constantine +# Copyright (c) 2018-2019 Status Research & Development GmbH +# Copyright (c) 2020-Present Mamy André-Ratsimbazafy +# Licensed and distributed under either of +# * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT). +# * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0). +# at your option. This file may not be copied, modified, or distributed except according to those terms. + +import + std/macros, + ./instrumentation, + ./crossthread/tasks_flowvars + +# Task parallelism - spawn +# --------------------------------------------- + +proc spawnVoid(funcCall: NimNode, args, argsTy: NimNode, workerContext, schedule: NimNode): NimNode = + # Create the async function + let fn = funcCall[0] + let fnName = $fn + let withArgs = args.len > 0 + let async_fn = ident("async_" & fnName) + var fnCall = newCall(fn) + let data = ident("data") # typed pointer to data + + # Schedule + let task = ident"task" + let scheduleBlock = newCall(schedule, workerContext, task) + + result = newStmtList() + + if funcCall.len == 2: + # With only 1 arg, the tuple syntax doesn't construct a tuple + # let data = (123) # is an int + fnCall.add nnkDerefExpr.newTree(data) + else: # This handles the 0 arg case as well + for i in 1 ..< funcCall.len: + fnCall.add nnkBracketExpr.newTree( + data, + newLit i-1 + ) + + # Create the async call + result.add quote do: + proc `async_fn`(param: pointer) {.nimcall.} = + # preCondition: not isRootTask(`workerContext`.currentTask) + + when bool(`withArgs`): + let `data` = cast[ptr `argsTy`](param) + `fnCall` + + # Create the task + result.add quote do: + block enq_deq_task: + when bool(`withArgs`): + let `task` = Task.new( + parent = `workerContext`.currentTask, + fn = `async_fn`, + params = `args`) + else: + let `task` = Task.new( + parent = `workerContext`.currentTask, + fn = `async_fn`) + `scheduleBlock` + +proc spawnRet(funcCall: NimNode, retTy, args, argsTy: NimNode, workerContext, schedule: NimNode): NimNode = + # Create the async function + let fn = funcCall[0] + let fnName = $fn + let async_fn = ident("async_" & fnName) + var fnCall = newCall(fn) + let data = ident("data") # typed pointer to data + + # Schedule + let task = ident"task" + let scheduleBlock = newCall(schedule, workerContext, task) + + result = newStmtList() + + # tasks have no return value. + # 1. The start of the task `data` buffer will store the return value for the flowvar and awaiter/sync + # 2. We create a wrapper async_fn without return value that send the return value in the channel + # 3. We package that wrapper function in a task + + # We store the following in task.data: + # + # | ptr Task | result | arg₀ | arg₁ | ... | argₙ + let fut = ident"fut" + let taskSelfReference = ident"taskSelfReference" + let retVal = ident"retVal" + + var futArgs = nnkPar.newTree + var futArgsTy = nnkPar.newTree + futArgs.add taskSelfReference + futArgsTy.add nnkPtrTy.newTree(bindSym"Task") + futArgs.add retVal + futArgsTy.add retTy + + for i in 1 ..< funcCall.len: + futArgsTy.add getTypeInst(funcCall[i]) + futArgs.add funcCall[i] + + # data stores | ptr Task | result | arg₀ | arg₁ | ... | argₙ + # so arguments starts at data[2] in the wrapping funcCall functions + for i in 1 ..< funcCall.len: + fnCall.add nnkBracketExpr.newTree( + data, + newLit i+1 + ) + + result.add quote do: + proc `async_fn`(param: pointer) {.nimcall.} = + # preCondition: not isRootTask(`workerContext`.currentTask) + + let `data` = cast[ptr `futArgsTy`](param) + let res = `fnCall` + readyWith(`data`[0], res) + + # Regenerate fresh ident, retTy has been tagged as a function call param + let retTy = ident($retTy) + + # Create the task + result.add quote do: + block enq_deq_task: + let `taskSelfReference` = cast[ptr Task](0xDEADBEEF) + let `retVal` = default(`retTy`) + + let `task` = Task.new( + parent = `workerContext`.currentTask, + fn = `async_fn`, + params = `futArgs`) + let `fut` = newFlowvar(`retTy`, `task`) + `scheduleBlock` + # Return the future + `fut` + +proc spawnImpl*(tp: NimNode{nkSym}, funcCall: NimNode, workerContext, schedule: NimNode): NimNode = + funcCall.expectKind(nnkCall) + + # Get the return type if any + let retType = funcCall[0].getImpl[3][0] + let needFuture = retType.kind != nnkEmpty + + # Get a serialized type and data for all function arguments + # We use adhoc tuple + var argsTy = nnkPar.newTree() + var args = nnkPar.newTree() + for i in 1 ..< funcCall.len: + argsTy.add getTypeInst(funcCall[i]) + args.add funcCall[i] + + # Package in a task + if not needFuture: + result = spawnVoid(funcCall, args, argsTy, workerContext, schedule) + else: + result = spawnRet(funcCall, retType, args, argsTy, workerContext, schedule) + + # Wrap in a block for namespacing + result = nnkBlockStmt.newTree(newEmptyNode(), result) + # echo result.toStrLit \ No newline at end of file diff --git a/constantine/platforms/threadpool/primitives/barriers.md b/constantine/platforms/threadpool/primitives/barriers.md new file mode 100644 index 0000000..1cf679f --- /dev/null +++ b/constantine/platforms/threadpool/primitives/barriers.md @@ -0,0 +1,53 @@ +# Synchronization Barriers + +OSX does not implement pthread_barrier as its an optional part +of the POSIX standard and they probably want to drive people to libdispatch/Grand Central Dispatch. + +So we need to roll our own with a POSIX compatible API. + +## Glibc barriers, design bug and implementation + +> Note: due to GPL licensing, do not lift the code. +> Not that we can as it is heavily dependent on futexes +> which are not available on OSX + +We need to make sure that we don't hit the same bug +as glibc: https://sourceware.org/bugzilla/show_bug.cgi?id=13065 +which seems to be an issue in some of the barrier implementations +in the wild. + +The design of Glibc barriers is here: +https://sourceware.org/git/?p=glibc.git;a=blob;f=nptl/DESIGN-barrier.txt;h=23463c6b7e77231697db3e13933b36ce295365b1;hb=HEAD + +And implementation: +- https://sourceware.org/git/?p=glibc.git;a=blob;f=nptl/pthread_barrier_destroy.c;h=76957adef3ee751e5b0cfa429fcf4dd3cfd80b2b;hb=HEAD +- https://sourceware.org/git/?p=glibc.git;a=blob;f=nptl/pthread_barrier_init.c;h=c8ebab3a3cb5cbbe469c0d05fb8d9ca0c365b2bb;hb=HEAD` +- https://sourceware.org/git/?p=glibc.git;a=blob;f=nptl/pthread_barrier_wait.c;h=49fcfd370c1c4929fdabdf420f2f19720362e4a0;hb=HEAD + +## Synchronization barrier techniques + +This article goes over the techniques of +"pool barrier" and "ticket barrier" +https://locklessinc.com/articles/barriers/ +to reach 2x to 20x the speed of pthreads barrier + +This course https://cs.anu.edu.au/courses/comp8320/lectures/aux/comp422-Lecture21-Barriers.pdf +goes over +- centralized barrier with sense reversal +- combining tree barrier +- dissemination barrier +- tournament barrier +- scalable tree barrier +More courses: +- http://www.cs.rochester.edu/u/sandhya/csc458/seminars/jb_Barrier_Methods.pdf + +It however requires lightweight mutexes like Linux futexes +that OSX lacks. + +This post goes over lightweight mutexes like Benaphores (from BeOS) +https://preshing.com/20120226/roll-your-own-lightweight-mutex/ + +This gives a few barrier implementations +http://gallium.inria.fr/~maranget/MPRI/02.pdf +and refers to Cubible paper for formally verifying synchronization barriers +http://cubicle.lri.fr/papers/jfla2014.pdf (in French) diff --git a/constantine/platforms/threadpool/primitives/barriers.nim b/constantine/platforms/threadpool/primitives/barriers.nim new file mode 100644 index 0000000..bf5663e --- /dev/null +++ b/constantine/platforms/threadpool/primitives/barriers.nim @@ -0,0 +1,71 @@ +# Weave +# Copyright (c) 2019 Mamy André-Ratsimbazafy +# Licensed and distributed under either of +# * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT). +# * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0). +# at your option. This file may not be copied, modified, or distributed except according to those terms. + +{.push raises: [], checks: off.} # No exceptions or overflow/conversion checks + +when defined(windows): + import ./barriers_windows + when compileOption("assertions"): + import os + + type SyncBarrier* = SynchronizationBarrier + + proc init*(syncBarrier: var SyncBarrier, threadCount: uint32) {.inline.} = + ## Initialize a synchronization barrier that will block ``threadCount`` threads + ## before release. + let err {.used.} = InitializeSynchronizationBarrier(syncBarrier, cast[int32](threadCount), -1) + when compileOption("assertions"): + if err != 1: + assert err == 0 + raiseOSError(osLastError()) + + proc wait*(syncBarrier: var SyncBarrier): bool {.inline.} = + ## Blocks thread at a synchronization barrier. + ## Returns true for one of the threads (the last one on Windows, undefined on Posix) + ## and false for the others. + result = bool EnterSynchronizationBarrier(syncBarrier, SYNCHRONIZATION_BARRIER_FLAGS_NO_DELETE) + + proc delete*(syncBarrier: sink SyncBarrier) {.inline.} = + ## Deletes a synchronization barrier. + ## This assumes no race between waiting at a barrier and deleting it, + ## and reuse of the barrier requires initialization. + DeleteSynchronizationBarrier(syncBarrier.addr) + +else: + import ./barriers_posix + when compileOption("assertions"): + import os + + type SyncBarrier* = PthreadBarrier + + proc init*(syncBarrier: var SyncBarrier, threadCount: uint32) {.inline.} = + ## Initialize a synchronization barrier that will block ``threadCount`` threads + ## before release. + let err {.used.} = pthread_barrier_init(syncBarrier, nil, threadCount) + when compileOption("assertions"): + if err != 0: + raiseOSError(OSErrorCode(err)) + + proc wait*(syncBarrier: var SyncBarrier): bool {.inline.} = + ## Blocks thread at a synchronization barrier. + ## Returns true for one of the threads (the last one on Windows, undefined on Posix) + ## and false for the others. + let err {.used.} = pthread_barrier_wait(syncBarrier) + when compileOption("assertions"): + if err != PTHREAD_BARRIER_SERIAL_THREAD and err < 0: + raiseOSError(OSErrorCode(err)) + result = if err == PTHREAD_BARRIER_SERIAL_THREAD: true + else: false + + proc delete*(syncBarrier: sink SyncBarrier) {.inline.} = + ## Deletes a synchronization barrier. + ## This assumes no race between waiting at a barrier and deleting it, + ## and reuse of the barrier requires initialization. + let err {.used.} = pthread_barrier_destroy(syncBarrier) + when compileOption("assertions"): + if err < 0: + raiseOSError(OSErrorCode(err)) \ No newline at end of file diff --git a/constantine/platforms/threadpool/primitives/barriers_macos.nim b/constantine/platforms/threadpool/primitives/barriers_macos.nim new file mode 100644 index 0000000..2b9e763 --- /dev/null +++ b/constantine/platforms/threadpool/primitives/barriers_macos.nim @@ -0,0 +1,84 @@ +# Weave +# Copyright (c) 2019 Mamy André-Ratsimbazafy +# Licensed and distributed under either of +# * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT). +# * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0). +# at your option. This file may not be copied, modified, or distributed except according to those terms. + +# OSX doesn't implement pthread_barrier_t +# It's an optional part of the POSIX standard +# +# This is a manual implementation of a sense reversing barrier + +import std/locks + +type + Errno* = cint + + PthreadBarrierAttr* = object + ## Dummy + PthreadBarrier* = object + ## Implementation of a sense reversing barrier + ## (The Art of Multiprocessor Programming by Maurice Herlihy & Nir Shavit) + + lock: Lock # Alternatively spinlock on Atomic + cond {.guard: lock.}: Cond + sense {.guard: lock.}: bool # Choose int32 to avoid zero-expansion cost in registers? + left {.guard: lock.}: cuint # Number of threads missing at the barrier before opening + count: cuint # Total number of threads that need to arrive before opening the barrier + +const + PTHREAD_BARRIER_SERIAL_THREAD* = Errno(1) + +proc pthread_cond_broadcast(cond: var Cond): Errno {.header:"".} + ## Nim only signal one thread in locks + ## We need to unblock all + +proc broadcast(cond: var Cond) {.inline.}= + discard pthread_cond_broadcast(cond) + +func pthread_barrier_init*( + barrier: var PthreadBarrier, + attr: ptr PthreadBarrierAttr, + count: cuint + ): Errno = + barrier.lock.initLock() + {.locks: [barrier.lock].}: + barrier.cond.initCond() + barrier.left = count + barrier.count = count + # barrier.sense = false + +proc pthread_barrier_wait*(barrier: var PthreadBarrier): Errno = + ## Wait on `barrier` + ## Returns PTHREAD_BARRIER_SERIAL_THREAD for a single arbitrary thread + ## Returns 0 for the other + ## Returns Errno if there is an error + barrier.lock.acquire() + {.locks: [barrier.lock].}: + var local_sense = barrier.sense # Thread local sense + dec barrier.left + + if barrier.left == 0: + # Last thread to arrive at the barrier + # Reverse phase and release it + barrier.left = barrier.count + barrier.sense = not barrier.sense + barrier.cond.broadcast() + barrier.lock.release() + return PTHREAD_BARRIER_SERIAL_THREAD + + while barrier.sense == local_sense: + # We are waiting for threads + # Wait for the sense to reverse + # while loop because we might have spurious wakeups + barrier.cond.wait(barrier.lock) + + # Reversed, we can leave the barrier + barrier.lock.release() + return Errno(0) + +proc pthread_barrier_destroy*(barrier: var PthreadBarrier): Errno = + {.locks: [barrier.lock].}: + barrier.cond.deinitCond() + barrier.lock.deinitLock() diff --git a/constantine/platforms/threadpool/primitives/barriers_posix.nim b/constantine/platforms/threadpool/primitives/barriers_posix.nim new file mode 100644 index 0000000..fd2454a --- /dev/null +++ b/constantine/platforms/threadpool/primitives/barriers_posix.nim @@ -0,0 +1,56 @@ +# Weave +# Copyright (c) 2019 Mamy André-Ratsimbazafy +# Licensed and distributed under either of +# * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT). +# * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0). +# at your option. This file may not be copied, modified, or distributed except according to those terms. + +# Abstractions over POSIX barriers (non-)implementations + +when not compileOption("threads"): + {.error: "This requires --threads:on compilation flag".} + +# Types +# ------------------------------------------------------- + +when defined(osx): + import ./barriers_macos + export PthreadBarrierAttr, PthreadBarrier, Errno, PTHREAD_BARRIER_SERIAL_THREAD +else: + type + PthreadBarrierAttr* {.importc: "pthread_barrierattr_t", header: "", byref.} = object + when (defined(linux) and not defined(android)) and defined(amd64): + abi: array[4 div sizeof(cint), cint] # https://sourceware.org/git/?p=glibc.git;a=blob;f=sysdeps/x86/nptl/bits/pthreadtypes-arch.h;h=dd06d6753ebc80d94ede6c3c18227a3ad3104570;hb=HEAD#l45 + PthreadBarrier* {.importc: "pthread_barrier_t", header: "", byref.} = object + when (defined(linux) and not defined(android)) and defined(amd64): + abi: array[32 div sizeof(clong), clong] # https://sourceware.org/git/?p=glibc.git;a=blob;f=sysdeps/x86/nptl/bits/pthreadtypes-arch.h;h=dd06d6753ebc80d94ede6c3c18227a3ad3104570;hb=HEAD#l28 + + + Errno* = cint + + var PTHREAD_BARRIER_SERIAL_THREAD* {.importc, header:"".}: Errno + +# Pthread +# ------------------------------------------------------- +when defined(osx): + export pthread_barrier_init, pthread_barrier_wait, pthread_barrier_destroy +else: + proc pthread_barrier_init*( + barrier: PthreadBarrier, + attr: ptr PthreadBarrierAttr, + count: cuint + ): Errno {.header: "".} + ## Initialize `barrier` with the attributes `attr`. + ## The barrier is opened when `count` waiters arrived. + + proc pthread_barrier_destroy*( + barrier: sink PthreadBarrier): Errno {.header: "".} + ## Destroy a previously dynamically initialized `barrier`. + + proc pthread_barrier_wait*( + barrier: var PthreadBarrier + ): Errno {.header: "".} + ## Wait on `barrier` + ## Returns PTHREAD_BARRIER_SERIAL_THREAD for a single arbitrary thread + ## Returns 0 for the other + ## Returns Errno if there is an error diff --git a/constantine/platforms/threadpool/primitives/barriers_windows.nim b/constantine/platforms/threadpool/primitives/barriers_windows.nim new file mode 100644 index 0000000..ecbf9ab --- /dev/null +++ b/constantine/platforms/threadpool/primitives/barriers_windows.nim @@ -0,0 +1,31 @@ +# Weave +# Copyright (c) 2019 Mamy André-Ratsimbazafy +# Licensed and distributed under either of +# * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT). +# * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0). +# at your option. This file may not be copied, modified, or distributed except according to those terms. + +import winlean + +# Technically in but MSVC complains with +# @m..@s..@sweave@sscheduler.nim.cpp +# C:\Program Files (x86)\Windows Kits\10\include\10.0.17763.0\um\winnt.h(154): fatal error C1189: #error: "No Target Architecture" + +type + SynchronizationBarrier*{.importc:"SYNCHRONIZATION_BARRIER", header:"".} = object + +var SYNCHRONIZATION_BARRIER_FLAGS_NO_DELETE* {.importc, header: "".}: DWORD + ## Skip expensive checks on barrier enter if a barrier is never deleted. + +proc EnterSynchronizationBarrier*(lpBarrier: var SynchronizationBarrier, dwFlags: DWORD): WINBOOL {.importc, stdcall, header: "".} +proc DeleteSynchronizationBarrier*(lpBarrier: ptr SynchronizationBarrier) {.importc, stdcall, header: "".} +proc InitializeSynchronizationBarrier*(lpBarrier: var SynchronizationBarrier, lTotalThreads: LONG, lSpinCount: LONG): WINBOOL {.importc, stdcall, header: "".} + +when isMainModule: + import os + + var x{.noinit.}: SynchronizationBarrier + let err = InitializeSynchronizationBarrier(x, 2, -1) + if err != 1: + assert err == 0 + raiseOSError(osLastError()) \ No newline at end of file diff --git a/constantine/platforms/threadpool/primitives/futexes.nim b/constantine/platforms/threadpool/primitives/futexes.nim new file mode 100644 index 0000000..44fadc1 --- /dev/null +++ b/constantine/platforms/threadpool/primitives/futexes.nim @@ -0,0 +1,18 @@ +# Weave +# Copyright (c) 2019 Mamy André-Ratsimbazafy +# Licensed and distributed under either of +# * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT). +# * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0). +# at your option. This file may not be copied, modified, or distributed except according to those terms. + +when defined(linux): + import ./futexes_linux + export futexes_linux +elif defined(windows): + import ./futexes_windows + export futexes_windows +elif defined(osx): + import ./futexes_macos + export futexes_macos +else: + {.error: "Futexes are not implemented for your OS".} diff --git a/constantine/platforms/threadpool/primitives/futexes_linux.nim b/constantine/platforms/threadpool/primitives/futexes_linux.nim new file mode 100644 index 0000000..8fd5a25 --- /dev/null +++ b/constantine/platforms/threadpool/primitives/futexes_linux.nim @@ -0,0 +1,68 @@ +# Weave +# Copyright (c) 2019 Mamy André-Ratsimbazafy +# Licensed and distributed under either of +# * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT). +# * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0). +# at your option. This file may not be copied, modified, or distributed except according to those terms. + +# A wrapper for linux futex. +# Condition variables do not always wake on signal which can deadlock the runtime +# so we need to roll up our sleeves and use the low-level futex API. + +import std/atomics +export MemoryOrder + +type + Futex* = object + value: Atomic[uint32] + FutexOp = distinct cint + +var NR_Futex {.importc: "__NR_futex", header: "".}: cint +var FutexWaitPrivate {.importc:"FUTEX_WAIT_PRIVATE", header: "".}: FutexOp +var FutexWakePrivate {.importc:"FUTEX_WAKE_PRIVATE", header: "".}: FutexOp + +proc syscall(sysno: clong): cint {.header:"", varargs.} + +proc sysFutex( + futex: var Futex, op: FutexOp, val1: cuint or cint, + timeout: pointer = nil, val2: pointer = nil, val3: cint = 0): cint {.inline.} = + syscall(NR_Futex, futex.value.addr, op, val1, timeout, val2, val3) + +proc initialize*(futex: var Futex) {.inline.} = + futex.value.store(0, moRelaxed) + +proc teardown*(futex: var Futex) {.inline.} = + futex.value.store(0, moRelaxed) + +proc load*(futex: var Futex, order: MemoryOrder): uint32 {.inline.} = + futex.value.load(order) + +proc loadMut*(futex: var Futex): var Atomic[uint32] {.inline.} = + futex.value + +proc store*(futex: var Futex, value: uint32, order: MemoryOrder) {.inline.} = + futex.value.store(value, order) + +proc wait*(futex: var Futex, refVal: uint32) {.inline.} = + ## Suspend a thread if the value of the futex is the same as refVal. + + # Returns 0 in case of a successful suspend + # If value are different, it returns EWOULDBLOCK + # We discard as this is not needed and simplifies compat with Windows futex + discard sysFutex(futex, FutexWaitPrivate, refVal) + +proc wake*(futex: var Futex) {.inline.} = + ## Wake one thread (from the same process) + + # Returns the number of actually woken threads + # or a Posix error code (if negative) + # We discard as this is not needed and simplifies compat with Windows futex + discard sysFutex(futex, FutexWakePrivate, 1) + +proc wakeAll*(futex: var Futex) {.inline.} = + ## Wake all threads (from the same process) + + # Returns the number of actually woken threads + # or a Posix error code (if negative) + # We discard as this is not needed and simplifies compat with Windows futex + discard sysFutex(futex, FutexWakePrivate, high(int32)) \ No newline at end of file diff --git a/constantine/platforms/threadpool/primitives/futexes_macos.nim b/constantine/platforms/threadpool/primitives/futexes_macos.nim new file mode 100644 index 0000000..f6141de --- /dev/null +++ b/constantine/platforms/threadpool/primitives/futexes_macos.nim @@ -0,0 +1,109 @@ +# Weave +# Copyright (c) 2019 Mamy André-Ratsimbazafy +# Licensed and distributed under either of +# * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT). +# * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0). +# at your option. This file may not be copied, modified, or distributed except according to those terms. + +import std/atomics + +# A wrapper for Darwin futex. +# They are used in libc++ so likely to be very stable. +# A new API appeared in OSX Big Sur (Jan 2021) ulock_wait2 and macOS pthread_cond_t has been migrated to it +# - https://github.com/apple/darwin-xnu/commit/d4061fb0260b3ed486147341b72468f836ed6c8f#diff-08f993cc40af475663274687b7c326cc6c3031e0db3ac8de7b24624610616be6 +# +# The old API is ulock_wait +# - https://opensource.apple.com/source/xnu/xnu-7195.81.3/bsd/kern/sys_ulock.c.auto.html +# - https://opensource.apple.com/source/xnu/xnu-7195.81.3/bsd/sys/ulock.h.auto.html + +{.push hint[XDeclaredButNotUsed]: off.} + +const UL_COMPARE_AND_WAIT = 1 +const UL_UNFAIR_LOCK = 2 +const UL_COMPARE_AND_WAIT_SHARED = 3 +const UL_UNFAIR_LOCK64_SHARED = 4 +const UL_COMPARE_AND_WAIT64 = 5 +const UL_COMPARE_AND_WAIT64_SHARED = 6 +# obsolete names +const UL_OSSPINLOCK = UL_COMPARE_AND_WAIT +const UL_HANDOFFLOCK = UL_UNFAIR_LOCK +# These operation code are only implemented in (DEVELOPMENT || DEBUG) kernels +const UL_DEBUG_SIMULATE_COPYIN_FAULT = 253 +const UL_DEBUG_HASH_DUMP_ALL = 254 +const UL_DEBUG_HASH_DUMP_PID = 255 + +# operation bits [15, 8] contain the flags for __ulock_wake +# +const ULF_WAKE_ALL = 0x00000100 +const ULF_WAKE_THREAD = 0x00000200 +const ULF_WAKE_ALLOW_NON_OWNER = 0x00000400 + +# operation bits [23, 16] contain the flags for __ulock_wait +# +# @const ULF_WAIT_WORKQ_DATA_CONTENTION +# The waiter is contending on this lock for synchronization around global data. +# This causes the workqueue subsystem to not create new threads to offset for +# waiters on this lock. +# +# @const ULF_WAIT_CANCEL_POINT +# This wait is a cancelation point +# +# @const ULF_WAIT_ADAPTIVE_SPIN +# Use adaptive spinning when the thread that currently holds the unfair lock +# is on core. +const ULF_WAIT_WORKQ_DATA_CONTENTION = 0x00010000 +const ULF_WAIT_CANCEL_POINT = 0x00020000 +const ULF_WAIT_ADAPTIVE_SPIN = 0x00040000 + +# operation bits [31, 24] contain the generic flags +const ULF_NO_ERRNO = 0x01000000 + +# masks +const UL_OPCODE_MASK = 0x000000FF +const UL_FLAGS_MASK = 0xFFFFFF00 +const ULF_GENERIC_MASK = 0xFFFF0000 + +const ULF_WAIT_MASK = ULF_NO_ERRNO or + ULF_WAIT_WORKQ_DATA_CONTENTION or + ULF_WAIT_CANCEL_POINT or + ULF_WAIT_ADAPTIVE_SPIN + +const ULF_WAKE_MASK = ULF_NO_ERRNO or + ULF_WAKE_ALL or + ULF_WAKE_THREAD or + ULF_WAKE_ALLOW_NON_OWNER + +proc ulock_wait(operation: uint32, address: pointer, value: uint64, timeout: uint32): cint {.importc:"__ulock_wait", cdecl.} +proc ulock_wait2(operation: uint32, address: pointer, value: uint64, timeout, value2: uint64): cint {.importc:"__ulock_wait2", cdecl.} +proc ulock_wake(operation: uint32, address: pointer, wake_value: uint64): cint {.importc:"__ulock_wake", cdecl.} + +type + Futex* = object + value: Atomic[uint32] + +proc initialize*(futex: var Futex) {.inline.} = + futex.value.store(0, moRelaxed) + +proc teardown*(futex: var Futex) {.inline.} = + futex.value.store(0, moRelaxed) + +proc load*(futex: var Futex, order: MemoryOrder): uint32 {.inline.} = + futex.value.load(order) + +proc loadMut*(futex: var Futex): var Atomic[uint32] {.inline.} = + futex.value + +proc store*(futex: var Futex, value: uint32, order: MemoryOrder) {.inline.} = + futex.value.store(value, order) + +proc wait*(futex: var Futex, refVal: uint32) {.inline.} = + ## Suspend a thread if the value of the futex is the same as refVal. + discard ulock_wait(UL_UNFAIR_LOCK64_SHARED or ULF_NO_ERRNO, futex.value.addr, uint64 refVal, 0) + +proc wake*(futex: var Futex) {.inline.} = + ## Wake one thread (from the same process) + discard ulock_wake(ULF_WAKE_THREAD or ULF_NO_ERRNO, futex.value.addr, 0) + +proc wakeAll*(futex: var Futex) {.inline.} = + ## Wake all threads (from the same process) + discard ulock_wake(ULF_WAKE_ALL or ULF_NO_ERRNO, futex.value.addr, 0) \ No newline at end of file diff --git a/constantine/platforms/threadpool/primitives/futexes_windows.nim b/constantine/platforms/threadpool/primitives/futexes_windows.nim new file mode 100644 index 0000000..0b0fca0 --- /dev/null +++ b/constantine/platforms/threadpool/primitives/futexes_windows.nim @@ -0,0 +1,59 @@ +# Weave +# Copyright (c) 2019 Mamy André-Ratsimbazafy +# Licensed and distributed under either of +# * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT). +# * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0). +# at your option. This file may not be copied, modified, or distributed except according to those terms. + +# An implementation of futex using Windows primitives + +import std/atomics, winlean +export MemoryOrder + +type + Futex* = object + value: Atomic[uint32] + +# Contrary to the documentation, the futex related primitives are NOT in kernel32.dll +# but in API-MS-Win-Core-Synch-l1-2-0.dll ¯\_(ツ)_/¯ + +proc initialize*(futex: var Futex) {.inline.} = + futex.value.store(0, moRelaxed) + +proc teardown*(futex: var Futex) {.inline.} = + futex.value.store(0, moRelaxed) + +proc WaitOnAddress( + Address: pointer, CompareAddress: pointer, + AddressSize: csize_t, dwMilliseconds: DWORD + ): WINBOOL {.importc, stdcall, dynlib: "API-MS-Win-Core-Synch-l1-2-0.dll".} + # The Address should be volatile + +proc WakeByAddressSingle(Address: pointer) {.importc, stdcall, dynlib: "API-MS-Win-Core-Synch-l1-2-0.dll".} +proc WakeByAddressAll(Address: pointer) {.importc, stdcall, dynlib: "API-MS-Win-Core-Synch-l1-2-0.dll".} + +proc load*(futex: var Futex, order: MemoryOrder): uint32 {.inline.} = + futex.value.load(order) + +proc loadMut*(futex: var Futex): var Atomic[uint32] {.inline.} = + futex.value + +proc store*(futex: var Futex, value: uint32, order: MemoryOrder) {.inline.} = + futex.value.store(value, order) + +proc wait*(futex: var Futex, refVal: uint32) {.inline.} = + ## Suspend a thread if the value of the futex is the same as refVal. + + # Returns TRUE if the wait succeeds or FALSE if not. + # getLastError() will contain the error information, for example + # if it failed due to a timeout. + # We discard as this is not needed and simplifies compat with Linux futex + discard WaitOnAddress(futex.value.addr, refVal.unsafeAddr, csize_t sizeof(refVal), INFINITE) + +proc wake*(futex: var Futex) {.inline.} = + ## Wake one thread (from the same process) + WakeByAddressSingle(futex.value.addr) + +proc wakeAll*(futex: var Futex) {.inline.} = + ## Wake all threads (from the same process) + WakeByAddressAll(futex.value.addr) diff --git a/constantine/platforms/threadpool/threadpool.nim b/constantine/platforms/threadpool/threadpool.nim new file mode 100644 index 0000000..b98c860 --- /dev/null +++ b/constantine/platforms/threadpool/threadpool.nim @@ -0,0 +1,555 @@ +# Constantine +# Copyright (c) 2018-2019 Status Research & Development GmbH +# Copyright (c) 2020-Present Mamy André-Ratsimbazafy +# Licensed and distributed under either of +# * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT). +# * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0). +# at your option. This file may not be copied, modified, or distributed except according to those terms. + +when not compileOption("threads"): + {.error: "This requires --threads:on compilation flag".} + +{.push raises: [].} + +import + std/[cpuinfo, atomics, macros], + ./crossthread/[ + taskqueues, + backoff, + tasks_flowvars], + ./instrumentation, + ./primitives/barriers, + ./parallel_offloading, + ../allocs, ../bithacks, + ../../../helpers/prng_unsafe + +export + # flowvars + Flowvar, isSpawned, isReady, sync + +type + WorkerID = uint32 + Signal = object + terminate {.align: 64.}: Atomic[bool] + + WorkerContext = object + ## Thread-local worker context + + # Params + id: WorkerID + threadpool: Threadpool + + # Tasks + taskqueue: ptr Taskqueue # owned task queue + currentTask: ptr Task + + # Synchronization + localBackoff: EventNotifier # Multi-Producer Single-Consumer backoff + signal: ptr Signal # owned signal + + # Thefts + rng: RngState # RNG state to select victims + + # Adaptative theft policy + stealHalf: bool + recentTasks: uint32 + recentThefts: uint32 + recentTheftsAdaptative: uint32 + recentLeaps: uint32 + + Threadpool* = ptr object + barrier: SyncBarrier # Barrier for initialization and teardown + # -- align: 64 + globalBackoff: EventCount # Multi-Producer Multi-Consumer backoff + # -- align: 64 + numThreads*{.align: 64.}: uint32 + workerQueues: ptr UncheckedArray[Taskqueue] + workers: ptr UncheckedArray[Thread[(Threadpool, WorkerID)]] + workerSignals: ptr UncheckedArray[Signal] + +# Thread-local config +# --------------------------------------------- + +var workerContext {.threadvar.}: WorkerContext + ## Thread-local Worker context + +proc setupWorker() = + ## Initialize the thread-local context of a worker + ## Requires the ID and threadpool fields to be initialized + template ctx: untyped = workerContext + + preCondition: not ctx.threadpool.isNil() + preCondition: 0 <= ctx.id and ctx.id < ctx.threadpool.numThreads.uint32 + preCondition: not ctx.threadpool.workerQueues.isNil() + preCondition: not ctx.threadpool.workerSignals.isNil() + + # Thefts + ctx.rng.seed(0xEFFACED + ctx.id) + + # Synchronization + ctx.localBackoff.initialize() + ctx.signal = addr ctx.threadpool.workerSignals[ctx.id] + ctx.signal.terminate.store(false, moRelaxed) + + # Tasks + ctx.taskqueue = addr ctx.threadpool.workerQueues[ctx.id] + ctx.currentTask = nil + + # Init + ctx.taskqueue[].init(initialCapacity = 32) + + # Adaptative theft policy + ctx.recentTasks = 0 + ctx.recentThefts = 0 + ctx.recentTheftsAdaptative = 0 + ctx.recentLeaps = 0 + +proc teardownWorker() = + ## Cleanup the thread-local context of a worker + workerContext.localBackoff.`=destroy`() + workerContext.taskqueue[].teardown() + +proc eventLoop(ctx: var WorkerContext) {.raises:[Exception].} + +proc workerEntryFn(params: tuple[threadpool: Threadpool, id: WorkerID]) {.raises: [Exception].} = + ## On the start of the threadpool workers will execute this + ## until they receive a termination signal + # We assume that thread_local variables start all at their binary zero value + preCondition: workerContext == default(WorkerContext) + + template ctx: untyped = workerContext + + # If the following crashes, you need --tlsEmulation:off + ctx.id = params.id + ctx.threadpool = params.threadpool + + setupWorker() + + # 1 matching barrier in Threadpool.new() for root thread + discard params.threadpool.barrier.wait() + + {.cast(gcsafe).}: # Compiler does not consider that GC-safe by default when multi-threaded due to thread-local variables + ctx.eventLoop() + + debugTermination: + log(">>> Worker %2d shutting down <<<\n", ctx.id) + + # 1 matching barrier in threadpool.shutdown() for root thread + discard params.threadpool.barrier.wait() + + teardownWorker() + +# Tasks +# --------------------------------------------- + +# Sentinel values +const ReadyFuture = cast[ptr EventNotifier](0xCA11AB1E) +const RootTask = cast[ptr Task](0xEFFACED0) + +proc run*(ctx: var WorkerContext, task: ptr Task) {.raises:[Exception].} = + ## Run a task, frees it if it is not owned by a Flowvar + let suspendedTask = workerContext.currentTask + ctx.currentTask = task + debug: log("Worker %2d: running task.fn 0x%.08x (%d pending)\n", ctx.id, task.fn, ctx.taskqueue[].peek()) + task.fn(task.data.addr) + debug: log("Worker %2d: completed task.fn 0x%.08x (%d pending)\n", ctx.id, task.fn, ctx.taskqueue[].peek()) + ctx.recentTasks += 1 + ctx.currentTask = suspendedTask + if not task.hasFuture: + freeHeap(task) + return + + # Sync with an awaiting thread without work in completeFuture + var expected = (ptr EventNotifier)(nil) + if not compareExchange(task.waiter, expected, desired = ReadyFuture, moAcquireRelease): + debug: log("Worker %2d: completed task 0x%.08x, notifying waiter 0x%.08x\n", ctx.id, task, expected) + expected[].notify() + +proc schedule(ctx: var WorkerContext, tn: ptr Task, forceWake = false) {.inline.} = + ## Schedule a task in the threadpool + ## This wakes a sibling thread if our local queue is empty + ## or forceWake is true. + debug: log("Worker %2d: schedule task 0x%.08x (parent 0x%.08x, current 0x%.08x)\n", ctx.id, tn, tn.parent, ctx.currentTask) + + # Instead of notifying every time a task is scheduled, we notify + # only when the worker queue is empty. This is a good approximation + # of starvation in work-stealing. + # - Tzannes, A., G. C. Caragea, R. Barua, and U. Vishkin. + # Lazy binary-splitting: a run-time adaptive work-stealing scheduler. + # In PPoPP ’10, Bangalore, India, January 2010. ACM, pp. 179–190. + # https://user.eng.umd.edu/~barua/ppopp164.pdf + + let wasEmpty = ctx.taskqueue[].peek() == 0 + ctx.taskqueue[].push(tn) + if forceWake or wasEmpty: + ctx.threadpool.globalBackoff.wake() + +# Scheduler +# --------------------------------------------- + +iterator pseudoRandomPermutation(randomSeed, maxExclusive: uint32): uint32 = + ## Create a (low-quality) pseudo-random permutation from [0, max) + # Design considerations and randomness constraint for work-stealing, see docs/random_permutations.md + # + # Linear Congruential Generator: https://en.wikipedia.org/wiki/Linear_congruential_generator + # + # Xₙ₊₁ = aXₙ+c (mod m) generates all random number mod m without repetition + # if and only if (Hull-Dobell theorem): + # 1. c and m are coprime + # 2. a-1 is divisible by all prime factors of m + # 3. a-1 is divisible by 4 if m is divisible by 4 + # + # Alternative 1. By choosing a=1, all conditions are easy to reach. + # + # The randomness quality is not important besides distributing potential contention, + # i.e. randomly trying thread i, then i+1, then i+n-1 (mod n) is good enough. + # + # Assuming 6 threads, co-primes are [1, 5], which means the following permutations + # assuming we start with victim 0: + # - [0, 1, 2, 3, 4, 5] + # - [0, 5, 4, 3, 2, 1] + # While we don't care much about randoness quality, it's a bit disappointing. + # + # Alternative 2. We can choose m to be the next power of 2, meaning all odd integers are co-primes, + # consequently: + # - we don't need a GCD to find the coprimes + # - we don't need to cache coprimes, removing a cache-miss potential + # - a != 1, so we now have a multiplicative factor, which makes output more "random looking". + + # n and (m-1) <=> n mod m, if m is a power of 2 + let M = maxExclusive.nextPowerOfTwo_vartime() + let c = (randomSeed and ((M shr 1) - 1)) * 2 + 1 # c odd and c ∈ [0, M) + let a = (randomSeed and ((M shr 2) - 1)) * 4 + 1 # a-1 divisible by 2 (all prime factors of m) and by 4 if m divisible by 4 + + let mask = M-1 # for mod M + let start = randomSeed and mask + + var x = start + while true: + if x < maxExclusive: + yield x + x = (a*x + c) and mask # ax + c (mod M), with M power of 2 + if x == start: + break + +proc tryStealOne(ctx: var WorkerContext): ptr Task = + ## Try to steal a task. + let seed = ctx.rng.next().uint32 + for targetId in seed.pseudoRandomPermutation(ctx.threadpool.numThreads): + if targetId == ctx.id: + continue + + let stolenTask = ctx.id.steal(ctx.threadpool.workerQueues[targetId]) + + if not stolenTask.isNil(): + ctx.recentThefts += 1 + # Theft successful, there might be more work for idle threads, wake one + ctx.threadpool.globalBackoff.wake() + return stolenTask + return nil + +proc updateStealStrategy(ctx: var WorkerContext) = + ## Estimate work-stealing efficiency during the last interval + ## If the value is below a threshold, switch strategies + const StealAdaptativeInterval = 25 + if ctx.recentTheftsAdaptative == StealAdaptativeInterval: + let recentTheftsNonAdaptative = ctx.recentThefts - ctx.recentTheftsAdaptative + let adaptativeTasks = ctx.recentTasks - ctx.recentLeaps - recentTheftsNonAdaptative + + let ratio = adaptativeTasks.float32 / StealAdaptativeInterval.float32 + if ctx.stealHalf and ratio < 2.0f: + # Tasks stolen are coarse-grained, steal only one to reduce re-steal + ctx.stealHalf = false + elif not ctx.stealHalf and ratio == 1.0f: + # All tasks processed were stolen tasks, we need to steal many at a time + ctx.stealHalf = true + + # Reset interval + ctx.recentTasks = 0 + ctx.recentThefts = 0 + ctx.recentTheftsAdaptative = 0 + ctx.recentLeaps = 0 + +proc tryStealAdaptative(ctx: var WorkerContext): ptr Task = + ## Try to steal one or many tasks, depending on load + + # TODO: while running 'threadpool/examples/e02_parallel_pi.nim' + # stealHalf can error out in tasks_flowvars.nim with: + # "precondition not task.completed.load(moAcquire)" + ctx.stealHalf = false + # ctx.updateStealStrategy() + + let seed = ctx.rng.next().uint32 + for targetId in seed.pseudoRandomPermutation(ctx.threadpool.numThreads): + if targetId == ctx.id: + continue + + let stolenTask = + if ctx.stealHalf: ctx.id.stealHalf(ctx.taskqueue[], ctx.threadpool.workerQueues[targetId]) + else: ctx.id.steal(ctx.threadpool.workerQueues[targetId]) + + if not stolenTask.isNil(): + ctx.recentThefts += 1 + ctx.recentTheftsAdaptative += 1 + # Theft successful, there might be more work for idle threads, wake one + ctx.threadpool.globalBackoff.wake() + return stolenTask + return nil + +proc tryLeapfrog(ctx: var WorkerContext, awaitedTask: ptr Task): ptr Task = + ## Leapfrogging: + ## + ## - Leapfrogging: a portable technique for implementing efficient futures, + ## David B. Wagner, Bradley G. Calder, 1993 + ## https://dl.acm.org/doi/10.1145/173284.155354 + ## + ## When awaiting a future, we can look in the thief queue first. They steal when they run out of tasks. + ## If they have tasks in their queue, it's the task we are awaiting that created them and it will likely be stuck + ## on those tasks as well, so we need to help them help us. + + var thiefID = SentinelThief + while true: + debug: log("Worker %2d: waiting for thief to publish their ID\n", ctx.id) + thiefID = awaitedTask.thiefID.load(moAcquire) + if thiefID != SentinelThief: + break + cpuRelax() + ascertain: 0 <= thiefID and thiefID < ctx.threadpool.numThreads + + # Leapfrogging is used when completing a future, steal only one + # and don't leave tasks stranded in our queue. + let leapTask = ctx.id.steal(ctx.threadpool.workerQueues[thiefID]) + if not leapTask.isNil(): + ctx.recentLeaps += 1 + # Theft successful, there might be more work for idle threads, wake one + ctx.threadpool.globalBackoff.wake() + return leapTask + return nil + +proc eventLoop(ctx: var WorkerContext) {.raises:[Exception].} = + ## Each worker thread executes this loop over and over. + while not ctx.signal.terminate.load(moRelaxed): + # 1. Pick from local queue + debug: log("Worker %2d: eventLoop 1 - searching task from local queue\n", ctx.id) + while (var task = ctx.taskqueue[].pop(); not task.isNil): + debug: log("Worker %2d: eventLoop 1 - running task 0x%.08x (parent 0x%.08x, current 0x%.08x)\n", ctx.id, task, task.parent, ctx.currentTask) + ctx.run(task) + + # 2. Run out of tasks, become a thief + debug: log("Worker %2d: eventLoop 2 - becoming a thief\n", ctx.id) + let ticket = ctx.threadpool.globalBackoff.sleepy() + var stolenTask = ctx.tryStealAdaptative() + if not stolenTask.isNil: + # We manage to steal a task, cancel sleep + ctx.threadpool.globalBackoff.cancelSleep() + # 2.a Run task + debug: log("Worker %2d: eventLoop 2.a - stole task 0x%.08x (parent 0x%.08x, current 0x%.08x)\n", ctx.id, stolenTask, stolenTask.parent, ctx.currentTask) + ctx.run(stolenTask) + else: + # 2.b Park the thread until a new task enters the threadpool + debug: log("Worker %2d: eventLoop 2.b - sleeping\n", ctx.id) + ctx.threadpool.globalBackoff.sleep(ticket) + debug: log("Worker %2d: eventLoop 2.b - waking\n", ctx.id) + +# Sync +# --------------------------------------------- + +template isRootTask(task: ptr Task): bool = + task == RootTask + +proc completeFuture*[T](fv: Flowvar[T], parentResult: var T) {.raises:[Exception].} = + ## Eagerly complete an awaited FlowVar + + template ctx: untyped = workerContext + + template isFutReady(): untyped = + let isReady = fv.task.completed.load(moAcquire) + if isReady: + parentResult = cast[ptr (ptr Task, T)](fv.task.data.addr)[1] + isReady + + if isFutReady(): + return + + ## 1. Process all the children of the current tasks. + ## This ensures that we can give control back ASAP. + debug: log("Worker %2d: sync 1 - searching task from local queue\n", ctx.id) + while (let task = ctx.taskqueue[].pop(); not task.isNil): + if task.parent != ctx.currentTask: + debug: log("Worker %2d: sync 1 - skipping non-direct descendant task 0x%.08x (parent 0x%.08x, current 0x%.08x)\n", ctx.id, task, task.parent, ctx.currentTask) + ctx.schedule(task, forceWake = true) # reschedule task and wake a sibling to take it over. + break + debug: log("Worker %2d: sync 1 - running task 0x%.08x (parent 0x%.08x, current 0x%.08x)\n", ctx.id, task, task.parent, ctx.currentTask) + ctx.run(task) + if isFutReady(): + debug: log("Worker %2d: sync 1 - future ready, exiting\n", ctx.id) + return + + ## 2. We run out-of-tasks or out-of-direct-child of our current awaited task + ## So the task is bottlenecked by dependencies in other threads, + ## hence we abandon our enqueued work and steal in the others' queues + ## in hope it advances our awaited task. This prioritizes latency over throughput. + ## + ## See also + ## - Proactive work-stealing for futures + ## Kyle Singer, Yifan Xu, I-Ting Angelina Lee, 2019 + ## https://dl.acm.org/doi/10.1145/3293883.3295735 + debug: log("Worker %2d: sync 2 - future not ready, becoming a thief (currentTask 0x%.08x)\n", ctx.id, ctx.currentTask) + while not isFutReady(): + if (let leapTask = ctx.tryLeapfrog(fv.task); not leapTask.isNil): + # We stole a task generated by the task we are awaiting. + debug: log("Worker %2d: sync 2.1 - leapfrog task 0x%.08x (parent 0x%.08x, current 0x%.08x)\n", ctx.id, leapTask, leapTask.parent, ctx.currentTask) + ctx.run(leapTask) + elif (let stolenTask = ctx.tryStealOne(); not stolenTask.isNil): + # We stole a task, we hope we advance our awaited task. + debug: log("Worker %2d: sync 2.2 - stole task 0x%.08x (parent 0x%.08x, current 0x%.08x)\n", ctx.id, stolenTask, stolenTask.parent, ctx.currentTask) + ctx.run(stolenTask) + elif (let ownTask = ctx.taskqueue[].pop(); not ownTask.isNil): + # We advance our own queue, this increases global throughput but may impact latency on the awaited task. + # + # Note: for a scheduler to be optimal (i.e. within 2x than ideal) it should be greedy + # so all workers should be working. This is a difficult tradeoff. + debug: log("Worker %2d: sync 2.3 - couldn't steal, running own task\n", ctx.id) + ctx.run(ownTask) + else: + # Nothing to do, we park. + # Note: On today's hyperthreaded systems, it might be more efficient to always park + # instead of working on unrelated tasks in our task queue, despite making the scheduler non-greedy. + # The actual hardware resources are 2x less than the actual number of cores + ctx.localBackoff.prepareToPark() + + var expected = (ptr EventNotifier)(nil) + if compareExchange(fv.task.waiter, expected, desired = ctx.localBackoff.addr, moAcquireRelease): + ctx.localBackoff.park() + +proc syncAll*(tp: Threadpool) {.raises: [Exception].} = + ## Blocks until all pending tasks are completed + ## This MUST only be called from + ## the root scope that created the threadpool + template ctx: untyped = workerContext + + debugTermination: + log(">>> Worker %2d enters barrier <<<\n", ctx.id) + + preCondition: ctx.id == 0 + preCondition: ctx.currentTask.isRootTask() + + # Empty all tasks + var foreignThreadsParked = false + while not foreignThreadsParked: + # 1. Empty local tasks + debug: log("Worker %2d: syncAll 1 - searching task from local queue\n", ctx.id) + while (let task = ctx.taskqueue[].pop(); not task.isNil): + debug: log("Worker %2d: syncAll 1 - running task 0x%.08x (parent 0x%.08x, current 0x%.08x)\n", ctx.id, task, task.parent, ctx.currentTask) + ctx.run(task) + + if tp.numThreads == 1 or foreignThreadsParked: + break + + # 2. Help other threads + debug: log("Worker %2d: syncAll 2 - becoming a thief\n", ctx.id) + let stolenTask = ctx.tryStealAdaptative() + + if not stolenTask.isNil: + # 2.1 We stole some task + debug: log("Worker %2d: syncAll 2.1 - stole task 0x%.08x (parent 0x%.08x, current 0x%.08x)\n", ctx.id, stolenTask, stolenTask.parent, ctx.currentTask) + ctx.run(stolenTask) + else: + # 2.2 No task to steal + if tp.globalBackoff.getNumWaiters() == tp.numThreads - 1: + # 2.2.1 all threads besides the current are parked + debugTermination: + log("Worker %2d: syncAll 2.2.1 - termination, all other threads sleeping\n", ctx.id) + foreignThreadsParked = true + else: + # 2.2.2 We don't park as there is no notif for task completion + cpuRelax() + + debugTermination: + log(">>> Worker %2d leaves barrier <<<\n", ctx.id) + +# Runtime +# --------------------------------------------- + +proc new*(T: type Threadpool, numThreads = countProcessors()): T {.raises: [Exception].} = + ## Initialize a threadpool that manages `numThreads` threads. + ## Default to the number of logical processors available. + + type TpObj = typeof(default(Threadpool)[]) # due to C import, we need a dynamic sizeof + var tp = allocHeapUncheckedAlignedPtr(Threadpool, sizeof(TpObj), alignment = 64) + + tp.barrier.init(numThreads.uint32) + tp.globalBackoff.initialize() + tp.numThreads = numThreads.uint32 + tp.workerQueues = allocHeapArrayAligned(Taskqueue, numThreads, alignment = 64) + tp.workers = allocHeapArrayAligned(Thread[(Threadpool, WorkerID)], numThreads, alignment = 64) + tp.workerSignals = allocHeapArrayAligned(Signal, numThreads, alignment = 64) + + # Setup master thread + workerContext.id = 0 + workerContext.threadpool = tp + + # Start worker threads + for i in 1 ..< numThreads: + createThread(tp.workers[i], workerEntryFn, (tp, WorkerID(i))) + + # Root worker + setupWorker() + + # Root task, this is a sentinel task that is never called. + workerContext.currentTask = RootTask + + # Wait for the child threads + discard tp.barrier.wait() + return tp + +proc cleanup(tp: var Threadpool) {.raises: [OSError].} = + ## Cleanup all resources allocated by the threadpool + preCondition: workerContext.currentTask.isRootTask() + + for i in 1 ..< tp.numThreads: + joinThread(tp.workers[i]) + + tp.workerSignals.freeHeapAligned() + tp.workers.freeHeapAligned() + tp.workerQueues.freeHeapAligned() + tp.globalBackoff.`=destroy`() + tp.barrier.delete() + + tp.freeHeapAligned() + +proc shutdown*(tp: var Threadpool) {.raises:[Exception].} = + ## Wait until all tasks are processed and then shutdown the threadpool + preCondition: workerContext.currentTask.isRootTask() + tp.syncAll() + + # Signal termination to all threads + for i in 0 ..< tp.numThreads: + tp.workerSignals[i].terminate.store(true, moRelaxed) + + tp.globalBackoff.wakeAll() + + # 1 matching barrier in workerEntryFn + discard tp.barrier.wait() + + teardownWorker() + tp.cleanup() + + # Delete dummy task + workerContext.currentTask = nil + +{.pop.} # raises:[] + +# Task parallel API +# --------------------------------------------- + +macro spawn*(tp: Threadpool, fnCall: typed): untyped = + ## Spawns the input function call asynchronously, potentially on another thread of execution. + ## + ## If the function calls returns a result, spawn will wrap it in a Flowvar. + ## You can use `sync` to block the current thread and extract the asynchronous result from the flowvar. + ## You can use `isReady` to check if result is available and if subsequent + ## `spawn` returns immediately. + ## + ## Tasks are processed approximately in Last-In-First-Out (LIFO) order + result = spawnImpl(tp, fnCall, bindSym"workerContext", bindSym"schedule") diff --git a/helpers/prng_unsafe.nim b/helpers/prng_unsafe.nim index 89c3107..194ddc2 100644 --- a/helpers/prng_unsafe.nim +++ b/helpers/prng_unsafe.nim @@ -68,7 +68,7 @@ func rotl(x: uint64, k: static int): uint64 {.inline.} = template `^=`(x: var uint64, y: uint64) = x = x xor y -func next(rng: var RngState): uint64 = +func next*(rng: var RngState): uint64 = ## Compute a random uint64 from the input state ## using xoshiro512** algorithm by Vigna et al ## State is updated. @@ -96,6 +96,12 @@ func random_unsafe*(rng: var RngState, maxExclusive: uint32): uint32 = ## Uses an unbiaised generation method ## See Lemire's algorithm modified by Melissa O'Neill ## https://www.pcg-random.org/posts/bounded-rands.html + ## Original: + ## biaised: https://lemire.me/blog/2016/06/27/a-fast-alternative-to-the-modulo-reduction/ + ## unbiaised: https://arxiv.org/pdf/1805.10941.pdf + ## Also: + ## Barrett Reduction: https://en.wikipedia.org/wiki/Barrett_reduction + ## http://www.acsel-lab.com/arithmetic/arith18/papers/ARITH18_Hasenplaugh.pdf let max = maxExclusive var x = uint32 rng.next() var m = x.uint64 * max.uint64