Parallel for (#222)

* introduce reserve threads to minimize latency and maximize throughput when awaiting a future

* introduce a ceilDiv proc

* threadpool: implement parallel-for loops

* 10x perf improvement by not waking reserveBackoff on syncAll

* bench overhead: new reserve system might introduce too much wakeup latency, 2x slower, for fine-grained parallelism

* add parallelForStrided

* Threadpool: Implement parallel reductions

* refactor parallel loop codegen: introduce descriptor, parsing and codegen stages

* parallel strided, test transpose bench

* tight loop is faster when backoff is not inline

* no POSIX stuff on windows, larger types for histogram bench

* fix tests

* max RSS overflow?

* missed an undefined var

* exit histogram on 32-bit

* forgot to return early dor 32-bit
This commit is contained in:
Mamy Ratsimbazafy 2023-02-24 09:47:36 +01:00 committed by GitHub
parent 8993789ddf
commit bf32c2d408
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
66 changed files with 16974 additions and 303 deletions

View File

@ -250,6 +250,10 @@ const testDescThreadpool: seq[string] = @[
# "constantine/platforms/threadpool/benchmarks/matmul_cache_oblivious/threadpool_matmul_co.nim",
"constantine/platforms/threadpool/benchmarks/nqueens/threadpool_nqueens.nim",
# "constantine/platforms/threadpool/benchmarks/single_task_producer/threadpool_spc.nim", # Need timing not implemented on Windows
# "constantine/platforms/threadpool/benchmarks/black_scholes/threadpool_black_scholes.nim", # Need input file
"constantine/platforms/threadpool/benchmarks/matrix_transposition/threadpool_transposes.nim",
"constantine/platforms/threadpool/benchmarks/histogram_2D/threadpool_histogram.nim",
"constantine/platforms/threadpool/benchmarks/logsumexp/threadpool_logsumexp.nim",
]
const testDescMultithreadedCrypto: seq[string] = @[

View File

@ -26,11 +26,6 @@ import
# Helpers
# ----------------------------------------------------------------
func ceilDiv(a, b: uint): uint =
## ceil division
## ceil(a / b)
(a + b - 1) div b
proc copyFrom[M, N: static int](output: var array[M, byte], bi: array[N, byte], cur: var uint) =
static: doAssert M mod N == 0
for i in 0'u ..< N:
@ -113,7 +108,7 @@ func expandMessageXMD*[B1, B2, B3: byte|char, len_in_bytes: static int](
doAssert output.len mod 8 == 0 # By spec
doAssert output.len mod 32 == 0 # Assumed by copy optimization
let ell = ceilDiv(output.len.uint, DigestSize.uint)
let ell = output.len.ceilDiv_vartime(DigestSize)
var l_i_b_str0 {.noInit.}: array[3, byte]
l_i_b_str0.dumpRawInt(output.len.uint16, cursor = 0, bigEndian)
l_i_b_str0[2] = 0
@ -200,7 +195,7 @@ func hashToField*[Field; B1, B2, B3: byte|char, count: static int](
## it is recommended to cache the reduced DST.
const
L = int ceilDiv(Field.C.getCurveBitwidth() + k, 8)
L = ceilDiv_vartime(Field.C.getCurveBitwidth() + k, 8)
m = block:
when Field is Fp: 1
elif Field is Fp2: 2

View File

@ -142,7 +142,7 @@ func powMont*[mBits, eBits: static int](
##
## This is constant-time: the window optimization does
## not reveal the exponent bits or hamming weight
var expBE {.noInit.}: array[(ebits + 7) div 8, byte]
var expBE {.noInit.}: array[ebits.ceilDiv_vartime(8), byte]
expBE.marshal(exponent, bigEndian)
powMont(a, expBE, M, one, negInvModWord, windowSize, spareBits)
@ -165,7 +165,7 @@ func powMontUnsafeExponent*[mBits, eBits: static int](
##
## This uses fixed window optimization
## A window size in the range [1, 5] must be chosen
var expBE {.noInit.}: array[(ebits + 7) div 8, byte]
var expBE {.noInit.}: array[ebits.ceilDiv_vartime(8), byte]
expBE.marshal(exponent, bigEndian)
powMontUnsafeExponent(a, expBE, M, one, negInvModWord, windowSize, spareBits)

View File

@ -591,7 +591,6 @@ macro addchain*(fn: untyped): untyped =
body.add s
result[^1] = body
# echo result.toStrLit()
# ############################################################
#

View File

@ -191,7 +191,7 @@ proc partitionDivsteps(bits, wordBitWidth: int): tuple[totalIters, numChunks, ch
# For any input, for gcd(f, g) with 0 <= g <= f <= Modulus with hddivstep variant (half-delta divstep)
# (inversion g == 1)
(45907*bits + 26313) div 19929
let numChunks = (totalIters + wordBitWidth-1) div wordBitWidth
let numChunks = totalIters.ceilDiv_vartime(wordBitWidth)
let chunkSize = totalIters div numChunks
let cutoff = totalIters mod numChunks
return (totalIters, numChunks, chunkSize, cutoff)
@ -444,7 +444,7 @@ func invmod*(
## ``a`` MUST be less than M.
const Excess = 2
const k = WordBitWidth - Excess
const NumUnsatWords = (bits + k - 1) div k
const NumUnsatWords = bits.ceilDiv_vartime(k)
# Convert values to unsaturated repr
var m2 {.noInit.}: LimbsUnsaturated[NumUnsatWords, Excess]
@ -471,7 +471,7 @@ func invmod*(
const Excess = 2
const k = WordBitWidth - Excess
const NumUnsatWords = (bits + k - 1) div k
const NumUnsatWords = bits.ceilDiv_vartime(k)
# Convert values to unsaturated repr
const m2 = LimbsUnsaturated[NumUnsatWords, Excess].fromPackedRepr(M)
@ -647,7 +647,7 @@ func legendre*(a, M: Limbs, bits: static int): SecretWord =
## ≡ 0 (mod p), iff a is 0
const Excess = 2
const k = WordBitWidth - Excess
const NumUnsatWords = (bits + k - 1) div k
const NumUnsatWords = bits.ceilDiv_vartime(k)
# Convert values to unsaturated repr
var m2 {.noInit.}: LimbsUnsaturated[NumUnsatWords, Excess]
@ -667,7 +667,7 @@ func legendre*(a: Limbs, M: static Limbs, bits: static int): SecretWord =
const Excess = 2
const k = WordBitWidth - Excess
const NumUnsatWords = (bits + k - 1) div k
const NumUnsatWords = bits.ceilDiv_vartime(k)
# Convert values to unsaturated repr
const m2 = LimbsUnsaturated[NumUnsatWords, Excess].fromPackedRepr(M)
@ -852,7 +852,7 @@ func invmod_vartime*(
## ``a`` MUST be less than M.
const Excess = 2
const k = WordBitWidth - Excess
const NumUnsatWords = (bits + k - 1) div k
const NumUnsatWords = bits.ceilDiv_vartime(k)
# Convert values to unsaturated repr
var m2 {.noInit.}: LimbsUnsaturated[NumUnsatWords, Excess]
@ -879,7 +879,7 @@ func invmod_vartime*(
const Excess = 2
const k = WordBitWidth - Excess
const NumUnsatWords = (bits + k - 1) div k
const NumUnsatWords = bits.ceilDiv_vartime(k)
# Convert values to unsaturated repr
const m2 = LimbsUnsaturated[NumUnsatWords, Excess].fromPackedRepr(M)

View File

@ -116,5 +116,3 @@ macro genDerivedConstants*(mode: static DerivedConstantMode): untyped =
M
)
)
# echo result.toStrLit()

View File

@ -95,7 +95,7 @@ proc genCurveConstants(defs: seq[CurveParams]): NimNode =
exported($curve & "_sexticTwist"),
newLit curveDef.sexticTwist
)
if curveDef.eq_form == TwistedEdwards and
curveDef.coef_A.kind != NoCoef and curveDef.coef_D.kind != NoCoef:
curveEllipticStmts.add newConstStmt(
@ -118,8 +118,6 @@ proc genCurveConstants(defs: seq[CurveParams]): NimNode =
result.add curveEllipticStmts
# echo result.toStrLit()
macro setupCurves(): untyped =
result = genCurveConstants(curvesDefinitions)

View File

@ -321,8 +321,6 @@ proc genFieldsConstants(defs: seq[CurveParams]): NimNode =
exported("CurveOrderBitWidth"), MapCurveOrderBitWidth
)
# echo result.toStrLit()
macro declareCurves*(curves: untyped): untyped =
## Parse curve configuration and generates
##

View File

@ -389,7 +389,7 @@ func primePlus1div2*(P: BigInt): BigInt =
func primeMinus3div4_BE*[bits: static int](
P: BigInt[bits]
): array[(bits+7) div 8, byte] {.noInit.} =
): array[bits.ceilDiv_vartime(8), byte] {.noInit.} =
## For an input prime `p`, compute (p-3)/4
## and return the result as a canonical byte array / octet string
## For use to check if a number is a square (quadratic residue)
@ -408,7 +408,7 @@ func primeMinus3div4_BE*[bits: static int](
func primeMinus5div8_BE*[bits: static int](
P: BigInt[bits]
): array[(bits+7) div 8, byte] {.noInit.} =
): array[bits.ceilDiv_vartime(8), byte] {.noInit.} =
## For an input prime `p`, compute (p-5)/8
## and return the result as a canonical byte array / octet string
## For use to check if a number is a square (quadratic residue)

View File

@ -11,7 +11,7 @@ import ../../platforms/abstractions
func wordsRequired*(bits: int): int {.compileTime.} =
## Compute the number of limbs required
# from the **announced** bit length
(bits + WordBitWidth - 1) div WordBitWidth
bits.ceilDiv_vartime(WordBitWidth)
type
BigInt*[bits: static int] = object

View File

@ -61,7 +61,7 @@ func decomposeEndo*[M, scalBits, L: static int](
static: doAssert scalBits >= L, "Cannot decompose a scalar smaller than a mini-scalar or the decomposition coefficient"
# Equal when no window or no negative handling, greater otherwise
static: doAssert L >= (scalBits + M - 1) div M + 1
static: doAssert L >= scalBits.ceilDiv_vartime(M) + 1
const w = F.C.getCurveOrderBitwidth().wordsRequired()
when M == 2:
@ -129,7 +129,7 @@ func decomposeEndo*[M, scalBits, L: static int](
# (For example generating a public-key)
type
Recoded[LengthInDigits: static int] = distinct array[(LengthInDigits + 7) div 8, byte]
Recoded[LengthInDigits: static int] = distinct array[LengthInDigits.ceilDiv_vartime(8), byte]
GLV_SAC[M, LengthInDigits: static int] = array[M, Recoded[LengthInDigits]]
## GLV-Based Sign-Aligned-Column representation
## see Faz-Hernandez, 2013
@ -319,7 +319,7 @@ func scalarMulEndo*[scalBits; EC](
{.error: "Unconfigured".}
# 2. Decompose scalar into mini-scalars
const L = (scalBits + M - 1) div M + 1 # Alternatively, negative can be handled with an extra "+1"
const L = scalBits.ceilDiv_vartime(M) + 1 # Alternatively, negative can be handled with an extra "+1"
var miniScalars {.noInit.}: array[M, BigInt[L]]
var negatePoints {.noInit.}: array[M, SecretBool]
miniScalars.decomposeEndo(negatePoints, scalar, P.F)
@ -473,7 +473,7 @@ func computeRecodedLength(bitWidth, window: int): int =
# Strangely in the paper this doesn't depend
# "m", the GLV decomposition dimension.
# lw = ⌈log2 r/w⌉+1 (optionally a second "+1" to handle negative mini scalars)
let lw = (bitWidth + window - 1) div window + 1
let lw = bitWidth.ceilDiv_vartime(window) + 1
result = (lw mod window) + lw
func scalarMulGLV_m2w2*[scalBits; EC](

View File

@ -44,7 +44,7 @@ func multiScalarMulImpl_reference_vartime[F, G; bits: static int](
# Prologue
# --------
const numBuckets = 1 shl c - 1 # bucket 0 is unused
const numWindows = (bits + c - 1) div c
const numWindows = bits.ceilDiv_vartime(c)
type EC = typeof(r)
let miniMSMs = allocHeapArray(EC, numWindows)
@ -376,7 +376,7 @@ func multiScalarMul_vartime*[bits: static int, F, G](
elif F is Fp2: 4
else: {.error: "Unconfigured".}
const L = (bits + M - 1) div M + 1
const L = bits.ceilDiv_vartime(M) + 1
let splitCoefs = allocHeapArray(array[M, BigInt[L]], N)
let endoBasis = allocHeapArray(array[M, ECP_ShortW_Aff[F, G]], N)

View File

@ -599,7 +599,7 @@ when isMainModule:
let c = inputSize.bestBucketBitSize(255, useSignedBuckets = true, useManualTuning = false)
let twoPow = "2^"
let numNZBuckets = 1 shl (c-1)
let collisionMapSize = ((1 shl (c-1))+63) div 64 * 8 # Stored in BigInt[1 shl (c-1)]
let collisionMapSize = ceilDiv_vartime(1 shl (c-1), 64) * 8 # Stored in BigInt[1 shl (c-1)]
let queueSize = 4*c*c - 16*c - 128
let numCollisions = float(inputSize*queueSize) / float(numNZBuckets)
let collisionPercentage = numCollisions / float(inputSize) * 100

View File

@ -223,8 +223,8 @@ func scalarMulGeneric*[EC](P: var EC, scalar: BigInt, window: static int = 5) =
## the scalar multiplication.
var
scratchSpace: array[1 shl window, EC]
scalarCanonicalBE: array[(scalar.bits+7) div 8, byte] # canonical big endian representation
scalarCanonicalBE.marshal(scalar, bigEndian) # Export is constant-time
scalarCanonicalBE: array[scalar.bits.ceilDiv_vartime(8), byte] # canonical big endian representation
scalarCanonicalBE.marshal(scalar, bigEndian) # Export is constant-time
P.scalarMulGeneric(scalarCanonicalBE, scratchSpace)
func scalarMul*[EC](

View File

@ -38,7 +38,7 @@ func scalarMul_doubleAdd_vartime*[EC](P: var EC, scalar: BigInt) {.tags:[VarTime
## This MUST NOT be used with secret data.
##
## This is highly VULNERABLE to timing attacks and power analysis attacks.
var scalarCanonical: array[(scalar.bits+7) div 8, byte]
var scalarCanonical: array[scalar.bits.ceilDiv_vartime(8), byte]
scalarCanonical.marshal(scalar, bigEndian)
var Paff {.noinit.}: affine(EC)

View File

@ -45,7 +45,7 @@ func computeBalancedChunks(start, stopEx, minChunkSize, maxChunkSize, targetNumC
cutoff = totalIters mod numChunks
elif baseChunkSize > maxChunkSize or (baseChunkSize == maxChunkSize and cutoff != 0):
# After cutoff, we do baseChunkSize+1, and would run afoul of the maxChunkSize constraint (unless no remainder), hence ceildiv
numChunks = (totalIters + maxChunkSize - 1) div maxChunkSize # ceildiv
numChunks = totalIters.ceilDiv_vartime(maxChunkSize)
baseChunkSize = totalIters div numChunks
cutoff = totalIters mod numChunks

View File

@ -183,6 +183,6 @@ func powUnsafeExponent*[F; bits: static int](
## - memory access analysis
## - power analysis
## - timing analysis
var expBE {.noInit.}: array[(bits + 7) div 8, byte]
var expBE {.noInit.}: array[bits.ceilDiv_vartime(8), byte]
expBE.marshal(exponent, bigEndian)
a.powUnsafeExponent(expBE, window)

View File

@ -355,7 +355,7 @@ func marshal*(
## or zero-padded right for little-endian.
## I.e least significant bit is aligned to buffer boundary
debug:
doAssert dst.len >= (BigInt.bits + 7) div 8, block:
doAssert dst.len >= BigInt.bits.ceilDiv_vartime(8), block:
"BigInt -> Raw int conversion: destination buffer is too small\n" &
" bits: " & $BigInt.bits & "\n" &
" bytes allocated: " & $dst.len & '\n'
@ -389,7 +389,7 @@ func fromHex*(a: var BigInt, s: string) =
## Can work at compile-time to declare curve moduli from their hex strings
# 1. Convert to canonical uint
const canonLen = (BigInt.bits + 8 - 1) div 8
const canonLen = BigInt.bits.ceilDiv_vartime(8)
var bytes: array[canonLen, byte]
bytes.paddedFromHex(s, bigEndian)
@ -428,7 +428,7 @@ func appendHex*(dst: var string, big: BigInt, order: static Endianness = bigEndi
## This function may allocate.
# 1. Convert Big Int to canonical uint
const canonLen = (big.bits + 8 - 1) div 8
const canonLen = big.bits.ceilDiv_vartime(8)
var bytes: array[canonLen, byte]
marshal(bytes, big, order)

View File

@ -67,7 +67,7 @@ func roundNextMultipleOf(x: int, n: static int): int {.inline.} =
# n is a power of 2. (If compiler cannot prove that x>0 it does not make the optim)
result = (x + n - 1) and not(n - 1)
else:
result = ((x + n - 1) div n) * n
result = x.ceilDiv_vartime(n) * n
# Stack allocation
# ----------------------------------------------------------------------------------

View File

@ -0,0 +1,77 @@
# Constantine
# Copyright (c) 2018-2019 Status Research & Development GmbH
# Copyright (c) 2020-Present Mamy André-Ratsimbazafy
# Licensed and distributed under either of
# * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
# * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
# at your option. This file may not be copied, modified, or distributed except according to those terms.
import std/macros
proc rebuildUntypedAst*(ast: NimNode, dropRootStmtList = false): NimNode =
## In some cases (generics or static proc) Nim gives us
## typed NimNode which are hard to process.
## This rebuilds an untyped AST.
##
## Additionally this allows dropping the root StmtList that
## may wrap the typed AST from early symbol resolution
proc rebuild(node: NimNode): NimNode =
proc defaultMultipleChildren(node: NimNode): NimNode =
var rTree = node.kind.newTree()
for child in node:
rTree.add rebuild(child)
return rTree
case node.kind:
of {nnkIdent, nnkSym}:
return ident($node)
of nnkEmpty:
return node
of nnkLiterals:
return node
of nnkHiddenStdConv:
if node[1].kind == nnkIntLit:
return node[1]
else:
expectKind(node[1], nnkSym)
return ident($node[1])
of nnkConv: # type conversion needs to be replaced by a function call in untyped AST
var rTree = nnkCall.newTree()
for child in node:
rTree.add rebuild(child)
return rTree
of {nnkCall, nnkInfix, nnkPrefix}:
if node[0].kind == nnkOpenSymChoice:
if node[0][0].eqIdent"contains":
var rTree = nnkInfix.newTree()
rTree.add ident"in"
rTree.add rebuild(node[2])
rTree.add rebuild(node[1])
return rTree
else:
var rTree = node.kind.newTree()
rTree.add rebuild(node[0][0])
for i in 1 ..< node.len:
rTree.add rebuild(node[i])
return rTree
elif node[0].kind == nnkClosedSymChoice:
if node[0][0].eqIdent"addr":
node.expectLen(1)
return nnkAddr.newTree(rebuild(node[1]))
else:
var rTree = node.kind.newTree()
rTree.add rebuild(node[0][0])
for i in 1 ..< node.len:
rTree.add rebuild(node[i])
return rTree
else:
return defaultMultipleChildren(node)
of nnkClosedSymChoice:
return rebuild(node[0])
else:
return defaultMultipleChildren(node)
if dropRootStmtList and ast.kind == nnkStmtList:
return rebuild(ast[0])
else:
result = rebuild(ast)

View File

@ -31,12 +31,12 @@ when GCC_Compatible:
0
else:
when sizeof(n) == 8:
cint(64) - builtin_clzll(n)
cint(63) - builtin_clzll(n)
else:
cint(31) - builtin_clz(n.uint32)
func ctz_c_compiler_vartime*(n: SomeUnsignedInt): cint {.inline.} =
## Compute the number of trailing zeros
## Compute the number of trailing zeros
## in the bit representation of n using compiler builtin
## ⚠ Depending on the compiler:
## - It is undefined if n == 0
@ -72,7 +72,7 @@ elif defined(icc):
if fnc(index.addr, v) == 0:
return default
return index.int
func log2_c_compiler_vartime*(n: SomeUnsignedInt): cint {.inline.} =
## Compute the log2 of n using compiler builtin
## ⚠ Depending on the compiler:
@ -82,7 +82,7 @@ elif defined(icc):
bitscan(bitScanReverse64, n, default = 0)
else:
bitscan(bitScanReverse, c.uint32, default = 0)
func ctz_c_compiler_vartime*(n: SomeUnsignedInt): cint {.inline.} =
## Compute the number of trailing zero bits of n using compiler builtin
## ⚠ Depending on the compiler:
@ -116,7 +116,7 @@ elif defined(vcc):
if fnc(index.addr, v) == 0:
return 0
return index.int
func log2_c_compiler_vartime*(n: SomeUnsignedInt): cint {.inline.} =
## Compute the log2 of n using compiler builtin
## ⚠ Depending on the compiler:
@ -126,7 +126,7 @@ elif defined(vcc):
bitscan(bitScanReverse64, n, default = 0)
else:
bitscan(bitScanReverse, c.uint32, default = 0)
func ctz_c_compiler_vartime*(n: SomeUnsignedInt): cint {.inline.} =
## Compute the number of trailing zero bits of n using compiler builtin
## ⚠ Depending on the compiler:

View File

@ -7,6 +7,7 @@
# at your option. This file may not be copied, modified, or distributed except according to those terms.
import std/macros
import ../../ast_rebuilder
# ############################################################
#
@ -33,33 +34,6 @@ func flag*[E: enum](e: varargs[E]): Flag[E] {.inline.} =
# Macros
# ------------------------------------------------------------
proc replaceSymsByIdents*(ast: NimNode): NimNode =
proc inspect(node: NimNode): NimNode =
case node.kind:
of {nnkIdent, nnkSym}:
return ident($node)
of nnkEmpty:
return node
of nnkLiterals:
return node
of nnkHiddenStdConv:
if node[1].kind == nnkIntLit:
return node[1]
else:
expectKind(node[1], nnkSym)
return ident($node[1])
of nnkConv: # type conversion needs to be replaced by a function call in untyped AST
var rTree = nnkCall.newTree()
for child in node:
rTree.add inspect(child)
return rTree
else:
var rTree = node.kind.newTree()
for child in node:
rTree.add inspect(child)
return rTree
result = inspect(ast)
macro replacePragmasByInline(procAst: typed): untyped =
## Replace pragmas by the inline pragma
## We need a separate "typed" macro
@ -76,7 +50,7 @@ macro replacePragmasByInline(procAst: typed): untyped =
result.add newProc(
name = procAst.name,
params = params,
body = procAst.body.replaceSymsByIdents(),
body = procAst.body.rebuildUntypedAst(),
procType = nnkProcDef,
pragmas = nnkPragma.newTree(ident"inline", ident"nimcall")
)
@ -84,19 +58,19 @@ macro replacePragmasByInline(procAst: typed): untyped =
result.add nnkPragma.newTree(ident"pop")
macro wrapOpenArrayLenType*(ty: typedesc, procAst: untyped): untyped =
## Wraps pointer+len library calls in properly typed and converted openArray calls
## Wraps pointer+len library calls in properly typed and converted openArray calls
##
## ```
## {.push cdecl.}
## proc foo*(r: int, a: openArray[CustomType], b: int) {.wrapOpenArrayLenType: uint32, importc: "foo", dynlib: "libfoo.so".}
## {.pop.}
## ```
##
##
## is transformed into
##
##
## ```
## proc foo(r: int, a: ptr CustomType, aLen: uint32, b: int) {.cdecl, importc: "foo", dynlib: "libfoo.so".}
##
##
## proc foo*(r: int, a: openArray[CustomType], b: int) {.inline.} =
## foo(r, a[0].unsafeAddr, a.len.uint32, b)
## ```

View File

@ -9,7 +9,7 @@
import
../../math/config/[curves, precompute],
../../math/io/io_bigints,
../primitives, ../bithacks, ../endians,
../primitives, ../bithacks, ../endians, ../codecs,
./llvm
# ############################################################

View File

@ -52,8 +52,25 @@ template debug*(body: untyped): untyped =
when defined(debugConstantine):
body
func unreachable*() {.noReturn.} =
proc builtin_unreachable(){.nodecl, importc: "__builtin_unreachable".}
func unreachable*() {.noReturn, inline.} =
doAssert false, "Unreachable"
when GCC_Compatible:
builtin_unreachable()
# ############################################################
#
# Arithmetic
#
# ############################################################
func ceilDiv_vartime*(a, b: auto): auto {.inline.} =
## ceil division, to be used only on length or at compile-time
## ceil(a / b)
# "LengthInDigits: static int" doesn't match "int"
# if "SomeInteger" is used instead of "autoi"
(a + b - 1) div b
# ############################################################
#

View File

@ -32,16 +32,14 @@ macro staticFor*(idx: untyped{nkIdent}, start, stopEx: static int, body: untyped
for i in start ..< stopEx:
result.add nnkBlockStmt.newTree(
ident("unrolledIter_" & $idx & $i),
body.replaceNodes(idx, newLit i)
)
body.replaceNodes(idx, newLit i))
macro staticForCountdown*(idx: untyped{nkIdent}, start, stopIncl: static int, body: untyped): untyped =
result = newStmtList()
for i in countdown(start, stopIncl):
result.add nnkBlockStmt.newTree(
ident("unrolledIter_" & $idx & $i),
body.replaceNodes(idx, newLit i)
)
body.replaceNodes(idx, newLit i))
{.experimental: "dynamicBindSym".}
@ -62,5 +60,4 @@ macro staticFor*(ident: untyped{nkIdent}, choices: typed, body: untyped): untype
for choice in choices:
result.add nnkBlockStmt.newTree(
ident($ident & "_" & $choice.intVal),
body.replaceNodes(ident, choice)
)
body.replaceNodes(ident, choice))

View File

@ -0,0 +1,9 @@
# Black & Scholes European Option Pricing
https://www.investopedia.com/terms/b/blackscholes.asp
Benchmarks from the PARSEC benchmark suite
https://parsec.cs.princeton.edu
Reference C implementation by Intel

View File

@ -0,0 +1,67 @@
Name: Black Scholes
Description: The Black-Scholes equation is a differential equation that
describes how, under a certain set of assumptions, the value of an option
changes as the price of the underlying asset changes.
The formula for a put option is similar. The cumulative normal distribution
function, CND(x), gives the probability that normally distributed random
variable will have a value less than x. There is no closed form expression for
this function, and as such it must be evaluated numerically. Alternatively,
the values of this function may be pre-computed and hard-coded in the table; in
this case, they can be obtained at runtime using table lookup. We compare both
of these approaches in our work. The other parameters are as follows:
S underlying asset.s current price, X the strike price, T time to the
expiration date, r risk-less rate of return, and v stock.s volatility.
Based on this formula, one can compute the option price analytically based on
the five input parameters. Using this analytical approach to price option,
the limiting factor lies with the amount of floating-point calculation a
processor can perform.
Parallelization: Our parallelization algorithms is very simple: we simply price
multiple options in parallel using Black-Scholes formula. Each thread prices an
individual option. In practice financial houses price 10.s to 100.s of thousandsof options using Black-Scholes.
=======================================
Programming Languages & Libraries:
C/C++ and Pthread is used to implement this benchmark.
=======================================
System requirements:
1) Intel(R) C++ Compiler: version 9.0 or higher
2) GNU gcc/g++: version 3.3 or higher
3) sed: version 4.0.9 or higher recommended.
The minimum required memory size is 140 MBytes.
=======================================
Input/Output:
The input data file of this benchmark includes an array of data of
options.
The output benchmark will output the price of the options based on the five
input parameters in the dataset file.
=======================================
Characteristics:
(1) Hotspot
Hotspot of the benchmark includes computing the price of options using
black scholes formula and the cumulative normal distribution function.
They are implemented in BlkSchlsEqEuroNoDiv and CNDF in "bs.c" respectly.
=======================================
Revision History
Date: Person-making-revision brief-description-of-revision
=======================================
Author: Victor Lee, Mikhail Smelyanskiy
Acknowledgements:
References:
[Black73] Black, Fischer, and M. Scholes. The Pricing of Options and Corporate Liabilities. Journal of Political Economy, 81:637--659, May--June 1973.

View File

@ -0,0 +1,509 @@
// Copyright (c) 2007 Intel Corp.
// Black-Scholes
// Analytical method for calculating European Options
//
//
// Reference Source: Options, Futures, and Other Derivatives, 3rd Edition, Prentice
// Hall, John C. Hull,
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <string.h>
#ifdef ENABLE_PARSEC_HOOKS
#include <hooks.h>
#endif
// Multi-threaded pthreads header
#ifdef ENABLE_THREADS
// Add the following line so that icc 9.0 is compatible with pthread lib.
#define __thread __threadp
MAIN_ENV
#undef __thread
#endif
// Multi-threaded OpenMP header
#ifdef ENABLE_OPENMP
#include <omp.h>
#endif
#ifdef ENABLE_TBB
#include "tbb/blocked_range.h"
#include "tbb/parallel_for.h"
#include "tbb/task_scheduler_init.h"
#include "tbb/tick_count.h"
using namespace std;
using namespace tbb;
#endif //ENABLE_TBB
// Multi-threaded header for Windows
#ifdef WIN32
#pragma warning(disable : 4305)
#pragma warning(disable : 4244)
#include <windows.h>
#endif
//Precision to use for calculations
#define fptype float
#define NUM_RUNS 100
typedef struct OptionData_ {
fptype s; // spot price
fptype strike; // strike price
fptype r; // risk-free interest rate
fptype divq; // dividend rate
fptype v; // volatility
fptype t; // time to maturity or option expiration in years
// (1yr = 1.0, 6mos = 0.5, 3mos = 0.25, ..., etc)
char OptionType; // Option type. "P"=PUT, "C"=CALL
fptype divs; // dividend vals (not used in this test)
fptype DGrefval; // DerivaGem Reference Value
} OptionData;
OptionData *data;
fptype *prices;
int numOptions;
int * otype;
fptype * sptprice;
fptype * strike;
fptype * rate;
fptype * volatility;
fptype * otime;
int numErrors = 0;
int nThreads;
////////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////////
// Cumulative Normal Distribution Function
// See Hull, Section 11.8, P.243-244
#define inv_sqrt_2xPI 0.39894228040143270286
fptype CNDF ( fptype InputX )
{
int sign;
fptype OutputX;
fptype xInput;
fptype xNPrimeofX;
fptype expValues;
fptype xK2;
fptype xK2_2, xK2_3;
fptype xK2_4, xK2_5;
fptype xLocal, xLocal_1;
fptype xLocal_2, xLocal_3;
// Check for negative value of InputX
if (InputX < 0.0) {
InputX = -InputX;
sign = 1;
} else
sign = 0;
xInput = InputX;
// Compute NPrimeX term common to both four & six decimal accuracy calcs
expValues = exp(-0.5f * InputX * InputX);
xNPrimeofX = expValues;
xNPrimeofX = xNPrimeofX * inv_sqrt_2xPI;
xK2 = 0.2316419 * xInput;
xK2 = 1.0 + xK2;
xK2 = 1.0 / xK2;
xK2_2 = xK2 * xK2;
xK2_3 = xK2_2 * xK2;
xK2_4 = xK2_3 * xK2;
xK2_5 = xK2_4 * xK2;
xLocal_1 = xK2 * 0.319381530;
xLocal_2 = xK2_2 * (-0.356563782);
xLocal_3 = xK2_3 * 1.781477937;
xLocal_2 = xLocal_2 + xLocal_3;
xLocal_3 = xK2_4 * (-1.821255978);
xLocal_2 = xLocal_2 + xLocal_3;
xLocal_3 = xK2_5 * 1.330274429;
xLocal_2 = xLocal_2 + xLocal_3;
xLocal_1 = xLocal_2 + xLocal_1;
xLocal = xLocal_1 * xNPrimeofX;
xLocal = 1.0 - xLocal;
OutputX = xLocal;
if (sign) {
OutputX = 1.0 - OutputX;
}
return OutputX;
}
//////////////////////////////////////////////////////////////////////////////////////
//////////////////////////////////////////////////////////////////////////////////////
//////////////////////////////////////////////////////////////////////////////////////
//////////////////////////////////////////////////////////////////////////////////////
fptype BlkSchlsEqEuroNoDiv( fptype sptprice,
fptype strike, fptype rate, fptype volatility,
fptype time, int otype, float timet )
{
fptype OptionPrice;
// local private working variables for the calculation
fptype xStockPrice;
fptype xStrikePrice;
fptype xRiskFreeRate;
fptype xVolatility;
fptype xTime;
fptype xSqrtTime;
fptype logValues;
fptype xLogTerm;
fptype xD1;
fptype xD2;
fptype xPowerTerm;
fptype xDen;
fptype d1;
fptype d2;
fptype FutureValueX;
fptype NofXd1;
fptype NofXd2;
fptype NegNofXd1;
fptype NegNofXd2;
xStockPrice = sptprice;
xStrikePrice = strike;
xRiskFreeRate = rate;
xVolatility = volatility;
xTime = time;
xSqrtTime = sqrt(xTime);
logValues = log( sptprice / strike );
xLogTerm = logValues;
xPowerTerm = xVolatility * xVolatility;
xPowerTerm = xPowerTerm * 0.5;
xD1 = xRiskFreeRate + xPowerTerm;
xD1 = xD1 * xTime;
xD1 = xD1 + xLogTerm;
xDen = xVolatility * xSqrtTime;
xD1 = xD1 / xDen;
xD2 = xD1 - xDen;
d1 = xD1;
d2 = xD2;
NofXd1 = CNDF( d1 );
NofXd2 = CNDF( d2 );
FutureValueX = strike * ( exp( -(rate)*(time) ) );
if (otype == 0) {
OptionPrice = (sptprice * NofXd1) - (FutureValueX * NofXd2);
} else {
NegNofXd1 = (1.0 - NofXd1);
NegNofXd2 = (1.0 - NofXd2);
OptionPrice = (FutureValueX * NegNofXd2) - (sptprice * NegNofXd1);
}
return OptionPrice;
}
#ifdef ENABLE_TBB
struct mainWork {
mainWork() {}
mainWork(mainWork &w, tbb::split) {}
void operator()(const tbb::blocked_range<int> &range) const {
fptype price;
int begin = range.begin();
int end = range.end();
for (int i=begin; i!=end; i++) {
/* Calling main function to calculate option value based on
* Black & Scholes's equation.
*/
price = BlkSchlsEqEuroNoDiv( sptprice[i], strike[i],
rate[i], volatility[i], otime[i],
otype[i], 0);
prices[i] = price;
#ifdef ERR_CHK
fptype priceDelta = data[i].DGrefval - price;
if( fabs(priceDelta) >= 1e-5 ){
fprintf(stderr,"Error on %d. Computed=%.5f, Ref=%.5f, Delta=%.5f\n",
i, price, data[i].DGrefval, priceDelta);
numError ++;
}
#endif
}
}
};
#endif // ENABLE_TBB
//////////////////////////////////////////////////////////////////////////////////////
//////////////////////////////////////////////////////////////////////////////////////
//////////////////////////////////////////////////////////////////////////////////////
//////////////////////////////////////////////////////////////////////////////////////
#ifdef ENABLE_TBB
int bs_thread(void *tid_ptr) {
int j;
tbb::affinity_partitioner a;
mainWork doall;
for (j=0; j<NUM_RUNS; j++) {
tbb::parallel_for(tbb::blocked_range<int>(0, numOptions), doall, a);
}
return 0;
}
#else // !ENABLE_TBB
#ifdef WIN32
DWORD WINAPI bs_thread(LPVOID tid_ptr){
#else
int bs_thread(void *tid_ptr) {
#endif
int i, j;
fptype price;
fptype priceDelta;
int tid = *(int *)tid_ptr;
int start = tid * (numOptions / nThreads);
int end = start + (numOptions / nThreads);
for (j=0; j<NUM_RUNS; j++) {
#ifdef ENABLE_OPENMP
#pragma omp parallel for private(i, price, priceDelta)
for (i=0; i<numOptions; i++) {
#else //ENABLE_OPENMP
for (i=start; i<end; i++) {
#endif //ENABLE_OPENMP
/* Calling main function to calculate option value based on
* Black & Scholes's equation.
*/
price = BlkSchlsEqEuroNoDiv( sptprice[i], strike[i],
rate[i], volatility[i], otime[i],
otype[i], 0);
prices[i] = price;
#ifdef ERR_CHK
priceDelta = data[i].DGrefval - price;
if( fabs(priceDelta) >= 1e-4 ){
printf("Error on %d. Computed=%.5f, Ref=%.5f, Delta=%.5f\n",
i, price, data[i].DGrefval, priceDelta);
numError ++;
}
#endif
}
}
return 0;
}
#endif //ENABLE_TBB
int main (int argc, char **argv)
{
FILE *file;
int i;
int loopnum;
fptype * buffer;
int * buffer2;
int rv;
#ifdef PARSEC_VERSION
#define __PARSEC_STRING(x) #x
#define __PARSEC_XSTRING(x) __PARSEC_STRING(x)
printf("PARSEC Benchmark Suite Version "__PARSEC_XSTRING(PARSEC_VERSION)"\n");
fflush(NULL);
#else
printf("PARSEC Benchmark Suite\n");
fflush(NULL);
#endif //PARSEC_VERSION
#ifdef ENABLE_PARSEC_HOOKS
__parsec_bench_begin(__parsec_blackscholes);
#endif
if (argc != 4)
{
printf("Usage:\n\t%s <nthreads> <inputFile> <outputFile>\n", argv[0]);
exit(1);
}
nThreads = atoi(argv[1]);
char *inputFile = argv[2];
char *outputFile = argv[3];
//Read input data from file
file = fopen(inputFile, "r");
if(file == NULL) {
printf("ERROR: Unable to open file `%s'.\n", inputFile);
exit(1);
}
rv = fscanf(file, "%i", &numOptions);
if(rv != 1) {
printf("ERROR: Unable to read from file `%s'.\n", inputFile);
fclose(file);
exit(1);
}
if(nThreads > numOptions) {
printf("WARNING: Not enough work, reducing number of threads to match number of options.\n");
nThreads = numOptions;
}
#if !defined(ENABLE_THREADS) && !defined(ENABLE_OPENMP) && !defined(ENABLE_TBB)
if(nThreads != 1) {
printf("Error: <nthreads> must be 1 (serial version)\n");
exit(1);
}
#endif
// alloc spaces for the option data
data = (OptionData*)malloc(numOptions*sizeof(OptionData));
prices = (fptype*)malloc(numOptions*sizeof(fptype));
for ( loopnum = 0; loopnum < numOptions; ++ loopnum )
{
rv = fscanf(file, "%f %f %f %f %f %f %c %f %f", &data[loopnum].s, &data[loopnum].strike, &data[loopnum].r, &data[loopnum].divq, &data[loopnum].v, &data[loopnum].t, &data[loopnum].OptionType, &data[loopnum].divs, &data[loopnum].DGrefval);
if(rv != 9) {
printf("ERROR: Unable to read from file `%s'.\n", inputFile);
fclose(file);
exit(1);
}
}
rv = fclose(file);
if(rv != 0) {
printf("ERROR: Unable to close file `%s'.\n", inputFile);
exit(1);
}
#ifdef ENABLE_THREADS
MAIN_INITENV(,8000000,nThreads);
#endif
printf("Num of Options: %d\n", numOptions);
printf("Num of Runs: %d\n", NUM_RUNS);
#define PAD 256
#define LINESIZE 64
buffer = (fptype *) malloc(5 * numOptions * sizeof(fptype) + PAD);
sptprice = (fptype *) (((unsigned long long)buffer + PAD) & ~(LINESIZE - 1));
strike = sptprice + numOptions;
rate = strike + numOptions;
volatility = rate + numOptions;
otime = volatility + numOptions;
buffer2 = (int *) malloc(numOptions * sizeof(fptype) + PAD);
otype = (int *) (((unsigned long long)buffer2 + PAD) & ~(LINESIZE - 1));
for (i=0; i<numOptions; i++) {
otype[i] = (data[i].OptionType == 'P') ? 1 : 0;
sptprice[i] = data[i].s;
strike[i] = data[i].strike;
rate[i] = data[i].r;
volatility[i] = data[i].v;
otime[i] = data[i].t;
}
printf("Size of data: %d\n", numOptions * (sizeof(OptionData) + sizeof(int)));
#ifdef ENABLE_PARSEC_HOOKS
__parsec_roi_begin();
#endif
#ifdef ENABLE_THREADS
#ifdef WIN32
HANDLE *threads;
int *nums;
threads = (HANDLE *) malloc (nThreads * sizeof(HANDLE));
nums = (int *) malloc (nThreads * sizeof(int));
for(i=0; i<nThreads; i++) {
nums[i] = i;
threads[i] = CreateThread(0, 0, bs_thread, &nums[i], 0, 0);
}
WaitForMultipleObjects(nThreads, threads, TRUE, INFINITE);
free(threads);
free(nums);
#else
int *tids;
tids = (int *) malloc (nThreads * sizeof(int));
for(i=0; i<nThreads; i++) {
tids[i]=i;
CREATE_WITH_ARG(bs_thread, &tids[i]);
}
WAIT_FOR_END(nThreads);
free(tids);
#endif //WIN32
#else //ENABLE_THREADS
#ifdef ENABLE_OPENMP
{
int tid=0;
omp_set_num_threads(nThreads);
bs_thread(&tid);
}
#else //ENABLE_OPENMP
#ifdef ENABLE_TBB
tbb::task_scheduler_init init(nThreads);
int tid=0;
bs_thread(&tid);
#else //ENABLE_TBB
//serial version
int tid=0;
bs_thread(&tid);
#endif //ENABLE_TBB
#endif //ENABLE_OPENMP
#endif //ENABLE_THREADS
#ifdef ENABLE_PARSEC_HOOKS
__parsec_roi_end();
#endif
//Write prices to output file
file = fopen(outputFile, "w");
if(file == NULL) {
printf("ERROR: Unable to open file `%s'.\n", outputFile);
exit(1);
}
rv = fprintf(file, "%i\n", numOptions);
if(rv < 0) {
printf("ERROR: Unable to write to file `%s'.\n", outputFile);
fclose(file);
exit(1);
}
for(i=0; i<numOptions; i++) {
rv = fprintf(file, "%.18f\n", prices[i]);
if(rv < 0) {
printf("ERROR: Unable to write to file `%s'.\n", outputFile);
fclose(file);
exit(1);
}
}
rv = fclose(file);
if(rv != 0) {
printf("ERROR: Unable to close file `%s'.\n", outputFile);
exit(1);
}
#ifdef ERR_CHK
printf("Num Errors: %d\n", numError);
#endif
free(data);
free(prices);
#ifdef ENABLE_PARSEC_HOOKS
__parsec_bench_end();
#endif
return 0;
}

View File

@ -0,0 +1,598 @@
// Copyright (c) 2007 Intel Corp.
// Black-Scholes
// Analytical method for calculating European Options
//
//
// Reference Source: Options, Futures, and Other Derivatives, 3rd Edition, Prentice
// Hall, John C. Hull,
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <string.h>
#ifndef WIN32
#include <pmmintrin.h>
#else
#include <xmmintrin.h>
#endif
#ifdef ENABLE_PARSEC_HOOKS
#include <hooks.h>
#endif
// Multi-threaded pthreads header
#ifdef ENABLE_THREADS
// Add the following line so that icc 9.0 is compatible with pthread lib.
#define __thread __threadp
MAIN_ENV
#undef __thread
#endif
// Multi-threaded OpenMP header
#ifdef ENABLE_OPENMP
#include <omp.h>
#endif
#ifdef ENABLE_TBB
#include "tbb/blocked_range.h"
#include "tbb/parallel_for.h"
#include "tbb/task_scheduler_init.h"
#include "tbb/tick_count.h"
using namespace std;
using namespace tbb;
#endif //ENABLE_TBB
// Multi-threaded header for Windows
#ifdef WIN32
#pragma warning(disable : 4305)
#pragma warning(disable : 4244)
#include <windows.h>
#endif
#ifdef __GNUC__
#define _MM_ALIGN16 __attribute__((aligned (16)))
#define MUSTINLINE __attribute__((always_inline))
#else
#define MUSTINLINE __forceinline
#endif
// NCO = Number of Concurrent Options = SIMD Width
// NCO is currently set in the Makefile.
#ifndef NCO
#error NCO must be defined.
#endif
#if (NCO==2)
#define fptype double
#define SIMD_WIDTH 2
#define _MMR __m128d
#define _MM_LOAD _mm_load_pd
#define _MM_STORE _mm_store_pd
#define _MM_MUL _mm_mul_pd
#define _MM_ADD _mm_add_pd
#define _MM_SUB _mm_sub_pd
#define _MM_DIV _mm_div_pd
#define _MM_SQRT _mm_sqrt_pd
#define _MM_SET(A) _mm_set_pd(A,A)
#define _MM_SETR _mm_set_pd
#endif
#if (NCO==4)
#define fptype float
#define SIMD_WIDTH 4
#define _MMR __m128
#define _MM_LOAD _mm_load_ps
#define _MM_STORE _mm_store_ps
#define _MM_MUL _mm_mul_ps
#define _MM_ADD _mm_add_ps
#define _MM_SUB _mm_sub_ps
#define _MM_DIV _mm_div_ps
#define _MM_SQRT _mm_sqrt_ps
#define _MM_SET(A) _mm_set_ps(A,A,A,A)
#define _MM_SETR _mm_set_ps
#endif
#define NUM_RUNS 100
typedef struct OptionData_ {
fptype s; // spot price
fptype strike; // strike price
fptype r; // risk-free interest rate
fptype divq; // dividend rate
fptype v; // volatility
fptype t; // time to maturity or option expiration in years
// (1yr = 1.0, 6mos = 0.5, 3mos = 0.25, ..., etc)
char OptionType; // Option type. "P"=PUT, "C"=CALL
fptype divs; // dividend vals (not used in this test)
fptype DGrefval; // DerivaGem Reference Value
} OptionData;
_MM_ALIGN16 OptionData* data;
_MM_ALIGN16 fptype* prices;
int numOptions;
int * otype;
fptype * sptprice;
fptype * strike;
fptype * rate;
fptype * volatility;
fptype * otime;
int numError = 0;
int nThreads;
////////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////////
// Cumulative Normal Distribution Function
// See Hull, Section 11.8, P.243-244
#define inv_sqrt_2xPI 0.39894228040143270286
MUSTINLINE void CNDF ( fptype * OutputX, fptype * InputX )
{
int sign[SIMD_WIDTH];
int i;
_MMR xInput;
_MMR xNPrimeofX;
_MM_ALIGN16 fptype expValues[SIMD_WIDTH];
_MMR xK2;
_MMR xK2_2, xK2_3, xK2_4, xK2_5;
_MMR xLocal, xLocal_1, xLocal_2, xLocal_3;
for (i=0; i<SIMD_WIDTH; i++) {
// Check for negative value of InputX
if (InputX[i] < 0.0) {
InputX[i] = -InputX[i];
sign[i] = 1;
} else
sign[i] = 0;
}
// printf("InputX[0]=%lf\n", InputX[0]);
// printf("InputX[1]=%lf\n", InputX[1]);
xInput = _MM_LOAD(InputX);
// local vars
// Compute NPrimeX term common to both four & six decimal accuracy calcs
for (i=0; i<SIMD_WIDTH; i++) {
expValues[i] = exp(-0.5f * InputX[i] * InputX[i]);
// printf("exp[%d]: %f\n", i, expValues[i]);
}
xNPrimeofX = _MM_LOAD(expValues);
xNPrimeofX = _MM_MUL(xNPrimeofX, _MM_SET(inv_sqrt_2xPI));
xK2 = _MM_MUL(_MM_SET(0.2316419), xInput);
xK2 = _MM_ADD(xK2, _MM_SET(1.0));
xK2 = _MM_DIV(_MM_SET(1.0), xK2);
// xK2 = _mm_rcp_pd(xK2); // No rcp function for double-precision
xK2_2 = _MM_MUL(xK2, xK2);
xK2_3 = _MM_MUL(xK2_2, xK2);
xK2_4 = _MM_MUL(xK2_3, xK2);
xK2_5 = _MM_MUL(xK2_4, xK2);
xLocal_1 = _MM_MUL(xK2, _MM_SET(0.319381530));
xLocal_2 = _MM_MUL(xK2_2, _MM_SET(-0.356563782));
xLocal_3 = _MM_MUL(xK2_3, _MM_SET(1.781477937));
xLocal_2 = _MM_ADD(xLocal_2, xLocal_3);
xLocal_3 = _MM_MUL(xK2_4, _MM_SET(-1.821255978));
xLocal_2 = _MM_ADD(xLocal_2, xLocal_3);
xLocal_3 = _MM_MUL(xK2_5, _MM_SET(1.330274429));
xLocal_2 = _MM_ADD(xLocal_2, xLocal_3);
xLocal_1 = _MM_ADD(xLocal_2, xLocal_1);
xLocal = _MM_MUL(xLocal_1, xNPrimeofX);
xLocal = _MM_SUB(_MM_SET(1.0), xLocal);
_MM_STORE(OutputX, xLocal);
// _mm_storel_pd(&OutputX[0], xLocal);
// _mm_storeh_pd(&OutputX[1], xLocal);
for (i=0; i<SIMD_WIDTH; i++) {
if (sign[i]) {
OutputX[i] = (1.0 - OutputX[i]);
}
}
}
// For debugging
void print_xmm(_MMR in, char* s) {
int i;
_MM_ALIGN16 fptype val[SIMD_WIDTH];
_MM_STORE(val, in);
printf("%s: ", s);
for (i=0; i<SIMD_WIDTH; i++) {
printf("%f ", val[i]);
}
printf("\n");
}
//////////////////////////////////////////////////////////////////////////////////////
//////////////////////////////////////////////////////////////////////////////////////
//////////////////////////////////////////////////////////////////////////////////////
//////////////////////////////////////////////////////////////////////////////////////
void BlkSchlsEqEuroNoDiv (fptype * OptionPrice, int numOptions, fptype * sptprice,
fptype * strike, fptype * rate, fptype * volatility,
fptype * time, int * otype, float timet)
{
int i;
// local private working variables for the calculation
_MMR xStockPrice;
_MMR xStrikePrice;
_MMR xRiskFreeRate;
_MMR xVolatility;
_MMR xTime;
_MMR xSqrtTime;
_MM_ALIGN16 fptype logValues[NCO];
_MMR xLogTerm;
_MMR xD1, xD2;
_MMR xPowerTerm;
_MMR xDen;
_MM_ALIGN16 fptype d1[SIMD_WIDTH];
_MM_ALIGN16 fptype d2[SIMD_WIDTH];
_MM_ALIGN16 fptype FutureValueX[SIMD_WIDTH];
_MM_ALIGN16 fptype NofXd1[SIMD_WIDTH];
_MM_ALIGN16 fptype NofXd2[SIMD_WIDTH];
_MM_ALIGN16 fptype NegNofXd1[SIMD_WIDTH];
_MM_ALIGN16 fptype NegNofXd2[SIMD_WIDTH];
xStockPrice = _MM_LOAD(sptprice);
xStrikePrice = _MM_LOAD(strike);
xRiskFreeRate = _MM_LOAD(rate);
xVolatility = _MM_LOAD(volatility);
xTime = _MM_LOAD(time);
xSqrtTime = _MM_SQRT(xTime);
for(i=0; i<SIMD_WIDTH;i ++) {
logValues[i] = log(sptprice[i] / strike[i]);
}
xLogTerm = _MM_LOAD(logValues);
xPowerTerm = _MM_MUL(xVolatility, xVolatility);
xPowerTerm = _MM_MUL(xPowerTerm, _MM_SET(0.5));
xD1 = _MM_ADD(xRiskFreeRate, xPowerTerm);
xD1 = _MM_MUL(xD1, xTime);
xD1 = _MM_ADD(xD1, xLogTerm);
xDen = _MM_MUL(xVolatility, xSqrtTime);
xD1 = _MM_DIV(xD1, xDen);
xD2 = _MM_SUB(xD1, xDen);
_MM_STORE(d1, xD1);
_MM_STORE(d2, xD2);
CNDF( NofXd1, d1 );
CNDF( NofXd2, d2 );
for (i=0; i<SIMD_WIDTH; i++) {
FutureValueX[i] = strike[i] * (exp(-(rate[i])*(time[i])));
// printf("FV=%lf\n", FutureValueX[i]);
// NofXd1[i] = NofX(d1[i]);
// NofXd2[i] = NofX(d2[i]);
// printf("NofXd1=%lf\n", NofXd1[i]);
// printf("NofXd2=%lf\n", NofXd2[i]);
if (otype[i] == 0) {
OptionPrice[i] = (sptprice[i] * NofXd1[i]) - (FutureValueX[i] * NofXd2[i]);
}
else {
NegNofXd1[i] = (1.0 - (NofXd1[i]));
NegNofXd2[i] = (1.0 - (NofXd2[i]));
OptionPrice[i] = (FutureValueX[i] * NegNofXd2[i]) - (sptprice[i] * NegNofXd1[i]);
}
// printf("OptionPrice[0] = %lf\n", OptionPrice[i]);
}
}
#ifdef ENABLE_TBB
struct mainWork {
mainWork(){}
mainWork(mainWork &w, tbb::split){}
void operator()(const tbb::blocked_range<int> &range) const {
fptype price[NCO];
fptype priceDelta;
int begin = range.begin();
int end = range.end();
for (int i=begin; i!=end; i+=NCO) {
/* Calling main function to calculate option value based on
* Black & Scholes's equation.
*/
BlkSchlsEqEuroNoDiv( price, NCO, &(sptprice[i]), &(strike[i]),
&(rate[i]), &(volatility[i]), &(otime[i]),
&(otype[i]), 0);
for (int k=0; k<NCO; k++) {
prices[i+k] = price[k];
#ifdef ERR_CHK
priceDelta = data[i+k].DGrefval - price[k];
if( fabs(priceDelta) >= 1e-5 ){
fprintf(stderr,"Error on %d. Computed=%.5f, Ref=%.5f, Delta=%.5f\n",
i+k, price, data[i+k].DGrefval, priceDelta);
numError ++;
}
#endif
}
}
}
};
#endif // ENABLE_TBB
//////////////////////////////////////////////////////////////////////////////////////
//////////////////////////////////////////////////////////////////////////////////////
//////////////////////////////////////////////////////////////////////////////////////
//////////////////////////////////////////////////////////////////////////////////////
#ifdef ENABLE_TBB
int bs_thread(void *tid_ptr) {
int j;
tbb::affinity_partitioner a;
mainWork doall;
for (j=0; j<NUM_RUNS; j++) {
tbb::parallel_for(tbb::blocked_range<int>(0, numOptions), doall, a);
}
return 0;
}
#else // !ENABLE_TBB
#ifdef WIN32
DWORD WINAPI bs_thread(LPVOID tid_ptr){
#else
int bs_thread(void *tid_ptr) {
#endif
int i, j, k;
fptype price[NCO];
fptype priceDelta;
int tid = *(int *)tid_ptr;
int start = tid * (numOptions / nThreads);
int end = start + (numOptions / nThreads);
for (j=0; j<NUM_RUNS; j++) {
#ifdef ENABLE_OPENMP
#pragma omp parallel for private(i, price, priceDelta)
for (i=0; i<numOptions; i += NCO) {
#else //ENABLE_OPENMP
for (i=start; i<end; i += NCO) {
#endif //ENABLE_OPENMP
// Calling main function to calculate option value based on Black & Scholes's
// equation.
BlkSchlsEqEuroNoDiv(price, NCO, &(sptprice[i]), &(strike[i]),
&(rate[i]), &(volatility[i]), &(otime[i]), &(otype[i]), 0);
for (k=0; k<NCO; k++) {
prices[i+k] = price[k];
}
#ifdef ERR_CHK
for (k=0; k<NCO; k++) {
priceDelta = data[i+k].DGrefval - price[k];
if (fabs(priceDelta) >= 1e-4) {
printf("Error on %d. Computed=%.5f, Ref=%.5f, Delta=%.5f\n",
i + k, price[k], data[i+k].DGrefval, priceDelta);
numError ++;
}
}
#endif
}
}
return 0;
}
#endif //ENABLE_TBB
int main (int argc, char **argv)
{
FILE *file;
int i;
int loopnum;
fptype * buffer;
int * buffer2;
int rv;
#ifdef PARSEC_VERSION
#define __PARSEC_STRING(x) #x
#define __PARSEC_XSTRING(x) __PARSEC_STRING(x)
printf("PARSEC Benchmark Suite Version "__PARSEC_XSTRING(PARSEC_VERSION)"\n");
fflush(NULL);
#else
printf("PARSEC Benchmark Suite\n");
fflush(NULL);
#endif //PARSEC_VERSION
#ifdef ENABLE_PARSEC_HOOKS
__parsec_bench_begin(__parsec_blackscholes);
#endif
if (argc != 4)
{
printf("Usage:\n\t%s <nthreads> <inputFile> <outputFile>\n", argv[0]);
exit(1);
}
nThreads = atoi(argv[1]);
char *inputFile = argv[2];
char *outputFile = argv[3];
//Read input data from file
file = fopen(inputFile, "r");
if(file == NULL) {
printf("ERROR: Unable to open file `%s'.\n", inputFile);
exit(1);
}
rv = fscanf(file, "%i", &numOptions);
if(rv != 1) {
printf("ERROR: Unable to read from file `%s'.\n", inputFile);
fclose(file);
exit(1);
}
if(NCO > numOptions) {
printf("ERROR: Not enough work for SIMD operation.\n");
fclose(file);
exit(1);
}
if(nThreads > numOptions/NCO) {
printf("WARNING: Not enough work, reducing number of threads to match number of SIMD options packets.\n");
nThreads = numOptions/NCO;
}
#if !defined(ENABLE_THREADS) && !defined(ENABLE_OPENMP) && !defined(ENABLE_TBB)
if(nThreads != 1) {
printf("Error: <nthreads> must be 1 (serial version)\n");
exit(1);
}
#endif
data = (OptionData*)malloc(numOptions*sizeof(OptionData));
prices = (fptype*)malloc(numOptions*sizeof(fptype));
for ( loopnum = 0; loopnum < numOptions; ++ loopnum )
{
rv = fscanf(file, "%f %f %f %f %f %f %c %f %f", &data[loopnum].s, &data[loopnum].strike, &data[loopnum].r, &data[loopnum].divq, &data[loopnum].v, &data[loopnum].t, &data[loopnum].OptionType, &data[loopnum].divs, &data[loopnum].DGrefval);
if(rv != 9) {
printf("ERROR: Unable to read from file `%s'.\n", inputFile);
fclose(file);
exit(1);
}
}
rv = fclose(file);
if(rv != 0) {
printf("ERROR: Unable to close file `%s'.\n", inputFile);
exit(1);
}
#ifdef ENABLE_THREADS
MAIN_INITENV(,8000000,nThreads);
#endif
printf("Num of Options: %d\n", numOptions);
printf("Num of Runs: %d\n", NUM_RUNS);
#define PAD 256
#define LINESIZE 64
buffer = (fptype *) malloc(5 * numOptions * sizeof(fptype) + PAD);
sptprice = (fptype *) (((unsigned long long)buffer + PAD) & ~(LINESIZE - 1));
strike = sptprice + numOptions;
rate = strike + numOptions;
volatility = rate + numOptions;
otime = volatility + numOptions;
buffer2 = (int *) malloc(numOptions * sizeof(fptype) + PAD);
otype = (int *) (((unsigned long long)buffer2 + PAD) & ~(LINESIZE - 1));
for (i=0; i<numOptions; i++) {
otype[i] = (data[i].OptionType == 'P') ? 1 : 0;
sptprice[i] = data[i].s;
strike[i] = data[i].strike;
rate[i] = data[i].r;
volatility[i] = data[i].v;
otime[i] = data[i].t;
}
printf("Size of data: %d\n", numOptions * (sizeof(OptionData) + sizeof(int)));
#ifdef ENABLE_PARSEC_HOOKS
__parsec_roi_begin();
#endif
#ifdef ENABLE_THREADS
#ifdef WIN32
HANDLE *threads;
int *nums;
threads = (HANDLE *) malloc (nThreads * sizeof(HANDLE));
nums = (int *) malloc (nThreads * sizeof(int));
for(i=0; i<nThreads; i++) {
nums[i] = i;
threads[i] = CreateThread(0, 0, bs_thread, &nums[i], 0, 0);
}
WaitForMultipleObjects(nThreads, threads, TRUE, INFINITE);
free(threads);
free(nums);
#else
int *tids;
tids = (int *) malloc (nThreads * sizeof(int));
for(i=0; i<nThreads; i++) {
tids[i]=i;
CREATE_WITH_ARG(bs_thread, &tids[i]);
}
WAIT_FOR_END(nThreads);
free(tids);
#endif //WIN32
#else //ENABLE_THREADS
#ifdef ENABLE_OPENMP
{
int tid=0;
omp_set_num_threads(nThreads);
bs_thread(&tid);
}
#else //ENABLE_OPENMP
#ifdef ENABLE_TBB
tbb::task_scheduler_init init(nThreads);
int tid=0;
bs_thread(&tid);
#else //ENABLE_TBB
//serial version
int tid=0;
bs_thread(&tid);
#endif //ENABLE_TBB
#endif //ENABLE_OPENMP
#endif //ENABLE_THREADS
#ifdef ENABLE_PARSEC_HOOKS
__parsec_roi_end();
#endif
//Write prices to output file
file = fopen(outputFile, "w");
if(file == NULL) {
printf("ERROR: Unable to open file `%s'.\n", outputFile);
exit(1);
}
rv = fprintf(file, "%i\n", numOptions);
if(rv < 0) {
printf("ERROR: Unable to write to file `%s'.\n", outputFile);
fclose(file);
exit(1);
}
for(i=0; i<numOptions; i++) {
rv = fprintf(file, "%.18f\n", prices[i]);
if(rv < 0) {
printf("ERROR: Unable to write to file `%s'.\n", outputFile);
fclose(file);
exit(1);
}
}
rv = fclose(file);
if(rv != 0) {
printf("ERROR: Unable to close file `%s'.\n", outputFile);
exit(1);
}
#ifdef ERR_CHK
printf("Num Errors: %d\n", numError);
#endif
free(data);
free(prices);
#ifdef ENABLE_PARSEC_HOOKS
__parsec_bench_end();
#endif
return 0;
}

View File

@ -0,0 +1,122 @@
define(EXTERN_ENV,
`
#include <time.h>
extern pthread_t _M4_threads[MAX_THREADS];
extern pthread_mutexattr_t _M4_normalMutexAttr;
extern int _M4_numThreads;
')
define(MAIN_ENV,
`
#ifdef _XOPEN_SOURCE
#undef _XOPEN_SOURCE
#define _XOPEN_SOURCE 700
#endif
#ifndef _GNU_SOURCE
#define _GNU_SOURCE
#endif
#ifndef __USE_XOPEN2K
#define __USE_XOPEN2K
#endif
#ifndef __USE_UNIX98
#define __USE_UNIX98
#endif
#include <pthread.h>
#include <time.h>
#define MAX_THREADS 128
pthread_t _M4_threadsTable[MAX_THREADS];
int _M4_threadsTableAllocated[MAX_THREADS];
pthread_mutexattr_t _M4_normalMutexAttr;
int _M4_numThreads = MAX_THREADS;
')
define(MAIN_INITENV, `
pthread_mutexattr_init( &_M4_normalMutexAttr);
// pthread_mutexattr_settype( &_M4_normalMutexAttr, PTHREAD_MUTEX_NORMAL);
_M4_numThreads = $3;
{
int _M4_i;
for ( _M4_i = 0; _M4_i < MAX_THREADS; _M4_i++) {
_M4_threadsTableAllocated[_M4_i] = 0;
}
}
')
define(MAIN_END, `')
define(CREATE_WITH_ARG, `
{
int _M4_i;
for ( _M4_i = 0; _M4_i < MAX_THREADS; _M4_i++) {
if ( _M4_threadsTableAllocated[_M4_i] == 0) break;
}
pthread_create(&_M4_threadsTable[_M4_i],NULL,(void *(*)(void *))$1,(void *)$2);
_M4_threadsTableAllocated[_M4_i] = 1;
}
')
define(CREATE, `CREATE_WITH_ARG($1,NULL);')
define(SELF, `( long)pthread_self()')
define(BARDEC, `pthread_barrier_t $1;')
define(BARINIT, `pthread_barrier_init(&($1),NULL,_M4_numThreads);')
define(BARRIER, `pthread_barrier_wait(&($1))')
define(LOCKDEC, `pthread_mutex_t $1;')
define(LOCKINIT, `pthread_mutex_init(&($1), &_M4_normalMutexAttr);')
define(LOCK, `pthread_mutex_lock(&($1));')
define(UNLOCK, `pthread_mutex_unlock(&($1));')
define(LOCKRDEC, `pthread_mutex_t $1;')
define(LOCKRINIT, `pthread_mutex_init(&($1), NULL);')
define(LOCKR, `pthread_mutex_lock(&($1));')
define(UNLOCKR, `pthread_mutex_unlock(&($1));')
define(CVDEC, `pthread_cond_t $1;')
define(CVINIT, `pthread_cond_init(&$1,NULL);')
define(CVWAIT, `pthread_cond_wait(&$1,&$2);')
define(CVWAITREL, `pthread_cond_wait(&$1,&$2); pthread_mutex_unlock(&$2);')
define(CVSIGNALALL, `pthread_cond_broadcast(&$1);')
define(CVSIGNALONE, `pthread_cond_signal(&$1);')
define(ACQUIRE, `pthread_mutex_lock(&($1));')
define(RELEASE, `pthread_mutex_unlock(&($1));')
define(ALOCKDEC, `pthread_mutex_t ($1[$2]);')
define(ALOCKINIT, `{
int _M4_loop_j;
for(_M4_loop_j=0; _M4_loop_j < $2; _M4_loop_j++){
pthread_mutex_init((pthread_mutex_t*)&($1[_M4_loop_j]), NULL);
}
}')
define(ALOCK, `pthread_mutex_lock((pthread_mutex_t*)&($1[$2]));')
define(AULOCK, `pthread_mutex_unlock((pthread_mutex_t*)&($1[$2]));')
define(AACQUIRE, `pthread_mutex_lock(&($1[$2]));')
define(ARELEASE, `pthread_mutex_unlock(&($1[$2]));')
define(WAIT_FOR_END, `
{
int _M4_i;
void *_M4_ret;
for ( _M4_i = 0; _M4_i < MAX_THREADS;_M4_i++) {
if ( _M4_threadsTableAllocated[_M4_i] == 0) break;
pthread_join( _M4_threadsTable[_M4_i], &_M4_ret);
}
}
')
define(JOIN, `{pthread_join( _M4_threadsTable[($1)], NULL );}')
define(CLOCK, `{long time(); ($1) = time(0);}')
define(GET_PID, `$1 = pthread_self();')
define(AUG_ON, `')
define(AUG_OFF, `')
define(G_MALLOC, `malloc($1);')
define(MALLOC, `malloc($1);')

View File

@ -0,0 +1,85 @@
//Copyright (c) 2009 Princeton University
//Written by Christian Bienia
//Generate input files for blackscholes benchmark
#include <stdio.h>
#include <stdlib.h>
//Precision to use
#define fptype double
typedef struct OptionData_ {
fptype s; // spot price
fptype strike; // strike price
fptype r; // risk-free interest rate
fptype divq; // dividend rate
fptype v; // volatility
fptype t; // time to maturity or option expiration in years
// (1yr = 1.0, 6mos = 0.5, 3mos = 0.25, ..., etc)
const char *OptionType; // Option type. "P"=PUT, "C"=CALL
fptype divs; // dividend vals (not used in this test)
fptype DGrefval; // DerivaGem Reference Value
} OptionData;
//Total number of options in optionData.txt
#define MAX_OPTIONS 1000
OptionData data_init[] = {
#include "optionData.txt"
};
int main (int argc, char **argv) {
int numOptions;
char *fileName;
int rv;
int i;
if (argc != 3) {
printf("Usage:\n\t%s <numOptions> <fileName>\n", argv[0]);
exit(1);
}
numOptions = atoi(argv[1]);
fileName = argv[2];
if(numOptions < 1) {
printf("ERROR: Number of options must at least be 1.\n");
exit(1);
}
FILE *file;
file = fopen(fileName, "w");
if(file == NULL) {
printf("ERROR: Unable to open file `%s'.\n", fileName);
exit(1);
}
//write number of options
rv = fprintf(file, "%i\n", numOptions);
if(rv < 0) {
printf("ERROR: Unable to write to file `%s'.\n", fileName);
fclose(file);
exit(1);
}
//write values for options
for(i=0; i<numOptions; i++) {
//NOTE: DG RefValues specified exceed double precision, output will deviate
rv = fprintf(file, "%.2f %.2f %.4f %.2f %.2f %.2f %c %.2f %.18f\n", data_init[i % MAX_OPTIONS].s, data_init[i % MAX_OPTIONS].strike, data_init[i % MAX_OPTIONS].r, data_init[i % MAX_OPTIONS].divq, data_init[i % MAX_OPTIONS].v, data_init[i % MAX_OPTIONS].t, data_init[i % MAX_OPTIONS].OptionType[0], data_init[i % MAX_OPTIONS].divs, data_init[i % MAX_OPTIONS].DGrefval);
if(rv < 0) {
printf("ERROR: Unable to write to file `%s'.\n", fileName);
fclose(file);
exit(1);
}
}
rv = fclose(file);
if(rv != 0) {
printf("ERROR: Unable to close file `%s'.\n", fileName);
exit(1);
}
return 0;
}

View File

@ -0,0 +1,366 @@
# Weave
# Copyright (c) 2019 Mamy André-Ratsimbazafy
# Licensed and distributed under either of
# * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
# * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
# at your option. This file may not be copied, modified, or distributed except according to those terms.
# Reference implementation from
# // Copyright (c) 2007 Intel Corp.
#
# // Black-Scholes
# // Analytical method for calculating European Options
# //
# //
# // Reference Source: Options, Futures, and Other Derivatives, 3rd Edition, Prentice
# // Hall, John C. Hull,
#
# File manipulation routines ported from C++-Taskflow
import
# Stdlib
strformat, os, strutils, math, system/ansi_c,
cpuinfo, streams, strscans,
# bench
../wtime, ../resources
# Types
# --------------------------------
{.passC:"-fopenmp".}
{.passL:"-fopenmp".}
type
OptionKind = enum
Put
Call
OptionData[T: SomeFloat] = object
spot: T # Spot price
strike: T # Strike price
riskfree: T # risk-free rate
divrate: T # dividend rate
vol: T # volatility
expiry: T # expiry to maturity or option expiration in years
# (1 year = 1.0, 6 months = 0.5)
kind: OptionKind
divvals: T # Dividend values (not used in this test)
dgrefval: T # DerivaGem reference value
Context[T: SomeFloat] = object
data: ptr UncheckedArray[OptionData[T]]
prices: ptr UncheckedArray[T]
numOptions: int
numRuns: int
otype: ptr UncheckedArray[OptionKind]
spot: ptr UncheckedArray[T]
strike: ptr UncheckedArray[T]
riskFreeRate: ptr UncheckedArray[T]
volatility: ptr UncheckedArray[T]
expiry: ptr UncheckedArray[T]
numErrors: int
# Helpers
# ---------------------------------------------------
proc wv_alloc*(T: typedesc): ptr T {.inline.}=
## Default allocator for the Picasso library
## This allocates memory to hold the type T
## and returns a pointer to it
##
## Can use Nim allocator to measure the overhead of its lock
## Memory is not zeroed
when defined(WV_useNimAlloc):
createSharedU(T)
else:
cast[ptr T](c_malloc(csize_t sizeof(T)))
proc wv_alloc*(T: typedesc, len: SomeInteger): ptr UncheckedArray[T] {.inline.} =
## Default allocator for the Picasso library.
## This allocates a contiguous chunk of memory
## to hold ``len`` elements of type T
## and returns a pointer to it.
##
## Can use Nim allocator to measure the overhead of its lock
## Memory is not zeroed
when defined(WV_useNimAlloc):
cast[type result](createSharedU(T, len))
else:
cast[type result](c_malloc(csize_t len*sizeof(T)))
proc wv_free*[T: ptr](p: T) {.inline.} =
when defined(WV_useNimAlloc):
freeShared(p)
else:
c_free(p)
proc initialize[T](ctx: var Context[T], numOptions: int) =
ctx.numOptions = numOptions
ctx.data = wv_alloc(OptionData[T], numOptions)
ctx.prices = wv_alloc(T, numOptions)
ctx.otype = wv_alloc(OptionKind, numOptions)
ctx.spot = wv_alloc(T, numOptions)
ctx.strike = wv_alloc(T, numOptions)
ctx.riskFreeRate = wv_alloc(T, numOptions)
ctx.volatility = wv_alloc(T, numOptions)
ctx.expiry = wv_alloc(T, numOptions)
proc delete[T](ctx: sink Context[T]) =
wv_free ctx.data
wv_free ctx.prices
wv_free ctx.otype
wv_free ctx.spot
wv_free ctx.strike
wv_free ctx.riskFreeRate
wv_free ctx.volatility
wv_free ctx.expiry
# Cumulative Normal Distribution Function
# ---------------------------------------------------
# See Hull, Section 11.8, P.243-244
const InvSqrt2xPI = 0.39894228040143270286
func cumulNormalDist[T: SomeFloat](inputX: T): T =
# Check for negative value of inputX
var isNegative = false
var inputX = inputX
if inputX < 0.T:
inputX = -inputX
isNegative = true
let xInput = inputX
# Compute NPrimeX term common to both four & six decimal accuracy calcs
let expValues = exp(-0.5.T * inputX * inputX)
let xNPrimeofX = expValues * InvSqrt2xPI
let
xK2 = 1.0 / (1.0 + 0.2316419*xInput)
xK2_2 = xK2 * xK2
xK2_3 = xK2_2 * xK2
xK2_4 = xK2_3 * xK2
xK2_5 = xK2_4 * xK2
var
xLocal_1 = xK2 * 0.319381530
xLocal_2 = xK2_2 * -0.356563782
xLocal_3 = xK2_3 * 1.781477937
xLocal_2 += xLocal_3
xLocal_3 = xK2_4 * -1.821255978
xLocal_2 += xLocal_3
xLocal_3 = xK2_5 * 1.330274429
xLocal_2 += xLocal_3
xLocal_1 += xLocal_2
result = 1.T - xLocal_1 * xNPrimeofX
if isNegative:
result = 1.T - result
# Black & Scholes
# ---------------------------------------------------
func blackScholesEqEuroNoDiv[T](
spot, strike, riskFreeRate, volatility, expiry: T,
otype: OptionKind,
timet: float32
): T =
var xD1 = riskFreeRate + 0.5.T * volatility * volatility
xD1 *= expiry
xD1 += ln(spot / strike)
var xDen = volatility * sqrt(expiry)
xD1 /= xDen
let xD2 = xD1 - xDen
let nofXd1 = cumulNormalDist(xD1)
let nofXd2 = cumulNormalDist(xD2)
let futureValueX = strike * exp(-riskFreeRate*expiry)
if otype == Call:
result = spot * nofXd1 - futureValueX * nofXd2
else:
let negNofxd1 = 1.T - nofXd1
let negNofxd2 = 1.T - nofXd2
result = futureValueX * negNofxd2 - spot * negNofXd1
func checkErrors[T](ctx: var Context[T], i: int, price: T) =
let priceDelta = ctx.data[i].dgrefval - price
if abs(priceDelta) >= 1e-4:
c_printf("Error on %d. Computed=%.5f, Ref=%.5f, Delta=%.5f\n",
i.int, price, ctx.data[i].dgrefval, priceDelta
)
ctx.numErrors += 1
func blackScholesSequential(ctx: var Context) =
for j in 0 ..< ctx.numRuns:
for i in 0 ..< ctx.numOptions:
let price = blackScholesEqEuroNoDiv(
ctx.spot[i], ctx.strike[i],
ctx.riskFreeRate[i], ctx.volatility[i],
ctx.expiry[i], ctx.otype[i], 0
)
ctx.prices[i] = price
when defined(check):
checkErrors(ctx, i, price)
proc blackScholesOpenMP(ctx: ptr Context) =
# Stacktraces will create issues between thread-local GC and OpenMP
{.push stackTrace: off.}
for j in 0 ..< ctx.numRuns:
for i in 0||(ctx.numOptions-1):
let price = blackScholesEqEuroNoDiv(
ctx.spot[i], ctx.strike[i],
ctx.riskFreeRate[i], ctx.volatility[i],
ctx.expiry[i], ctx.otype[i], 0
)
ctx.prices[i] = price
when defined(check):
checkErrors(ctx[], i, price)
{.pop.}
proc dump[T](ctx: Context[T], file: string) =
let stream = openFileStream(file, fmWrite)
defer: stream.close()
stream.write($ctx.numOptions)
stream.write("\n")
for i in 0 ..< ctx.numOptions:
stream.write($ctx.prices[i])
stream.write("\n")
proc parseOptions[T](ctx: var Context[T], optFile: string) =
let stream = openFileStream(optFile, fmRead)
defer: stream.close()
var line: string
discard stream.readLine(line)
# Allocate the buffers
# Note sure why the original bench uses a struct of arrays
let numOptions = line.parseInt()
ctx.initialize(numOptions)
echo "Reading ", numOptions, " options"
# For parsing Nim uses float64 by default
var
spot, strike, riskfree, divrate, vol, expiry: float64
optKind: string
divvals, dgrefval: float64
for i in 0 ..< ctx.numOptions:
discard stream.readLine(line)
let isLineParsed = scanf(line, "$f $f $f $f $f $f $w $f $f",
spot, strike, riskFree,
divrate, vol, expiry,
optKind, divvals, dgrefval
)
doAssert isLineParsed
ctx.data[i].spot = spot.T
ctx.data[i].strike = strike.T
ctx.data[i].riskfree = riskfree.T
ctx.data[i].divrate = divrate.T
ctx.data[i].vol = vol.T
ctx.data[i].expiry = expiry.T
ctx.data[i].divvals = divvals.T
ctx.data[i].dgrefval = dgrefval.T
if optKind == "C":
ctx.data[i].kind = Call
elif optKind == "P":
ctx.data[i].kind = Put
else:
raise newException(ValueError, "Invalid option kind: \"" & optKind & '"')
ctx.otype[i] = ctx.data[i].kind
ctx.spot[i] = ctx.data[i].spot
ctx.strike[i] = ctx.data[i].strike
ctx.riskFreeRate[i] = ctx.data[i].riskfree
ctx.volatility[i] = ctx.data[i].vol
ctx.expiry[i] = ctx.data[i].expiry
proc main() =
var
input = ""
output = ""
numRounds = 2000
if paramCount() == 0:
let exeName = getAppFilename().extractFilename()
echo &"Usage: {exeName} <inputFile: string> <output = \"\"> <numRounds = 2000>"
quit 0
elif paramCount() == 1:
input = paramStr(1)
elif paramCount() == 2:
input = paramStr(1)
output = paramStr(2)
elif paramCount() == 3:
input = paramStr(1)
output = paramStr(2)
numRounds = paramStr(3).parseInt()
else:
let exeName = getAppFilename().extractFilename()
echo &"Usage: {exeName} <inputFile: string> <output = \"\"> <numRounds = 2000>"
quit 1
var ctx: Context[float32]
ctx.numRuns = numRounds
ctx.parseOptions(input)
var nthreads: int
if existsEnv"OMP_NUM_THREADS":
nthreads = getEnv"OMP_NUM_THREADS".parseInt()
else:
nthreads = countProcessors()
var ru: Rusage
getrusage(RusageSelf, ru)
var
rss = ru.ru_maxrss
flt = ru.ru_minflt
let start = wtime_msec()
blackScholesOpenMP(ctx.addr)
let stop = wtime_msec()
getrusage(RusageSelf, ru)
rss = ru.ru_maxrss - rss
flt = ru.ru_minflt - flt
echo "--------------------------------------------------------------------------"
echo "Scheduler: OpenMP (Nim)"
echo "Benchmark: Black & Scholes Option Pricing"
echo "Threads: ", nthreads
echo "Time(ms) ", round(stop - start, 3)
echo "Max RSS (KB): ", ru.ru_maxrss
echo "Runtime RSS (KB): ", rss
echo "# of page faults: ", flt
echo "--------------------------------------------------------------------------"
echo "# of rounds: ", numRounds
echo "# of options: ", ctx.numOptions
if output != "":
echo "\nDumping prices to \"", output, '"'
dump(ctx, output)
delete(ctx)
quit 0
main()

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,372 @@
# Weave
# Copyright (c) 2019 Mamy André-Ratsimbazafy
# Licensed and distributed under either of
# * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
# * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
# at your option. This file may not be copied, modified, or distributed except according to those terms.
# Reference implementation from
# // Copyright (c) 2007 Intel Corp.
#
# // Black-Scholes
# // Analytical method for calculating European Options
# //
# //
# // Reference Source: Options, Futures, and Other Derivatives, 3rd Edition, Prentice
# // Hall, John C. Hull,
#
# File manipulation routines ported from C++-Taskflow
import
# Stdlib
system/ansi_c, std/[strformat, os, strutils, math, cpuinfo, streams, strscans],
# Constantine
../../threadpool
when defined(linux):
# bench
import ../wtime, ../resources
# Types
# --------------------------------
type
OptionKind = enum
Put
Call
OptionData[T: SomeFloat] = object
spot: T # Spot price
strike: T # Strike price
riskfree: T # risk-free rate
divrate: T # dividend rate
vol: T # volatility
expiry: T # expiry to maturity or option expiration in years
# (1 year = 1.0, 6 months = 0.5)
kind: OptionKind
divvals: T # Dividend values (not used in this test)
dgrefval: T # DerivaGem reference value
Context[T: SomeFloat] = object
data: ptr UncheckedArray[OptionData[T]]
prices: ptr UncheckedArray[T]
numOptions: int
numRuns: int
otype: ptr UncheckedArray[OptionKind]
spot: ptr UncheckedArray[T]
strike: ptr UncheckedArray[T]
riskFreeRate: ptr UncheckedArray[T]
volatility: ptr UncheckedArray[T]
expiry: ptr UncheckedArray[T]
numErrors: int
# Helpers
# ---------------------------------------------------
proc wv_alloc*(T: typedesc): ptr T {.inline.}=
## Default allocator for the Picasso library
## This allocates memory to hold the type T
## and returns a pointer to it
##
## Can use Nim allocator to measure the overhead of its lock
## Memory is not zeroed
when defined(WV_useNimAlloc):
createSharedU(T)
else:
cast[ptr T](c_malloc(csize_t sizeof(T)))
proc wv_alloc*(T: typedesc, len: SomeInteger): ptr UncheckedArray[T] {.inline.} =
## Default allocator for the Picasso library.
## This allocates a contiguous chunk of memory
## to hold ``len`` elements of type T
## and returns a pointer to it.
##
## Can use Nim allocator to measure the overhead of its lock
## Memory is not zeroed
when defined(WV_useNimAlloc):
cast[type result](createSharedU(T, len))
else:
cast[type result](c_malloc(csize_t len*sizeof(T)))
proc wv_free*[T: ptr](p: T) {.inline.} =
when defined(WV_useNimAlloc):
freeShared(p)
else:
c_free(p)
proc initialize[T](ctx: var Context[T], numOptions: int) =
ctx.numOptions = numOptions
ctx.data = wv_alloc(OptionData[T], numOptions)
ctx.prices = wv_alloc(T, numOptions)
ctx.otype = wv_alloc(OptionKind, numOptions)
ctx.spot = wv_alloc(T, numOptions)
ctx.strike = wv_alloc(T, numOptions)
ctx.riskFreeRate = wv_alloc(T, numOptions)
ctx.volatility = wv_alloc(T, numOptions)
ctx.expiry = wv_alloc(T, numOptions)
proc delete[T](ctx: sink Context[T]) =
wv_free ctx.data
wv_free ctx.prices
wv_free ctx.otype
wv_free ctx.spot
wv_free ctx.strike
wv_free ctx.riskFreeRate
wv_free ctx.volatility
wv_free ctx.expiry
# Cumulative Normal Distribution Function
# ---------------------------------------------------
# See Hull, Section 11.8, P.243-244
const InvSqrt2xPI = 0.39894228040143270286
func cumulNormalDist[T: SomeFloat](inputX: T): T =
# Check for negative value of inputX
var isNegative = false
var inputX = inputX
if inputX < 0.T:
inputX = -inputX
isNegative = true
let xInput = inputX
# Compute NPrimeX term common to both four & six decimal accuracy calcs
let expValues = exp(-0.5.T * inputX * inputX)
let xNPrimeofX = expValues * InvSqrt2xPI
let
xK2 = 1.0 / (1.0 + 0.2316419*xInput)
xK2_2 = xK2 * xK2
xK2_3 = xK2_2 * xK2
xK2_4 = xK2_3 * xK2
xK2_5 = xK2_4 * xK2
var
xLocal_1 = xK2 * 0.319381530
xLocal_2 = xK2_2 * -0.356563782
xLocal_3 = xK2_3 * 1.781477937
xLocal_2 += xLocal_3
xLocal_3 = xK2_4 * -1.821255978
xLocal_2 += xLocal_3
xLocal_3 = xK2_5 * 1.330274429
xLocal_2 += xLocal_3
xLocal_1 += xLocal_2
result = 1.T - xLocal_1 * xNPrimeofX
if isNegative:
result = 1.T - result
# Black & Scholes
# ---------------------------------------------------
func blackScholesEqEuroNoDiv[T](
spot, strike, riskFreeRate, volatility, expiry: T,
otype: OptionKind,
timet: float32
): T =
var xD1 = riskFreeRate + 0.5.T * volatility * volatility
xD1 *= expiry
xD1 += ln(spot / strike)
var xDen = volatility * sqrt(expiry)
xD1 /= xDen
let xD2 = xD1 - xDen
let nofXd1 = cumulNormalDist(xD1)
let nofXd2 = cumulNormalDist(xD2)
let futureValueX = strike * exp(-riskFreeRate*expiry)
if otype == Call:
result = spot * nofXd1 - futureValueX * nofXd2
else:
let negNofxd1 = 1.T - nofXd1
let negNofxd2 = 1.T - nofXd2
result = futureValueX * negNofxd2 - spot * negNofXd1
func checkErrors[T](ctx: var Context[T], i: int, price: T) =
let priceDelta = ctx.data[i].dgrefval - price
if abs(priceDelta) >= 1e-4:
c_printf("Error on %d. Computed=%.5f, Ref=%.5f, Delta=%.5f\n",
i.int, price, ctx.data[i].dgrefval, priceDelta
)
ctx.numErrors += 1
func blackScholesSequential(ctx: var Context) =
for j in 0 ..< ctx.numRuns:
for i in 0 ..< ctx.numOptions:
let price = blackScholesEqEuroNoDiv(
ctx.spot[i], ctx.strike[i],
ctx.riskFreeRate[i], ctx.volatility[i],
ctx.expiry[i], ctx.otype[i], 0
)
ctx.prices[i] = price
when defined(check):
checkErrors(ctx, i, price)
proc blackScholesConstantine(tp: Threadpool, ctx: ptr Context) =
for j in 0 ..< ctx.numRuns:
tp.parallelFor i in 0 ..< ctx.numOptions:
captures: {ctx}
let price = blackScholesEqEuroNoDiv(
ctx.spot[i], ctx.strike[i],
ctx.riskFreeRate[i], ctx.volatility[i],
ctx.expiry[i], ctx.otype[i], 0
)
ctx.prices[i] = price
when defined(check):
checkErrors(ctx[], i, price)
tp.syncAll()
proc dump[T](ctx: Context[T], file: string) =
let stream = openFileStream(file, fmWrite)
defer: stream.close()
stream.write($ctx.numOptions)
stream.write("\n")
for i in 0 ..< ctx.numOptions:
stream.write($ctx.prices[i])
stream.write("\n")
proc parseOptions[T](ctx: var Context[T], optFile: string) =
let stream = openFileStream(optFile, fmRead)
defer: stream.close()
var line: string
discard stream.readLine(line)
# Allocate the buffers
# Note sure why the original bench uses a struct of arrays
let numOptions = line.parseInt()
ctx.initialize(numOptions)
echo "Reading ", numOptions, " options"
# For parsing Nim uses float64 by default
var
spot, strike, riskfree, divrate, vol, expiry: float64
optKind: string
divvals, dgrefval: float64
for i in 0 ..< ctx.numOptions:
discard stream.readLine(line)
let isLineParsed = scanf(line, "$f $f $f $f $f $f $w $f $f",
spot, strike, riskFree,
divrate, vol, expiry,
optKind, divvals, dgrefval
)
doAssert isLineParsed
ctx.data[i].spot = spot.T
ctx.data[i].strike = strike.T
ctx.data[i].riskfree = riskfree.T
ctx.data[i].divrate = divrate.T
ctx.data[i].vol = vol.T
ctx.data[i].expiry = expiry.T
ctx.data[i].divvals = divvals.T
ctx.data[i].dgrefval = dgrefval.T
if optKind == "C":
ctx.data[i].kind = Call
elif optKind == "P":
ctx.data[i].kind = Put
else:
raise newException(ValueError, "Invalid option kind: \"" & optKind & '"')
ctx.otype[i] = ctx.data[i].kind
ctx.spot[i] = ctx.data[i].spot
ctx.strike[i] = ctx.data[i].strike
ctx.riskFreeRate[i] = ctx.data[i].riskfree
ctx.volatility[i] = ctx.data[i].vol
ctx.expiry[i] = ctx.data[i].expiry
proc main() =
var
input = ""
output = ""
numRounds = 2000
if paramCount() == 0:
let exeName = getAppFilename().extractFilename()
echo &"Usage: {exeName} <inputFile: string> <output = \"\"> <numRounds = 2000>"
quit 0
elif paramCount() == 1:
input = paramStr(1)
elif paramCount() == 2:
input = paramStr(1)
output = paramStr(2)
elif paramCount() == 3:
input = paramStr(1)
output = paramStr(2)
numRounds = paramStr(3).parseInt()
else:
let exeName = getAppFilename().extractFilename()
echo &"Usage: {exeName} <inputFile: string> <output = \"\"> <numRounds = 2000>"
quit 1
var ctx: Context[float32]
ctx.numRuns = numRounds
ctx.parseOptions(input)
var nthreads: int
if existsEnv"CTT_NUM_THREADS":
nthreads = getEnv"CTT_NUM_THREADS".parseInt()
else:
nthreads = countProcessors()
when not defined(windows):
var ru: Rusage
getrusage(RusageSelf, ru)
var
rss = ru.ru_maxrss
flt = ru.ru_minflt
let start = wtime_msec()
var tp = Threadpool.new(numThreads = nthreads)
tp.blackScholesConstantine(ctx.addr)
tp.shutdown()
when not defined(windows):
let stop = wtime_msec()
getrusage(RusageSelf, ru)
rss = ru.ru_maxrss - rss
flt = ru.ru_minflt - flt
echo "--------------------------------------------------------------------------"
echo "Scheduler: Constantine"
echo "Benchmark: Black & Scholes Option Pricing (including init+shutdown)"
echo "Threads: ", nthreads
when not defined(windows):
echo "Time(ms) ", round(stop - start, 3)
echo "Max RSS (KB): ", ru.ru_maxrss
echo "Runtime RSS (KB): ", rss
echo "# of page faults: ", flt
echo "--------------------------------------------------------------------------"
echo "# of rounds: ", numRounds
echo "# of options: ", ctx.numOptions
if output != "":
echo "\nDumping prices to \"", output, '"'
dump(ctx, output)
delete(ctx)
quit 0
main()

View File

@ -10,7 +10,7 @@ import
system/ansi_c, std/[strformat, os, strutils, cpuinfo],
# Library
../../threadpool
when not defined(windows):
# bench
import ../wtime
@ -63,22 +63,31 @@ proc main() =
# Staccato benches runtime init and exit as well
when not defined(windows):
let start = wtime_usec()
let startRuntime = wtime_usec()
tp = Threadpool.new()
when not defined(windows):
let startBench = wtime_usec()
answer = test(depth, breadth)
when not defined(windows):
let stopBench = wtime_usec()
tp.shutdown()
when not defined(windows):
let stop = wtime_usec()
let stopRuntime = wtime_usec()
echo "--------------------------------------------------------------------------"
echo "Scheduler: Constantine's Threadpool"
echo "Benchmark: dfs"
echo "Threads: ", nthreads
echo "Scheduler: Constantine's Threadpool"
echo "Benchmark: dfs"
echo "Threads: ", nthreads
when not defined(windows):
echo "Time(us) ", stop - start
echo "Output: ", answer
echo "Time(us) runtime: ", stopRuntime - startRuntime
echo "Time(us) bench: ", stopBench - startBench
echo "Output: ", answer
echo "--------------------------------------------------------------------------"
quit 0

View File

@ -0,0 +1,10 @@
# Histogram 2D
This is a very interesting benchmark to test map-reduce style of computation and nested parallelism.
It needs:
- 2 loops over a 2D matrix
- A reduction
OpenMP is unable to deal intuitively with nesting
2 parallel loops and a custom reduction operation

View File

@ -0,0 +1,162 @@
// Source: https://stackoverflow.com/questions/16751445/parallelize-nested-for-loop-with-openmp
#include<stdio.h>
#include <stdlib.h>
#include <stdio.h>
#include <math.h>
#define float_t float
#include <time.h>
#include <omp.h>
// This was the original function to optimize
// where the reader had issues with OpenMP nested parallelism
float_t generate_histogram(float_t **matrix, int *histogram, int mat_size, int hist_size)
{
int i,j,k,count;
float_t max = 0.;
float_t sum;
//set histogram to zero everywhere
for(i = 0; i < hist_size; i++)
histogram[i] = 0;
//matrix computations
//#pragma omp parallel for schedule(runtime)
for (i = 1; i < (mat_size-1); i++)
{
//pragma omp prallel for schedule(dynamic)
for(j = 1; j < (mat_size-1); j++)
{
//assign current matrix[i][j] to element in order to reduce memory access
sum = fabs(matrix[i][j]-matrix[i-1][j]) + fabs(matrix[i][j] - matrix[i+1][j])
+ fabs(matrix[i][j]-matrix[i][j-1]) + fabs(matrix[i][j] - matrix[i][j+1]);
//compute index of histogram bin
k = (int)(sum * (float)mat_size);
histogram[k] += 1;
//keep track of largest element
if(sum > max)
max = sum;
}//end inner for
}//end outer for
return max;
}
// This is the proposed alternative
float_t generate_histogram_omp(float_t **matrix, int *histogram, int mat_size, int hist_size) {
float_t max = 0.;
//set histogram to zero everywhere
int i;
for(i = 0; i < hist_size; i++)
histogram[i] = 0;
//matrix computations
#pragma omp parallel
{
int *histogram_private = (int*)malloc(hist_size * sizeof(int));
int i;
for(i = 0; i < hist_size; i++)
histogram_private[i] = 0;
float_t max_private = 0.;
int n;
int j;
#pragma omp for
for (i = 1; i < (mat_size-1); i++) {
for(j = 1; j < (mat_size-1); j++) {
// for (n=0; n < (mat_size-2)*(mat_size-2); n++) {
// int i = n/(mat_size-2)+1;
// int j = n%(mat_size-2)+1;
float_t sum = fabs(matrix[i][j]-matrix[i-1][j]) + fabs(matrix[i][j] - matrix[i+1][j])
+ fabs(matrix[i][j]-matrix[i][j-1]) + fabs(matrix[i][j] - matrix[i][j+1]);
//compute index of histogram bin
int k = (int)(sum * (float)mat_size);
histogram_private[k] += 1;
//keep track of largest element
if(sum > max_private)
max_private = sum;
}
}
#pragma omp critical
{
for(i = 0; i < hist_size; i++)
histogram[i] += histogram_private[i];
if(max_private>max)
max = max_private;
}
free(histogram_private);
}
return max;
}
int compare_hists(int *hist1, int *hist2, int N) {
int i;
int diff = 0;
for(i =0; i < N; i++) {
int tmp = hist1[i] - hist2[i];
diff += tmp;
if(tmp!=0) {
printf("i %d, hist1 %d, hist2 %d\n", i, hist1[i], hist2[i]);
}
}
return diff;
}
int main() {
int i,j,N,boxes;
// Original values
// N = 10000;
// boxes = N / 2;
N = 25000;
boxes = 1000;
float_t **matrix;
int* histogram1;
int* histogram2;
//allocate a matrix with some numbers
matrix = (float_t**)calloc(N, sizeof(float_t **));
for(i = 0; i < N; i++)
matrix[i] = (float_t*)calloc(N, sizeof(float_t *));
for(i = 0; i < N; i++)
for(j = 0; j < N; j++)
matrix[i][j] = 1./(float_t) N * (float_t) i * 100.0;
histogram1 = (int*)malloc(boxes * sizeof(int));
histogram2 = (int*)malloc(boxes * sizeof(int));
for(i = 0; i<boxes; i++) {
histogram1[i] = 0;
histogram2[i] = 0;
}
double dtime;
dtime = omp_get_wtime();
float_t max1 = generate_histogram(matrix, histogram1, N, boxes);
dtime = omp_get_wtime() - dtime;
printf("time serial: %f\n", dtime);
printf("max serial: %f\n", max1);
dtime = omp_get_wtime();
float_t max2 = generate_histogram_omp(matrix, histogram2, N, boxes);
dtime = omp_get_wtime() - dtime;
printf("time openmp: %f\n", dtime);
printf("max openmp: %f\n", max2);
int diff = compare_hists(histogram1, histogram2, boxes);
printf("diff %d\n", diff);
return 0;
}

View File

@ -0,0 +1,303 @@
# Weave
# Copyright (c) 2019 Mamy André-Ratsimbazafy
# Licensed and distributed under either of
# * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
# * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
# at your option. This file may not be copied, modified, or distributed except according to those terms.
import
# Stdlib
system/ansi_c, std/[os, strutils, cpuinfo, math, strformat, locks],
# Constantine
../../threadpool
when not defined(windows):
# bench
import ../wtime, ../resources
# TODO: there is an overflow on Linux32 or MacOS for MaxRSS but not Linux or Windows
# This test is quite important to ensure parallel reductions work within a generic proc.
{.push checks: off.}
# Helpers
# -------------------------------------------------------
# We need a thin wrapper around raw pointers for matrices,
# we can't pass "var" to other threads
type
Matrix[T: SomeFloat] = object
buffer: ptr UncheckedArray[T]
ld: int
template `[]`[T](mat: Matrix[T], row, col: Natural): T =
# row-major storage
assert row < mat.ld
assert col < mat.ld
mat.buffer[row * mat.ld + col]
template `[]=`[T](mat: Matrix[T], row, col: Natural, value: T) =
assert row < mat.ld
assert col < mat.ld
mat.buffer[row * mat.ld + col] = value
type
Histogram = object
buffer: ptr UncheckedArray[int64]
len: int
template `[]`(hist: Histogram, idx: Natural): int64 =
# row-major storage
assert idx in 0 ..< hist.len
hist.buffer[idx]
template `[]=`(hist: Histogram, idx: Natural, value: int64) =
assert idx in 0 ..< hist.len
hist.buffer[idx] = value
proc wv_alloc*(T: typedesc): ptr T {.inline.}=
cast[ptr T](c_malloc(csize_t sizeof(T)))
proc wv_alloc*(T: typedesc, len: SomeInteger): ptr UncheckedArray[T] {.inline.} =
cast[type result](c_malloc(csize_t len*sizeof(T)))
proc wv_free*[T: ptr](p: T) {.inline.} =
c_free(p)
# -------------------------------------------------------
proc prepareMatrix[T](matrix: var Matrix[T], N: int) =
matrix.buffer = wv_alloc(T, N*N)
matrix.ld = N
for i in 0 ..< N:
for j in 0 ..< N:
matrix[i, j] = 1.0 / T(N) * T(i) * 100
proc newHistogram(bins: int): Histogram =
result.buffer = wv_alloc(int64, bins)
result.len = bins
# Reports
# -------------------------------------------------------
template memUsage(maxRSS, runtimeRSS, pageFaults: untyped{ident}, body: untyped) =
var maxRSS, runtimeRSS, pageFaults: int32
block:
var ru: Rusage
getrusage(RusageSelf, ru)
runtimeRSS = ru.ru_maxrss
pageFaults = ru.ru_minflt
body
getrusage(RusageSelf, ru)
runtimeRSS = ru.ru_maxrss - runtimeRSS
pageFaults = ru.ru_minflt - pageFaults
maxRss = ru.ru_maxrss
proc reportConfig(
scheduler: string,
nthreads, N, bins: int) =
echo "--------------------------------------------------------------------------"
echo "Scheduler: ", scheduler
echo "Benchmark: Histogram 2D "
echo "Threads: ", nthreads
echo "Matrix: ", N, " x ", N
echo "Histogram bins: ", bins
proc reportBench(
time: float64, maxRSS, runtimeRss, pageFaults: int, max: SomeFloat
) =
echo "--------------------------------------------------------------------------"
echo "Time(ms): ", round(time, 3)
echo "Max RSS (KB): ", maxRss
echo "Runtime RSS (KB): ", runtimeRSS
echo "# of page faults: ", pageFaults
echo "Max (from histogram): ", max
template runBench(tp: Threadpool, procName: untyped, matrix: Matrix, bins: int, parallel: static bool = true) =
var hist = newHistogram(bins)
when not defined(windows):
block:
var max: matrix.T
let start = wtime_msec()
memUsage(maxRSS, runtimeRSS, pageFaults):
when parallel:
tp = Threadpool.new()
max = procName(tp, matrix, hist)
tp.shutdown()
else:
max = procName(matrix, hist)
let stop = wtime_msec()
reportBench(stop-start, maxRSS, runtimeRSS, pageFaults, max)
else:
block:
var max: matrix.T
when parallel:
tp = Threadpool.new()
max = procName(tp, matrix, hist)
tp.shutdown()
else:
max = procName(matrix, hist)
# Algo
# -------------------------------------------------------
proc generateHistogramSerial[T](matrix: Matrix[T], hist: Histogram): T =
# zero-ing the histogram
for i in 0 ..< hist.len:
hist[i] = 0
# Note don't run on borders, they have no neighbour
for i in 1 ..< matrix.ld-1:
for j in 1 ..< matrix.ld-1:
# Sum of cell neigbors
let sum = abs(matrix[i, j] - matrix[i-1, j]) + abs(matrix[i,j] - matrix[i+1, j]) +
abs(matrix[i, j] - matrix[i, j-1] + abs(matrix[i, j] - matrix[i, j+1]))
# Compute index of histogram bin
let k = int(sum * T(matrix.ld))
hist[k] += 1
# Keep track of the largest element
if sum > result:
result = sum
proc generateHistogramThreadpoolReduce[T](tp: Threadpool, matrix: Matrix[T], hist: Histogram): T =
# We await reduce max only, sending the histogram across threads
# is too costly so the temporary histogram are freed in their allocating threads
# In generic proc, Nim tries to resolve symbol earlier than when the reduce macros creates them
# so we need to tell Nim that the symbol will exist in time.
mixin distributedMax
let boxes = hist.len
for i in 0 ..< boxes:
hist[i] = 0
# Parallel reduction
tp.parallelFor i in 1 ..< matrix.ld-1:
captures: {hist, matrix, boxes}
reduceInto(distributedMax: T):
prologue:
let threadHist = newHistogram(boxes)
var max = T(-Inf)
forLoop:
# with inner for loop
for j in 1 ..< matrix.ld-1:
let sum = abs(matrix[i, j] - matrix[i-1, j]) + abs(matrix[i,j] - matrix[i+1, j]) +
abs(matrix[i, j] - matrix[i, j-1] + abs(matrix[i, j] - matrix[i, j+1]))
let k = int(sum * T(matrix.ld))
threadHist[k] += 1
if sum > max:
max = sum
merge(remoteMax: FlowVar[T]):
block:
let remoteMax = sync(remoteMax) # Await max from other thread
if remoteMax > max:
max = remoteMax
for k in 0 ..< boxes:
discard hist[k].addr.atomicFetchAdd(threadHist[k], ATOMIC_RELAXED)
epilogue:
wv_free(threadHist.buffer)
return max
return sync(distributedMax)
# proc generateHistogramThreadpoolStaged[T](matrix: Matrix[T], hist: Histogram): T =
# var max = T(-Inf)
# let maxAddr = max.addr
# var lock: Lock
# lock.initLock()
# let lockAddr = lock.addr
# let boxes = hist.len
# for i in 0 ..< boxes:
# hist[i] = 0
# # Parallel reduction
# parallelForStaged i in 1 ..< matrix.ld-1:
# captures: {maxAddr, lockAddr, hist, matrix, boxes}
# awaitable: histoLoop
# prologue:
# let threadHist = newHistogram(boxes)
# var threadMax = T(-Inf)
# forLoop:
# # with inner for loop
# for j in 1 ..< matrix.ld-1:
# let sum = abs(matrix[i, j] - matrix[i-1, j]) + abs(matrix[i,j] - matrix[i+1, j]) +
# abs(matrix[i, j] - matrix[i, j-1] + abs(matrix[i, j] - matrix[i, j+1]))
# let k = int(sum * T(matrix.ld))
# threadHist[k] += 1
# if sum > threadMax:
# threadMax = sum
# epilogue:
# lockAddr[].acquire()
# maxAddr[] = max(maxAddr[], threadMax)
# if threadMax > maxAddr[]:
# maxAddr[] = threadMax
# for k in 0 ..< boxes:
# hist[k] += threadHist[k]
# lockAddr[].release()
# wv_free(threadHist.buffer)
# let waslastThread = sync(histoLoop)
# lock.deinitLock()
# return max
proc main() =
if sizeof(int) == 4:
echo "Running on 32-bit. This benchmark is requires 64-bit."
return
var
matrixSize = 25000
boxes = 1000
if paramCount() == 0:
let exeName = getAppFilename().extractFilename()
echo &"Usage: {exeName} <matrixSize: int> <boxes: int>"
echo &"Running with default matrixSize={matrixSize}, boxes={boxes}"
elif paramCount() == 2:
matrixSize = paramStr(1).parseInt()
boxes = paramStr(2).parseInt()
else:
let exeName = getAppFilename().extractFilename()
echo &"Usage: {exeName} <matrixSize: int> <boxes: int>"
echo &"Default \"{exeName} {matrixSize} {boxes}\""
quit 1
var nthreads: int
if existsEnv"CTT_NUM_THREADS":
nthreads = getEnv"CTT_NUM_THREADS".parseInt()
else:
nthreads = countProcessors()
var tp: Threadpool
var matrix: Matrix[float32]
# The reference code zero-out the histogram in the bench as well
prepareMatrix(matrix, matrixSize)
reportConfig("Sequential", 1, matrixSize, boxes)
runBench(tp, generateHistogramSerial, matrix, boxes, parallel = false)
reportConfig("Constantine's threadpool - Parallel Reduce", nthreads, matrixSize, boxes)
runBench(tp, generateHistogramThreadpoolReduce, matrix, boxes)
# reportConfig("Constantine's threadpool - Parallel For Staged", nthreads, matrixSize, boxes)
# runBench(generateHistogramThreadpoolStaged, matrix, boxes)
wv_free(matrix.buffer)
main()

View File

@ -0,0 +1,28 @@
# Log-Sum-Exp
Log-Sum-Exp computes `ln ∑i exp(xi)` by using the trick:
```
log ∑i exp(xi) = α + log ∑i exp(xiα)
with α = max(xi) for xi in x
```
Log-Sum-Exp is a key algorithm behind the Softmax Cross-Entropy loss function,
which is used in almost all deep learning classification problems.
Furthermore this is a huge bottleneck in NLP and research in fast softmax
approximation is active.
We are interested in testing parallel reductions
so we do first a parallel max and then a parallel exponential sum.
However note that there exist a streaming version
at http://www.nowozin.net/sebastian/blog/streaming-log-sum-exp-computation.html
which is similar to Welford algorithm for streaming mean and variance in statistics.
Given that exponential is a very heavy operation all framework should see a linear speedup.
The main challenge is the framework syntax for complex reduction operation.
Note that `<math.h>` exponential can be significantly improved (10x)
by using vectorization techniques from Laser
https://github.com/numforge/laser/blob/d1e6ae6106564bfb350d4e566261df97dbb578b3/benchmarks/vector_math/bench_exp_avx2.nim#L373-L379

View File

@ -0,0 +1,402 @@
# Weave
# Copyright (c) 2019 Mamy André-Ratsimbazafy
# Licensed and distributed under either of
# * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
# * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
# at your option. This file may not be copied, modified, or distributed except according to those terms.
import
# Stdlib
system/ansi_c, std/[strformat, os, strutils, cpuinfo, math, random, locks],
# Constantine
../../threadpool
when not defined(windows):
# bench
import ../wtime, ../resources
# TODO: there is an overflow on Linux32 or MacOS for MaxRSS but not Linux or Windows
# This test is quite important to ensure parallel reductions work within a generic proc.
{.push checks: off.}
# Helpers
# -------------------------------------------------------
# We need a thin wrapper around raw pointers for matrices,
# Note that matrices for log-sum-exp are usually in the following shapes:
# - Classification of a batch of 256 images in 3 categories: 256x3
# - Classification of a batch of words from a 50000 words dictionary: 256x50000
# Helpers
# -------------------------------------------------------
proc wv_alloc*(T: typedesc): ptr T {.inline.}=
cast[ptr T](c_malloc(csize_t sizeof(T)))
proc wv_alloc*(T: typedesc, len: SomeInteger): ptr UncheckedArray[T] {.inline.} =
cast[type result](c_malloc(csize_t len*sizeof(T)))
proc wv_free*[T: ptr](p: T) {.inline.} =
c_free(p)
# We need a thin wrapper around raw pointers for matrices,
# we can't pass "var" to other threads
type
Matrix[T: SomeFloat] = object
buffer: ptr UncheckedArray[T]
nrows, ncols: int # int64 on x86-64
func newMatrix[T](rows, cols: Natural): Matrix[T] {.inline.} =
# Create a rows x cols Matrix
result.buffer = cast[ptr UncheckedArray[T]](c_malloc(csize_t rows*cols*sizeof(T)))
result.nrows = rows
result.ncols = cols
template `[]`[T](M: Matrix[T], row, col: Natural): T =
# row-major storage
assert row < M.nrows
assert col < M.ncols
M.buffer[row * M.ncols + col]
template `[]=`[T](M: Matrix[T], row, col: Natural, value: T) =
assert row < M.nrows
assert col < M.ncols
M.buffer[row * M.ncols + col] = value
proc initialize[T](M: Matrix[T]) =
randomize(1234) # Seed
for i in 0 ..< M.nrows:
for j in 0 ..< M.ncols:
M[i, j] = T(rand(1.0))
func rowView*[T](M: Matrix[T], rowPos, size: Natural): Matrix[T]{.inline.}=
## Returns a new view offset by the row and column stride
result.buffer = cast[ptr UncheckedArray[T]](
addr M.buffer[rowPos * M.ncols]
)
result.nrows = size
result.ncols = M.ncols
# Reports
# -------------------------------------------------------
template memUsage(maxRSS, runtimeRSS, pageFaults: untyped{ident}, body: untyped) =
var maxRSS, runtimeRSS, pageFaults: int32
block:
var ru: Rusage
getrusage(RusageSelf, ru)
runtimeRSS = ru.ru_maxrss
pageFaults = ru.ru_minflt
body
getrusage(RusageSelf, ru)
runtimeRSS = ru.ru_maxrss - runtimeRSS
pageFaults = ru.ru_minflt - pageFaults
maxRss = ru.ru_maxrss
proc reportConfig(
scheduler: string,
nthreads: int, datasetSize, batchSize, imageLabels, textVocabulary: int64
) =
echo "--------------------------------------------------------------------------"
echo "Scheduler: ", scheduler
echo "Benchmark: Log-Sum-Exp (Machine Learning) "
echo "Threads: ", nthreads
echo "datasetSize: ", datasetSize
echo "batchSize: ", batchSize
echo "# of full batches: ", datasetSize div batchSize
echo "# of image labels: ", imageLabels
echo "Text vocabulary size: ", textVocabulary
proc reportBench(
batchSize, numLabels: int64,
time: float64, maxRSS, runtimeRss, pageFaults: int32,
logSumExp: float32
) =
echo "--------------------------------------------------------------------------"
echo "Dataset: ", batchSize,'x',numLabels
echo "Time(ms): ", round(time, 3)
echo "Max RSS (KB): ", maxRss
echo "Runtime RSS (KB): ", runtimeRSS
echo "# of page faults: ", pageFaults
echo "Logsumexp: ", logsumexp
template runBench(procName: untyped, datasetSize, batchSize, numLabels: int64) =
let data = newMatrix[float32](datasetSize, numLabels)
data.initialize()
when not defined(windows):
let start = wtime_msec()
var lse = 0'f32
memUsage(maxRSS, runtimeRSS, pageFaults):
# For simplicity we ignore the last few data points
for batchIdx in 0 ..< datasetSize div batchSize:
let X = data.rowView(batchIdx*batchSize, batchSize)
lse += procName(X)
let stop = wtime_msec()
reportBench(batchSize, numlabels, stop-start, maxRSS, runtimeRSS, pageFaults, lse)
else:
# For simplicity we ignore the last few data points
var lse = 0'f32
for batchIdx in 0 ..< datasetSize div batchSize:
let X = data.rowView(batchIdx*batchSize, batchSize)
lse += procName(X)
template runBench(tp: Threadpool, procName: untyped, datasetSize, batchSize, numLabels: int64) =
let data = newMatrix[float32](datasetSize, numLabels)
data.initialize()
when not defined(windows):
let start = wtime_msec()
var lse = 0'f32
memUsage(maxRSS, runtimeRSS, pageFaults):
# For simplicity we ignore the last few data points
for batchIdx in 0 ..< datasetSize div batchSize:
let X = data.rowView(batchIdx*batchSize, batchSize)
lse += procName(tp, X)
let stop = wtime_msec()
reportBench(batchSize, numlabels, stop-start, maxRSS, runtimeRSS, pageFaults, lse)
else:
var lse = 0'f32
for batchIdx in 0 ..< datasetSize div batchSize:
let X = data.rowView(batchIdx*batchSize, batchSize)
lse += procName(tp, X)
# Algo - Serial
# -------------------------------------------------------
proc maxSerial[T: SomeFloat](M: Matrix[T]) : T =
result = T(-Inf)
for i in 0 ..< M.nrows:
for j in 0 ..< M.ncols:
result = max(result, M[i, j])
proc logsumexpSerial[T: SomeFloat](M: Matrix[T]): T =
let alpha = M.maxSerial()
result = 0
for i in 0 ..< M.nrows:
for j in 0 ..< M.ncols:
result += exp(M[i, j] - alpha)
result = alpha + ln(result)
# Algo - parallel reduction
# -------------------------------------------------------
proc maxThreadpoolReduce[T: SomeFloat](tp: Threadpool, M: Matrix[T]) : T =
mixin globalMax
tp.parallelFor i in 0 ..< M.nrows:
captures:{M}
reduceInto(globalMax: T):
prologue:
var localMax = T(-Inf)
forLoop:
for j in 0 ..< M.ncols:
localMax = max(localMax, M[i, j])
merge(remoteMax: Flowvar[T]):
localMax = max(localMax, sync(remoteMax))
epilogue:
return localMax
result = sync(globalMax)
proc logsumexpThreadpoolReduce[T: SomeFloat](tp: Threadpool, M: Matrix[T]): T =
mixin lse
let alpha = tp.maxThreadpoolReduce(M)
tp.parallelFor i in 0 ..< M.nrows:
captures:{alpha, M}
reduceInto(lse: T):
prologue:
var localLSE = 0.T
forLoop:
for j in 0 ..< M.ncols:
localLSE += exp(M[i, j] - alpha)
merge(remoteLSE: Flowvar[T]):
localLSE += sync(remoteLSE)
epilogue:
return localLSE
result = alpha + ln(sync(lse))
# Algo - parallel reduction collapsed
# -------------------------------------------------------
proc maxThreadpoolCollapsed[T: SomeFloat](tp: Threadpool, M: Matrix[T]) : T =
mixin globalMax
tp.parallelFor ij in 0 ..< M.nrows * M.ncols:
captures:{M}
reduceInto(globalMax: T):
prologue:
var localMax = T(-Inf)
forLoop:
localMax = max(localMax, M.buffer[ij])
merge(remoteMax: FlowVar[T]):
localMax = max(localMax, sync(remoteMax))
epilogue:
return localMax
result = sync(globalMax)
proc logsumexpThreadpoolCollapsed[T: SomeFloat](tp: Threadpool, M: Matrix[T]): T =
mixin lse
let alpha = tp.maxThreadpoolCollapsed(M)
tp.parallelFor ij in 0 ..< M.nrows * M.ncols:
captures:{alpha, M}
reduceInto(lse: T):
prologue:
var localLSE = 0.T
forLoop:
localLSE += exp(M.buffer[ij] - alpha)
merge(remoteLSE: Flowvar[T]):
localLSE += sync(remoteLSE)
epilogue:
return localLSE
result = alpha + ln(sync(lse))
# proc maxThreadpoolStaged[T: SomeFloat](tp: Threadpool, M: Matrix[T]) : T =
# mixin maxLoop
#
# var max = T(-Inf)
# let maxAddr = max.addr
#
# var lock: Lock
# lock.initLock()
# let lockAddr = lock.addr
#
# tp.parallelForStaged i in 0 ..< M.nrows:
# captures:{maxAddr, lockAddr, M}
# awaitable: maxLoop
# prologue:
# var localMax = T(-Inf)
# forLoop:
# for j in 0 ..< M.ncols:
# localMax = max(localMax, M[i, j])
# epilogue:
# lockAddr[].acquire()
# maxAddr[] = max(maxAddr[], localMax)
# lockAddr[].release()
#
# let waslastThread = sync(maxLoop)
# lock.deinitLock()
#
# proc logsumexpThreadpoolStaged[T: SomeFloat](tp: Threadpool, M: Matrix[T]): T =
# mixin logSumExpLoop
# let alpha = M.maxThreadpoolStaged()
#
# var lse = T(0)
# let lseAddr = lse.addr
#
# # Atomic increment for float is done with a Compare-And-Swap loop usually.
# # Due to lazy splitting, load distribution is unbalanced between threads so they shouldn't
# # finish at the same time in general and lock contention would be low
# var lock: Lock
# lock.initLock()
# let lockAddr = lock.addr
#
# tp.parallelForStaged i in 0 ..< M.nrows:
# captures:{lseAddr, lockAddr, alpha, M}
# awaitable: logSumExpLoop
# prologue:
# var localLSE = 0.T
# loop:
# for j in 0 ..< M.ncols:
# localLSE += exp(M[i, j] - alpha)
# epilogue:
# lockAddr[].acquire()
# lseAddr[] += localLSE
# lockAddr[].release()
#
# let wasLastThread = sync(logSumExpLoop)
# result = alpha + ln(lse)
# lock.deinitLock()
# Main
# -------------------------------------------------------
proc main() =
echo "Note that a text vocabulary is often in the 50000-15000 words\n"
var
datasetSize = 20000'i64
batchSize = 256'i64
imagelabels = 1000'i64
textVocabulary = 10000'i64
if paramCount() == 0:
let exeName = getAppFilename().extractFilename()
echo &"Usage: {exeName} <datasetSize: int64> <batchSize: int64> <imagelabels: int64> <textVocabulary: int64>"
echo &"Running with default datasetSize={datasetSize}, batchSize={batchSize}, imagelabels={imagelabels}, textVocabulary={textVocabulary}"
elif paramCount() == 4:
datasetSize = paramStr(1).parseBiggestInt().int64
batchSize = paramStr(2).parseBiggestInt().int64
imagelabels = paramStr(3).parseBiggestInt().int64
textVocabulary = paramStr(4).parseBiggestInt().int64
else:
let exeName = getAppFilename().extractFilename()
echo &"Usage: {exeName} <datasetSize: int64> <batchSize: int64> <imagelabels: int64> <textVocabulary: int64>"
echo &"Default \"{datasetSize} {batchSize} {imagelabels} {textVocabulary}\""
quit 1
var nthreads: int
if existsEnv"CTT_NUM_THREADS":
nthreads = getEnv"CTT_NUM_THREADS".parseInt()
else:
nthreads = countProcessors()
let sanityM = newMatrix[float32](1, 9)
for i in 0'i32 ..< 9:
sanityM[0, i] = i.float32 + 1
echo "Sanity check, logSumExp(1..<10) should be 9.4585514 (numpy logsumexp): ", logsumexpSerial(sanityM)
echo '\n'
wv_free(sanityM.buffer)
reportConfig("Sequential", 1, datasetSize, batchSize, imageLabels, textVocabulary)
block:
runBench(logsumexpSerial, datasetSize, batchSize, imageLabels)
block:
runBench(logsumexpSerial, datasetSize, batchSize, textVocabulary)
# TODO: Placing the threadpool before the sequential bench makes it take ~85 ms instead of ~48 ms
var tp = Threadpool.new(nthreads)
# TODO: The parallel algorithm is slower than Weave AND slower than serial
reportConfig("Constantine's Threadpool Reduce", nthreads, datasetSize, batchSize, imageLabels, textVocabulary)
block:
tp.runBench(logsumexpThreadpoolReduce, datasetSize, batchSize, imageLabels)
block:
tp.runBench(logsumexpThreadpoolReduce, datasetSize, batchSize, textVocabulary)
reportConfig("Constantine's Threadpool (Collapsed)", nthreads, datasetSize, batchSize, imageLabels, textVocabulary)
block:
tp.runBench(logsumexpThreadpoolCollapsed, datasetSize, batchSize, imageLabels)
block:
tp.runBench(logsumexpThreadpoolCollapsed, datasetSize, batchSize, textVocabulary)
# reportConfig("Constantine's Threadpool (Staged)", nthreads, datasetSize, batchSize, imageLabels, textVocabulary)
# block:
# tp.runBench(logsumexpThreadpoolStaged, datasetSize, batchSize, imageLabels)
# block:
# tp.runBench(logsumexpThreadpoolStaged, datasetSize, batchSize, textVocabulary)
tp.shutdown()
main()

View File

@ -0,0 +1,14 @@
# Matrix transposition
This benchmark extract from [Laser](https://github.com/numforge/laser)
stresses the ability for a runtime to support nested loops.
A matrix is being copied to another buffer with transposition.
For one buffer or the other the accesses will not be linear
so it is important to do tiling. As dimensions might be skewed,
the ideal tiling should be 2D with nested parallel for loops that properly find
work for all cores even if a matrix is tall and skinny or short and fat.
Note that as OpenMP nested loop support is very problematic
we use the `collapse` clause for OpenMP which is only usable
in a restricted number of scenarios.

View File

@ -0,0 +1,294 @@
# Weave
# Copyright (c) 2019 Mamy André-Ratsimbazafy
# Licensed and distributed under either of
# * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
# * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
# at your option. This file may not be copied, modified, or distributed except according to those terms.
# Original transposition codes from Laser project
# (c) Mamy André Ratsimbazafy, Apache License version 2
import
# Stdlib
strformat, os, strutils, math, system/ansi_c,
cpuinfo, streams, strscans,
# Third-party
cligen,
# bench
../wtime, ../resources
# OpenMP
# ---------------------------------------------------
{.passC:"-fopenmp".}
{.passL:"-fopenmp".}
{.pragma: omp, header:"omp.h".}
proc omp_get_num_threads*(): cint {.omp.}
# Memory
# ---------------------------------------------------
proc wv_alloc*(T: typedesc): ptr T {.inline.}=
## Default allocator for the Picasso library
## This allocates memory to hold the type T
## and returns a pointer to it
##
## Can use Nim allocator to measure the overhead of its lock
## Memory is not zeroed
when defined(WV_useNimAlloc):
createSharedU(T)
else:
cast[ptr T](c_malloc(csize_t sizeof(T)))
proc wv_alloc*(T: typedesc, len: SomeInteger): ptr UncheckedArray[T] {.inline.} =
## Default allocator for the Picasso library.
## This allocates a contiguous chunk of memory
## to hold ``len`` elements of type T
## and returns a pointer to it.
##
## Can use Nim allocator to measure the overhead of its lock
## Memory is not zeroed
when defined(WV_useNimAlloc):
cast[type result](createSharedU(T, len))
else:
cast[type result](c_malloc(csize_t len*sizeof(T)))
proc wv_free*[T: ptr](p: T) {.inline.} =
when defined(WV_useNimAlloc):
freeShared(p)
else:
c_free(p)
# Transpose implementations
# ---------------------------------------------------
type TransposeStrategy = enum
Sequential
Naive
Collapsed
TiledCollapsed
# Question: do we need __restrict to avoid the compiler generating
# defensive aliasing robust code?
proc sequentialTranspose(M, N: int, bufIn, bufOut: ptr UncheckedArray[float32]) =
for j in 0 ..< N:
for i in 0 ..< M:
bufOut[j*M+i] = bufIn[i*N+j]
proc ompNaiveTranspose(M, N: int, bufIn, bufOut: ptr UncheckedArray[float32]) =
## Transpose a MxN matrix into a NxM matrix
# Write are more expensive than read so we keep i accesses linear for writes
{.push stacktrace:off.}
for j in 0||(N-1):
for i in 0 ..< M:
bufOut[j*M+i] = bufIn[i*N+j]
{.pop.}
proc ompCollapsedTranspose(M, N: int, bufIn, bufOut: ptr UncheckedArray[float32]) =
## Transpose a MxN matrix into a NxM matrix
# We need to go down to C level for the collapsed clause
# This relies on M, N, bfIn, bufOut symbols being the same in C and Nim
# The proper interpolation syntax is a bit busy otherwise
{.emit: """
#pragma omp parallel for collapse(2)
for (int i = 0; i < `M`; ++i)
for (int j = 0; j < `N`; ++j)
`bufOut`[j*M+i] = `bufIn`[i*N+j];
""".}
proc omp2DTiledCollapsedTranspose(M, N: int, bufIn, bufOut: ptr UncheckedArray[float32]) =
## Transpose with 2D tiling and collapsed
const blck = 64
{.emit: """
#define min(a,b) (((a)<(b))?(a):(b))
#pragma omp parallel for collapse(2)
for (int j = 0; j < `N`; j+=`blck`)
for (int i = 0; i < `M`; i+=`blck`)
for (int jj = j; jj<j+`blck` && jj<`N`; jj++)
for (int ii = i; ii<min(i+`blck`,`M`); ii++)
`bufOut`[ii+jj*`M`] = `bufIn`[jj+ii*`N`];
""".}
# Meta
# ---------------------------------------------------
func computeMeta(height, width: int): tuple[reqOps, reqBytes, bufSize: int] =
result.reqOps = height * width
result.reqBytes = sizeof(float32) * height * width
result.bufSize = height * width
func initialize(buffer: ptr UncheckedArray[float32], len: int) =
for i in 0 ..< len:
buffer[i] = i.float32
# Bench
# ---------------------------------------------------
template memUsage(maxRSS, runtimeRSS, pageFaults: untyped{ident}, body: untyped) =
var maxRSS, runtimeRSS, pageFaults: int32
block:
var ru: Rusage
getrusage(RusageSelf, ru)
runtimeRSS = ru.ru_maxrss
pageFaults = ru.ru_minflt
body
getrusage(RusageSelf, ru)
runtimeRSS = ru.ru_maxrss - runtimeRSS
pageFaults = ru.ru_minflt - pageFaults
maxRss = ru.ru_maxrss
proc report(
M, N: int, nthreads: int32, nrounds: int, reordered: bool,
transposeStrategy: TransposeStrategy, reqOps, reqBytes: int,
mxnTime: float64, mxnMaxRSS, mxnRuntimeRss, mxnPageFaults: int32,
nxmTime: float64, nxmMaxRSS, nxmRuntimeRss, nxmPageFaults: int32,
) =
let arithIntensity = reqOps.float / reqBytes.float
let mxnPerf = reqOps.float/(mxnTime*1e-3 / nrounds.float) * 1e-9 # Gops per second
let nxmPerf = reqOps.float/(nxmTime*1e-3 / nrounds.float) * 1e-9 # Gops per second
echo "--------------------------------------------------------------------------"
echo "Scheduler: OpenMP"
echo "Benchmark: Transpose - ", $transposeStrategy
echo "Threads: ", nthreads
echo "# of rounds: ", nrounds
echo "# of operations: ", reqOps
echo "# of bytes: ", reqBytes
echo "Arithmetic Intensity: ", round(arithIntensity, 3)
echo "--------------------------------------------------------------------------"
if not reordered:
echo "Transposition: ", M,'x',N, " --> ", N, 'x', M
echo "Time(ms): ", round(mxnTime, 3)
echo "Max RSS (KB): ", mxnMaxRss
echo "Runtime RSS (KB): ", mxnRuntimeRSS
echo "# of page faults: ", mxnPageFaults
echo "Perf (GMEMOPs/s ~ GigaMemory Operations/s) ", round(mxnPerf, 3)
echo "--------------------------------------------------------------------------"
echo "Transposition: ", N,'x',M, " --> ", M, 'x', N
echo "Time(ms): ", round(nxmTime, 3)
echo "Max RSS (KB): ", nxmMaxRss
echo "Runtime RSS (KB): ", nxmRuntimeRSS
echo "# of page faults: ", nxmPageFaults
echo "Perf (GMEMOPs/s ~ GigaMemory Operations/s) ", round(nxmPerf, 3)
else:
echo "Transposition: ", N,'x',M, " --> ", M, 'x', N
echo "Time(ms): ", round(nxmTime, 3)
echo "Max RSS (KB): ", nxmMaxRss
echo "Runtime RSS (KB): ", nxmRuntimeRSS
echo "# of page faults: ", nxmPageFaults
echo "Perf (GMEMOPs/s ~ GigaMemory Operations/s) ", round(mxnPerf, 3)
echo "--------------------------------------------------------------------------"
echo "Transposition: ", M,'x',N, " --> ", N, 'x', M
echo "Time(ms): ", round(mxnTime, 3)
echo "Max RSS (KB): ", mxnMaxRss
echo "Runtime RSS (KB): ", mxnRuntimeRSS
echo "# of page faults: ", mxnPageFaults
echo "Perf (GMEMOPs/s ~ GigaMemory Operations/s) ", round(nxmPerf, 3)
template runBench(transposeName: typed, reorderCompute: bool): untyped =
if not reorderCompute:
memUsage(mxnMaxRss, mxnRuntimeRss, mxnPageFaults):
let start = wtime_msec()
for _ in 0 ..< nrounds:
transposeName(M, N, bufIn, bufOut)
let stop = wtime_msec()
mxnTime = stop - start
memUsage(nxmMaxRss, nxmRuntimeRss, nxmPageFaults):
let start = wtime_msec()
for _ in 0 ..< nrounds:
transposeName(N, M, bufIn, bufOut)
let stop = wtime_msec()
nxmTime = stop - start
report(M, N, nthreads, nrounds, reorderCompute,
transposeStrat, reqOps, reqBytes,
mxnTime, mxnMaxRSS, mxnRuntimeRss, mxnPageFaults,
nxmTime, nxmMaxRSS, nxmRuntimeRss, nxmPageFaults
)
else:
memUsage(nxmMaxRss, nxmRuntimeRss, nxmPageFaults):
let start = wtime_msec()
for _ in 0 ..< nrounds:
transposeName(N, M, bufIn, bufOut)
let stop = wtime_msec()
nxmTime = stop - start
memUsage(mxnMaxRss, mxnRuntimeRss, mxnPageFaults):
let start = wtime_msec()
for _ in 0 ..< nrounds:
transposeName(M, N, bufIn, bufOut)
let stop = wtime_msec()
mxnTime = stop - start
report(M, N, nthreads, nrounds, reorderCompute,
transposeStrat, reqOps, reqBytes,
mxnTime, mxnMaxRSS, mxnRuntimeRss, mxnPageFaults,
nxmTime, nxmMaxRSS, nxmRuntimeRss, nxmPageFaults
)
# Interface
# ---------------------------------------------------
proc main() =
var
M = 400
N = 4000
nrounds = 1000
transposeStrat = TiledCollapsed
reorderCompute = false
if paramCount() == 0:
let exeName = getAppFilename().extractFilename()
echo &"Usage: {exeName} <M: int> <N: int> <rounds: int> <transposeStrategy: Sequential|Naive|Nested|TiledNested> <reorderCompute: bool>"
echo &"Running with default M={M}, N={N}, rounds={nrounds}, transposeStrategy={transposeStrat}, reorderCompute={reorderCompute}"
elif paramCount() == 5:
M = paramStr(1).parseInt()
N = paramStr(2).parseInt()
nrounds = paramStr(3).parseInt()
transposeStrat = paramStr(4).parseEnum[:TransposeStrategy]()
reorderCompute = paramStr(5).parseBool()
else:
let exeName = getAppFilename().extractFilename()
echo &"Usage: {exeName} <M: int> <N: int> <rounds: int> <transposeStrategy: Sequential|Naive|Nested|TiledNested> <reorderCompute: bool>"
echo &"Default \"{exeName} {M} {N} {nrounds} {transposeStrat} {reorderCompute}\""
quit 1
echo "Inverting the transpose order may favor one transposition heavily for non-tiled strategies"
let nthreads = if transposeStrat == Sequential: 1'i32
else: omp_get_num_threads()
let (reqOps, reqBytes, bufSize) = computeMeta(M, N)
let bufOut = wv_alloc(float32, bufSize)
let bufIn = wv_alloc(float32, bufSize)
bufIn.initialize(bufSize)
var mxnTime, nxmTime: float64
case transposeStrat
of Sequential: runBench(sequentialTranspose, reorderCompute)
of Naive: runBench(ompNaiveTranspose, reorderCompute)
of Collapsed: runBench(ompCollapsedTranspose, reorderCompute)
of TiledCollapsed: runBench(omp2DTiledCollapsedTranspose, reorderCompute)
wv_free(bufOut)
wv_free(bufIn)
dispatch(main)

View File

@ -0,0 +1,319 @@
# Weave
# Copyright (c) 2019 Mamy André-Ratsimbazafy
# Licensed and distributed under either of
# * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
# * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
# at your option. This file may not be copied, modified, or distributed except according to those terms.
# Original transposition codes from Laser project
# (c) Mamy André Ratsimbazafy, Apache License version 2
import
# Stdlib
system/ansi_c, std/[strformat, os, strutils, math, cpuinfo],
# Constantine
../../threadpool
when not defined(windows):
# bench
import ../wtime, ../resources
# Memory
# ---------------------------------------------------
proc wv_alloc*(T: typedesc): ptr T {.inline.}=
## Default allocator for the Picasso library
## This allocates memory to hold the type T
## and returns a pointer to it
##
## Can use Nim allocator to measure the overhead of its lock
## Memory is not zeroed
when defined(WV_useNimAlloc):
createSharedU(T)
else:
cast[ptr T](c_malloc(csize_t sizeof(T)))
proc wv_alloc*(T: typedesc, len: SomeInteger): ptr UncheckedArray[T] {.inline.} =
## Default allocator for the Picasso library.
## This allocates a contiguous chunk of memory
## to hold ``len`` elements of type T
## and returns a pointer to it.
##
## Can use Nim allocator to measure the overhead of its lock
## Memory is not zeroed
when defined(WV_useNimAlloc):
cast[type result](createSharedU(T, len))
else:
cast[type result](c_malloc(csize_t len*sizeof(T)))
proc wv_free*[T: ptr](p: T) {.inline.} =
when defined(WV_useNimAlloc):
freeShared(p)
else:
c_free(p)
# Transpose implementations
# ---------------------------------------------------
type TransposeStrategy = enum
Sequential
Naive
Nested
TiledNested
# Question: do we need __restrict to avoid the compiler generating
# defensive aliasing robust code?
proc sequentialTranspose(tp: Threadpool, M, N: int, bufIn, bufOut: ptr UncheckedArray[float32]) =
for j in 0 ..< N:
for i in 0 ..< M:
bufOut[j*M+i] = bufIn[i*N+j]
proc cttNaiveTranspose(tp: Threadpool, M, N: int, bufIn, bufOut: ptr UncheckedArray[float32]) =
## Transpose a MxN matrix into a NxM matrix
# Write are more expensive than read so we keep i accesses linear for writes
tp.parallelFor j in 0 ..< N:
captures: {M, N, bufIn, bufOut}
for i in 0 ..< M:
bufOut[j*M+i] = bufIn[i*N+j]
proc cttNestedTranspose(tp: Threadpool, M, N: int, bufIn, bufOut: ptr UncheckedArray[float32]) =
## Transpose a MxN matrix into a NxM matrix with nested for loops
tp.parallelFor j in 0 ..< N:
captures: {tp, M, N, bufIn, bufOut}
tp.parallelFor i in 0 ..< M:
captures: {j, M, N, bufIn, bufOut}
bufOut[j*M+i] = bufIn[i*N+j]
proc ctt2DTiledNestedTranspose(tp: Threadpool, M, N: int, bufIn, bufOut: ptr UncheckedArray[float32]) =
## Transpose with 2D tiling and nested
const blck = 64 # const do not need to be captured
tp.parallelFor j in 0 ..< N:
stride: blck
captures: {tp, M, N, bufIn, bufOut}
tp.parallelFor i in 0 ..< M:
stride: blck
captures: {j, M, N, bufIn, bufOut}
for jj in j ..< min(j+blck, N):
for ii in i ..< min(i+blck, M):
bufOut[jj*M+ii] = bufIn[ii*N+jj]
# Meta
# ---------------------------------------------------
func computeMeta(height, width: int): tuple[reqOps, reqBytes, bufSize: int] =
result.reqOps = height * width
result.reqBytes = sizeof(float32) * height * width
result.bufSize = height * width
func initialize(buffer: ptr UncheckedArray[float32], len: int) =
for i in 0 ..< len:
buffer[i] = i.float32
# Bench
# ---------------------------------------------------
template memUsage(maxRSS, runtimeRSS, pageFaults: untyped{ident}, body: untyped) =
var maxRSS, runtimeRSS, pageFaults: int32
block:
when not defined(windows):
var ru: Rusage
getrusage(RusageSelf, ru)
runtimeRSS = ru.ru_maxrss
pageFaults = ru.ru_minflt
body
when not defined(windows):
getrusage(RusageSelf, ru)
runtimeRSS = ru.ru_maxrss - runtimeRSS
pageFaults = ru.ru_minflt - pageFaults
maxRss = ru.ru_maxrss
proc report(
M, N: int, nthreads: int32, nrounds: int, reordered: bool,
transposeStrategy: TransposeStrategy, reqOps, reqBytes: int,
mxnTime: float64, mxnMaxRSS, mxnRuntimeRss, mxnPageFaults: int32,
nxmTime: float64, nxmMaxRSS, nxmRuntimeRss, nxmPageFaults: int32,
) =
let arithIntensity = reqOps.float / reqBytes.float
let mxnPerf = reqOps.float/(mxnTime*1e-3 / nrounds.float) * 1e-9 # Gops per second
let nxmPerf = reqOps.float/(nxmTime*1e-3 / nrounds.float) * 1e-9 # Gops per second
echo "--------------------------------------------------------------------------"
echo "Scheduler: Constantine's threadpool"
echo "Benchmark: Transpose - ", $transposeStrategy
echo "Threads: ", nthreads
echo "# of rounds: ", nrounds
echo "# of operations: ", reqOps
echo "# of bytes: ", reqBytes
echo "Arithmetic Intensity: ", round(arithIntensity, 3)
echo "--------------------------------------------------------------------------"
if not reordered:
echo "Transposition: ", M,'x',N, " --> ", N, 'x', M
when not defined(windows):
echo "Time(ms): ", round(mxnTime, 3)
echo "Max RSS (KB): ", mxnMaxRss
echo "Runtime RSS (KB): ", mxnRuntimeRSS
echo "# of page faults: ", mxnPageFaults
echo "Perf (GMEMOPs/s ~ GigaMemory Operations/s) ", round(mxnPerf, 3)
echo "--------------------------------------------------------------------------"
echo "Transposition: ", N,'x',M, " --> ", M, 'x', N
when not defined(windows):
echo "Time(ms): ", round(nxmTime, 3)
echo "Max RSS (KB): ", nxmMaxRss
echo "Runtime RSS (KB): ", nxmRuntimeRSS
echo "# of page faults: ", nxmPageFaults
echo "Perf (GMEMOPs/s ~ GigaMemory Operations/s) ", round(nxmPerf, 3)
else:
echo "Transposition: ", N,'x',M, " --> ", M, 'x', N
when not defined(windows):
echo "Time(ms): ", round(nxmTime, 3)
echo "Max RSS (KB): ", nxmMaxRss
echo "Runtime RSS (KB): ", nxmRuntimeRSS
echo "# of page faults: ", nxmPageFaults
echo "Perf (GMEMOPs/s ~ GigaMemory Operations/s) ", round(mxnPerf, 3)
echo "--------------------------------------------------------------------------"
echo "Transposition: ", M,'x',N, " --> ", N, 'x', M
when not defined(windows):
echo "Time(ms): ", round(mxnTime, 3)
echo "Max RSS (KB): ", mxnMaxRss
echo "Runtime RSS (KB): ", mxnRuntimeRSS
echo "# of page faults: ", mxnPageFaults
echo "Perf (GMEMOPs/s ~ GigaMemory Operations/s) ", round(nxmPerf, 3)
template runBench(tp: Threadpool, transposeName: typed, reorderCompute, isSequential: bool): untyped =
if not reorderCompute:
if not isSequential:
tp = Threadpool.new()
memUsage(mxnMaxRss, mxnRuntimeRss, mxnPageFaults):
when not defined(windows):
let start = wtime_msec()
for _ in 0 ..< nrounds:
tp.transposeName(M, N, bufIn, bufOut)
if not isSequential:
tp.syncAll()
when not defined(windows):
let stop = wtime_msec()
mxnTime = stop - start
memUsage(nxmMaxRss, nxmRuntimeRss, nxmPageFaults):
when not defined(windows):
let start = wtime_msec()
for _ in 0 ..< nrounds:
tp.transposeName(N, M, bufIn, bufOut)
if not isSequential:
tp.syncAll()
when not defined(windows):
let stop = wtime_msec()
nxmTime = stop - start
if not isSequential:
tp.shutdown()
report(M, N, nthreads, nrounds, reorderCompute,
transposeStrat, reqOps, reqBytes,
mxnTime, mxnMaxRSS, mxnRuntimeRss, mxnPageFaults,
nxmTime, nxmMaxRSS, nxmRuntimeRss, nxmPageFaults
)
else:
if not isSequential:
tp = Threadpool.new()
memUsage(nxmMaxRss, nxmRuntimeRss, nxmPageFaults):
when not defined(windows):
let start = wtime_msec()
for _ in 0 ..< nrounds:
tp.transposeName(N, M, bufIn, bufOut)
if not isSequential:
tp.syncAll()
when not defined(windows):
let stop = wtime_msec()
nxmTime = stop - start
memUsage(mxnMaxRss, mxnRuntimeRss, mxnPageFaults):
when not defined(windows):
let start = wtime_msec()
for _ in 0 ..< nrounds:
tp.transposeName(M, N, bufIn, bufOut)
if not isSequential:
tp.syncAll()
when not defined(windows):
let stop = wtime_msec()
mxnTime = stop - start
if not isSequential:
tp.shutdown()
report(M, N, nthreads, nrounds, reorderCompute,
transposeStrat, reqOps, reqBytes,
mxnTime, mxnMaxRSS, mxnRuntimeRss, mxnPageFaults,
nxmTime, nxmMaxRSS, nxmRuntimeRss, nxmPageFaults
)
# Interface
# ---------------------------------------------------
proc main() =
var
M = 400
N = 4000
nrounds = 1000
transposeStrat = TiledNested
reorderCompute = false
if paramCount() == 0:
let exeName = getAppFilename().extractFilename()
echo &"Usage: {exeName} <M: int> <N: int> <rounds: int> <transposeStrategy: Sequential|Naive|Nested|TiledNested> <reorderCompute: bool>"
echo &"Running with default M={M}, N={N}, rounds={nrounds}, transposeStrategy={transposeStrat}, reorderCompute={reorderCompute}"
elif paramCount() == 5:
M = paramStr(1).parseInt()
N = paramStr(2).parseInt()
nrounds = paramStr(3).parseInt()
transposeStrat = paramStr(4).parseEnum[:TransposeStrategy]()
reorderCompute = paramStr(5).parseBool()
else:
let exeName = getAppFilename().extractFilename()
echo &"Usage: {exeName} <M: int> <N: int> <rounds: int> <transposeStrategy: Sequential|Naive|Nested|TiledNested> <reorderCompute: bool>"
echo &"Default \"{exeName} {M} {N} {nrounds} {transposeStrat} {reorderCompute}\""
quit 1
echo "Inverting the transpose order may favor one transposition heavily for non-tiled strategies"
let isSequential = transposeStrat == Sequential
var nthreads: int32
if transposeStrat == Sequential:
nthreads = 1
elif existsEnv"CTT_NUM_THREADS":
nthreads = getEnv"CTT_NUM_THREADS".parseInt().int32
else:
nthreads = countProcessors().int32
let (reqOps, reqBytes, bufSize) = computeMeta(M, N)
let bufOut = wv_alloc(float32, bufSize)
let bufIn = wv_alloc(float32, bufSize)
bufIn.initialize(bufSize)
var mxnTime, nxmTime: float64
var tp: Threadpool
case transposeStrat
of Sequential: tp.runBench(sequentialTranspose, reorderCompute, isSequential)
of Naive: tp.runBench(cttNaiveTranspose, reorderCompute, isSequential)
of Nested: tp.runBench(cttNestedTranspose, reorderCompute, isSequential)
of TiledNested: tp.runBench(ctt2DTiledNestedTranspose, reorderCompute, isSequential)
wv_free(bufOut)
wv_free(bufIn)
main()

View File

@ -218,7 +218,7 @@ proc main() =
echo "Benchmark: N-queens"
echo "Threads: ", nthreads
when not defined(windows):
echo "Time(us) ", stop - start
echo "Time(ms) ", stop - start
echo "Max RSS (KB): ", ru.ru_maxrss
echo "Runtime RSS (KB): ", rss
echo "# of page faults: ", flt

View File

@ -45,7 +45,7 @@ func initialize*(en: var EventNotifier) {.inline.} =
func `=destroy`*(en: var EventNotifier) {.inline.} =
en.futex.teardown()
func `=`*(dst: var EventNotifier, src: EventNotifier) {.error: "An event notifier cannot be copied".}
func `=copy`*(dst: var EventNotifier, src: EventNotifier) {.error: "An event notifier cannot be copied".}
func `=sink`*(dst: var EventNotifier, src: EventNotifier) {.error: "An event notifier cannot be moved".}
func prepareToPark*(en: var EventNotifier) {.inline.} =
@ -176,7 +176,7 @@ func initialize*(ec: var EventCount) {.inline.} =
func `=destroy`*(ec: var EventCount) {.inline.} =
ec.futex.teardown()
proc sleepy*(ec: var Eventcount): ParkingTicket {.inline.} =
proc sleepy*(ec: var Eventcount): ParkingTicket {.noInit, inline.} =
## To be called before checking if the condition to not sleep is met.
## Returns a ticket to be used when committing to sleep
let prevState = ec.state.fetchAdd(kPreWait, moAcquireRelease)
@ -209,11 +209,11 @@ proc wakeAll*(ec: var EventCount) {.inline.} =
if (prev and kAnyWaiterMask) != 0:
ec.futex.wakeAll()
proc getNumWaiters*(ec: var EventCount): tuple[preSleep, committedSleep: uint32] {.inline.} =
proc getNumWaiters*(ec: var EventCount): tuple[preSleep, committedSleep: int32] {.noInit, inline.} =
## Get the number of idle threads:
## (planningToSleep, committedToSleep)
let waiters = ec.state.load(moAcquire)
result.preSleep = uint32((waiters and kPreWaitMask) shr kPreWaitShift)
result.committedSleep = uint32(waiters and kWaitMask)
result.preSleep = cast[int32]((waiters and kPreWaitMask) shr kPreWaitShift)
result.committedSleep = cast[int32](waiters and kWaitMask)
{.pop.} # {.push raises:[], checks:off.}

View File

@ -76,7 +76,7 @@ proc peek*(tq: var Taskqueue): int =
##
## This is a non-locking operation.
let # Handle race conditions
b = tq.back.load(moAcquire)
b = tq.back.load(moRelaxed) # Only the producer peeks in the threadpool so moRelaxed is enough
f = tq.front.load(moAcquire)
if b >= f:
@ -199,7 +199,7 @@ proc pop*(tq: var Taskqueue): ptr Task =
# Empty queue, no thieves can have a pointer to an old retired buffer
tq.garbageCollect()
proc steal*(thiefID: uint32, tq: var Taskqueue): ptr Task =
proc steal*(thiefID: int32, tq: var Taskqueue): ptr Task =
## Dequeue an item at the front. Takes ownership of the item
## This is intended for consumers.
var f = tq.front.load(moAcquire)
@ -244,7 +244,7 @@ proc stealHalfImpl(dst: var Buf, dstBack: int, src: var Taskqueue): int =
if compareExchange(src.front, f, f+n, moSequentiallyConsistent, moRelaxed):
return n
proc stealHalf*(thiefID: uint32, dst: var Taskqueue, src: var Taskqueue): ptr Task =
proc stealHalf*(thiefID: int32, dst: var Taskqueue, src: var Taskqueue): ptr Task =
## Dequeue up to half of the items in the `src` tq, fom the front.
## Return the last of those, or nil if none found

View File

@ -9,7 +9,7 @@
import
std/atomics,
../instrumentation,
../../allocs,
../../allocs, ../../primitives,
./backoff
# Tasks have an efficient design so that a single heap allocation
@ -29,9 +29,9 @@ type
Task* = object
# Intrusive metadata
# ------------------
parent*: ptr Task # When a task is awaiting, a thread can quickly prioritize the direct child of a task
parent*: ptr Task # When a task is awaiting, a thread can quickly prioritize the direct child of a task
thiefID*: Atomic[uint32] # ID of the worker that stole and run the task. For leapfrogging.
thiefID*: Atomic[int32] # ID of the worker that stole and run the task. For leapfrogging.
# Result sync
# ------------------
@ -39,21 +39,50 @@ type
completed*: Atomic[bool]
waiter*: Atomic[ptr EventNotifier]
# Data parallelism
# ------------------
isFirstIter*: bool # Awaitable for-loops return true for first iter. Loops are split before first iter.
loopStart*: int
loopStop*: int
loopStride*: int
loopStepsLeft*: int
reductionDAG*: ptr ReductionDagNode # For parallel loop reduction, merge with other range result
# Dataflow parallelism
# --------------------
dependsOnEvent: bool # We cannot leapfrog a task triggered by an event
# Execution
# ------------------
fn*: proc (param: pointer) {.nimcall, gcsafe, raises: [].}
# destroy*: proc (param: pointer) {.nimcall, gcsafe.} # Constantine only deals with plain old data
data*{.align:sizeof(int).}: UncheckedArray[byte]
fn*: proc (env: pointer) {.nimcall, gcsafe, raises: [].}
# destroy*: proc (env: pointer) {.nimcall, gcsafe.} # Constantine only deals with plain old data
envSize*: int32
env*{.align:sizeof(int).}: UncheckedArray[byte]
Flowvar*[T] = object
## A Flowvar is a placeholder for a future result that may be computed in parallel
task: ptr Task
const SentinelThief* = 0xFACADE'u32
ReductionDagNode* = object
## In a parallel reduction, when a loop a split the worker
## keeps track of the tasks to gather results from in a private task-local linked-list.
## Those forms a global computation directed acyclic graph
## with the initial parallel reduction task as root.
# Note: While this requires an extra allocation per split
# the alternative, making an intrusive linked-list of reduction tasks
# require synchronization between threads.
task*: ptr Task
next*: ptr ReductionDagNode
proc new*(
# Tasks
# -------------------------------------------------------------------------
const SentinelThief* = 0xFACADE'i32
proc newSpawn*(
T: typedesc[Task],
parent: ptr Task,
fn: proc (param: pointer) {.nimcall, gcsafe.}): ptr Task {.inline.} =
fn: proc (env: pointer) {.nimcall, gcsafe, raises: [].}): ptr Task =
const size = sizeof(T)
@ -64,15 +93,25 @@ proc new*(
result.completed.store(false, moRelaxed)
result.waiter.store(nil, moRelaxed)
result.fn = fn
result.envSize = 0
proc new*(
result.isFirstIter = false
result.loopStart = 0
result.loopStop = 0
result.loopStride = 0
result.loopStepsLeft = 0
result.reductionDAG = nil
result.dependsOnEvent = false
proc newSpawn*(
T: typedesc[Task],
parent: ptr Task,
fn: proc (param: pointer) {.nimcall, gcsafe, raises: [].},
params: auto): ptr Task {.inline.} =
fn: proc (env: pointer) {.nimcall, gcsafe, raises: [].},
env: auto): ptr Task =
const size = sizeof(T) + # size without Unchecked
sizeof(params)
sizeof(env)
result = allocHeapUnchecked(T, size)
result.parent = parent
@ -81,7 +120,78 @@ proc new*(
result.completed.store(false, moRelaxed)
result.waiter.store(nil, moRelaxed)
result.fn = fn
cast[ptr[type params]](result.data)[] = params
result.envSize = int32 sizeof(env)
cast[ptr[type env]](result.env)[] = env
result.isFirstIter = false
result.loopStart = 0
result.loopStop = 0
result.loopStride = 0
result.loopStepsLeft = 0
result.reductionDAG = nil
result.dependsOnEvent = false
proc newLoop*(
T: typedesc[Task],
parent: ptr Task,
start, stop, stride: int,
isFirstIter: bool,
fn: proc (env: pointer) {.nimcall, gcsafe, raises: [].}): ptr Task =
const size = sizeof(T)
preCondition: start < stop
result = allocHeapUnchecked(T, size)
result.parent = parent
result.thiefID.store(SentinelThief, moRelaxed)
result.hasFuture = false
result.completed.store(false, moRelaxed)
result.waiter.store(nil, moRelaxed)
result.fn = fn
result.envSize = 0
result.isFirstIter = isFirstIter
result.loopStart = start
result.loopStop = stop
result.loopStride = stride
result.loopStepsLeft = ceilDiv_vartime(stop-start, stride)
result.reductionDAG = nil
result.dependsOnEvent = false
proc newLoop*(
T: typedesc[Task],
parent: ptr Task,
start, stop, stride: int,
isFirstIter: bool,
fn: proc (env: pointer) {.nimcall, gcsafe, raises: [].},
env: auto): ptr Task =
const size = sizeof(T) + # size without Unchecked
sizeof(env)
preCondition: start < stop
result = allocHeapUnchecked(T, size)
result.parent = parent
result.thiefID.store(SentinelThief, moRelaxed)
result.hasFuture = false
result.completed.store(false, moRelaxed)
result.waiter.store(nil, moRelaxed)
result.fn = fn
result.envSize = int32(sizeof(env))
cast[ptr[type env]](result.env)[] = env
result.isFirstIter = isFirstIter
result.loopStart = start
result.loopStop = stop
result.loopStride = stride
result.loopStepsLeft = ceilDiv_vartime(stop-start, stride)
result.reductionDAG = nil
result.dependsOnEvent = false
# Flowvars
# -------------------------------------------------------------------------
# proc `=copy`*[T](dst: var Flowvar[T], src: Flowvar[T]) {.error: "Futures/Flowvars cannot be copied".}
@ -91,9 +201,9 @@ proc newFlowVar*(T: typedesc, task: ptr Task): Flowvar[T] {.inline.} =
# Task with future references themselves so that readyWith can be called
# within the constructed
# proc async_fn(param: pointer) {.nimcall.}
# that can only access data
cast[ptr ptr Task](task.data.addr)[] = task
# proc threadpoolSpawn_fn(env: pointer) {.nimcall.}
# that can only access env
cast[ptr ptr Task](task.env.addr)[] = task
proc cleanup*(fv: var Flowvar) {.inline.} =
fv.task.freeHeap()
@ -117,13 +227,24 @@ func readyWith*[T](task: ptr Task, childResult: T) {.inline.} =
## Send the Flowvar result from the child thread processing the task
## to its parent thread.
precondition: not task.completed.load(moAcquire)
cast[ptr (ptr Task, T)](task.data.addr)[1] = childResult
cast[ptr (ptr Task, T)](task.env.addr)[1] = childResult
task.completed.store(true, moRelease)
proc sync*[T](fv: sink Flowvar[T]): T {.inline, gcsafe.} =
proc sync*[T](fv: sink Flowvar[T]): T {.noInit, inline, gcsafe.} =
## Blocks the current thread until the flowvar is available
## and returned.
## The thread is not idle and will complete pending tasks.
mixin completeFuture
if fv.task.isNil:
zeroMem(result.addr, sizeof(T))
return
completeFuture(fv, result)
cleanup(fv)
# ReductionDagNodes
# -------------------------------------------------------------------------
proc newReductionDagNode*(task: ptr Task, next: ptr ReductionDagNode): ptr ReductionDagNode {.inline.} =
result = allocHeap(ReductionDagNode)
result.next = next
result.task = task

View File

@ -0,0 +1,142 @@
# Raytracing
This is a port to Nim and Weave of
SmallPT by Kevin Beason https://www.kevinbeason.com/smallpt/
The original C++ version is also provided (with 4 extra lines to highlight original code quirks when porting).
## Showcase
The Nim version has can use a single parallel-for loop
or nested parallel-for loop for better load balancing (important when the inner loop .is actually larget than the outer loop)
At the moment, there is no random number generator that can deal with
the dynamic thread migration of Weave so we get interesting artifacts
compared to the single-threaded or the single parallel-for versions.
**Single-thread or single parallel-for**
![ray_trace_300samples_nim_threaded](ray_trace_300samples_nim_threaded.png)
**Nested parallel-for**
![ray_trace_300samples_nim_nested](ray_trace_300samples_nim_nested.png)
## Benchmark
Note: except for the nested parallelism which has RNG issue,
the Nim and C++ versions are pixel equivalent.
### Setup
CPU: i9-9980XE, 18 cores, overclocked at 4.1GHz all-core turbo (from 3.0 nominal)
The code was compiled with default flag, hence x86-64, hence SSE2.
- Nim devel (1.3.5 2020-05-16) + GCC v10.1.0
- `nim c --threads:off -d:danger`
- `nim c --threads:on -d:danger`
- GCC v10.1.0
- `-O3`
- `-O3 -fopenmp`
- GCC v8.4.0
- `-O3`
- `-O3 -fopenmp`
- Clang v10.0.0
- `-O3`
- `-O3 -fopenmp`
### Commands
```bash
git clone https://github.com/mratsim/weave
cd weave
nimble install -y # install Weave dependencies, here synthesis, overwriting if asked.
nim -v # Ensure you have nim 1.2.0 or more recent
# Threads on (by default in this repo)
nim c -d:danger -o:build/ray_threaded demos/raytracing/smallpt.nim
# Threads off
nim c -d:danger --threads:off -o:build/ray_single demos/raytracing/smallpt.nim
g++ -O3 -o build/ray_gcc_single demos/raytracing/smallpt.cpp
g++ -O3 -fopenmp -o build/ray_gcc_omp demos/raytracing/smallpt.cpp
clang++ -O3 -o build/ray_clang_single demos/raytracing/smallpt.cpp
clang++ -O3 -fopenmp -o build/ray_clang_omp demos/raytracing/smallpt.cpp
```
Then run for 300 samples with
```
build/ray_threaded 300
# ...
build/ray_clang_omp 300
```
### Results & Analysis
GCC 10 has a significant OpenMP regression
| Bench | Nim | Clang C++ OpenMP | GCC 10 C++ OpenMP | GCC 8 C++ OpenMP |
| ---------------- | ----------: | ---------------: | ----------------: | ---------------: |
| Single-threaded | 4min43.369s | 4m51.052s | 4min50.934s | 4m50.648s |
| Multithreaded | 12.977s | 14.428s | 2min14.616s | 12.244s |
| Nested-parallel | 12.981s | | | |
| Parallel speedup | 21.83x | 20.17x | 2.16x | 23.74x |
Single-threaded Nim is 2.7% faster than Clang C++.
Multithreaded Nim via Weave is 11.1% faster Clang C++.
GCC 8 despite a simpler OpenMP design (usage of a global task queue instead of work-stealing)
achieves a better speedup than both Weave and Clang.
In that case, I expect it's because the tasks are so big that there is minimal contention
on the task queue, furthermore the OpenMP schedule is "Dynamic" so we avoid the worst case scenario
with static scheduling where a bunch of threads are assigned easy rays that never collide with a surface
and a couple of threads are drowned in complex rays.
I have absolutely no idea of what happened to OpenMP in GCC 10.
Note: I only have 18 cores but we observe speedups in the 20x
with Weave and LLVM. This is probably due to 2 factors:
- Raytracing is pure compute, in particular contrary to high-performance computing
and machine learning workloads which are also very memory-intensive (matrices and tensors with thousands to millions of elements)
The scene has only 10 objects and a camera to keep track off.
Memory is extremely slow, you can do 100 additions while waiting for data
in the L2 cache.
- A CPU has a certain number of execution ports and can use instruction-level parallelism (we call that superscalar). Hyperthreading is a way to
use those extra execution ports. However in many computing workloads
the sibling cores are also competing for memory bandwidth.
From the previous point, this is not true for raytracing
and so we enjoy super linear speedup.
## License
Kevin Beason code is licensed under (mail redacted to avoid spam)
```
LICENSE
Copyright (c) 2006-2008 Kevin Beason (<surname>.<name>@gmail.com)
Permission is hereby granted, free of charge, to any person obtaining
a copy of this software and associated documentation files (the
"Software"), to deal in the Software without restriction, including
without limitation the rights to use, copy, modify, merge, publish,
distribute, sublicense, and/or sell copies of the Software, and to
permit persons to whom the Software is furnished to do so, subject to
the following conditions:
The above copyright notice and this permission notice shall be included
in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
```

Binary file not shown.

After

Width:  |  Height:  |  Size: 1.7 MiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 1.8 MiB

View File

@ -0,0 +1,103 @@
#include <math.h> // smallpt, a Path Tracer by Kevin Beason, 2008
#include <stdlib.h> // Make : g++ -O3 -fopenmp smallpt.cpp -o smallpt
#include <stdio.h> // Remove "-fopenmp" for g++ version < 4.2
struct Vec { // Usage: time ./smallpt 5000 && xv image.ppm
double x, y, z; // position, also color (r,g,b)
Vec(double x_=0, double y_=0, double z_=0){ x=x_; y=y_; z=z_; }
Vec operator+(const Vec &b) const { return Vec(x+b.x,y+b.y,z+b.z); }
Vec operator-(const Vec &b) const { return Vec(x-b.x,y-b.y,z-b.z); }
Vec operator*(double b) const { return Vec(x*b,y*b,z*b); }
Vec mult(const Vec &b) const { return Vec(x*b.x,y*b.y,z*b.z); }
Vec& norm(){ return *this = *this * (1/sqrt(x*x+y*y+z*z)); }
double dot(const Vec &b) const { return x*b.x+y*b.y+z*b.z; } // cross:
Vec operator%(Vec&b){return Vec(y*b.z-z*b.y,z*b.x-x*b.z,x*b.y-y*b.x);}
};
struct Ray { Vec o, d; Ray(Vec o_, Vec d_) : o(o_), d(d_) {} };
enum Refl_t { DIFF, SPEC, REFR }; // material types, used in radiance()
struct Sphere {
double rad; // radius
Vec p, e, c; // position, emission, color
Refl_t refl; // reflection type (DIFFuse, SPECular, REFRactive)
Sphere(double rad_, Vec p_, Vec e_, Vec c_, Refl_t refl_):
rad(rad_), p(p_), e(e_), c(c_), refl(refl_) {}
double intersect(const Ray &r) const { // returns distance, 0 if nohit
Vec op = p-r.o; // Solve t^2*d.d + 2*t*(o-p).d + (o-p).(o-p)-R^2 = 0
double t, eps=1e-4, b=op.dot(r.d), det=b*b-op.dot(op)+rad*rad;
if (det<0) return 0; else det=sqrt(det);
return (t=b-det)>eps ? t : ((t=b+det)>eps ? t : 0);
}
};
Sphere spheres[] = {//Scene: radius, position, emission, color, material
Sphere(1e5, Vec( 1e5+1,40.8,81.6), Vec(),Vec(.75,.25,.25),DIFF),//Left
Sphere(1e5, Vec(-1e5+99,40.8,81.6),Vec(),Vec(.25,.25,.75),DIFF),//Rght
Sphere(1e5, Vec(50,40.8, 1e5), Vec(),Vec(.75,.75,.75),DIFF),//Back
Sphere(1e5, Vec(50,40.8,-1e5+170), Vec(),Vec(), DIFF),//Frnt
Sphere(1e5, Vec(50, 1e5, 81.6), Vec(),Vec(.75,.75,.75),DIFF),//Botm
Sphere(1e5, Vec(50,-1e5+81.6,81.6),Vec(),Vec(.75,.75,.75),DIFF),//Top
Sphere(16.5,Vec(27,16.5,47), Vec(),Vec(1,1,1)*.999, SPEC),//Mirr
Sphere(16.5,Vec(73,16.5,78), Vec(),Vec(1,1,1)*.999, REFR),//Glas
Sphere(600, Vec(50,681.6-.27,81.6),Vec(12,12,12), Vec(), DIFF) //Lite
};
inline double clamp(double x){ return x<0 ? 0 : x>1 ? 1 : x; }
inline int toInt(double x){ return int(pow(clamp(x),1/2.2)*255+.5); }
inline bool intersect(const Ray &r, double &t, int &id){
double n=sizeof(spheres)/sizeof(Sphere), d, inf=t=1e20;
for(int i=int(n);i--;) if((d=spheres[i].intersect(r))&&d<t){t=d;id=i;}
return t<inf;
}
Vec radiance(const Ray &r, int depth, unsigned short *Xi){
double t; // distance to intersection
int id=0; // id of intersected object
if (!intersect(r, t, id)) return Vec(); // if miss, return black
const Sphere &obj = spheres[id]; // the hit object
Vec x=r.o+r.d*t, n=(x-obj.p).norm(), nl=n.dot(r.d)<0?n:n*-1, f=obj.c;
double p = f.x>f.y && f.x>f.z ? f.x : f.y>f.z ? f.y : f.z; // max refl
if (++depth>5) if (erand48(Xi)<p) f=f*(1/p); else return obj.e; //R.R.
if (obj.refl == DIFF){ // Ideal DIFFUSE reflection
double r1=2*M_PI*erand48(Xi), r2=erand48(Xi), r2s=sqrt(r2);
Vec w=nl, u=((fabs(w.x)>.1?Vec(0,1):Vec(1))%w).norm(), v=w%u;
Vec d = (u*cos(r1)*r2s + v*sin(r1)*r2s + w*sqrt(1-r2)).norm();
return obj.e + f.mult(radiance(Ray(x,d),depth,Xi));
} else if (obj.refl == SPEC) // Ideal SPECULAR reflection
return obj.e + f.mult(radiance(Ray(x,r.d-n*2*n.dot(r.d)),depth,Xi));
Ray reflRay(x, r.d-n*2*n.dot(r.d)); // Ideal dielectric REFRACTION
bool into = n.dot(nl)>0; // Ray from outside going in?
double nc=1, nt=1.5, nnt=into?nc/nt:nt/nc, ddn=r.d.dot(nl), cos2t;
if ((cos2t=1-nnt*nnt*(1-ddn*ddn))<0) // Total internal reflection
return obj.e + f.mult(radiance(reflRay,depth,Xi));
Vec tdir = (r.d*nnt - n*((into?1:-1)*(ddn*nnt+sqrt(cos2t)))).norm();
double a=nt-nc, b=nt+nc, R0=a*a/(b*b), c = 1-(into?-ddn:tdir.dot(n));
double Re=R0+(1-R0)*c*c*c*c*c,Tr=1-Re,P=.25+.5*Re,RP=Re/P,TP=Tr/(1-P);
return obj.e + f.mult(depth>2 ? (erand48(Xi)<P ? // Russian roulette
radiance(reflRay,depth,Xi)*RP:radiance(Ray(x,tdir),depth,Xi)*TP) :
// For determinism due to the RNG: C++ executes the right side first.
radiance(reflRay,depth,Xi)*Re+radiance(Ray(x,tdir),depth,Xi)*Tr);
}
int main(int argc, char *argv[]){
int w=1024, h=768, samps = argc==2 ? atoi(argv[1])/4 : 1; // # samples
Ray cam(Vec(50,52,295.6), Vec(0,-0.042612,-1).norm()); // cam pos, dir
Vec cx=Vec(w*.5135/h), cy=(cx%cam.d).norm()*.5135, r, *c=new Vec[w*h];
#pragma omp parallel for schedule(dynamic, 1) private(r) // OpenMP
for (int y=0; y<h; y++){ // Loop over image rows
fprintf(stderr,"\rRendering (%d spp) %5.2f%%",samps*4,100.*y/(h-1));
for (unsigned short x=0, Xi[3]={0,0,(unsigned short)(y*y*y)}; x<w; x++) // Loop cols
for (int sy=0, i=(h-y-1)*w+x; sy<2; sy++) // 2x2 subpixel rows
for (int sx=0; sx<2; sx++, r=Vec()){ // 2x2 subpixel cols
for (int s=0; s<samps; s++){
double r1=2*erand48(Xi), dx=r1<1 ? sqrt(r1)-1: 1-sqrt(2-r1);
double r2=2*erand48(Xi), dy=r2<1 ? sqrt(r2)-1: 1-sqrt(2-r2);
Vec d = cx*( ( (sx+.5 + dx)/2 + x)/w - .5) +
cy*( ( (sy+.5 + dy)/2 + y)/h - .5) + cam.d;
// Warning: The d in "cam.o+d*140" is actually d.norm()
// due to right to left execution and norm() working
// both in-place and out-place
r = r + radiance(Ray(cam.o+d*140,d.norm()),0,Xi)*(1./samps);
} // Camera rays are pushed ^^^^^ forward to start in interior
c[i] = c[i] + Vec(clamp(r.x),clamp(r.y),clamp(r.z))*.25;
}
}
FILE *f = fopen("image.ppm", "w"); // Write image to PPM file.
fprintf(f, "P3\n%d %d\n%d\n", w, h, 255);
for (int i=0; i<w*h; i++)
fprintf(f,"%d %d %d ", toInt(c[i].x), toInt(c[i].y), toInt(c[i].z));
}

View File

@ -0,0 +1,339 @@
# Weave
# Copyright (c) 2020 Mamy André-Ratsimbazafy
# Licensed and distributed under either of
# * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
# * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
# at your option. This file may not be copied, modified, or distributed except according to those terms.
# Warning: the original code-golfing is pretty extreme
# The worse being declaring, resetting non-loop variable
# in for-loop, including the random seed ...
import std/[math, strformat, os, strutils]
type Vec = object
x, y, z: float64
func vec(x, y, z: float64 = 0): Vec =
Vec(x: x, y: y, z: z)
func `+`(a, b: Vec): Vec =
vec(a.x+b.x, a.y+b.y, a.z+b.z)
func `-`(a, b: Vec): Vec =
vec(a.x-b.x, a.y-b.y, a.z-b.z)
func `*`(a: Vec, t: float64): Vec =
vec(a.x*t, a.y*t, a.z*t)
func `*.`(a, b: Vec): Vec =
# hadamard product
vec(a.x*b.x, a.y*b.y, a.z*b.z)
func norm(a: Vec): Vec =
# Warning! Warning! The original code
# mutate the vec in-place AND returns a reference to the mutated input
a * (1/sqrt(a.x*a.x + a.y*a.y + a.z*a.z))
func dot(a, b: Vec): float64 =
a.x*b.x + a.y*b.y + a.z*b.z
func cross(a, b: Vec): Vec =
vec(a.y*b.z - a.z*b.y, a.z*b.x - a.x*b.z, a.x*b.y - a.y*b.x)
type
Ray = object
o, d: Vec # origin, direction
Reflection = enum
Diffuse, Specular, Refractive
Sphere = object
radius: float64
pos, emit, color: Vec
refl: Reflection
func ray(origin: Vec, direction: Vec): Ray =
result.o = origin
result.d = direction
func intersect(self: Sphere, ray: Ray): float64 =
## Returns distance, 0 if no hit
## Solve t^2*d.d + 2*t*(o-p).d + (o-p).(o-p)-R^2 = 0
let op = self.pos - ray.o
const eps = 1e-4
let b = op.dot(ray.d)
var det = b*b - op.dot(op) + self.radius*self.radius
if det < 0.0:
return 0.0
det = sqrt(det)
block:
let t = b-det
if t > eps:
return t
block:
let t = b+det
if t > eps:
return t
return 0.0
const spheres = [ # Scene: radius, position, emission, color, material # Walls approximated by very large spheres
Sphere(radius:1e5, pos:vec( 1e5+1, 40.8, 81.6), color:vec(0.75,0.25,0.25),refl:Diffuse), # Left
Sphere(radius:1e5, pos:vec(-1e5+99, 40.8, 81.6), color:vec(0.25,0.25,0.75),refl:Diffuse), # Right
Sphere(radius:1e5, pos:vec(50, 40.8, 1e5), color:vec(0.75,0.75,0.75),refl:Diffuse), # Back
Sphere(radius:1e5, pos:vec(50, 40.8,-1e5+170), refl:Diffuse), # Front
Sphere(radius:1e5, pos:vec(50, 1e5, 81.6), color:vec(0.75,0.75,0.75),refl:Diffuse), # Bottom
Sphere(radius:1e5, pos:vec(50, -1e5+81.6, 81.6), color:vec(0.75,0.75,0.75),refl:Diffuse), # Top
Sphere(radius:16.5,pos:vec(27, 16.5, 47), color:vec(1,1,1)*0.999, refl:Specular), # Mirror
Sphere(radius:16.5,pos:vec(73, 16.5, 78), color:vec(1,1,1)*0.999, refl:Refractive),# Glass
Sphere(radius:600, pos:vec(50, 681.6-0.27,81.6), emit:vec(12,12,12), refl:Diffuse), # Light
]
func clamp(x: float64): float64 {.inline.} =
if x < 0: 0.0 else: (if x > 1: 1.0 else: x)
func toInt(x: float64): int32 =
# This seems to do gamma correction by 2.2
int32(
pow(clamp(x),1/2.2) * 255 + 0.5
)
func intersect(r: Ray, t: var float64, id: var int32): bool =
# out parameters ...
const inf = 1e20
t = inf
for i in countdown(spheres.len-1, 0):
let d = spheres[i].intersect(r)
if d != 0 and d < t:
t = d
id = i.int32
return t < inf
when defined(cpp):
# Seems like Nim codegen for mutable arrays is slightly different from the C++ API
# and needs a compatibility shim
proc erand48(xi: ptr cushort): cdouble {.importc, header:"<stdlib.h>", sideeffect.}
proc erand48(xi: var array[3, cushort]): float64 {.inline.} =
erand48(xi[0].addr)
else:
# Need the same RNG for comparison
proc erand48(xi: var array[3, cushort]): cdouble {.importc, header:"<stdlib.h>", sideeffect.}
proc radiance(r: Ray, depth: int32, xi: var array[3, cushort]): Vec =
var t: float64 # distance to intersection
var id = 0'i32 # id of intersected object
if not r.intersect(t, id): # if miss return black
return vec()
template obj: untyped = spheres[id] # alias the hit object
let x = r.o + r.d * t
let n = norm(x-obj.pos);
let nl = if n.dot(r.d) < 0: n else: n * -1
var f = obj.color
let p = max(f.x, max(f.y, f.z)) # max reflect
let depth=depth+1
if depth>5:
if erand48(xi) < p:
f = f*(1/p)
else:
return obj.emit # Russian Roulette
if obj.refl == Diffuse: # ideal diffuse reflection
let
r1 = 2*PI*erand48(xi)
r2 = erand48(xi)
r2s = sqrt(r2)
w = nl
u = (if w.x.abs() > 0.1: vec(0,1) else: vec(1)).cross(w).norm()
v = w.cross(u)
d = (u*cos(r1)*r2s + v*sin(r1)*r2s + w*sqrt(1-r2)).norm()
return obj.emit + f *. radiance(ray(x, d), depth, xi)
elif obj.refl == Specular: # ideal specular reflection
return obj.emit + f *. radiance(ray(x, r.d - n*2*n.dot(r.d)), depth, xi)
# Dielectric refraction
let
reflRay = ray(x, r.d - n*2*n.dot(r.d))
into = n.dot(nl) > 0 # Ray from outside going in
nc = 1.0
nt = 1.5
nnt = if into: nc/nt else: nt/nc
ddn = r.d.dot(nl)
cos2t = 1-nnt*nnt*(1-ddn*ddn)
if cos2t < 0:
return obj.emit + f *. radiance(reflRay, depth, xi)
let
tdir = (r.d*nnt - n*(if into: 1 else: -1)*(ddn*nnt+sqrt(cos2t))).norm()
a = nt - nc
b = nt + nc
R0 = a*a/(b*b)
c = 1 - (if into: -ddn else: tdir.dot(n))
Re = R0 + (1-R0)*c*c*c*c*c
Tr = 1-Re
P = 0.25+0.5*Re
RP = Re/P
TP = Tr/(1-P)
return obj.emit + f *. (block:
if depth>2: # Russian roulette
if erand48(xi)<P:
radiance(reflRay, depth, xi)*RP
else:
radiance(ray(x, tdir), depth, xi)*TP
else:
# Note: to exacly reproduce the C++ result,
# since we have a random function and C++ seem to resolve
# from right to left, we exchange our processing order
radiance(ray(x, tdir), depth, xi)*Tr +
radiance(reflRay, depth, xi)*Re
)
proc tracer_single(C: var seq[Vec], w, h: static int, samples: int) =
const
cam = ray(vec(50,52,295.6), vec(0,-0.042612,-1).norm())
cx = vec(w*0.5135/h)
cy = cx.cross(cam.d).norm()*0.5135
for y in 0 ..< h: # Loop over image rows
stderr.write &"\rRendering ({samples*4} samples per pixel) {100.0*y.float64/float(h-1):5.2f}%"
var xi = [cushort 0, 0, cushort y*y*y]
for x in 0 ..< w: # Loop over columns
let i = (h-y-1)*w+x
for sy in 0 ..< 2: # 2x2 subpixel rows
for sx in 0 ..< 2: # 2x2 subpixel cols
var r = vec()
for s in 0 ..< samples:
let
r1 = 2*erand48(xi)
dx = if r1<1: sqrt(r1)-1 else: 1-sqrt(2-r1)
r2 = 2*erand48(xi)
dy = if r2<1: sqrt(r2)-1 else: 1-sqrt(2-r2)
d = cx*(((sx.float64 + 0.5 + dx)/2 + x.float64)/w - 0.5) +
cy*(((sy.float64 + 0.5 + dy)/2 + y.float64)/h - 0.5) + cam.d
let dnorm = d.norm() # Warning, the original code is deceptive since d is modified by d.norm()
let ray = ray(cam.o + dnorm*140'f64, dnorm)
let rad = radiance(ray, depth = 0, xi)
r = r + rad * (1.0/samples.float64)
C[i] = C[i] + vec(r.x.clamp(), r.y.clamp(), r.z.clamp()) * 0.25 # / num subpixels
when compileOption("threads"):
import ../../threadpool
proc tracer_threaded(C: var seq[Vec], w, h: static int, samples: int) =
# This gives the exact same result as single threaded and GCC OpenMP
# assumes that C++ calls resolve function calls from right to left
const
cam = ray(vec(50,52,295.6), vec(0,-0.042612,-1).norm())
cx = vec(w*0.5135/h)
cy = cx.cross(cam.d).norm()*0.5135
# We need the buffer raw address
let buf = cast[ptr UncheckedArray[Vec]](C[0].addr)
var tp = Threadpool.new()
tp.parallelFor y in 0 ..< h: # Loop over image rows
captures: {tp, buf, samples}
try:
stderr.write &"\rRendering ({samples*4} samples per pixel) {100.0*y.float64/float(h-1):5.2f}%"
except:
echo getCurrentExceptionMsg()
quit 1
var xi = [cushort 0, 0, cushort y*y*y]
for x in 0 ..< w: # Loop over columns
let i = (h-y-1)*w+x
for sy in 0 ..< 2: # 2x2 subpixel rows
for sx in 0 ..< 2: # 2x2 subpixel cols
var r = vec()
for s in 0 ..< samples:
let
r1 = 2*erand48(xi)
dx = if r1<1: sqrt(r1)-1 else: 1-sqrt(2-r1)
r2 = 2*erand48(xi)
dy = if r2<1: sqrt(r2)-1 else: 1-sqrt(2-r2)
d = cx*(((sx.float64 + 0.5 + dx)/2 + x.float64)/w - 0.5) +
cy*(((sy.float64 + 0.5 + dy)/2 + y.float64)/h - 0.5) + cam.d
let dnorm = d.norm() # Warning, the original code is deceptive since d is modified by d.norm()
let ray = ray(cam.o + dnorm*140'f64, dnorm)
let rad = radiance(ray, depth = 0, xi)
r = r + rad * (1.0/samples.float64)
buf[i] = buf[i] + vec(r.x.clamp(), r.y.clamp(), r.z.clamp()) * 0.25 # / num subpixels
tp.shutdown() # This implicitly blocks until tasks are all done
proc tracer_nested_parallelism(C: var seq[Vec], w, h: static int, samples: int) =
# The results are different since the RNG will not be seeded the same
# The rng needs to be thread-local but task stealing + nested parallelismmakes the
# actual executing thread pretty random.
# So the RNG has been moved to an inner scope,
# downside is that resulting images have horizontal noise instead of pointwise
const
cam = ray(vec(50,52,295.6), vec(0,-0.042612,-1).norm())
cx = vec(w*0.5135/h)
cy = cx.cross(cam.d).norm()*0.5135
# We need the buffer raw address
let buf = cast[ptr UncheckedArray[Vec]](C[0].addr)
var tp = Threadpool.new()
tp.parallelFor y in 0 ..< h: # Loop over image rows
captures: {buf, samples}
try:
stderr.write &"\rRendering ({samples*4} samples per pixel) {100.0*y.float64/float(h-1):5.2f}%"
except:
echo getCurrentExceptionMsg()
quit 1
tp.parallelFor x in 0 ..< w: # Loop over columns
captures: {y, buf, samples}
var xi = [cushort 0, 0, cushort y*y*y]
let i = (h-y-1)*w+x
for sy in 0 ..< 2: # 2x2 subpixel rows
for sx in 0 ..< 2: # 2x2 subpixel cols
var r = vec()
for s in 0 ..< samples:
let
r1 = 2*erand48(xi)
dx = if r1<1: sqrt(r1)-1 else: 1-sqrt(2-r1)
r2 = 2*erand48(xi)
dy = if r2<1: sqrt(r2)-1 else: 1-sqrt(2-r2)
d = cx*(((sx.float64 + 0.5 + dx)/2 + x.float64)/w - 0.5) +
cy*(((sy.float64 + 0.5 + dy)/2 + y.float64)/h - 0.5) + cam.d
let dnorm = d.norm() # Warning, the original code is deceptive since d is modified by d.norm()
let ray = ray(cam.o + dnorm*140'f64, dnorm)
let rad = radiance(ray, depth = 0, xi)
r = r + rad * (1.0/samples.float64)
buf[i] = buf[i] + vec(r.x.clamp(), r.y.clamp(), r.z.clamp()) * 0.25 # / num subpixels
tp.shutdown() # This implicitly blocks until tasks are all done
proc main() =
const w = 1024
const h = 768
var samples = 1
var outFile = "image.ppm"
let exeName = getAppFilename().extractFilename()
if paramCount() == 0:
echo &"Usage: {exeName} <samples per pixel:{samples*4}> <output image:{outFile}>"
echo &"Running with default samples = {samples*4}, output = {outFile}"
elif paramCount() == 1:
samples = paramStr(1).parseInt() div 4 # (2*2 blocks)
samples = max(samples, 1)
elif paramCount() == 2:
samples = paramStr(1).parseInt() div 4
samples = max(samples, 1)
outFile = paramStr(2)
else:
echo &"Usage: {exeName} <samples per pixel:{samples}> <output image:{outFile}>"
var C = newSeq[Vec](w*h)
when compileOption("threads"):
echo "Running multithreaded. (⚠️ Stack overflow at depth 4480 (400 samples)?)"
# tracer_threaded(C, w, h, samples)
tracer_nested_parallelism(C, w, h, samples)
else:
echo "Running single-threaded."
tracer_single(C, w, h, samples)
let f = open(outFile, mode = fmWrite)
defer: f.close()
f.write &"P3\n{w} {h}\n255\n"
for i in 0 ..< w*h:
f.write &"{C[i].x.toInt()} {C[i].y.toInt()} {C[i].z.toInt()} "
stderr.write "\nDone.\n"
main()

View File

@ -0,0 +1,109 @@
# Threadpool design
The threadpool design is heavily inspired by [Weave](https://github.com/mratsim/weave), the wealth of preparatory [research](https://github.com/mratsim/weave/tree/master/research) and the simplified Weave, [nim-taskpools](https://github.com/status-im/nim-taskpools)
The goal is to produce an extremely high-performance, low-overhead, energy-efficient multithreading runtime.
However, as the backend to a cryptographic library it needs to be high-assurance, in particular auditable and maintainable.
Unfortunately, Weave design, based on work-requesting requires more machinery compared to work-stealing, which means more state. Furthermore it includes a custom memory pool.
On the other hand, nim-taskpools does not support data parallelism (parallel for loop).
Also neither supports putting awaiting threads to sleep when the future they want to complete is not done AND there is no work left.
## Features
| Features | OpenMP | Weave | nim-taskpools | Constantine's Threadpool |
|--------------------------------------------------------------------------------------------------|------------------------------------------------------------|----------------------------------------------------|---------------|----------------------------------------------------|
| Task parallelism (Futures with spawn/sync) | no | yes | yes | yes |
| Data parallelism (parallel for-loop) | yes | yes | no | yes |
| Nested parallel-for regions support | no (lead to oversubscription) | yes | N/A | yes |
| Dataflow parallelism (Tasks triggered by events / precise task dependencies) | yes | yes | no | yes |
| Communication mechanisms | Shared-memory | Message-passing / Channels | Shared-memory | Shared-memory
| Load balancing strategy | static (GCC), work-stealing (Intel/LLVM) | work-sharing / work-requesting | work-stealing | work-stealing |
| Blocked tasks don't block runtime | N/A | no | yes | yes |
| Load-balancing strategy for task parallelism (important for fine-grained parallelism) | global queue (GCC), steal-one (Intel/LLVM) | Adaptative steal-one/steal-half | steal-one | steal-one (steal-half WIP) |
| Load-balancing strategy for data parallelism | eager splitting depending on iteration count and cpu count | lazy splitting depending on idle CPUs and workload | N/A | lazy splitting depending on idle CPUs and workload |
| Backoff worker when idle | yes (?) | yes | yes | yes |
| Backoff worker when awaiting task but no work | N/A | no | no | yes |
| Scheduler overhead/contention (measured on Fibonacci 40), important for fine-grained parallelism | Extreme: frozen runtime (GCC), high (Intel/LLVM) | low to very low | medium | low |
## Key features design
### Load-balancing for data parallelism
A critical issue in most (all?) runtimes used in HPC (OpenMP and Intel TBB in particular) is that they split their parallel for loop ahead of time.
They do not know how many idle threads there are, or how costly the workload that will be run will be. This leads to significant inefficiencies and performance unportability.
For example this repo https://github.com/zy97140/omp-benchmark-for-pytorch gives the number of elements thresholds under which parallelization is not profitable or even hurt performance for common float operations:
> |CPU Model|Sockets|Cores/Socket|Frequency|
> |---|---|---|---|
> |Intel(R) Xeon(R) CPU E5-2699 v4 |2|22|2.20GHz|
> |Intel(R) Xeon(R) Platinum 8180 CPU|2|28|2.50GHz|
> |Intel(R) Core(TM) i7-5960X CPU |1|8|3.00GHz|
>
> | |Xeon(R) Platinum 8180 CPU|Xeon(R) CPU E5-2699 v4| i7-5960X CPU|
> |---|------------------------:|---------------------:|------------:|
> |copy|80k|20k|8k|
> |add |80k|20k|8k|
> |div |50k|10k|2k|
> |exp |1k |1k |1k|
> |sin |1k |1k |1k|
> |sum |1k |1k |1k|
> |prod|1k |1k |1k|
>
> Details on the Xeon Platinum
>
> |Tensor Size|In series|In parallel|SpeedUp|
> |---|---:|---:|---:|
> |1k |1.04 |5.15| 0.20X |
> |2k |1.23 |5.47| 0.22X |
> |3k |1.33 |5.34| 0.24X |
> |4k |1.47 |5.41| 0.27X |
> |5k |1.48 |5.40| 0.27X |
> |8k |1.81 |5.55| 0.32X |
> |10k|1.98 |5.66| 0.35X |
> |20k|2.74 |6.74| 0.40X |
> |50k|5.12 |6.59| 0.77X |
> |__80k__|__14.79__|__6.59__| __2.24X__ |
> |__100k__|__21.97__|__6.70__| __3.27X__ |
Instead we can have each thread start working and use backpressure to lazily evaluate when it's profitable to split:
- Lazy Binary-Splitting: A Run-Time Adaptive Work-Stealing Scheduler
Tzannes, Caragea, Barua, Vishkin
https://terpconnect.umd.edu/~barua/ppopp164.pdf
### Backoff workers when awaiting a future
This problem is quite tricky:
- For latency, and to potentially create more work ASAP, we want such a worker to follow through on the blocked future ASAP.
- For throughput, and because a scheduler is optimal only when greedy, we want an idle thread to take any available work.
- But what if the work then blocks that worker for 1s? This hurts latency and might lead to work starvation if the continuation would have created more work.
- There is no robust, cross-platform API, to wake a specific thread awaiting on a futex or condition variable.
- The simplest design then would be to have an array of futexes, when backing-off sleep on those.
The issue then is that when offering work you have to scan that array to find a worker to wake.
Contrary to a idle worker, the waker is working so this scan hurts throughput and latency, and due to the many
atomics operations required, will completely thrash the cache of that worker.
- Another potential data structure would be a concurrent sparse-set but designing concurrent data structures is difficult.
and locking would be expensive for an active worker
The solution in Constantine's threadpool to minimize latency and maximize throughput and avoid implementing another concurrent data structure is
having "reserve threads". Before sleeping when blocked on a future the thread wakes a reserve thread. This maintain throughput, and the thread is immediately available
as well when the future is completed. A well-behaved program will always have at least 1 thread making progress among N, so a reserve of size N is sufficient.
### Scheduler overhead/contention
To enable fine-grained parallelism, i.e. parallelizing tasks in the microseconds range, it's critical to reduce contention.
A global task queue will be hammered by N threads, leading to each thrashing each other caches.
In contrast, distributed task queues with random victim selection significantly reduce contention.
Another source of overhead is the allocator, the worst case for allocators is allocation in a thread and deallocation in another, especially if the
allocating thread is always the same. Unfortunately this is common in producer-consumer workloads.
Besides multithreaded allocations/deallocations will trigger costly atomic-swaps and possibly fragmentation.
Minimizing allocations to the utmost will significantly help on fine-grained tasks.
- Weave solved that problem by having 2 levels of cache: a memory-pool for tasks and futures and a lookaside list that caches tasks to reduce further pressure on the memory pool.
- Nim-taskpools does not address this problem, it has an allocation overhead per tasks of 1 for std/tasks, 1 for the linked list that holds them, 1 for the result channel/flowvar.
Unlike GCC OpenMP which freezes on a fibonacci 40 benchmark, it can still finish but it's 20x slower than Weave.
- Constantine's threadpool solves the problem by making everything intrusive to a task: the task env, the future, the linked list.
In fact this solution is even faster than Weave's, probably due to significantly less page faults and cache misses.
Note that Weave has an even faster mode when futures don't escape their function by allocating them on the stack but without compiler support (heap allocation elision) that restricts the programs you can write.

View File

@ -0,0 +1,60 @@
import ../threadpool, ../instrumentation
block:
proc main() =
echo "\n=============================================================================================="
echo "Running 'threadpool/examples/e03_parallel_for.nim'"
echo "=============================================================================================="
var tp = Threadpool.new(numThreads = 4)
tp.parallelFor i in 0 ..< 100:
log("%d\n", i)
tp.shutdown()
echo "Simple parallel for"
echo "-------------------------"
main()
echo "-------------------------"
block: # Capturing outside scope
proc main2() =
echo "\n=============================================================================================="
echo "Running 'threadpool/examples/e03_parallel_for.nim'"
echo "=============================================================================================="
var tp = Threadpool.new(numThreads = 4)
var a = 100
var b = 10
tp.parallelFor i in 0 ..< 10:
captures: {a, b}
log("a+b+i = %d \n", a+b+i)
tp.shutdown()
echo "\n\nCapturing outside variables"
echo "-------------------------"
main2()
echo "-------------------------"
block: # Nested loops
proc main3() =
echo "\n=============================================================================================="
echo "Running 'threadpool/examples/e03_parallel_for.nim'"
echo "=============================================================================================="
var tp = Threadpool.new(numThreads = 4)
tp.parallelFor i in 0 ..< 4:
tp.parallelFor j in 0 ..< 8:
captures: {i}
log("Matrix[%d, %d]\n", i, j)
tp.shutdown()
echo "\n\nNested loops"
echo "-------------------------"
main3()
echo "-------------------------"

View File

@ -0,0 +1,34 @@
import ../threadpool
block:
proc main() =
echo "\n=============================================================================================="
echo "Running 'threadpool/examples/e04_parallel_reduce.nim'"
echo "=============================================================================================="
proc sumReduce(tp: Threadpool, n: int): int64 =
tp.parallelFor i in 0 .. n:
reduceInto(globalSum: int64):
prologue:
var localSum = 0'i64
forLoop:
localSum += int64(i)
merge(remoteSum: Flowvar[int64]):
localSum += sync(remoteSum)
epilogue:
return localSum
result = sync(globalSum)
var tp = Threadpool.new(numThreads = 4)
let sum1M = tp.sumReduce(1000000)
echo "Sum reduce(0..1000000): ", sum1M
doAssert sum1M == 500_000_500_000'i64
tp.shutdown()
echo "Simple parallel reduce"
echo "-------------------------"
main()
echo "-------------------------"

View File

@ -14,6 +14,10 @@ template log*(args: varargs[untyped]): untyped =
c_printf(args)
flushFile(stdout)
template debugSplit*(body: untyped): untyped =
when defined(TP_DebugSplit) or defined(TP_Debug):
{.noSideEffect, gcsafe.}: body
template debugTermination*(body: untyped): untyped =
when defined(TP_DebugTermination) or defined(TP_Debug):
{.noSideEffect, gcsafe.}: body

View File

@ -1,6 +1,5 @@
# Constantine
# Copyright (c) 2018-2019 Status Research & Development GmbH
# Copyright (c) 2020-Present Mamy André-Ratsimbazafy
# Weave
# Copyright (c) 2019 Mamy André-Ratsimbazafy
# Licensed and distributed under either of
# * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
# * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
@ -8,111 +7,128 @@
import
std/macros,
./crossthread/tasks_flowvars
./crossthread/tasks_flowvars,
../ast_rebuilder
# Task parallelism - spawn
# ---------------------------------------------
# Parallel offloading API
# -----------------------
# This file implements all the macros necessary
# to provide a comprehensive and hopefully intuitive API
# for all the parallelim paradigms supported:
#
# - Task parallelism
# - Data parallelism / parallel for
# - parallel-for with thread-local prologue and epilogue
# - parallel-reduction without atomics or locks
# - Dataflow parallelism
# - also known as:
# - Graph parallelism
# - Stream parallelism
# - Pipeline parallelism
# - Data-driven (task) parallelism
# with precise input/output dependencies
# ############################################################
# #
# Task parallelism #
# #
# ############################################################
proc spawnVoid(funcCall: NimNode, args, argsTy: NimNode, workerContext, schedule: NimNode): NimNode =
# Create the async function
let fn = funcCall[0]
let fnName = $fn
let withArgs = args.len > 0
let async_fn = ident("async_" & fnName)
var fnCall = newCall(fn)
let data = ident("data") # typed pointer to data
let tpSpawn_closure = ident("ctt_tpSpawnVoidClosure_" & fnName)
var loopFnCall = newCall(fn)
let env = ident("ctt_tpSpawnVoidEnv_") # typed pointer to env
# Schedule
let task = ident"task"
let task = ident"ctt_tpSpawnVoidTask_"
let scheduleBlock = newCall(schedule, workerContext, task)
result = newStmtList()
if funcCall.len == 2:
# With only 1 arg, the tuple syntax doesn't construct a tuple
# let data = (123) # is an int
fnCall.add nnkDerefExpr.newTree(data)
# let env = (123) # is an int
loopFnCall.add nnkDerefExpr.newTree(env)
else: # This handles the 0 arg case as well
for i in 1 ..< funcCall.len:
fnCall.add nnkBracketExpr.newTree(
data,
newLit i-1
)
loopFnCall.add nnkBracketExpr.newTree(
env,
newLit i-1)
# Create the async call
result.add quote do:
proc `async_fn`(param: pointer) {.nimcall.} =
proc `tpSpawn_closure`(env: pointer) {.nimcall, gcsafe, raises: [].} =
when bool(`withArgs`):
let `data` = cast[ptr `argsTy`](param)
`fnCall`
let `env` = cast[ptr `argsTy`](env)
`loopFnCall`
# Create the task
result.add quote do:
block enq_deq_task:
when bool(`withArgs`):
let `task` = Task.new(
let `task` = Task.newSpawn(
parent = `workerContext`.currentTask,
fn = `async_fn`,
params = `args`)
fn = `tpSpawn_closure`,
env = `args`)
else:
let `task` = Task.new(
let `task` = Task.newSpawn(
parent = `workerContext`.currentTask,
fn = `async_fn`)
fn = `tpSpawn_closure`)
`scheduleBlock`
proc spawnRet(funcCall: NimNode, retTy, args, argsTy: NimNode, workerContext, schedule: NimNode): NimNode =
# Create the async function
let fn = funcCall[0]
let fnName = $fn
let async_fn = ident("async_" & fnName)
var fnCall = newCall(fn)
let data = ident("data") # typed pointer to data
# Schedule
let task = ident"task"
let scheduleBlock = newCall(schedule, workerContext, task)
result = newStmtList()
let fn = funcCall[0]
let fnName = $fn
let tpSpawn_closure = ident("ctt_tpSpawnRetClosure_" & fnName)
var loopFnCall = newCall(fn)
let env = ident("ctt_tpSpawnRetEnv_") # typed pointer to env
# tasks have no return value.
# 1. The start of the task `data` buffer will store the return value for the flowvar and awaiter/sync
# 2. We create a wrapper async_fn without return value that send the return value in the channel
# 1. The start of the task `env` buffer will store the return value for the flowvar and awaiter/sync
# 2. We create a wrapper tpSpawn_closure without return value that send the return value in the channel
# 3. We package that wrapper function in a task
# We store the following in task.data:
# We store the following in task.env:
#
# | ptr Task | result | arg₀ | arg₁ | ... | argₙ
let fut = ident"fut"
let taskSelfReference = ident"taskSelfReference"
let retVal = ident"retVal"
let fut = ident"ctt_tpSpawnRetFut_"
let taskSelfReference = ident"ctt_taskSelfReference"
let retVal = ident"ctt_retVal"
var futArgs = nnkPar.newTree
var futArgsTy = nnkPar.newTree
futArgs.add taskSelfReference
futArgsTy.add nnkPtrTy.newTree(bindSym"Task")
futArgs.add retVal
futArgsTy.add retTy
var envParams = nnkPar.newTree
var envParamsTy = nnkPar.newTree
envParams.add taskSelfReference
envParamsTy.add nnkPtrTy.newTree(bindSym"Task")
envParams.add retVal
envParamsTy.add retTy
for i in 1 ..< funcCall.len:
futArgsTy.add getTypeInst(funcCall[i])
futArgs.add funcCall[i]
envParamsTy.add getTypeInst(funcCall[i])
envParams.add funcCall[i]
# data stores | ptr Task | result | arg₀ | arg₁ | ... | argₙ
# so arguments starts at data[2] in the wrapping funcCall functions
# env stores | ptr Task | result | arg₀ | arg₁ | ... | argₙ
# so arguments starts at env[2] in the wrapping funcCall functions
for i in 1 ..< funcCall.len:
fnCall.add nnkBracketExpr.newTree(
data,
newLit i+1
)
loopFnCall.add nnkBracketExpr.newTree(env, newLit i+1)
result.add quote do:
proc `async_fn`(param: pointer) {.nimcall.} =
let `data` = cast[ptr `futArgsTy`](param)
let res = `fnCall`
readyWith(`data`[0], res)
proc `tpSpawn_closure`(env: pointer) {.nimcall, gcsafe, raises: [].} =
let `env` = cast[ptr `envParamsTy`](env)
let res = `loopFnCall`
readyWith(`env`[0], res)
# Regenerate fresh ident, retTy has been tagged as a function call param
let retTy = ident($retTy)
let task = ident"ctt_tpSpawnRetTask_"
let scheduleBlock = newCall(schedule, workerContext, task)
# Create the task
result.add quote do:
@ -120,11 +136,11 @@ proc spawnRet(funcCall: NimNode, retTy, args, argsTy: NimNode, workerContext, sc
let `taskSelfReference` = cast[ptr Task](0xDEADBEEF)
let `retVal` = default(`retTy`)
let `task` = Task.new(
let `task` = Task.newSpawn(
parent = `workerContext`.currentTask,
fn = `async_fn`,
params = `futArgs`)
let `fut` = newFlowvar(`retTy`, `task`)
fn = `tpSpawn_closure`,
env = `envParams`)
let `fut` = newFlowVar(`retTy`, `task`)
`scheduleBlock`
# Return the future
`fut`
@ -133,8 +149,8 @@ proc spawnImpl*(tp: NimNode{nkSym}, funcCall: NimNode, workerContext, schedule:
funcCall.expectKind(nnkCall)
# Get the return type if any
let retType = funcCall[0].getImpl[3][0]
let needFuture = retType.kind != nnkEmpty
let retTy = funcCall[0].getImpl[3][0]
let needFuture = retTy.kind != nnkEmpty
# Get a serialized type and data for all function arguments
# We use adhoc tuple
@ -148,7 +164,628 @@ proc spawnImpl*(tp: NimNode{nkSym}, funcCall: NimNode, workerContext, schedule:
if not needFuture:
result = spawnVoid(funcCall, args, argsTy, workerContext, schedule)
else:
result = spawnRet(funcCall, retType, args, argsTy, workerContext, schedule)
result = spawnRet(funcCall, retTy, args, argsTy, workerContext, schedule)
# Wrap in a block for namespacing
result = nnkBlockStmt.newTree(newEmptyNode(), result)
# ############################################################
# #
# Data parallelism #
# #
# ############################################################
# Error messages generation
# --------------------------------------------------------------------------------------------------
# This outputs nice syntax examples for the parallel reduction
# and parallel staged domain specific languages.
type Example = enum
Reduce
Staged
template parReduceExample() {.dirty.}=
# Used for a nice error message
proc parallelReduceExample(n: int): int =
tp.parallelFor i in 0 ..< n:
## Declare a parallelFor or parallelForStrided loop as usual
reduceInto(globalSum: int64):
## Indicate that the loop is a reduction and declare the global reduction variable to sync with
prologue:
## Declare your local reduction variable(s) here
## It should be initialized with the neutral element
## corresponding to your fold operation.
## (0 for addition, 1 for multiplication, -Inf for max, +Inf for min, ...)
##
## This is task-local (and thread-local), each tasks set this section independently.
## Splitting in multiple tasks is done dynamically at the runtime discretion
## depending on available parallelism and load.
var localSum = 0
forLoop:
## This is the reduction loop
localSum += i
merge(remoteSum: FlowVar[int64]):
## Define how to merge with partial reductions from remote threads
## Remote threads result come as Flowvar that needs to be synced.
## Latency-hiding techniques can be use to overlap epilogue computations
## with other threads sync.
localSum += sync(remoteSum)
epilogue:
## Local task cleanup like memory allocated in prologue
## and returning the local accumulator
return localSum
## Await the parallel reduction
return sync(globalSum)
template parStagedExample() {.dirty.} =
# Used for a nice error message
proc parallelStagedSumExample(n: int): int =
## We will do a sum reduction to illustrate
## staged parallel for
## First take the address of the result
let res = result.addr
## Declare a parallelForStaged loop
tp.parallelForStaged i in 0 ..< n:
captures: {res}
prologue:
## Declare anything needed before the for-loop
## This will be thread-local, so each thread will run this section independently.
## The loop increment is not available here
var localSum = 0
forLoop:
## This is within the parallel loop
localSum += i
epilogue:
## Once the loop is finished, you have a final opportunity for processing.
## Thread-local cleanup should happen here as well
## Here we print the localSum and atomically increment the global sum
## before ending the task.
echo "localsum = ", localSum
res[].atomicInc(localSum)
## Await all tasks
tp.syncAll()
proc printReduceExample() =
let example = getAst(parReduceExample())
echo example.toStrLit()
proc printStagedExample() =
let example = getAst(parStagedExample())
echo example.toStrLit()
proc testKind(nn: NimNode, nnk: NimNodeKind, kind: Example) =
if nn.kind != nnk:
case kind
of Reduce: printReduceExample()
of Staged: printStagedExample()
nn.expectKind(nnk) # Gives nice line numbers
# Parallel Loop Domain Specific Language Descriptor
# --------------------------------------------------------------------------------------------------
type
LoopKind = enum
kForLoop
kReduction
kStaged
LoopDescriptor = object
## A loop descriptor fully described a parallel loop
## before final code generation
##
## Fields are ordered by depth of the call stack:
## - Users defines the loop boundaries and captures
## - a closure with signature `proc MyFunctionName(env: pointer)`
## is generated
## - it gets packaged in a task
## - on task execution, the inner proc is reconstructed
## - That inner proc may have various sections depending on the loop kind
kind: LoopKind
# Loop bounds
# -----------
indexVariable: NimNode
start: NimNode
stopEx: NimNode
stride: NimNode
# Closure generation
# ------------------
envName: NimNode
closureName: NimNode
closureDef: NimNode
capturedVars: NimNode
capturedTypes: NimNode
# Task packaging and scheduling
# -----------------------------
taskName: NimNode
taskCreation: NimNode
workerContext: NimNode
scheduleFn: NimNode
# Parallel loop stages
# --------------------
# There are 3 calls level for loops:
# - closure(env: pointer) {.nimcall, gcsafe, raises: [].}
# - loopFn(args: ptr (argsTy₀, argsTy₁, ..., argsTyₙ)): returnType {.inline, nimcall, gcsafe, raises: [].}
# let (args₀, args₁, ..., argsₙ) = args[]
# loopTemplate(indexVar, prologue, loopBody, ...)
# - loopTemplate(indexVar, prologue, loopBody, ...: untyped)
#
# The last 2 levels are inline in the closure.
# - The closure deals with removing type erasure from an untyped environment and updating the future once the task is finished
# - The loopFn reinstalls the captured values
# - The loopTemplate reimplements the sections as well as runtime interaction
# for loop splitting checks and merging reduction accumulators with splitted tasks.
#
# A side-benefit of the loopFn is that it allows borrow-checking:
# - error if we capture a `var parameter`
# - error if we forget to capture a runtime variable (compile-time constants do not have to be captured)
loopFnName: NimNode # inner function called by the closure once environment is retyped
loopTemplate: NimNode # inner function implementation, defined in threadpool.nim
prologue: NimNode
forLoop: NimNode
epilogue: NimNode
# Futures - awaitable loops and reductions
# ----------------------------------------
globalAwaitable: NimNode
remoteTaskAwaitable: NimNode
awaitableType: NimNode
mergeLocalWithRemote: NimNode
# Parsing parallel loop DSL
# --------------------------------------------------------------------------------------------------
proc checkLoopBounds(loopBounds: NimNode) =
## Checks loop parameters
## --------------------------------------------------------
## loopBounds should have the form "i in 0..<10"
loopBounds.expectKind(nnkInfix)
assert loopBounds[0].eqIdent"in"
loopBounds[1].expectKind(nnkIdent)
loopBounds[2].expectKind(nnkInfix) # 0 ..< 10 / 0 .. 10, for now we don't support slice objects
assert loopBounds[2][0].eqIdent".." or loopBounds[2][0].eqIdent"..<"
proc parseLoopBounds(ld: var LoopDescriptor, loopBounds: NimNode) =
## Extract the index, start and stop of the loop
## Strides must be dealt with separately
let loopBounds = rebuildUntypedAst(loopBounds, dropRootStmtList = true)
checkLoopBounds(loopBounds)
ld.indexVariable = loopBounds[1]
ld.start = loopBounds[2][1]
ld.stopEx = loopBounds[2][2]
# We use exclusive bounds
if loopBounds[2][0].eqIdent"..":
ld.stopEx = newCall(ident"succ", ld.stopEx)
proc parseCaptures(ld: var LoopDescriptor, body: NimNode) =
## Extract captured variables from the for-loop body.
## Once extracted the section that declared those captures will be discarded.
##
## Returns the captured variable and the captured variable types
## in a tuple of nnkPar for easy use in tuple construction and destructuring.
# parallelFor i in 0 ..< 10:
# captures: a
# ...
#
# StmtList
# Call
# Ident "captures"
# StmtList
# Curly
# Ident "a"
# Rest of the body
for i in 0 ..< body.len:
if body[i].kind == nnkCall and body[i][0].eqIdent"captures":
ld.capturedVars = nnkPar.newTree()
ld.capturedTypes = nnkPar.newTree()
body[i][1].expectKind(nnkStmtList)
body[i][1][0].expectKind(nnkCurly)
for j in 0 ..< body[i][1][0].len:
ld.capturedVars.add body[i][1][0][j]
ld.capturedTypes.add newCall(ident"typeof", body[i][1][0][j])
# Remove the captures section
body[i] = nnkDiscardStmt.newTree(body[i].toStrLit)
return
proc extractSection(ldField: var NimNode, body: NimNode, sectionName: string) =
body.expectKind(nnkStmtList)
for i in 0 ..< body.len:
if body[i].kind == nnkCall and body[i][0].eqIdent(sectionName):
body[i][1].expectKind(nnkStmtList)
ldField = body[i][1]
# Remove the section
body[i] = nnkDiscardStmt.newTree(body[i].toStrLit)
return
# Code generation
# --------------------------------------------------------------------------------------------------
proc generateClosure(ld: LoopDescriptor): NimNode =
let env = ld.envName
let capturedTypes = ld.capturedTypes
let withCaptures = ld.capturedTypes.len > 0
let closureName = ld.closureName
var loopFnCall = newCall(ld.loopFnName)
if withCaptures:
loopFnCall.add(env)
case ld.kind
of kForLoop:
result = quote do:
proc `closureName`(env: pointer) {.nimcall, gcsafe, raises: [].} =
when bool(`withCaptures`):
let `env` = cast[ptr `capturedTypes`](env)
`loopFnCall`
of kReduction:
let retTy = ld.awaitableType
result = quote do:
proc `closureName`(env: pointer) {.nimcall, gcsafe, raises: [].} =
let taskSelfReference = cast[ptr ptr Task](env)
when bool(`withCaptures`):
let offset = cast[ByteAddress](env) +% sizeof((ptr Task, `retTy`))
let `env` = cast[ptr `capturedTypes`](offset)
let res = `loopFnCall`
readyWith(taskSelfReference[], res)
else:
error "Not Implemented"
proc generateAndScheduleLoopTask(ld: LoopDescriptor): NimNode =
result = newStmtList()
var withCaptures = false
if not ld.capturedVars.isNil:
withCaptures = true
# TODO: awaitable for loop
# Dependencies
# ---------------------------------------------------
var scheduleBlock: NimNode
let task = ident"ctt_tpLoopTask_"
# TODO: Dataflow parallelism / precise task dependencies
scheduleBlock = newCall(ld.scheduleFn, ld.workerContext, task)
# ---------------------------------------------------
let
(start, stopEx, stride) = (ld.start, ld.stopEx, ld.stride)
workerContext = ld.workerContext
(closureName, capturedVars) = (ld.closureName, ld.capturedVars)
(globalAwaitable, awaitableType) = (ld.globalAwaitable, ld.awaitableType)
if ld.awaitableType.isNil():
result = quote do:
block enq_deq_task: # block for namespacing
let start = `start` # Ensure single evaluation / side-effect
let stopEx = `stopEx`
if stopEx-start != 0:
when bool(`withCaptures`):
let `task` = Task.newLoop(
parent = `workerContext`.currentTask,
start, stopEx, `stride`,
isFirstIter = true,
fn = `closureName`,
env = `capturedVars`)
else:
let `task` = Task.newLoop(
parent = `workerContext`.currentTask,
start, stopEx, `stride`,
isFirstIter = true,
fn = `closureName`)
`scheduleBlock`
else:
result = quote do:
var `globalAwaitable`: FlowVar[`awaitableType`]
block enq_deq_task: # Block for name spacing
let start = `start` # Ensure single evaluation / side-effect
let stopEx = `stopEx`
if stopEx-start != 0:
let taskSelfReference = cast[ptr Task](0xDEADBEEF)
var retValBuffer = default(`awaitableType`)
when bool(`withCaptures`):
let `task` = Task.newLoop(
parent = `workerContext`.currentTask,
start, stopEx, `stride`,
isFirstIter = true,
fn = `closureName`,
env = (taskSelfReference, retValBuffer, `capturedVars`))
else:
let `task` = Task.newLoop(
parent = `workerContext`.currentTask,
start, stopEx, `stride`,
isFirstIter = true,
fn = `closureName`,
env = (taskSelfReference, retValBuffer))
`globalAwaitable` = newFlowVar(`awaitableType`, `task`)
`scheduleBlock`
proc generateParallelLoop(ld: LoopDescriptor): NimNode =
# Package a parallel for loop into a proc
# Returns the statements that implements it.
let pragmas = nnkPragma.newTree(
ident"nimcall", ident"gcsafe", ident"inline",
nnkExprColonExpr.newTree(ident"raises", nnkBracket.newTree())) # raises: []
var params: seq[NimNode]
if ld.awaitableType.isNil:
params.add newEmptyNode()
else:
params.add ld.awaitableType
var procBody = newStmtList()
if ld.capturedVars.len > 0:
params.add newIdentDefs(ld.envName, nnkPtrTy.newTree(ld.capturedTypes))
let derefEnv = nnkBracketExpr.newTree(ld.envName)
if ld.capturedVars.len > 1:
# Unpack the variables captured from the environment
# let (a, b, c) = env[]
var unpacker = nnkVarTuple.newTree()
ld.capturedVars.copyChildrenTo(unpacker)
unpacker.add newEmptyNode()
unpacker.add derefEnv
procBody.add nnkLetSection.newTree(unpacker)
else:
procBody.add newLetStmt(ld.capturedVars[0], derefEnv)
case ld.kind
of kForLoop:
procBody.add newCall(ld.loopTemplate, ld.indexVariable, ld.forLoop)
of kReduction:
procBody.add newCall(
ld.loopTemplate, ld.indexVariable,
ld.prologue, ld.forLoop, ld.mergeLocalWithRemote, ld.epilogue,
ld.remoteTaskAwaitable, ld.awaitableType)
else:
error " Unimplemented"
result = newProc(
name = ld.loopFnName,
params = params,
body = procBody,
pragmas = pragmas)
# Parallel for
# --------------------------------------------------------------------------------------------------
proc parallelForImpl*(workerContext, scheduleFn, loopTemplate, loopBounds, body: NimNode): NimNode =
## Parallel for loop
## Syntax:
##
## parallelFor i in 0 ..< 10:
## echo(i)
##
## Variables from the external scope needs to be explicitly captured
##
## var a = 100
## var b = 10
## parallelFor i in 0 ..< 10:
## captures: {a, b}
## echo a + b + i
result = newStmtList()
var ld = LoopDescriptor(kind: kForLoop, workerContext: workerContext, scheduleFn: scheduleFn)
# Parse the loop Domain-Specific Language
# --------------------------------------------------------
body.expectKind(nnkStmtList)
ld.parseLoopBounds(loopBounds)
ld.stride.extractSection(body, "stride")
if ld.stride.isNil:
ld.stride = newLit(1)
ld.parseCaptures(body)
ld.forLoop = body
# Code generation
# --------------------------------------------------------
ld.loopTemplate = loopTemplate
ld.loopFnName = ident("ctt_tpParForImpl_")
ld.envName = ident("ctt_tpParForEnv_")
result.add ld.generateParallelLoop()
ld.closureName = ident("ctt_tpParForClosure_")
result.add ld.generateClosure()
ld.taskName = ident("ctt_tpParForTask_")
result.add ld.generateAndScheduleLoopTask()
# Parallel reductions
# --------------------------------------------------------------------------------------------------
proc parseReductionSection(body: NimNode):
tuple[globalAwaitable, awaitableType, reductionBody: NimNode] =
for i in 0 ..< body.len:
# parallelFor i in 0 .. n:
# reduceInto(globalSum: int64):
# prologue:
# var localSum = 0'i64
#
# StmtList
# Call
# ObjConstr
# Ident "reduceInto"
# ExprColonExpr
# Ident "globalSum"
# Ident "int64"
# StmtList
# Call
# Ident "prologue"
# StmtList
# VarSection
# IdentDefs
# Ident "localSum"
# Empty
# Int64Lit 0
if body[i].kind == nnkCall and
body[i][0].kind == nnkObjConstr and
body[i][0][0].eqident"reduceInto":
body[i][0][1].testKind(nnkExprColonExpr, Reduce)
body[i][1].testKind(nnkStmtList, Reduce)
if body[i][1].len != 4:
printReduceExample()
error "A reduction should have 4 sections named:\n" &
" prologue, forLoop, merge and epilogue statements\n"
# (globalAwaitable, awaitableType, reductionBody)
return (body[i][0][1][0], body[i][0][1][1], body[i][1])
printReduceExample()
error "Missing section \"reduceInto(globalAwaitable: awaitableType):\""
proc extractRemoteTaskMerge(ld: var LoopDescriptor, body: NimNode) =
for i in 0 ..< body.len:
if body[i].kind == nnkCall and
body[i][0].kind == nnkObjConstr and
body[i][0][0].eqident"merge":
body[i][0][1].testKind(nnkExprColonExpr, Reduce)
body[i][1].testKind(nnkStmtList, Reduce)
ld.remoteTaskAwaitable = body[i][0][1][0]
ld.mergeLocalWithRemote = body[i][1]
return
printReduceExample()
error "Missing section \"merge(remoteThreadAccumulator: Flowvar[accumulatorType]):\""
proc parallelReduceImpl*(workerContext, scheduleFn, loopTemplate, loopBounds, body: NimNode): NimNode =
## Parallel reduce loop
## Syntax:
##
## parallelFor i in 0 ..< 100:
## reduceInto(globalSum: int64):
## prologue:
## ## Initialize before the loop
## var localSum = 0
## forLoop:
## ## Compute the partial reductions
## localSum += i
## merge(remoteSum: Flowvar[int64]):
## ## Merge our local reduction with reduction from remote threads
## localSum += sync(remoteSum)
## return localSum
##
## # Await our result
## let sum = sync(globalSum)
##
## The first element from the iterator (i) in the example is not available in the prologue.
## Depending on multithreaded scheduling it may start at 0 or halfway or close to completion.
## The accumulator set in the prologue should be set at the neutral element for your fold operation:
## - 0 for addition, 1 for multiplication, +Inf for min, -Inf for max, ...
##
## In the forLoop section the iterator i is available, the number of iterations is undefined.
## The runtime chooses dynamically how many iterations are done to maximize throughput.
## - This requires your operation to be associative, i.e. (a+b)+c = a+(b+c).
## - It does not require your operation to be commutative (a+b = b+a is not needed).
## - In particular floating-point addition is NOT associative due to rounding errors.
## and result may differ between runs.
## For inputs usually in [-1,1]
## the floating point addition error is within 1e-8 (float32) or 1e-15 (float64).
## For inputs beyond 1e^9 please evaluate the acceptable precision.
## Note: that the main benefits of "-ffast-math" is considering floating-point addition
## associative
##
## In the merge section, a tuple (identifier: Flowvar[MyType]) for a partial reduction from a remote core must be passed.
## The merge section may be executed multiple times if a loop was split between many threads.
## The local partial reduction must be returned.
##
## Variables from the external scope needs to be explicitly captured.
## For example, to compute the variance of a seq in parallel
##
## var s = newSeqWith(1000, rand(100.0))
## let mean = mean(s)
##
## let ps = cast[ptr UncheckedArray[float64]](s)
##
## parallelFor i in 0 ..< s.len:
## captures: {ps, mean}
## reduceInto(globalVariance: float64):
## prologue:
## var localVariance = 0.0
## fold:
## localVariance += (ps[i] - mean)^2
## merge(remoteVariance: Flowvar[float64]):
## localVariance += sync(remoteVariance)
## return localVariance
##
## # Await our result
## let variance = sync(globalVariance)
##
## Performance note:
## For trivial floating points operations like addition/sum reduction:
## before parallelizing reductions on multiple cores
## you might try to parallelize it on a single core by
## creating multiple accumulators (between 2 and 4)
## and unrolling the accumulation loop by that amount.
##
## The compiler is unable to do that (without -ffast-math)
## as floating point addition is NOT associative and changing
## order will change the result due to floating point rounding errors.
##
## The performance improvement is dramatic (2x-3x) as at a low-level
## there is no data dependency between each accumulators and
## the CPU can now use instruction-level parallelism instead
## of suffer from data dependency latency (3 or 4 cycles)
## https://software.intel.com/sites/landingpage/IntrinsicsGuide/#techs=SSE&expand=158
## The reduction becomes memory-bound instead of CPU-latency-bound.
result = newStmtList()
var ld = LoopDescriptor(kind: kReduction, workerContext: workerContext, scheduleFn: scheduleFn)
# Parse the loop Domain-Specific Language
# --------------------------------------------------------
body.testKind(nnkStmtList, Reduce)
ld.parseLoopBounds(loopBounds)
ld.stride.extractSection(body, "stride")
if ld.stride.isNil:
ld.stride = newLit(1)
ld.parseCaptures(body)
var reductionBody: NimNode
(ld.globalAwaitable, ld.awaitableType, reductionBody) = parseReductionSection(body)
ld.extractRemoteTaskMerge(reductionBody)
ld.prologue.extractSection(reductionBody, "prologue")
ld.forLoop.extractSection(reductionBody, "forLoop")
ld.epilogue.extractSection(reductionBody, "epilogue")
# Code generation
# --------------------------------------------------------
ld.loopTemplate = loopTemplate
ld.loopFnName = ident("ctt_tpParReduceImpl_")
ld.envName = ident("ctt_tpParReduceEnv_")
result.add ld.generateParallelLoop()
ld.closureName = ident("ctt_tpParReduceClosure_")
result.add ld.generateClosure()
ld.taskName = ident("ctt_tpParReduceTask_")
result.add ld.generateAndScheduleLoopTask()
# ############################################################
# #
# Parallel For Dispatchers #
# #
# ############################################################
proc hasReduceSection*(body: NimNode): bool =
for i in 0 ..< body.len:
if body[i].kind == nnkCall:
for j in 0 ..< body[i].len:
if body[i][j].kind == nnkObjConstr and body[i][j][0].eqIdent"reduceInto":
return true
return false

View File

@ -9,7 +9,7 @@
when not compileOption("threads"):
{.error: "This requires --threads:on compilation flag".}
{.push raises: [].}
{.push raises: [], checks: off.}
import
std/[cpuinfo, atomics, macros],
@ -27,8 +27,14 @@ export
# flowvars
Flowvar, isSpawned, isReady, sync
# ############################################################
# #
# Types #
# #
# ############################################################
type
WorkerID = uint32
WorkerID = int32
Signal = object
terminate {.align: 64.}: Atomic[bool]
@ -52,26 +58,33 @@ type
# Adaptative theft policy
stealHalf: bool
recentTasks: uint32
recentThefts: uint32
recentTheftsAdaptative: uint32
recentLeaps: uint32
recentTasks: int32
recentThefts: int32
recentTheftsAdaptative: int32
recentLeaps: int32
Threadpool* = ptr object
barrier: SyncBarrier # Barrier for initialization and teardown
barrier: SyncBarrier # Barrier for initialization and teardown
# -- align: 64
globalBackoff: EventCount # Multi-Producer Multi-Consumer backoff
globalBackoff: EventCount # Multi-Producer Multi-Consumer backoff
reserveBackoff: EventCount
# -- align: 64
numThreads*{.align: 64.}: uint32
workerQueues: ptr UncheckedArray[Taskqueue]
workers: ptr UncheckedArray[Thread[(Threadpool, WorkerID)]]
workerSignals: ptr UncheckedArray[Signal]
numThreads*{.align: 64.}: int32 # N regular workers + N reserve workers
workerQueues: ptr UncheckedArray[Taskqueue] # size 2N
workers: ptr UncheckedArray[Thread[(Threadpool, WorkerID)]] # size 2N
workerSignals: ptr UncheckedArray[Signal] # size 2N
# -- align: 64
numIdleThreadsAwaitingFutures*{.align: 64.}: Atomic[int32]
# Thread-local config
# ---------------------------------------------
# ############################################################
# #
# Workers #
# #
# ############################################################
var workerContext {.threadvar.}: WorkerContext
## Thread-local Worker context
## We assume that a threadpool has exclusive ownership
proc setupWorker() =
## Initialize the thread-local context of a worker
@ -79,7 +92,7 @@ proc setupWorker() =
template ctx: untyped = workerContext
preCondition: not ctx.threadpool.isNil()
preCondition: 0 <= ctx.id and ctx.id < ctx.threadpool.numThreads.uint32
preCondition: 0 <= ctx.id and ctx.id < 2*ctx.threadpool.numThreads
preCondition: not ctx.threadpool.workerQueues.isNil()
preCondition: not ctx.threadpool.workerSignals.isNil()
@ -109,7 +122,8 @@ proc teardownWorker() =
workerContext.localBackoff.`=destroy`()
workerContext.taskqueue[].teardown()
proc eventLoop(ctx: var WorkerContext) {.raises:[].}
proc eventLoopRegular(ctx: var WorkerContext) {.raises:[], gcsafe.}
proc eventLoopReserve(ctx: var WorkerContext) {.raises:[], gcsafe.}
proc workerEntryFn(params: tuple[threadpool: Threadpool, id: WorkerID]) {.raises: [].} =
## On the start of the threadpool workers will execute this
@ -128,19 +142,24 @@ proc workerEntryFn(params: tuple[threadpool: Threadpool, id: WorkerID]) {.raises
# 1 matching barrier in Threadpool.new() for root thread
discard params.threadpool.barrier.wait()
{.cast(gcsafe).}: # Compiler does not consider that GC-safe by default when multi-threaded due to thread-local variables
ctx.eventLoop()
if ctx.id < ctx.threadpool.numThreads:
ctx.eventLoopRegular()
else:
ctx.eventLoopReserve()
debugTermination:
log(">>> Worker %2d shutting down <<<\n", ctx.id)
log(">>> Worker %3d shutting down <<<\n", ctx.id)
# 1 matching barrier in threadpool.shutdown() for root thread
discard params.threadpool.barrier.wait()
teardownWorker()
# Tasks
# ---------------------------------------------
# ############################################################
# #
# Tasks #
# #
# ############################################################
# Sentinel values
const ReadyFuture = cast[ptr EventNotifier](0xCA11AB1E)
@ -150,26 +169,26 @@ proc run*(ctx: var WorkerContext, task: ptr Task) {.raises:[].} =
## Run a task, frees it if it is not owned by a Flowvar
let suspendedTask = workerContext.currentTask
ctx.currentTask = task
debug: log("Worker %2d: running task.fn 0x%.08x (%d pending)\n", ctx.id, task.fn, ctx.taskqueue[].peek())
task.fn(task.data.addr)
debug: log("Worker %2d: completed task.fn 0x%.08x (%d pending)\n", ctx.id, task.fn, ctx.taskqueue[].peek())
debug: log("Worker %3d: running task 0x%.08x (previous: 0x%.08x, %d pending, thiefID %d)\n", ctx.id, task, suspendedTask, ctx.taskqueue[].peek(), task.thiefID)
task.fn(task.env.addr)
debug: log("Worker %3d: completed task 0x%.08x (%d pending)\n", ctx.id, task, ctx.taskqueue[].peek())
ctx.recentTasks += 1
ctx.currentTask = suspendedTask
if not task.hasFuture:
freeHeap(task)
return
# Sync with an awaiting thread without work in completeFuture
# Sync with an awaiting thread in completeFuture that didn't find work
var expected = (ptr EventNotifier)(nil)
if not compareExchange(task.waiter, expected, desired = ReadyFuture, moAcquireRelease):
debug: log("Worker %2d: completed task 0x%.08x, notifying waiter 0x%.08x\n", ctx.id, task, expected)
debug: log("Worker %3d: completed task 0x%.08x, notifying waiter 0x%.08x\n", ctx.id, task, expected)
expected[].notify()
proc schedule(ctx: var WorkerContext, tn: ptr Task, forceWake = false) {.inline.} =
## Schedule a task in the threadpool
## This wakes a sibling thread if our local queue is empty
## or forceWake is true.
debug: log("Worker %2d: schedule task 0x%.08x (parent 0x%.08x, current 0x%.08x)\n", ctx.id, tn, tn.parent, ctx.currentTask)
debug: log("Worker %3d: schedule task 0x%.08x (parent 0x%.08x, current 0x%.08x)\n", ctx.id, tn, tn.parent, ctx.currentTask)
# Instead of notifying every time a task is scheduled, we notify
# only when the worker queue is empty. This is a good approximation
@ -183,10 +202,252 @@ proc schedule(ctx: var WorkerContext, tn: ptr Task, forceWake = false) {.inline.
if forceWake or wasEmpty:
ctx.threadpool.globalBackoff.wake()
# Scheduler
# ---------------------------------------------
# ############################################################
# #
# Parallel-loops load-balancing #
# #
# ############################################################
iterator pseudoRandomPermutation(randomSeed, maxExclusive: uint32): uint32 =
# Inpired by
# - Lazy binary-splitting: a run-time adaptive work-stealing scheduler.
# Tzannes, A., G. C. Caragea, R. Barua, and U. Vishkin.
# In PPoPP 10, Bangalore, India, January 2010. ACM, pp. 179190.
# https://user.eng.umd.edu/~barua/ppopp164.pdf
# - Embracing Explicit Communication in Work-Stealing Runtime Systems.
# Andreas Prell, 2016
# https://epub.uni-bayreuth.de/id/eprint/2990/
#
# Instead of splitting loops ahead of time depending on the number of cores,
# we split just-in-time depending on idle threads.
# This allows the code to lazily evaluate when it's profitable to split,
# making parallel-for performance portable to any CPU and any inner algorithm
# unlike OpenMP or TBB for example, see design.md for performance unportability benchmark numbers.
# This frees the developer from grain size / work splitting thresholds.
iterator splitUpperRanges(
ctx: WorkerContext, task: ptr Task,
curLoopIndex: int, numIdle: int32
): tuple[start, size: int] =
## Split the iteration range based on the number of idle threads
## returns chunks with parameters (start, stopEx, len)
##
## - Chunks are balanced, their size differs by at most 1.
## Balanced workloads are scheduled with overhead similar to static scheduling.
## - Split is adaptative, unlike static scheduling or guided scheduling in OpenMP
## it is based on idle workers and not the number of cores.
## If enough parallelism is exposed, for example due to nested parallelism,
## there is no splitting overhead.
##
## - Updates the current task loopStop with the lower range
#
# Unbalanced example:
# Splitting 40 iterations on 12 threads
# A simple chunking algorithm with division + remainder
# will lead to a base chunk size of 40/12 = 3.
# 3*11 = 33, so last thread will do 7 iterations, more than twice the work.
#
# Note: Each division costs 55 cycles, 55x more than addition/substraction/bit operations
# and they can't use instruction-level parallelism.
# When dealing with loop ranges and strides we need to carefully craft our code to
# only use division where unavoidable: dividing by the number of idle threads.
# Loop metadata should allow us to avoid loop-bounds-related divisions completely.
preCondition: task.loopStepsLeft > 1
preCondition: curLoopIndex mod task.loopStride == 0
debugSplit:
log("Worker %3d: task 0x%.08x - %8d step(s) left (current: %3d, start: %3d, stop: %3d, stride: %3d, %3d idle worker(s))\n",
ctx.id, task, task.loopStepsLeft, curLoopIndex, task.loopStart, task.loopStop, task.loopStride, numIdle)
# Send a chunk of work to all idle workers + ourselves
let availableWorkers = numIdle + 1
let baseChunkSize = task.loopStepsLeft div availableWorkers
let cutoff = task.loopStepsLeft mod availableWorkers
block: # chunkID 0 is ours! My precious!!!
task.loopStepsLeft = baseChunkSize + int(0 < cutoff)
task.loopStop = min(task.loopStop, curLoopIndex + task.loopStepsLeft*task.loopStride)
debugSplit:
log("Worker %3d: task 0x%.08x - %8d step(s) kept locally (current: %3d, start: %3d, stop: %3d, stride: %3d)\n",
ctx.id, task, task.loopStepsLeft, curLoopIndex, task.loopStart, task.loopStop, task.loopStride)
for chunkID in 1 ..< availableWorkers:
# As the iterator callsite is copy-pasted, we want a single yield point.
var chunkSize = baseChunkSize
var offset = curLoopIndex
if chunkID < cutoff:
chunkSize += 1
offset += task.loopStride*chunkSize*chunkID
else:
offset += task.loopStride*(baseChunkSize*chunkID + cutoff)
yield (offset, chunkSize)
type BalancerBackoff = object
## We want to dynamically split parallel loops depending on the number of idle threads.
## However checking an atomic variable require synchronization which at the very least means
## reloading its value in all caches, a guaranteed cache miss. In a tight loop,
## this might be a significant cost, especially given that memory is often the bottleneck.
##
## There is no synchronization possible with thieves, unlike Prell PhD thesis.
## We want to avoid the worst-case scenario in Tzannes paper, tight-loop with too many available cores
## so the producer deque is always empty, leading to it spending all its CPU time splitting loops.
## For this we split depending on the numbers of idle CPUs. This prevents also splitting unnecessarily.
##
## Tzannes et al mentions that checking the thread own deque emptiness is a good approximation of system load
## with low overhead except in very fine-grained parallelism.
## With a better approximation, by checking the number of idle threads we can instead
## directly do the correct number of splits or avoid splitting. But this check is costly.
##
## To minimize checking cost while keeping latency low, even in bursty cases,
## we use log-log iterated backoff.
## - Adversarial Contention Resolution for Simple Channels
## Bender, Farach-Colton, He, Kuszmaul, Leiserson, 2005
## https://people.csail.mit.edu/bradley/papers/BenderFaHe05.pdf
nextCheck: int
windowLogSize: uint32 # while loopIndex < lastCheck + 2^windowLogSize, don't recheck.
round: uint32 # windowSize += 1 after log(windowLogSize) rounds
func increase(backoff: var BalancerBackoff) {.inline.} =
# On failure, we use log-log iterated backoff, an optimal backoff strategy
# suitable for bursts and adversarial conditions.
backoff.round += 1
if backoff.round >= log2_vartime(backoff.windowLogSize):
backoff.round = 0
backoff.windowLogSize += 1
func decrease(backoff: var BalancerBackoff) {.inline.} =
# On success, we exponentially reduce check window.
# Note: the thieves will start contributing as well.
backoff.windowLogSize -= 1
backoff.round = 0
if backoff.windowLogSize < 0:
backoff.windowLogSize = 0
proc splitAndDispatchLoop(ctx: var WorkerContext, task: ptr Task, curLoopIndex: int, numIdle: int32) =
# The iterator mutates the task with the first chunk metadata
let stop = task.loopStop
for (offset, numSteps) in ctx.splitUpperRanges(task, curLoopIndex, numIdle):
if numSteps == 0:
break
let upperSplit = allocHeapUnchecked(Task, sizeof(Task) + task.envSize)
copyMem(upperSplit, task, sizeof(Task) + task.envSize)
upperSplit.parent = task
upperSplit.thiefID.store(SentinelThief, moRelaxed)
upperSplit.waiter.store(nil, moRelaxed)
upperSplit.isFirstIter = false
upperSplit.loopStart = offset
upperSplit.loopStop = min(stop, offset + numSteps*upperSplit.loopStride)
upperSplit.loopStepsLeft = numSteps
if upperSplit.hasFuture:
# Update self-reference
cast[ptr ptr Task](upperSplit.env.addr)[] = upperSplit
# Create a private task-local linked-list of awaited tasks
task.reductionDAG = newReductionDagNode(task = upperSplit, next = task.reductionDAG)
upperSplit.reductionDAG = nil
debugSplit:
log("Worker %3d: task 0x%.08x - %8d step(s) sent in task 0x%.08x (start: %3d, stop: %3d, stride: %3d)\n",
ctx.id, task, upperSplit.loopStepsLeft, upperSplit, upperSplit.loopStart, upperSplit.loopStop, upperSplit.loopStride)
ctx.taskqueue[].push(upperSplit)
ctx.threadpool.globalBackoff.wakeAll()
proc loadBalanceLoop(ctx: var WorkerContext, task: ptr Task, curLoopIndex: int, backoff: var BalancerBackoff) =
## Split a parallel loop when necessary
# We might want to make this inline to cheapen the first check
# but it is 10% faster not inline on the transpose benchmark (memory-bandwidth bound)
if task.loopStepsLeft > 1 and curLoopIndex == backoff.nextCheck:
if ctx.taskqueue[].peek() == 0:
let waiters = ctx.threadpool.globalBackoff.getNumWaiters()
# We assume that the worker that scheduled the task will work on it. I.e. idleness is underestimated.
let numIdle = waiters.preSleep + waiters.committedSleep + int32(task.isFirstIter)
if numIdle > 0:
ctx.splitAndDispatchLoop(task, curLoopIndex, numIdle)
backoff.decrease()
else:
backoff.increase()
else:
backoff.increase()
backoff.nextCheck += task.loopStride shl backoff.windowLogSize
template parallelForWrapper(idx: untyped{ident}, loopBody: untyped): untyped =
## To be called within a loop task
## Gets the loop bounds and iterate the over them
## Also polls runtime status for dynamic loop splitting
##
## Loop prologue, epilogue,
## remoteAccum, resultTy and returnStmt
## are unused
block:
let this = workerContext.currentTask
var backoff = BalancerBackoff(
nextCheck: this.loopStart,
windowLogSize: 0,
round: 0)
if not this.isFirstIter:
# Task was just stolen, no need to check runtime status. do one loop first
backoff.nextCheck += this.loopStride
var idx = this.loopStart
while idx < this.loopStop:
loadBalanceLoop(workerContext, this, idx, backoff)
loopBody
idx += this.loopStride
this.loopStepsLeft -= 1
template parallelReduceWrapper(
idx: untyped{ident},
prologue, loopBody, mergeLocalWithRemote, epilogue,
remoteTaskAwaitable, awaitableType: untyped): untyped =
## To be called within a loop task
## Gets the loop bounds and iterate the over them
## Also polls runtime status for dynamic loop splitting
block:
let this = workerContext.currentTask
var backoff = BalancerBackoff(
nextCheck: this.loopStart,
windowLogSize: 0,
round: 0
)
if not this.isFirstIter:
# Task was just stolen, no need to check runtime status. do one loop first
backoff.nextCheck += this.loopStride
prologue
block: # loop body
var idx = this.loopStart
while idx < this.loopStop:
loadBalanceLoop(workerContext, this, idx, backoff)
loopBody
idx += this.loopStride
this.loopStepsLeft -= 1
block: # Merging with flowvars from remote threads
while not this.reductionDAG.isNil:
let reductionDagNode = this.reductionDAG
let remoteTaskAwaitable = cast[Flowvar[awaitableType]](reductionDagNode.task)
this.reductionDAG = reductionDagNode.next
mergeLocalWithRemote
# In `merge` there should be a sync which frees `reductionDagNode.task`
freeHeap(reductionDagNode)
epilogue
# ############################################################
# #
# Scheduler #
# #
# ############################################################
iterator pseudoRandomPermutation(randomSeed: uint32, maxExclusive: int32): int32 =
## Create a (low-quality) pseudo-random permutation from [0, max)
# Design considerations and randomness constraint for work-stealing, see docs/random_permutations.md
#
@ -216,6 +477,7 @@ iterator pseudoRandomPermutation(randomSeed, maxExclusive: uint32): uint32 =
# - a != 1, so we now have a multiplicative factor, which makes output more "random looking".
# n and (m-1) <=> n mod m, if m is a power of 2
let maxExclusive = cast[uint32](maxExclusive)
let M = maxExclusive.nextPowerOfTwo_vartime()
let c = (randomSeed and ((M shr 1) - 1)) * 2 + 1 # c odd and c ∈ [0, M)
let a = (randomSeed and ((M shr 2) - 1)) * 4 + 1 # a-1 divisible by 2 (all prime factors of m) and by 4 if m divisible by 4
@ -226,7 +488,7 @@ iterator pseudoRandomPermutation(randomSeed, maxExclusive: uint32): uint32 =
var x = start
while true:
if x < maxExclusive:
yield x
yield cast[int32](x)
x = (a*x + c) and mask # ax + c (mod M), with M power of 2
if x == start:
break
@ -234,7 +496,7 @@ iterator pseudoRandomPermutation(randomSeed, maxExclusive: uint32): uint32 =
proc tryStealOne(ctx: var WorkerContext): ptr Task =
## Try to steal a task.
let seed = ctx.rng.next().uint32
for targetId in seed.pseudoRandomPermutation(ctx.threadpool.numThreads):
for targetId in seed.pseudoRandomPermutation(2*ctx.threadpool.numThreads):
if targetId == ctx.id:
continue
@ -279,7 +541,7 @@ proc tryStealAdaptative(ctx: var WorkerContext): ptr Task =
# ctx.updateStealStrategy()
let seed = ctx.rng.next().uint32
for targetId in seed.pseudoRandomPermutation(ctx.threadpool.numThreads):
for targetId in seed.pseudoRandomPermutation(2*ctx.threadpool.numThreads):
if targetId == ctx.id:
continue
@ -308,14 +570,14 @@ proc tryLeapfrog(ctx: var WorkerContext, awaitedTask: ptr Task): ptr Task =
var thiefID = SentinelThief
while true:
debug: log("Worker %2d: waiting for thief to publish their ID\n", ctx.id)
debug: log("Worker %3d: leapfrogging - waiting for thief of task 0x%.08x to publish their ID\n", ctx.id, awaitedTask)
thiefID = awaitedTask.thiefID.load(moAcquire)
if thiefID != SentinelThief:
break
cpuRelax()
ascertain: 0 <= thiefID and thiefID < ctx.threadpool.numThreads
ascertain: 0 <= thiefID and thiefID < 2*ctx.threadpool.numThreads
# Leapfrogging is used when completing a future, steal only one
# Leapfrogging is used when completing a future, so steal only one task
# and don't leave tasks stranded in our queue.
let leapTask = ctx.id.steal(ctx.threadpool.workerQueues[thiefID])
if not leapTask.isNil():
@ -325,37 +587,100 @@ proc tryLeapfrog(ctx: var WorkerContext, awaitedTask: ptr Task): ptr Task =
return leapTask
return nil
proc eventLoop(ctx: var WorkerContext) {.raises:[].} =
proc eventLoopRegular(ctx: var WorkerContext) {.raises:[], gcsafe.} =
## Each worker thread executes this loop over and over.
while true:
# 1. Pick from local queue
debug: log("Worker %2d: eventLoop 1 - searching task from local queue\n", ctx.id)
debug: log("Worker %3d: eventLoopRegular 1 - searching task from local queue\n", ctx.id)
while (var task = ctx.taskqueue[].pop(); not task.isNil):
debug: log("Worker %2d: eventLoop 1 - running task 0x%.08x (parent 0x%.08x, current 0x%.08x)\n", ctx.id, task, task.parent, ctx.currentTask)
debug: log("Worker %3d: eventLoopRegular 1 - running task 0x%.08x (parent 0x%.08x, current 0x%.08x)\n", ctx.id, task, task.parent, ctx.currentTask)
ctx.run(task)
# 2. Run out of tasks, become a thief
debug: log("Worker %2d: eventLoop 2 - becoming a thief\n", ctx.id)
debug: log("Worker %3d: eventLoopRegular 2 - becoming a thief\n", ctx.id)
let ticket = ctx.threadpool.globalBackoff.sleepy()
if (var stolenTask = ctx.tryStealAdaptative(); not stolenTask.isNil):
# We manage to steal a task, cancel sleep
ctx.threadpool.globalBackoff.cancelSleep()
# 2.a Run task
debug: log("Worker %2d: eventLoop 2.a - stole task 0x%.08x (parent 0x%.08x, current 0x%.08x)\n", ctx.id, stolenTask, stolenTask.parent, ctx.currentTask)
debug: log("Worker %3d: eventLoopRegular 2.a - stole task 0x%.08x (parent 0x%.08x, current 0x%.08x)\n", ctx.id, stolenTask, stolenTask.parent, ctx.currentTask)
ctx.run(stolenTask)
elif ctx.signal.terminate.load(moAcquire):
# 2.b Threadpool has no more tasks and we were signaled to terminate
ctx.threadpool.globalBackoff.cancelSleep()
debugTermination: log("Worker %2d: eventLoop 2.b - terminated\n", ctx.id)
debugTermination: log("Worker %3d: eventLoopRegular 2.b - terminated\n", ctx.id)
break
else:
# 2.b Park the thread until a new task enters the threadpool
debug: log("Worker %2d: eventLoop 2.b - sleeping\n", ctx.id)
# 2.c Park the thread until a new task enters the threadpool
debug: log("Worker %3d: eventLoopRegular 2.b - sleeping\n", ctx.id)
ctx.threadpool.globalBackoff.sleep(ticket)
debug: log("Worker %2d: eventLoop 2.b - waking\n", ctx.id)
debug: log("Worker %3d: eventLoopRegular 2.b - waking\n", ctx.id)
# Sync
# ---------------------------------------------
proc eventLoopReserve(ctx: var WorkerContext) {.raises:[], gcsafe.} =
## A reserve worker is a relay when a thread is stuck awaiting a future completion.
## This ensure those threads are available as soon as the future completes, minimizing latency
## while ensuring the runtime uses all available hardware resources, maximizing throughput.
template reserveSleepCheck: untyped =
let ticket = ctx.threadpool.reserveBackoff.sleepy()
let (reservePlanningSleep, reserveCommittedSleep) = ctx.threadpool.reserveBackoff.getNumWaiters()
let numActiveReservists = ctx.threadpool.numThreads - (reservePlanningSleep-1 + reserveCommittedSleep) # -1 we don't want to count ourselves
if ctx.signal.terminate.load(moAcquire): # If terminated, we leave everything as-is, the regular workers will finish
ctx.threadpool.reserveBackoff.cancelSleep()
debugTermination: log("Worker %3d: reserveSleepCheck - terminated\n", ctx.id)
return
elif numActiveReservists > ctx.threadpool.numIdleThreadsAwaitingFutures.load(moAcquire):
ctx.threadpool.globalBackoff.wake() # In case we were just woken up for a task or we have tasks in our queue, pass the torch
debug: log("Worker %3d: reserveSleepCheck - going to sleep on reserve backoff\n", ctx.id)
ctx.threadpool.reserveBackoff.sleep(ticket)
debug: log("Worker %3d: reserveSleepCheck - waking on reserve backoff\n", ctx.id)
else:
ctx.threadpool.reserveBackoff.cancelSleep()
while true:
# 1. Pick from local queue
debug: log("Worker %3d: eventLoopReserve 1 - searching task from local queue\n", ctx.id)
while true:
reserveSleepCheck()
var task = ctx.taskqueue[].pop()
if task.isNil():
break
debug: log("Worker %3d: eventLoopReserve 1 - running task 0x%.08x (parent 0x%.08x, current 0x%.08x)\n", ctx.id, task, task.parent, ctx.currentTask)
ctx.run(task)
# 2. Run out of tasks, become a thief
debug: log("Worker %3d: eventLoopReserve 2 - becoming a thief\n", ctx.id)
let ticket = ctx.threadpool.globalBackoff.sleepy() # If using a reserve worker was necessary, sleep on the backoff for active threads
if (var stolenTask = ctx.tryStealAdaptative(); not stolenTask.isNil):
# We manage to steal a task, cancel sleep
ctx.threadpool.globalBackoff.cancelSleep()
# 2.a Run task
debug: log("Worker %3d: eventLoopReserve 2.a - stole task 0x%.08x (parent 0x%.08x, current 0x%.08x)\n", ctx.id, stolenTask, stolenTask.parent, ctx.currentTask)
ctx.run(stolenTask)
elif ctx.signal.terminate.load(moAcquire):
# 2.b Threadpool has no more tasks and we were signaled to terminate
ctx.threadpool.globalBackoff.cancelSleep()
debugTermination: log("Worker %3d: eventLoopReserve 2.b - terminated\n", ctx.id)
break
else:
# 2.c Park the thread until a new task enters the threadpool.
# It is intentionally parked with all active threads as long as a reservist is needed
let (reservePlanningSleep, reserveCommittedSleep) = ctx.threadpool.reserveBackoff.getNumWaiters()
let numActiveReservists = ctx.threadpool.numThreads - (reservePlanningSleep-1 + reserveCommittedSleep) # -1 we don't want to count ourselves
if numActiveReservists > ctx.threadpool.numIdleThreadsAwaitingFutures.load(moAcquire):
ctx.threadpool.globalBackoff.cancelSleep()
continue
debug: log("Worker %3d: eventLoopReserve 2.b - sleeping on active threads backoff\n", ctx.id)
ctx.threadpool.globalBackoff.sleep(ticket)
debug: log("Worker %3d: eventLoopReserve 2.b - waking on active threads backoff\n", ctx.id)
# ############################################################
# #
# Futures & Synchronization #
# #
# ############################################################
template isRootTask(task: ptr Task): bool =
task == RootTask
@ -367,7 +692,7 @@ proc completeFuture*[T](fv: Flowvar[T], parentResult: var T) {.raises:[].} =
template isFutReady(): untyped =
let isReady = fv.task.completed.load(moAcquire)
if isReady:
parentResult = cast[ptr (ptr Task, T)](fv.task.data.addr)[1]
parentResult = cast[ptr (ptr Task, T)](fv.task.env.addr)[1]
isReady
if isFutReady():
@ -375,55 +700,55 @@ proc completeFuture*[T](fv: Flowvar[T], parentResult: var T) {.raises:[].} =
## 1. Process all the children of the current tasks.
## This ensures that we can give control back ASAP.
debug: log("Worker %2d: sync 1 - searching task from local queue\n", ctx.id)
debug: log("Worker %3d: sync 1 - searching task from local queue\n", ctx.id)
while (let task = ctx.taskqueue[].pop(); not task.isNil):
if task.parent != ctx.currentTask:
debug: log("Worker %2d: sync 1 - skipping non-direct descendant task 0x%.08x (parent 0x%.08x, current 0x%.08x)\n", ctx.id, task, task.parent, ctx.currentTask)
debug: log("Worker %3d: sync 1 - skipping non-direct descendant task 0x%.08x (parent 0x%.08x, current 0x%.08x)\n", ctx.id, task, task.parent, ctx.currentTask)
ctx.schedule(task, forceWake = true) # reschedule task and wake a sibling to take it over.
break
debug: log("Worker %2d: sync 1 - running task 0x%.08x (parent 0x%.08x, current 0x%.08x)\n", ctx.id, task, task.parent, ctx.currentTask)
debug: log("Worker %3d: sync 1 - running task 0x%.08x (parent 0x%.08x, current 0x%.08x)\n", ctx.id, task, task.parent, ctx.currentTask)
ctx.run(task)
if isFutReady():
debug: log("Worker %2d: sync 1 - future ready, exiting\n", ctx.id)
debug: log("Worker %3d: sync 1 - future ready, exiting\n", ctx.id)
return
## 2. We run out-of-tasks or out-of-direct-child of our current awaited task
## So the task is bottlenecked by dependencies in other threads,
## hence we abandon our enqueued work and steal in the others' queues
## in hope it advances our awaited task. This prioritizes latency over throughput.
## hence we abandon our enqueued work and steal.
##
## See also
## - Proactive work-stealing for futures
## Kyle Singer, Yifan Xu, I-Ting Angelina Lee, 2019
## https://dl.acm.org/doi/10.1145/3293883.3295735
debug: log("Worker %2d: sync 2 - future not ready, becoming a thief (currentTask 0x%.08x)\n", ctx.id, ctx.currentTask)
debug: log("Worker %3d: sync 2 - future not ready, becoming a thief (currentTask 0x%.08x)\n", ctx.id, ctx.currentTask)
while not isFutReady():
if (let leapTask = ctx.tryLeapfrog(fv.task); not leapTask.isNil):
# We stole a task generated by the task we are awaiting.
debug: log("Worker %2d: sync 2.1 - leapfrog task 0x%.08x (parent 0x%.08x, current 0x%.08x)\n", ctx.id, leapTask, leapTask.parent, ctx.currentTask)
# Leapfrogging, the thief had an empty queue, hence if there are tasks in its queue, it's generated by our blocked task.
# Help the thief clear those, as if it did not finish, it's likely blocked on those children tasks.
debug: log("Worker %3d: sync 2.1 - leapfrog task 0x%.08x (parent 0x%.08x, current 0x%.08x)\n", ctx.id, leapTask, leapTask.parent, ctx.currentTask)
ctx.run(leapTask)
elif (let stolenTask = ctx.tryStealOne(); not stolenTask.isNil):
# We stole a task, we hope we advance our awaited task.
debug: log("Worker %2d: sync 2.2 - stole task 0x%.08x (parent 0x%.08x, current 0x%.08x)\n", ctx.id, stolenTask, stolenTask.parent, ctx.currentTask)
ctx.run(stolenTask)
elif (let ownTask = ctx.taskqueue[].pop(); not ownTask.isNil):
# We advance our own queue, this increases global throughput but may impact latency on the awaited task.
#
# Note: for a scheduler to be optimal (i.e. within 2x than ideal) it should be greedy
# so all workers should be working. This is a difficult tradeoff.
debug: log("Worker %2d: sync 2.3 - couldn't steal, running own task\n", ctx.id)
ctx.run(ownTask)
else:
# Nothing to do, we park.
# Note: On today's hyperthreaded systems, it might be more efficient to always park
# instead of working on unrelated tasks in our task queue, despite making the scheduler non-greedy.
# The actual hardware resources are 2x less than the actual number of cores
# At this point, we have significant design decisions:
# - Do we steal from other workers in hope we advance our awaited task?
# - Do we advance our own queue for tasks that are not child of our awaited tasks?
# - Do we park instead of working on unrelated task. With hyperthreading that would actually still leave the core busy enough?
#
# - If we work, we maximize throughput, but we increase latency to handle the future's continuation.
# If that future creates more parallel work, we would actually have restricted parallelism.
# - If we park, we minimize latency, but we don't use the full hardware resources, and there are CPUs without hyperthreading (on ARM for example)
# Furthermore, a work-stealing scheduler is within 2x an optimal scheduler if it is greedy, i.e., as long as there is enough work, all cores are used.
#
# The solution chosen is to wake a reserve thread, keeping hardware offered/throughput constant. And put the awaiting thread to sleep.
ctx.localBackoff.prepareToPark()
discard ctx.threadpool.numIdleThreadsAwaitingFutures.fetchAdd(1, moRelease)
ctx.threadpool.reserveBackoff.wake()
var expected = (ptr EventNotifier)(nil)
if compareExchange(fv.task.waiter, expected, desired = ctx.localBackoff.addr, moAcquireRelease):
ctx.localBackoff.park()
discard ctx.threadpool.numIdleThreadsAwaitingFutures.fetchSub(1, moRelease)
proc syncAll*(tp: Threadpool) {.raises: [].} =
## Blocks until all pending tasks are completed
## This MUST only be called from
@ -431,64 +756,76 @@ proc syncAll*(tp: Threadpool) {.raises: [].} =
template ctx: untyped = workerContext
debugTermination:
log(">>> Worker %2d enters barrier <<<\n", ctx.id)
log(">>> Worker %3d enters barrier <<<\n", ctx.id)
preCondition: ctx.id == 0
preCondition: ctx.currentTask.isRootTask()
# Empty all tasks
tp.globalBackoff.wakeAll()
while true:
# 1. Empty local tasks
debug: log("Worker %2d: syncAll 1 - searching task from local queue\n", ctx.id)
debug: log("Worker %3d: syncAll 1 - searching task from local queue\n", ctx.id)
while (let task = ctx.taskqueue[].pop(); not task.isNil):
debug: log("Worker %2d: syncAll 1 - running task 0x%.08x (parent 0x%.08x, current 0x%.08x)\n", ctx.id, task, task.parent, ctx.currentTask)
debug: log("Worker %3d: syncAll 1 - running task 0x%.08x (parent 0x%.08x, current 0x%.08x)\n", ctx.id, task, task.parent, ctx.currentTask)
ctx.run(task)
if tp.numThreads == 1:
break
# 2. Help other threads
debug: log("Worker %2d: syncAll 2 - becoming a thief\n", ctx.id)
debugTermination:
let regular = tp.globalBackoff.getNumWaiters()
let reserve = tp.reserveBackoff.getNumWaiters()
log("Worker %3d: syncAll 2 - becoming a thief - (preSleep: %d, sleeping %d) regular and (preSleep: %d, sleeping %d) reserve workers\n",
ctx.id, regular.preSleep, regular.committedSleep, reserve.preSleep, reserve.committedSleep)
if (var stolenTask = ctx.tryStealAdaptative(); not stolenTask.isNil):
# 2.a We stole some task
debug: log("Worker %2d: syncAll 2.a - stole task 0x%.08x (parent 0x%.08x, current 0x%.08x)\n", ctx.id, stolenTask, stolenTask.parent, ctx.currentTask)
debug: log("Worker %3d: syncAll 2.a - stole task 0x%.08x (parent 0x%.08x, current 0x%.08x)\n", ctx.id, stolenTask, stolenTask.parent, ctx.currentTask)
ctx.run(stolenTask)
elif tp.globalBackoff.getNumWaiters() == (0'u32, tp.numThreads - 1):
elif tp.reserveBackoff.getNumWaiters() == (0'i32, tp.numThreads) and
tp.globalBackoff.getNumWaiters() == (0'i32, tp.numThreads-1): # Don't count ourselves
# 2.b all threads besides the current are parked
debugTermination: log("Worker %2d: syncAll 2.b - termination, all other threads sleeping\n", ctx.id)
debugTermination: log("Worker %3d: syncAll 2.b - termination, all other threads sleeping\n", ctx.id)
break
else:
# 2.c We don't park as there is no notif for task completion
cpuRelax()
debugTermination:
log(">>> Worker %2d leaves barrier <<<\n", ctx.id)
log(">>> Worker %3d leaves barrier <<<\n", ctx.id)
# Runtime
# ---------------------------------------------
# ############################################################
# #
# Runtime API #
# #
# ############################################################
proc new*(T: type Threadpool, numThreads = countProcessors()): T {.raises: [ResourceExhaustedError].} =
## Initialize a threadpool that manages `numThreads` threads.
## Default to the number of logical processors available.
##
## A Constantine's threadpool cannot be instantiated
## on a thread managed by another Constantine's threadpool
## including the root thread.
##
## Mixing with other libraries' threadpools and runtime
## will not impact correctness but may impact performance.
type TpObj = typeof(default(Threadpool)[]) # due to C import, we need a dynamic sizeof
var tp = allocHeapUncheckedAlignedPtr(Threadpool, sizeof(TpObj), alignment = 64)
tp.barrier.init(numThreads.uint32)
tp.barrier.init(2*numThreads.uint32)
tp.globalBackoff.initialize()
tp.numThreads = numThreads.uint32
tp.workerQueues = allocHeapArrayAligned(Taskqueue, numThreads, alignment = 64)
tp.workers = allocHeapArrayAligned(Thread[(Threadpool, WorkerID)], numThreads, alignment = 64)
tp.workerSignals = allocHeapArrayAligned(Signal, numThreads, alignment = 64)
tp.reserveBackoff.initialize()
tp.numThreads = numThreads.int32
tp.numIdleThreadsAwaitingFutures.store(0, moRelaxed)
# Allocate for `numThreads` regular workers and `numTHreads` reserve workers
tp.workerQueues = allocHeapArrayAligned(Taskqueue, 2*numThreads, alignment = 64)
tp.workers = allocHeapArrayAligned(Thread[(Threadpool, WorkerID)], 2*numThreads, alignment = 64)
tp.workerSignals = allocHeapArrayAligned(Signal, 2*numThreads, alignment = 64)
# Setup master thread
workerContext.id = 0
workerContext.threadpool = tp
# Start worker threads
for i in 1 ..< numThreads:
for i in 1 ..< 2*numThreads:
createThread(tp.workers[i], workerEntryFn, (tp, WorkerID(i)))
# Root worker
@ -505,7 +842,7 @@ proc cleanup(tp: var Threadpool) {.raises: [].} =
## Cleanup all resources allocated by the threadpool
preCondition: workerContext.currentTask.isRootTask()
for i in 1 ..< tp.numThreads:
for i in 1 ..< 2*tp.numThreads:
joinThread(tp.workers[i])
tp.workerSignals.freeHeapAligned()
@ -522,10 +859,11 @@ proc shutdown*(tp: var Threadpool) {.raises:[].} =
tp.syncAll()
# Signal termination to all threads
for i in 0 ..< tp.numThreads:
for i in 0 ..< 2*tp.numThreads:
tp.workerSignals[i].terminate.store(true, moRelease)
tp.globalBackoff.wakeAll()
tp.reserveBackoff.wakeAll()
# 1 matching barrier in workerEntryFn
discard tp.barrier.wait()
@ -538,6 +876,18 @@ proc shutdown*(tp: var Threadpool) {.raises:[].} =
{.pop.} # raises:[]
# ############################################################
# #
# Parallel API #
# #
# ############################################################
proc getThreadID(tp: Threadpool): int {.inline, used.} =
## Returns the worker local ID.
## This is a debug proc for logging purposes
## The threadpool needs to be imported with {.all.} pragma
workerContext.id
# Task parallel API
# ---------------------------------------------
@ -551,3 +901,44 @@ macro spawn*(tp: Threadpool, fnCall: typed): untyped =
##
## Tasks are processed approximately in Last-In-First-Out (LIFO) order
result = spawnImpl(tp, fnCall, bindSym"workerContext", bindSym"schedule")
# Data parallel API
# ---------------------------------------------
# TODO: we can fuse parallelFor and parallelForStrided
# in a single proc once {.experimental: "flexibleOptionalParams".}
# is not experimental anymore
macro parallelFor*(tp: Threadpool, loopParams: untyped, body: untyped): untyped =
## Parallel for loop.
## Syntax:
##
## tp.parallelFor i in 0 ..< 10:
## echo(i)
##
## Variables from the external scope needs to be explicitly captured
##
## var a = 100
## var b = 10
## tp.parallelFor i in 0 ..< 10:
## captures: {a, b}
## echo a + b + i
##
result = newStmtList()
result.add quote do:
# Avoid integer overflow checks in tight loop
# and no exceptions in code.
{.push checks:off.}
if body.hasReduceSection():
result.add parallelReduceImpl(
bindSym"workerContext", bindSym"schedule",
bindSym"parallelReduceWrapper",
loopParams, body)
else:
result.add parallelForImpl(
bindSym"workerContext", bindSym"schedule",
bindSym"parallelForWrapper",
loopParams, body)
result.add quote do:
{.pop.}

View File

@ -247,7 +247,7 @@ func random_long01Seq(rng: var RngState, a: var BigInt) =
## Initialize a bigint
## It is skewed towards producing strings of 1111... and 0000
## to trigger edge cases
var buf: array[(a.bits + 7) div 8, byte]
var buf: array[a.bits.ceilDiv_vartime(8), byte]
rng.random_long01Seq(buf)
let order = rng.sample_unsafe([bigEndian, littleEndian])
if order == bigEndian:

View File

@ -12,7 +12,8 @@ import
std/[unittest, times],
# Internal
../../constantine/platforms/gpu/[llvm, nvidia, ir],
../../constantine/math/config/[curves, type_bigint],
../../constantine/platforms/static_for,
../../constantine/math/config/curves,
../../constantine/math/io/io_bigints,
../../constantine/math/arithmetic,
../../constantine/math_gpu/fields_nvidia,
@ -34,8 +35,7 @@ proc init(T: type CurveMetadata, asy: Assembler_LLVM, curve: static Curve, wordS
fpBits = uint32 curve.getCurveBitwidth(),
fpMod = curve.Mod().toHex(),
frBits = uint32 curve.getCurveOrderBitwidth(),
frMod = curve.getCurveOrder().toHex()
)
frMod = curve.getCurveOrder().toHex())
proc genFieldAddPTX(asy: Assembler_LLVM, cm: CurveMetadata) =
let fpAdd = asy.field_add_gen(cm, fp)

View File

@ -107,8 +107,8 @@ proc main() =
#########################################################
# Conversion to GMP
const aLen = (aBits + 7) div 8
const mLen = (mBits + 7) div 8
const aLen = aBits.ceilDiv_vartime(8)
const mLen = mBits.ceilDiv_vartime(8)
var aBuf: array[aLen, byte]
var mBuf: array[mLen, byte]

View File

@ -78,8 +78,8 @@ proc main() =
#########################################################
# Conversion to GMP
const aLen = (aBits + 7) div 8
const bLen = (bBits + 7) div 8
const aLen = aBits.ceilDiv_vartime(8)
const bLen = bBits.ceilDiv_vartime(8)
var aBuf: array[aLen, byte]
var bBuf: array[bLen, byte]

View File

@ -71,8 +71,8 @@ proc main() =
#########################################################
# Conversion to GMP
const aLen = (aBits + 7) div 8
const bLen = (bBits + 7) div 8
const aLen = aBits.ceilDiv_vartime(8)
const bLen = bBits.ceilDiv_vartime(8)
var aBuf: array[aLen, byte]
var bBuf: array[bLen, byte]

View File

@ -62,8 +62,8 @@ proc binary_prologue[C: static Curve, N: static int](
#########################################################
# Conversion to GMP
const aLen = (C.getCurveBitwidth() + 7) div 8
const bLen = (C.getCurveBitwidth() + 7) div 8
const aLen = C.getCurveBitwidth().ceilDiv_vartime(8)
const bLen = C.getCurveBitwidth().ceilDiv_vartime(8)
var aBuf: array[aLen, byte]
var bBuf: array[bLen, byte]
@ -118,7 +118,7 @@ proc addTests(rng: var RngState, a, b, p, r: var mpz_t, C: static Curve) =
const
bits = C.getCurveBitwidth()
bufLen = (bits + 7) div 8
bufLen = bits.ceilDiv_vartime(8)
var
aTest, bTest{.noInit.}: Fp[C]
aBuf, bBuf: array[bufLen, byte]
@ -141,7 +141,7 @@ proc subTests(rng: var RngState, a, b, p, r: var mpz_t, C: static Curve) =
const
bits = C.getCurveBitwidth()
bufLen = (bits + 7) div 8
bufLen = bits.ceilDiv_vartime(8)
var
aTest, bTest{.noInit.}: Fp[C]
aBuf, bBuf: array[bufLen, byte]
@ -169,7 +169,7 @@ proc mulTests(rng: var RngState, a, b, p, r: var mpz_t, C: static Curve) =
const
bits = C.getCurveBitwidth()
bufLen = (bits + 7) div 8
bufLen = bits.ceilDiv_vartime(8)
var
aTest, bTest{.noInit.}: Fp[C]
aBuf, bBuf: array[bufLen, byte]
@ -193,7 +193,7 @@ proc invTests(rng: var RngState, a, b, p, r: var mpz_t, C: static Curve) =
const
bits = C.getCurveBitwidth()
bufLen = (bits + 7) div 8
bufLen = bits.ceilDiv_vartime(8)
var
aTest, bTest{.noInit.}: Fp[C]
aBuf, bBuf: array[bufLen, byte]

View File

@ -42,7 +42,7 @@ proc testRoundtrip(curve: static Curve, gen: static RandomGen) =
const bits = curve.getCurveBitwidth()
const Excess = 2
const UnsatBitwidth = WordBitWidth - Excess
const N = (bits + UnsatBitwidth-1) div UnsatBitwidth
const N = bits.ceilDiv_vartime(UnsatBitwidth)
let a = rng.random_bigint(curve, gen)
var u: LimbsUnsaturated[N, Excess]