mirror of
https://github.com/logos-storage/constantine.git
synced 2026-01-02 13:13:07 +00:00
* introduce reserve threads to minimize latency and maximize throughput when awaiting a future * introduce a ceilDiv proc * threadpool: implement parallel-for loops * 10x perf improvement by not waking reserveBackoff on syncAll * bench overhead: new reserve system might introduce too much wakeup latency, 2x slower, for fine-grained parallelism * add parallelForStrided * Threadpool: Implement parallel reductions * refactor parallel loop codegen: introduce descriptor, parsing and codegen stages * parallel strided, test transpose bench * tight loop is faster when backoff is not inline * no POSIX stuff on windows, larger types for histogram bench * fix tests * max RSS overflow? * missed an undefined var * exit histogram on 32-bit * forgot to return early dor 32-bit
792 lines
28 KiB
Nim
792 lines
28 KiB
Nim
# Weave
|
|
# Copyright (c) 2019 Mamy André-Ratsimbazafy
|
|
# Licensed and distributed under either of
|
|
# * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
|
|
# * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
|
|
# at your option. This file may not be copied, modified, or distributed except according to those terms.
|
|
|
|
import
|
|
std/macros,
|
|
./crossthread/tasks_flowvars,
|
|
../ast_rebuilder
|
|
|
|
# Parallel offloading API
|
|
# -----------------------
|
|
|
|
# This file implements all the macros necessary
|
|
# to provide a comprehensive and hopefully intuitive API
|
|
# for all the parallelim paradigms supported:
|
|
#
|
|
# - Task parallelism
|
|
# - Data parallelism / parallel for
|
|
# - parallel-for with thread-local prologue and epilogue
|
|
# - parallel-reduction without atomics or locks
|
|
# - Dataflow parallelism
|
|
# - also known as:
|
|
# - Graph parallelism
|
|
# - Stream parallelism
|
|
# - Pipeline parallelism
|
|
# - Data-driven (task) parallelism
|
|
# with precise input/output dependencies
|
|
|
|
# ############################################################
|
|
# #
|
|
# Task parallelism #
|
|
# #
|
|
# ############################################################
|
|
|
|
proc spawnVoid(funcCall: NimNode, args, argsTy: NimNode, workerContext, schedule: NimNode): NimNode =
|
|
# Create the async function
|
|
let fn = funcCall[0]
|
|
let fnName = $fn
|
|
let withArgs = args.len > 0
|
|
let tpSpawn_closure = ident("ctt_tpSpawnVoidClosure_" & fnName)
|
|
var loopFnCall = newCall(fn)
|
|
let env = ident("ctt_tpSpawnVoidEnv_") # typed pointer to env
|
|
|
|
# Schedule
|
|
let task = ident"ctt_tpSpawnVoidTask_"
|
|
let scheduleBlock = newCall(schedule, workerContext, task)
|
|
|
|
result = newStmtList()
|
|
|
|
if funcCall.len == 2:
|
|
# With only 1 arg, the tuple syntax doesn't construct a tuple
|
|
# let env = (123) # is an int
|
|
loopFnCall.add nnkDerefExpr.newTree(env)
|
|
else: # This handles the 0 arg case as well
|
|
for i in 1 ..< funcCall.len:
|
|
loopFnCall.add nnkBracketExpr.newTree(
|
|
env,
|
|
newLit i-1)
|
|
|
|
# Create the async call
|
|
result.add quote do:
|
|
proc `tpSpawn_closure`(env: pointer) {.nimcall, gcsafe, raises: [].} =
|
|
when bool(`withArgs`):
|
|
let `env` = cast[ptr `argsTy`](env)
|
|
`loopFnCall`
|
|
|
|
# Create the task
|
|
result.add quote do:
|
|
block enq_deq_task:
|
|
when bool(`withArgs`):
|
|
let `task` = Task.newSpawn(
|
|
parent = `workerContext`.currentTask,
|
|
fn = `tpSpawn_closure`,
|
|
env = `args`)
|
|
else:
|
|
let `task` = Task.newSpawn(
|
|
parent = `workerContext`.currentTask,
|
|
fn = `tpSpawn_closure`)
|
|
`scheduleBlock`
|
|
|
|
proc spawnRet(funcCall: NimNode, retTy, args, argsTy: NimNode, workerContext, schedule: NimNode): NimNode =
|
|
# Create the async function
|
|
result = newStmtList()
|
|
|
|
let fn = funcCall[0]
|
|
let fnName = $fn
|
|
let tpSpawn_closure = ident("ctt_tpSpawnRetClosure_" & fnName)
|
|
var loopFnCall = newCall(fn)
|
|
let env = ident("ctt_tpSpawnRetEnv_") # typed pointer to env
|
|
|
|
# tasks have no return value.
|
|
# 1. The start of the task `env` buffer will store the return value for the flowvar and awaiter/sync
|
|
# 2. We create a wrapper tpSpawn_closure without return value that send the return value in the channel
|
|
# 3. We package that wrapper function in a task
|
|
|
|
# We store the following in task.env:
|
|
#
|
|
# | ptr Task | result | arg₀ | arg₁ | ... | argₙ
|
|
let fut = ident"ctt_tpSpawnRetFut_"
|
|
let taskSelfReference = ident"ctt_taskSelfReference"
|
|
let retVal = ident"ctt_retVal"
|
|
|
|
var envParams = nnkPar.newTree
|
|
var envParamsTy = nnkPar.newTree
|
|
envParams.add taskSelfReference
|
|
envParamsTy.add nnkPtrTy.newTree(bindSym"Task")
|
|
envParams.add retVal
|
|
envParamsTy.add retTy
|
|
|
|
for i in 1 ..< funcCall.len:
|
|
envParamsTy.add getTypeInst(funcCall[i])
|
|
envParams.add funcCall[i]
|
|
|
|
# env stores | ptr Task | result | arg₀ | arg₁ | ... | argₙ
|
|
# so arguments starts at env[2] in the wrapping funcCall functions
|
|
for i in 1 ..< funcCall.len:
|
|
loopFnCall.add nnkBracketExpr.newTree(env, newLit i+1)
|
|
|
|
result.add quote do:
|
|
proc `tpSpawn_closure`(env: pointer) {.nimcall, gcsafe, raises: [].} =
|
|
let `env` = cast[ptr `envParamsTy`](env)
|
|
let res = `loopFnCall`
|
|
readyWith(`env`[0], res)
|
|
|
|
# Regenerate fresh ident, retTy has been tagged as a function call param
|
|
let retTy = ident($retTy)
|
|
let task = ident"ctt_tpSpawnRetTask_"
|
|
let scheduleBlock = newCall(schedule, workerContext, task)
|
|
|
|
# Create the task
|
|
result.add quote do:
|
|
block enq_deq_task:
|
|
let `taskSelfReference` = cast[ptr Task](0xDEADBEEF)
|
|
let `retVal` = default(`retTy`)
|
|
|
|
let `task` = Task.newSpawn(
|
|
parent = `workerContext`.currentTask,
|
|
fn = `tpSpawn_closure`,
|
|
env = `envParams`)
|
|
let `fut` = newFlowVar(`retTy`, `task`)
|
|
`scheduleBlock`
|
|
# Return the future
|
|
`fut`
|
|
|
|
proc spawnImpl*(tp: NimNode{nkSym}, funcCall: NimNode, workerContext, schedule: NimNode): NimNode =
|
|
funcCall.expectKind(nnkCall)
|
|
|
|
# Get the return type if any
|
|
let retTy = funcCall[0].getImpl[3][0]
|
|
let needFuture = retTy.kind != nnkEmpty
|
|
|
|
# Get a serialized type and data for all function arguments
|
|
# We use adhoc tuple
|
|
var argsTy = nnkPar.newTree()
|
|
var args = nnkPar.newTree()
|
|
for i in 1 ..< funcCall.len:
|
|
argsTy.add getTypeInst(funcCall[i])
|
|
args.add funcCall[i]
|
|
|
|
# Package in a task
|
|
if not needFuture:
|
|
result = spawnVoid(funcCall, args, argsTy, workerContext, schedule)
|
|
else:
|
|
result = spawnRet(funcCall, retTy, args, argsTy, workerContext, schedule)
|
|
|
|
# Wrap in a block for namespacing
|
|
result = nnkBlockStmt.newTree(newEmptyNode(), result)
|
|
|
|
# ############################################################
|
|
# #
|
|
# Data parallelism #
|
|
# #
|
|
# ############################################################
|
|
|
|
# Error messages generation
|
|
# --------------------------------------------------------------------------------------------------
|
|
# This outputs nice syntax examples for the parallel reduction
|
|
# and parallel staged domain specific languages.
|
|
|
|
type Example = enum
|
|
Reduce
|
|
Staged
|
|
|
|
template parReduceExample() {.dirty.}=
|
|
# Used for a nice error message
|
|
|
|
proc parallelReduceExample(n: int): int =
|
|
tp.parallelFor i in 0 ..< n:
|
|
## Declare a parallelFor or parallelForStrided loop as usual
|
|
reduceInto(globalSum: int64):
|
|
## Indicate that the loop is a reduction and declare the global reduction variable to sync with
|
|
prologue:
|
|
## Declare your local reduction variable(s) here
|
|
## It should be initialized with the neutral element
|
|
## corresponding to your fold operation.
|
|
## (0 for addition, 1 for multiplication, -Inf for max, +Inf for min, ...)
|
|
##
|
|
## This is task-local (and thread-local), each tasks set this section independently.
|
|
## Splitting in multiple tasks is done dynamically at the runtime discretion
|
|
## depending on available parallelism and load.
|
|
var localSum = 0
|
|
forLoop:
|
|
## This is the reduction loop
|
|
localSum += i
|
|
merge(remoteSum: FlowVar[int64]):
|
|
## Define how to merge with partial reductions from remote threads
|
|
## Remote threads result come as Flowvar that needs to be synced.
|
|
## Latency-hiding techniques can be use to overlap epilogue computations
|
|
## with other threads sync.
|
|
localSum += sync(remoteSum)
|
|
epilogue:
|
|
## Local task cleanup like memory allocated in prologue
|
|
## and returning the local accumulator
|
|
return localSum
|
|
|
|
## Await the parallel reduction
|
|
return sync(globalSum)
|
|
|
|
template parStagedExample() {.dirty.} =
|
|
# Used for a nice error message
|
|
|
|
proc parallelStagedSumExample(n: int): int =
|
|
## We will do a sum reduction to illustrate
|
|
## staged parallel for
|
|
|
|
## First take the address of the result
|
|
let res = result.addr
|
|
|
|
## Declare a parallelForStaged loop
|
|
tp.parallelForStaged i in 0 ..< n:
|
|
captures: {res}
|
|
prologue:
|
|
## Declare anything needed before the for-loop
|
|
## This will be thread-local, so each thread will run this section independently.
|
|
## The loop increment is not available here
|
|
var localSum = 0
|
|
forLoop:
|
|
## This is within the parallel loop
|
|
localSum += i
|
|
epilogue:
|
|
## Once the loop is finished, you have a final opportunity for processing.
|
|
## Thread-local cleanup should happen here as well
|
|
## Here we print the localSum and atomically increment the global sum
|
|
## before ending the task.
|
|
echo "localsum = ", localSum
|
|
res[].atomicInc(localSum)
|
|
|
|
## Await all tasks
|
|
tp.syncAll()
|
|
|
|
proc printReduceExample() =
|
|
let example = getAst(parReduceExample())
|
|
echo example.toStrLit()
|
|
proc printStagedExample() =
|
|
let example = getAst(parStagedExample())
|
|
echo example.toStrLit()
|
|
|
|
proc testKind(nn: NimNode, nnk: NimNodeKind, kind: Example) =
|
|
if nn.kind != nnk:
|
|
case kind
|
|
of Reduce: printReduceExample()
|
|
of Staged: printStagedExample()
|
|
nn.expectKind(nnk) # Gives nice line numbers
|
|
|
|
# Parallel Loop Domain Specific Language Descriptor
|
|
# --------------------------------------------------------------------------------------------------
|
|
|
|
type
|
|
LoopKind = enum
|
|
kForLoop
|
|
kReduction
|
|
kStaged
|
|
|
|
LoopDescriptor = object
|
|
## A loop descriptor fully described a parallel loop
|
|
## before final code generation
|
|
##
|
|
## Fields are ordered by depth of the call stack:
|
|
## - Users defines the loop boundaries and captures
|
|
## - a closure with signature `proc MyFunctionName(env: pointer)`
|
|
## is generated
|
|
## - it gets packaged in a task
|
|
## - on task execution, the inner proc is reconstructed
|
|
## - That inner proc may have various sections depending on the loop kind
|
|
|
|
kind: LoopKind
|
|
|
|
# Loop bounds
|
|
# -----------
|
|
indexVariable: NimNode
|
|
start: NimNode
|
|
stopEx: NimNode
|
|
stride: NimNode
|
|
|
|
# Closure generation
|
|
# ------------------
|
|
envName: NimNode
|
|
closureName: NimNode
|
|
closureDef: NimNode
|
|
capturedVars: NimNode
|
|
capturedTypes: NimNode
|
|
|
|
# Task packaging and scheduling
|
|
# -----------------------------
|
|
taskName: NimNode
|
|
taskCreation: NimNode
|
|
workerContext: NimNode
|
|
scheduleFn: NimNode
|
|
|
|
# Parallel loop stages
|
|
# --------------------
|
|
# There are 3 calls level for loops:
|
|
# - closure(env: pointer) {.nimcall, gcsafe, raises: [].}
|
|
# - loopFn(args: ptr (argsTy₀, argsTy₁, ..., argsTyₙ)): returnType {.inline, nimcall, gcsafe, raises: [].}
|
|
# let (args₀, args₁, ..., argsₙ) = args[]
|
|
# loopTemplate(indexVar, prologue, loopBody, ...)
|
|
# - loopTemplate(indexVar, prologue, loopBody, ...: untyped)
|
|
#
|
|
# The last 2 levels are inline in the closure.
|
|
# - The closure deals with removing type erasure from an untyped environment and updating the future once the task is finished
|
|
# - The loopFn reinstalls the captured values
|
|
# - The loopTemplate reimplements the sections as well as runtime interaction
|
|
# for loop splitting checks and merging reduction accumulators with splitted tasks.
|
|
#
|
|
# A side-benefit of the loopFn is that it allows borrow-checking:
|
|
# - error if we capture a `var parameter`
|
|
# - error if we forget to capture a runtime variable (compile-time constants do not have to be captured)
|
|
loopFnName: NimNode # inner function called by the closure once environment is retyped
|
|
loopTemplate: NimNode # inner function implementation, defined in threadpool.nim
|
|
prologue: NimNode
|
|
forLoop: NimNode
|
|
epilogue: NimNode
|
|
|
|
# Futures - awaitable loops and reductions
|
|
# ----------------------------------------
|
|
globalAwaitable: NimNode
|
|
remoteTaskAwaitable: NimNode
|
|
awaitableType: NimNode
|
|
mergeLocalWithRemote: NimNode
|
|
|
|
# Parsing parallel loop DSL
|
|
# --------------------------------------------------------------------------------------------------
|
|
|
|
proc checkLoopBounds(loopBounds: NimNode) =
|
|
## Checks loop parameters
|
|
## --------------------------------------------------------
|
|
## loopBounds should have the form "i in 0..<10"
|
|
loopBounds.expectKind(nnkInfix)
|
|
assert loopBounds[0].eqIdent"in"
|
|
loopBounds[1].expectKind(nnkIdent)
|
|
loopBounds[2].expectKind(nnkInfix) # 0 ..< 10 / 0 .. 10, for now we don't support slice objects
|
|
assert loopBounds[2][0].eqIdent".." or loopBounds[2][0].eqIdent"..<"
|
|
|
|
proc parseLoopBounds(ld: var LoopDescriptor, loopBounds: NimNode) =
|
|
## Extract the index, start and stop of the loop
|
|
## Strides must be dealt with separately
|
|
let loopBounds = rebuildUntypedAst(loopBounds, dropRootStmtList = true)
|
|
checkLoopBounds(loopBounds)
|
|
ld.indexVariable = loopBounds[1]
|
|
ld.start = loopBounds[2][1]
|
|
ld.stopEx = loopBounds[2][2]
|
|
# We use exclusive bounds
|
|
if loopBounds[2][0].eqIdent"..":
|
|
ld.stopEx = newCall(ident"succ", ld.stopEx)
|
|
|
|
proc parseCaptures(ld: var LoopDescriptor, body: NimNode) =
|
|
## Extract captured variables from the for-loop body.
|
|
## Once extracted the section that declared those captures will be discarded.
|
|
##
|
|
## Returns the captured variable and the captured variable types
|
|
## in a tuple of nnkPar for easy use in tuple construction and destructuring.
|
|
# parallelFor i in 0 ..< 10:
|
|
# captures: a
|
|
# ...
|
|
#
|
|
# StmtList
|
|
# Call
|
|
# Ident "captures"
|
|
# StmtList
|
|
# Curly
|
|
# Ident "a"
|
|
# Rest of the body
|
|
for i in 0 ..< body.len:
|
|
if body[i].kind == nnkCall and body[i][0].eqIdent"captures":
|
|
ld.capturedVars = nnkPar.newTree()
|
|
ld.capturedTypes = nnkPar.newTree()
|
|
|
|
body[i][1].expectKind(nnkStmtList)
|
|
body[i][1][0].expectKind(nnkCurly)
|
|
for j in 0 ..< body[i][1][0].len:
|
|
ld.capturedVars.add body[i][1][0][j]
|
|
ld.capturedTypes.add newCall(ident"typeof", body[i][1][0][j])
|
|
|
|
# Remove the captures section
|
|
body[i] = nnkDiscardStmt.newTree(body[i].toStrLit)
|
|
return
|
|
|
|
proc extractSection(ldField: var NimNode, body: NimNode, sectionName: string) =
|
|
body.expectKind(nnkStmtList)
|
|
for i in 0 ..< body.len:
|
|
if body[i].kind == nnkCall and body[i][0].eqIdent(sectionName):
|
|
body[i][1].expectKind(nnkStmtList)
|
|
ldField = body[i][1]
|
|
# Remove the section
|
|
body[i] = nnkDiscardStmt.newTree(body[i].toStrLit)
|
|
return
|
|
|
|
# Code generation
|
|
# --------------------------------------------------------------------------------------------------
|
|
|
|
proc generateClosure(ld: LoopDescriptor): NimNode =
|
|
|
|
let env = ld.envName
|
|
let capturedTypes = ld.capturedTypes
|
|
let withCaptures = ld.capturedTypes.len > 0
|
|
|
|
let closureName = ld.closureName
|
|
var loopFnCall = newCall(ld.loopFnName)
|
|
if withCaptures:
|
|
loopFnCall.add(env)
|
|
|
|
case ld.kind
|
|
of kForLoop:
|
|
result = quote do:
|
|
proc `closureName`(env: pointer) {.nimcall, gcsafe, raises: [].} =
|
|
when bool(`withCaptures`):
|
|
let `env` = cast[ptr `capturedTypes`](env)
|
|
`loopFnCall`
|
|
of kReduction:
|
|
let retTy = ld.awaitableType
|
|
|
|
result = quote do:
|
|
proc `closureName`(env: pointer) {.nimcall, gcsafe, raises: [].} =
|
|
let taskSelfReference = cast[ptr ptr Task](env)
|
|
when bool(`withCaptures`):
|
|
let offset = cast[ByteAddress](env) +% sizeof((ptr Task, `retTy`))
|
|
let `env` = cast[ptr `capturedTypes`](offset)
|
|
let res = `loopFnCall`
|
|
readyWith(taskSelfReference[], res)
|
|
else:
|
|
error "Not Implemented"
|
|
|
|
proc generateAndScheduleLoopTask(ld: LoopDescriptor): NimNode =
|
|
result = newStmtList()
|
|
|
|
var withCaptures = false
|
|
if not ld.capturedVars.isNil:
|
|
withCaptures = true
|
|
|
|
# TODO: awaitable for loop
|
|
|
|
# Dependencies
|
|
# ---------------------------------------------------
|
|
var scheduleBlock: NimNode
|
|
let task = ident"ctt_tpLoopTask_"
|
|
# TODO: Dataflow parallelism / precise task dependencies
|
|
scheduleBlock = newCall(ld.scheduleFn, ld.workerContext, task)
|
|
|
|
# ---------------------------------------------------
|
|
let
|
|
(start, stopEx, stride) = (ld.start, ld.stopEx, ld.stride)
|
|
workerContext = ld.workerContext
|
|
(closureName, capturedVars) = (ld.closureName, ld.capturedVars)
|
|
(globalAwaitable, awaitableType) = (ld.globalAwaitable, ld.awaitableType)
|
|
if ld.awaitableType.isNil():
|
|
result = quote do:
|
|
block enq_deq_task: # block for namespacing
|
|
let start = `start` # Ensure single evaluation / side-effect
|
|
let stopEx = `stopEx`
|
|
if stopEx-start != 0:
|
|
when bool(`withCaptures`):
|
|
let `task` = Task.newLoop(
|
|
parent = `workerContext`.currentTask,
|
|
start, stopEx, `stride`,
|
|
isFirstIter = true,
|
|
fn = `closureName`,
|
|
env = `capturedVars`)
|
|
else:
|
|
let `task` = Task.newLoop(
|
|
parent = `workerContext`.currentTask,
|
|
start, stopEx, `stride`,
|
|
isFirstIter = true,
|
|
fn = `closureName`)
|
|
`scheduleBlock`
|
|
else:
|
|
result = quote do:
|
|
var `globalAwaitable`: FlowVar[`awaitableType`]
|
|
block enq_deq_task: # Block for name spacing
|
|
let start = `start` # Ensure single evaluation / side-effect
|
|
let stopEx = `stopEx`
|
|
if stopEx-start != 0:
|
|
let taskSelfReference = cast[ptr Task](0xDEADBEEF)
|
|
var retValBuffer = default(`awaitableType`)
|
|
|
|
when bool(`withCaptures`):
|
|
let `task` = Task.newLoop(
|
|
parent = `workerContext`.currentTask,
|
|
start, stopEx, `stride`,
|
|
isFirstIter = true,
|
|
fn = `closureName`,
|
|
env = (taskSelfReference, retValBuffer, `capturedVars`))
|
|
else:
|
|
let `task` = Task.newLoop(
|
|
parent = `workerContext`.currentTask,
|
|
start, stopEx, `stride`,
|
|
isFirstIter = true,
|
|
fn = `closureName`,
|
|
env = (taskSelfReference, retValBuffer))
|
|
`globalAwaitable` = newFlowVar(`awaitableType`, `task`)
|
|
`scheduleBlock`
|
|
|
|
proc generateParallelLoop(ld: LoopDescriptor): NimNode =
|
|
# Package a parallel for loop into a proc
|
|
# Returns the statements that implements it.
|
|
let pragmas = nnkPragma.newTree(
|
|
ident"nimcall", ident"gcsafe", ident"inline",
|
|
nnkExprColonExpr.newTree(ident"raises", nnkBracket.newTree())) # raises: []
|
|
|
|
var params: seq[NimNode]
|
|
if ld.awaitableType.isNil:
|
|
params.add newEmptyNode()
|
|
else:
|
|
params.add ld.awaitableType
|
|
|
|
var procBody = newStmtList()
|
|
|
|
if ld.capturedVars.len > 0:
|
|
params.add newIdentDefs(ld.envName, nnkPtrTy.newTree(ld.capturedTypes))
|
|
|
|
let derefEnv = nnkBracketExpr.newTree(ld.envName)
|
|
if ld.capturedVars.len > 1:
|
|
# Unpack the variables captured from the environment
|
|
# let (a, b, c) = env[]
|
|
var unpacker = nnkVarTuple.newTree()
|
|
ld.capturedVars.copyChildrenTo(unpacker)
|
|
unpacker.add newEmptyNode()
|
|
unpacker.add derefEnv
|
|
|
|
procBody.add nnkLetSection.newTree(unpacker)
|
|
else:
|
|
procBody.add newLetStmt(ld.capturedVars[0], derefEnv)
|
|
|
|
case ld.kind
|
|
of kForLoop:
|
|
procBody.add newCall(ld.loopTemplate, ld.indexVariable, ld.forLoop)
|
|
of kReduction:
|
|
procBody.add newCall(
|
|
ld.loopTemplate, ld.indexVariable,
|
|
ld.prologue, ld.forLoop, ld.mergeLocalWithRemote, ld.epilogue,
|
|
ld.remoteTaskAwaitable, ld.awaitableType)
|
|
else:
|
|
error " Unimplemented"
|
|
|
|
result = newProc(
|
|
name = ld.loopFnName,
|
|
params = params,
|
|
body = procBody,
|
|
pragmas = pragmas)
|
|
|
|
# Parallel for
|
|
# --------------------------------------------------------------------------------------------------
|
|
|
|
proc parallelForImpl*(workerContext, scheduleFn, loopTemplate, loopBounds, body: NimNode): NimNode =
|
|
## Parallel for loop
|
|
## Syntax:
|
|
##
|
|
## parallelFor i in 0 ..< 10:
|
|
## echo(i)
|
|
##
|
|
## Variables from the external scope needs to be explicitly captured
|
|
##
|
|
## var a = 100
|
|
## var b = 10
|
|
## parallelFor i in 0 ..< 10:
|
|
## captures: {a, b}
|
|
## echo a + b + i
|
|
|
|
result = newStmtList()
|
|
var ld = LoopDescriptor(kind: kForLoop, workerContext: workerContext, scheduleFn: scheduleFn)
|
|
|
|
# Parse the loop Domain-Specific Language
|
|
# --------------------------------------------------------
|
|
body.expectKind(nnkStmtList)
|
|
ld.parseLoopBounds(loopBounds)
|
|
ld.stride.extractSection(body, "stride")
|
|
if ld.stride.isNil:
|
|
ld.stride = newLit(1)
|
|
ld.parseCaptures(body)
|
|
ld.forLoop = body
|
|
|
|
# Code generation
|
|
# --------------------------------------------------------
|
|
ld.loopTemplate = loopTemplate
|
|
ld.loopFnName = ident("ctt_tpParForImpl_")
|
|
ld.envName = ident("ctt_tpParForEnv_")
|
|
result.add ld.generateParallelLoop()
|
|
|
|
ld.closureName = ident("ctt_tpParForClosure_")
|
|
result.add ld.generateClosure()
|
|
|
|
ld.taskName = ident("ctt_tpParForTask_")
|
|
result.add ld.generateAndScheduleLoopTask()
|
|
|
|
# Parallel reductions
|
|
# --------------------------------------------------------------------------------------------------
|
|
|
|
proc parseReductionSection(body: NimNode):
|
|
tuple[globalAwaitable, awaitableType, reductionBody: NimNode] =
|
|
for i in 0 ..< body.len:
|
|
# parallelFor i in 0 .. n:
|
|
# reduceInto(globalSum: int64):
|
|
# prologue:
|
|
# var localSum = 0'i64
|
|
#
|
|
# StmtList
|
|
# Call
|
|
# ObjConstr
|
|
# Ident "reduceInto"
|
|
# ExprColonExpr
|
|
# Ident "globalSum"
|
|
# Ident "int64"
|
|
# StmtList
|
|
# Call
|
|
# Ident "prologue"
|
|
# StmtList
|
|
# VarSection
|
|
# IdentDefs
|
|
# Ident "localSum"
|
|
# Empty
|
|
# Int64Lit 0
|
|
if body[i].kind == nnkCall and
|
|
body[i][0].kind == nnkObjConstr and
|
|
body[i][0][0].eqident"reduceInto":
|
|
body[i][0][1].testKind(nnkExprColonExpr, Reduce)
|
|
body[i][1].testKind(nnkStmtList, Reduce)
|
|
|
|
if body[i][1].len != 4:
|
|
printReduceExample()
|
|
error "A reduction should have 4 sections named:\n" &
|
|
" prologue, forLoop, merge and epilogue statements\n"
|
|
# (globalAwaitable, awaitableType, reductionBody)
|
|
return (body[i][0][1][0], body[i][0][1][1], body[i][1])
|
|
|
|
printReduceExample()
|
|
error "Missing section \"reduceInto(globalAwaitable: awaitableType):\""
|
|
|
|
proc extractRemoteTaskMerge(ld: var LoopDescriptor, body: NimNode) =
|
|
for i in 0 ..< body.len:
|
|
if body[i].kind == nnkCall and
|
|
body[i][0].kind == nnkObjConstr and
|
|
body[i][0][0].eqident"merge":
|
|
body[i][0][1].testKind(nnkExprColonExpr, Reduce)
|
|
body[i][1].testKind(nnkStmtList, Reduce)
|
|
|
|
ld.remoteTaskAwaitable = body[i][0][1][0]
|
|
ld.mergeLocalWithRemote = body[i][1]
|
|
return
|
|
|
|
printReduceExample()
|
|
error "Missing section \"merge(remoteThreadAccumulator: Flowvar[accumulatorType]):\""
|
|
|
|
proc parallelReduceImpl*(workerContext, scheduleFn, loopTemplate, loopBounds, body: NimNode): NimNode =
|
|
## Parallel reduce loop
|
|
## Syntax:
|
|
##
|
|
## parallelFor i in 0 ..< 100:
|
|
## reduceInto(globalSum: int64):
|
|
## prologue:
|
|
## ## Initialize before the loop
|
|
## var localSum = 0
|
|
## forLoop:
|
|
## ## Compute the partial reductions
|
|
## localSum += i
|
|
## merge(remoteSum: Flowvar[int64]):
|
|
## ## Merge our local reduction with reduction from remote threads
|
|
## localSum += sync(remoteSum)
|
|
## return localSum
|
|
##
|
|
## # Await our result
|
|
## let sum = sync(globalSum)
|
|
##
|
|
## The first element from the iterator (i) in the example is not available in the prologue.
|
|
## Depending on multithreaded scheduling it may start at 0 or halfway or close to completion.
|
|
## The accumulator set in the prologue should be set at the neutral element for your fold operation:
|
|
## - 0 for addition, 1 for multiplication, +Inf for min, -Inf for max, ...
|
|
##
|
|
## In the forLoop section the iterator i is available, the number of iterations is undefined.
|
|
## The runtime chooses dynamically how many iterations are done to maximize throughput.
|
|
## - This requires your operation to be associative, i.e. (a+b)+c = a+(b+c).
|
|
## - It does not require your operation to be commutative (a+b = b+a is not needed).
|
|
## - In particular floating-point addition is NOT associative due to rounding errors.
|
|
## and result may differ between runs.
|
|
## For inputs usually in [-1,1]
|
|
## the floating point addition error is within 1e-8 (float32) or 1e-15 (float64).
|
|
## For inputs beyond 1e^9 please evaluate the acceptable precision.
|
|
## Note: that the main benefits of "-ffast-math" is considering floating-point addition
|
|
## associative
|
|
##
|
|
## In the merge section, a tuple (identifier: Flowvar[MyType]) for a partial reduction from a remote core must be passed.
|
|
## The merge section may be executed multiple times if a loop was split between many threads.
|
|
## The local partial reduction must be returned.
|
|
##
|
|
## Variables from the external scope needs to be explicitly captured.
|
|
## For example, to compute the variance of a seq in parallel
|
|
##
|
|
## var s = newSeqWith(1000, rand(100.0))
|
|
## let mean = mean(s)
|
|
##
|
|
## let ps = cast[ptr UncheckedArray[float64]](s)
|
|
##
|
|
## parallelFor i in 0 ..< s.len:
|
|
## captures: {ps, mean}
|
|
## reduceInto(globalVariance: float64):
|
|
## prologue:
|
|
## var localVariance = 0.0
|
|
## fold:
|
|
## localVariance += (ps[i] - mean)^2
|
|
## merge(remoteVariance: Flowvar[float64]):
|
|
## localVariance += sync(remoteVariance)
|
|
## return localVariance
|
|
##
|
|
## # Await our result
|
|
## let variance = sync(globalVariance)
|
|
##
|
|
## Performance note:
|
|
## For trivial floating points operations like addition/sum reduction:
|
|
## before parallelizing reductions on multiple cores
|
|
## you might try to parallelize it on a single core by
|
|
## creating multiple accumulators (between 2 and 4)
|
|
## and unrolling the accumulation loop by that amount.
|
|
##
|
|
## The compiler is unable to do that (without -ffast-math)
|
|
## as floating point addition is NOT associative and changing
|
|
## order will change the result due to floating point rounding errors.
|
|
##
|
|
## The performance improvement is dramatic (2x-3x) as at a low-level
|
|
## there is no data dependency between each accumulators and
|
|
## the CPU can now use instruction-level parallelism instead
|
|
## of suffer from data dependency latency (3 or 4 cycles)
|
|
## https://software.intel.com/sites/landingpage/IntrinsicsGuide/#techs=SSE&expand=158
|
|
## The reduction becomes memory-bound instead of CPU-latency-bound.
|
|
|
|
result = newStmtList()
|
|
var ld = LoopDescriptor(kind: kReduction, workerContext: workerContext, scheduleFn: scheduleFn)
|
|
|
|
# Parse the loop Domain-Specific Language
|
|
# --------------------------------------------------------
|
|
body.testKind(nnkStmtList, Reduce)
|
|
ld.parseLoopBounds(loopBounds)
|
|
ld.stride.extractSection(body, "stride")
|
|
if ld.stride.isNil:
|
|
ld.stride = newLit(1)
|
|
ld.parseCaptures(body)
|
|
|
|
var reductionBody: NimNode
|
|
(ld.globalAwaitable, ld.awaitableType, reductionBody) = parseReductionSection(body)
|
|
ld.extractRemoteTaskMerge(reductionBody)
|
|
|
|
ld.prologue.extractSection(reductionBody, "prologue")
|
|
ld.forLoop.extractSection(reductionBody, "forLoop")
|
|
ld.epilogue.extractSection(reductionBody, "epilogue")
|
|
|
|
# Code generation
|
|
# --------------------------------------------------------
|
|
ld.loopTemplate = loopTemplate
|
|
ld.loopFnName = ident("ctt_tpParReduceImpl_")
|
|
ld.envName = ident("ctt_tpParReduceEnv_")
|
|
result.add ld.generateParallelLoop()
|
|
|
|
ld.closureName = ident("ctt_tpParReduceClosure_")
|
|
result.add ld.generateClosure()
|
|
|
|
ld.taskName = ident("ctt_tpParReduceTask_")
|
|
result.add ld.generateAndScheduleLoopTask()
|
|
|
|
# ############################################################
|
|
# #
|
|
# Parallel For Dispatchers #
|
|
# #
|
|
# ############################################################
|
|
|
|
proc hasReduceSection*(body: NimNode): bool =
|
|
for i in 0 ..< body.len:
|
|
if body[i].kind == nnkCall:
|
|
for j in 0 ..< body[i].len:
|
|
if body[i][j].kind == nnkObjConstr and body[i][j][0].eqIdent"reduceInto":
|
|
return true
|
|
return false
|