Mamy Ratsimbazafy bf32c2d408
Parallel for (#222)
* introduce reserve threads to minimize latency and maximize throughput when awaiting a future

* introduce a ceilDiv proc

* threadpool: implement parallel-for loops

* 10x perf improvement by not waking reserveBackoff on syncAll

* bench overhead: new reserve system might introduce too much wakeup latency, 2x slower, for fine-grained parallelism

* add parallelForStrided

* Threadpool: Implement parallel reductions

* refactor parallel loop codegen: introduce descriptor, parsing and codegen stages

* parallel strided, test transpose bench

* tight loop is faster when backoff is not inline

* no POSIX stuff on windows, larger types for histogram bench

* fix tests

* max RSS overflow?

* missed an undefined var

* exit histogram on 32-bit

* forgot to return early dor 32-bit
2023-02-24 09:47:36 +01:00

250 lines
8.1 KiB
Nim

# Constantine
# Copyright (c) 2018-2019 Status Research & Development GmbH
# Copyright (c) 2020-Present Mamy André-Ratsimbazafy
# Licensed and distributed under either of
# * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
# * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
# at your option. This file may not be copied, modified, or distributed except according to those terms.
import
std/atomics,
../instrumentation,
../../allocs, ../../primitives,
./backoff
# Tasks have an efficient design so that a single heap allocation
# is required per `spawn`.
# This greatly reduce overhead and potential memory fragmentation for long-running applications.
#
# This is done by tasks:
# - being an intrusive linked lists
# - integrating the channel to send results
#
# Flowvar is the public type created when spawning a task.
# and can be synced to receive the task result.
# Flowvars are also called future interchangeably.
# (The name future is already used for IO scheduling)
type
Task* = object
# Intrusive metadata
# ------------------
parent*: ptr Task # When a task is awaiting, a thread can quickly prioritize the direct child of a task
thiefID*: Atomic[int32] # ID of the worker that stole and run the task. For leapfrogging.
# Result sync
# ------------------
hasFuture*: bool # Ownership: if a task has a future, the future deallocates it. Otherwise the worker thread does.
completed*: Atomic[bool]
waiter*: Atomic[ptr EventNotifier]
# Data parallelism
# ------------------
isFirstIter*: bool # Awaitable for-loops return true for first iter. Loops are split before first iter.
loopStart*: int
loopStop*: int
loopStride*: int
loopStepsLeft*: int
reductionDAG*: ptr ReductionDagNode # For parallel loop reduction, merge with other range result
# Dataflow parallelism
# --------------------
dependsOnEvent: bool # We cannot leapfrog a task triggered by an event
# Execution
# ------------------
fn*: proc (env: pointer) {.nimcall, gcsafe, raises: [].}
# destroy*: proc (env: pointer) {.nimcall, gcsafe.} # Constantine only deals with plain old data
envSize*: int32
env*{.align:sizeof(int).}: UncheckedArray[byte]
Flowvar*[T] = object
## A Flowvar is a placeholder for a future result that may be computed in parallel
task: ptr Task
ReductionDagNode* = object
## In a parallel reduction, when a loop a split the worker
## keeps track of the tasks to gather results from in a private task-local linked-list.
## Those forms a global computation directed acyclic graph
## with the initial parallel reduction task as root.
# Note: While this requires an extra allocation per split
# the alternative, making an intrusive linked-list of reduction tasks
# require synchronization between threads.
task*: ptr Task
next*: ptr ReductionDagNode
# Tasks
# -------------------------------------------------------------------------
const SentinelThief* = 0xFACADE'i32
proc newSpawn*(
T: typedesc[Task],
parent: ptr Task,
fn: proc (env: pointer) {.nimcall, gcsafe, raises: [].}): ptr Task =
const size = sizeof(T)
result = allocHeapUnchecked(T, size)
result.parent = parent
result.thiefID.store(SentinelThief, moRelaxed)
result.hasFuture = false
result.completed.store(false, moRelaxed)
result.waiter.store(nil, moRelaxed)
result.fn = fn
result.envSize = 0
result.isFirstIter = false
result.loopStart = 0
result.loopStop = 0
result.loopStride = 0
result.loopStepsLeft = 0
result.reductionDAG = nil
result.dependsOnEvent = false
proc newSpawn*(
T: typedesc[Task],
parent: ptr Task,
fn: proc (env: pointer) {.nimcall, gcsafe, raises: [].},
env: auto): ptr Task =
const size = sizeof(T) + # size without Unchecked
sizeof(env)
result = allocHeapUnchecked(T, size)
result.parent = parent
result.thiefID.store(SentinelThief, moRelaxed)
result.hasFuture = false
result.completed.store(false, moRelaxed)
result.waiter.store(nil, moRelaxed)
result.fn = fn
result.envSize = int32 sizeof(env)
cast[ptr[type env]](result.env)[] = env
result.isFirstIter = false
result.loopStart = 0
result.loopStop = 0
result.loopStride = 0
result.loopStepsLeft = 0
result.reductionDAG = nil
result.dependsOnEvent = false
proc newLoop*(
T: typedesc[Task],
parent: ptr Task,
start, stop, stride: int,
isFirstIter: bool,
fn: proc (env: pointer) {.nimcall, gcsafe, raises: [].}): ptr Task =
const size = sizeof(T)
preCondition: start < stop
result = allocHeapUnchecked(T, size)
result.parent = parent
result.thiefID.store(SentinelThief, moRelaxed)
result.hasFuture = false
result.completed.store(false, moRelaxed)
result.waiter.store(nil, moRelaxed)
result.fn = fn
result.envSize = 0
result.isFirstIter = isFirstIter
result.loopStart = start
result.loopStop = stop
result.loopStride = stride
result.loopStepsLeft = ceilDiv_vartime(stop-start, stride)
result.reductionDAG = nil
result.dependsOnEvent = false
proc newLoop*(
T: typedesc[Task],
parent: ptr Task,
start, stop, stride: int,
isFirstIter: bool,
fn: proc (env: pointer) {.nimcall, gcsafe, raises: [].},
env: auto): ptr Task =
const size = sizeof(T) + # size without Unchecked
sizeof(env)
preCondition: start < stop
result = allocHeapUnchecked(T, size)
result.parent = parent
result.thiefID.store(SentinelThief, moRelaxed)
result.hasFuture = false
result.completed.store(false, moRelaxed)
result.waiter.store(nil, moRelaxed)
result.fn = fn
result.envSize = int32(sizeof(env))
cast[ptr[type env]](result.env)[] = env
result.isFirstIter = isFirstIter
result.loopStart = start
result.loopStop = stop
result.loopStride = stride
result.loopStepsLeft = ceilDiv_vartime(stop-start, stride)
result.reductionDAG = nil
result.dependsOnEvent = false
# Flowvars
# -------------------------------------------------------------------------
# proc `=copy`*[T](dst: var Flowvar[T], src: Flowvar[T]) {.error: "Futures/Flowvars cannot be copied".}
proc newFlowVar*(T: typedesc, task: ptr Task): Flowvar[T] {.inline.} =
result.task = task
result.task.hasFuture = true
# Task with future references themselves so that readyWith can be called
# within the constructed
# proc threadpoolSpawn_fn(env: pointer) {.nimcall.}
# that can only access env
cast[ptr ptr Task](task.env.addr)[] = task
proc cleanup*(fv: var Flowvar) {.inline.} =
fv.task.freeHeap()
fv.task = nil
func isSpawned*(fv: Flowvar): bool {.inline.} =
## Returns true if a flowvar is spawned
## This may be useful for recursive algorithms that
## may or may not spawn a flowvar depending on a condition.
## This is similar to Option or Maybe types
return not fv.task.isNil
func isReady*[T](fv: Flowvar[T]): bool {.inline.} =
## Returns true if the result of a Flowvar is ready.
## In that case `sync` will not block.
## Otherwise the current will block to help on all the pending tasks
## until the Flowvar is ready.
fv.task.completed.load(moAcquire)
func readyWith*[T](task: ptr Task, childResult: T) {.inline.} =
## Send the Flowvar result from the child thread processing the task
## to its parent thread.
precondition: not task.completed.load(moAcquire)
cast[ptr (ptr Task, T)](task.env.addr)[1] = childResult
task.completed.store(true, moRelease)
proc sync*[T](fv: sink Flowvar[T]): T {.noInit, inline, gcsafe.} =
## Blocks the current thread until the flowvar is available
## and returned.
## The thread is not idle and will complete pending tasks.
mixin completeFuture
if fv.task.isNil:
zeroMem(result.addr, sizeof(T))
return
completeFuture(fv, result)
cleanup(fv)
# ReductionDagNodes
# -------------------------------------------------------------------------
proc newReductionDagNode*(task: ptr Task, next: ptr ReductionDagNode): ptr ReductionDagNode {.inline.} =
result = allocHeap(ReductionDagNode)
result.next = next
result.task = task