nim-taskpools/taskpools/taskpools.nim

558 lines
19 KiB
Nim

# Nim-Taskpools
# Copyright (c) 2021 Status Research & Development GmbH
# Licensed and distributed under either of
# * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
# * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
# at your option. This file may not be copied, modified, or distributed except according to those terms.
# Taskpools
#
# This file implements a taskpool
#
# Implementation:
#
# It is a simple shared memory based work-stealing threadpool.
# The primary focus is:
# - Delegate compute intensive tasks to the threadpool.
# - Simple to audit by staying close to foundational papers
# and using simple datastructures otherwise.
# - Low energy consumption:
# threads should be put to sleep ASAP
# instead of polling/spinning (energy vs latency tradeoff)
# - Decent performance:
# Work-stealing has optimal asymptotic parallel speedup.
# Work-stealing has significantly reduced contention
# when many tasks are created,
# for example by divide-and-conquer algorithms, compared to a global task queue
#
# Not a priority:
# - Handling trillions of very short tasks (less than 100µs).
# - Advanced task dependencies or events API.
# - Unbalanced parallel-for loops.
# - Handling services that should run for the lifetime of the program.
#
# Doing IO on a compute threadpool should be avoided
# In case a thread is blocked for IO, other threads can steal pending tasks in that thread.
# If all threads are pending for IO, the threadpool will not make any progress and be soft-locked.
{.push raises: [].}
import
system/ansi_c,
std/[random, cpuinfo, atomics, macros],
./channels_spsc_single,
./chase_lev_deques,
./event_notifiers,
./primitives/[barriers, allocs],
./instrumentation/[contracts, loggers],
./sparsesets,
./flowvars,
./ast_utils
export
# flowvars
Flowvar, isSpawned, isReady, sync
when defined(windows):
import ./primitives/affinity_windows
else:
import ./primitives/affinity_posix
when (NimMajor,NimMinor,NimPatch) >= (1,6,0):
import std/[isolation, tasks]
export isolation
else:
import ./shims_pre_1_6/tasks
type
WorkerID = int32
TaskNode = ptr object
# Linked list of tasks
parent: TaskNode
task: Task
Signal = object
terminate {.align: 64.}: Atomic[bool]
WorkerContext = object
## Thread-local worker context
# Params
id: WorkerID
taskpool: Taskpool
# Tasks
taskDeque: ptr ChaseLevDeque[TaskNode] # owned task deque
currentTask: TaskNode
# Synchronization
eventNotifier: ptr EventNotifier # shared event notifier
signal: ptr Signal # owned signal
# Thefts
rng: Rand # RNG state to select victims
numThreads: int
otherDeques: ptr UncheckedArray[ChaseLevDeque[TaskNode]]
victims: SparseSet
Taskpool* = ptr object
barrier: SyncBarrier
## Barrier for initialization and teardown
# --- Align: 64
eventNotifier: EventNotifier
## Puts thread to sleep
numThreads*{.align: 64.}: int
workerDeques: ptr UncheckedArray[ChaseLevDeque[TaskNode]]
## Direct access for task stealing
workers: ptr UncheckedArray[Thread[(Taskpool, WorkerID)]]
workerSignals: ptr UncheckedArray[Signal]
## Access signaledTerminate
# Thread-local config
# ---------------------------------------------
var workerContext {.threadvar.}: WorkerContext
## Thread-local Worker context
proc setupWorker() =
## Initialize the thread-local context of a worker
## Requires the ID and taskpool fields to be initialized
template ctx: untyped = workerContext
preCondition: not ctx.taskpool.isNil()
preCondition: 0 <= ctx.id and ctx.id < ctx.taskpool.numThreads
preCondition: not ctx.taskpool.workerDeques.isNil()
preCondition: not ctx.taskpool.workerSignals.isNil()
# Thefts
ctx.rng = initRand(0xEFFACED + ctx.id)
ctx.numThreads = ctx.taskpool.numThreads
ctx.otherDeques = ctx.taskpool.workerDeques
ctx.victims.allocate(ctx.taskpool.numThreads)
# Synchronization
ctx.eventNotifier = addr ctx.taskpool.eventNotifier
ctx.signal = addr ctx.taskpool.workerSignals[ctx.id]
ctx.signal.terminate.store(false, moRelaxed)
# Tasks
ctx.taskDeque = addr ctx.taskpool.workerDeques[ctx.id]
ctx.currentTask = nil
# Init
ctx.taskDeque[].init()
proc teardownWorker() =
## Cleanup the thread-local context of a worker
template ctx: untyped = workerContext
ctx.taskDeque[].teardown()
ctx.victims.delete()
proc eventLoop(ctx: var WorkerContext) {.raises:[Exception].}
proc workerEntryFn(params: tuple[taskpool: Taskpool, id: WorkerID])
{.raises: [Exception].} =
## On the start of the threadpool workers will execute this
## until they receive a termination signal
# We assume that thread_local variables start all at their binary zero value
preCondition: workerContext == default(WorkerContext)
template ctx: untyped = workerContext
# If the following crashes, you need --tlsEmulation:off
ctx.id = params.id
ctx.taskpool = params.taskpool
setupWorker()
# 1 matching barrier in Taskpool.new() for root thread
discard params.taskpool.barrier.wait()
{.gcsafe.}: # Not GC-safe when multi-threaded due to thread-local variables
ctx.eventLoop()
debugTermination:
log(">>> Worker %2d shutting down <<<\n", ctx.id)
# 1 matching barrier in taskpool.shutdown() for root thread
discard params.taskpool.barrier.wait()
teardownWorker()
# Tasks
# ---------------------------------------------
proc new(T: type TaskNode, parent: TaskNode, task: sink Task): T =
type TaskNodeObj = typeof(default(T)[])
var tn = cast[TaskNode](c_calloc(1, csize_t sizeof(TaskNodeObj)))
tn.parent = parent
tn.task = task
return tn
proc runTask(tn: var TaskNode) {.raises:[Exception], inline.} =
## Run a task and consumes the taskNode
tn.task.invoke()
when (NimMajor,NimMinor,NimPatch) >= (1,6,0):
{.gcsafe.}: # Upstream missing tagging `=destroy` as gcsafe
tn.task.`=destroy`()
else:
tn.task.shim_destroy()
tn.c_free()
proc schedule(ctx: WorkerContext, tn: sink TaskNode) {.inline.} =
## Schedule a task in the taskpool
debug: log("Worker %2d: schedule task 0x%.08x (parent 0x%.08x, current 0x%.08x)\n", ctx.id, tn, tn.parent, ctx.currentTask)
ctx.taskDeque[].push(tn)
ctx.taskpool.eventNotifier.notify()
# Scheduler
# ---------------------------------------------
proc trySteal(ctx: var WorkerContext): TaskNode =
## Try to steal a task.
ctx.victims.refill()
ctx.victims.excl(ctx.id)
while not ctx.victims.isEmpty():
let target = ctx.victims.randomPick(ctx.rng)
let stolenTask = ctx.otherDeques[target].steal()
if not stolenTask.isNil:
return stolenTask
ctx.victims.excl(target)
return nil
proc eventLoop(ctx: var WorkerContext) {.raises:[Exception].} =
## Each worker thread executes this loop over and over.
while not ctx.signal.terminate.load(moRelaxed):
# 1. Pick from local deque
debug: log("Worker %2d: eventLoop 1 - searching task from local deque\n", ctx.id)
while (var taskNode = ctx.taskDeque[].pop(); not taskNode.isNil):
debug: log("Worker %2d: eventLoop 1 - running task 0x%.08x (parent 0x%.08x, current 0x%.08x)\n", ctx.id, taskNode, taskNode.parent, ctx.currentTask)
taskNode.runTask()
# 2. Run out of tasks, become a thief
debug: log("Worker %2d: eventLoop 2 - becoming a thief\n", ctx.id)
var stolenTask = ctx.trySteal()
if not stolenTask.isNil:
# 2.a Run task
debug: log("Worker %2d: eventLoop 2.a - stole task 0x%.08x (parent 0x%.08x, current 0x%.08x)\n", ctx.id, stolenTask, stolenTask.parent, ctx.currentTask)
stolenTask.runTask()
else:
# 2.b Park the thread until a new task enters the taskpool
debug: log("Worker %2d: eventLoop 2.b - sleeping\n", ctx.id)
ctx.eventNotifier[].park()
debug: log("Worker %2d: eventLoop 2.b - waking\n", ctx.id)
# Tasking
# ---------------------------------------------
const RootTask = default(Task) # TODO: sentinel value different from null task
template isRootTask(task: Task): bool =
task == RootTask
proc forceFuture*[T](fv: Flowvar[T], parentResult: var T) {.raises:[Exception].} =
## Eagerly complete an awaited FlowVar
template ctx: untyped = workerContext
template isFutReady(): untyped =
fv.chan[].tryRecv(parentResult)
if isFutReady():
return
## 1. Process all the children of the current tasks.
## This ensures that we can give control back ASAP.
debug: log("Worker %2d: sync 1 - searching task from local deque\n", ctx.id)
while (var taskNode = ctx.taskDeque[].pop(); not taskNode.isNil):
if taskNode.parent != ctx.currentTask:
debug: log("Worker %2d: sync 1 - skipping non-direct descendant task 0x%.08x (parent 0x%.08x, current 0x%.08x)\n", ctx.id, taskNode, taskNode.parent, ctx.currentTask)
ctx.schedule(taskNode)
break
debug: log("Worker %2d: sync 1 - running task 0x%.08x (parent 0x%.08x, current 0x%.08x)\n", ctx.id, taskNode, taskNode.parent, ctx.currentTask)
taskNode.runTask()
if isFutReady():
debug: log("Worker %2d: sync 1 - future ready, exiting\n", ctx.id)
return
## 2. We run out-of-tasks or out-of-direct-child of our current awaited task
## So the task is bottlenecked by dependencies in other threads,
## hence we abandon our enqueued work and steal in the others' queues
## in hope it advances our awaited task. This prioritizes latency over throughput.
debug: log("Worker %2d: sync 2 - future not ready, becoming a thief (currentTask 0x%.08x)\n", ctx.id, ctx.currentTask)
while not isFutReady():
var taskNode = ctx.trySteal()
if not taskNode.isNil:
# We stole some task, we hope we advance our awaited task
debug: log("Worker %2d: sync 2.1 - stole task 0x%.08x (parent 0x%.08x, current 0x%.08x)\n", ctx.id, taskNode, taskNode.parent, ctx.currentTask)
taskNode.runTask()
# elif (taskNode = ctx.taskDeque[].pop(); not taskNode.isNil):
# # We advance our own queue, this increases throughput but may impact latency on the awaited task
# debug: log("Worker %2d: sync 2.2 - couldn't steal, running own task\n", ctx.id)
# taskNode.runTask()
else:
# We don't park as there is no notif for task completion
cpuRelax()
proc syncAll*(pool: Taskpool) {.raises: [Exception].} =
## Blocks until all pending tasks are completed
## This MUST only be called from
## the root scope that created the taskpool
template ctx: untyped = workerContext
debugTermination:
log(">>> Worker %2d enters barrier <<<\n", ctx.id)
preCondition: ctx.id == 0
preCondition: ctx.currentTask.task.isRootTask()
# Empty all tasks
var foreignThreadsParked = false
while not foreignThreadsParked:
# 1. Empty local tasks
debug: log("Worker %2d: syncAll 1 - searching task from local deque\n", ctx.id)
while (var taskNode = ctx.taskDeque[].pop(); not taskNode.isNil):
debug: log("Worker %2d: syncAll 1 - running task 0x%.08x (parent 0x%.08x, current 0x%.08x)\n", ctx.id, taskNode, taskNode.parent, ctx.currentTask)
taskNode.runTask()
if ctx.numThreads == 1 or foreignThreadsParked:
break
# 2. Help other threads
debug: log("Worker %2d: syncAll 2 - becoming a thief\n", ctx.id)
var taskNode = ctx.trySteal()
if not taskNode.isNil:
# 2.1 We stole some task
debug: log("Worker %2d: syncAll 2.1 - stole task 0x%.08x (parent 0x%.08x, current 0x%.08x)\n", ctx.id, taskNode, taskNode.parent, ctx.currentTask)
taskNode.runTask()
else:
# 2.2 No task to steal
if pool.eventNotifier.getParked() == pool.numThreads - 1:
# 2.2.1 all threads besides the current are parked
debugTermination:
log("Worker %2d: syncAll 2.2.1 - termination, all other threads sleeping\n", ctx.id)
foreignThreadsParked = true
else:
# 2.2.2 We don't park as there is no notif for task completion
cpuRelax()
debugTermination:
log(">>> Worker %2d leaves barrier <<<\n", ctx.id)
# Runtime
# ---------------------------------------------
proc new*(T: type Taskpool, numThreads = countProcessors(), pinThreadsToCores = false): T {.raises: [Exception].} =
## Initialize a threadpool that manages `numThreads` threads.
## Default to the number of logical processors available.
##
## If pinToCPU is set, threads spawned will be pinned to the core that spawned them.
## This improves performance of memory-intensive workloads by avoiding
## thrashing and reloading core caches when a thread moves around.
##
## pinThreadsToCores option is ignored in:
## - In C++ compilation with Microsoft Visual Studio Compiler
## - MacOS
## - Android
#
# pinThreadsToCores Status:
# - C++ MSVC: implementation missing (need to wrap reinterpret_cast)
# - Android: API missing and unclear benefits due to Big.Little architecture
# - MacOS: API missing
type TpObj = typeof(default(Taskpool)[])
# Event notifier requires an extra 64 bytes for alignment
var tp = wv_allocAligned(TpObj, sizeof(TpObj) + 64, 64)
tp.barrier.init(numThreads.int32)
tp.eventNotifier.initialize()
tp.numThreads = numThreads
tp.workerDeques = wv_allocArrayAligned(ChaseLevDeque[TaskNode], numThreads, alignment = 64)
tp.workers = wv_allocArrayAligned(Thread[(Taskpool, WorkerID)], numThreads, alignment = 64)
tp.workerSignals = wv_allocArrayAligned(Signal, numThreads, alignment = 64)
# Setup master thread
workerContext.id = 0
workerContext.taskpool = tp
if pinThreadsToCores:
when not(defined(cpp) and defined(vcc)):
# TODO: Nim casts between Windows Handles but that requires reinterpret cast for C++
pinToCpu(0)
# Start worker threads
for i in 1 ..< numThreads:
createThread(tp.workers[i], worker_entry_fn, (tp, WorkerID(i)))
if pinThreadsToCores:
# TODO: we might want to take into account Hyper-Threading (HT)
# and allow spawning tasks and pinning to cores that are not HT-siblings.
# This is important for memory-bound workloads (like copy, addition, ...)
# where both sibling cores will compete for L1 and L2 cache, effectively
# halving the memory bandwidth or worse, flushing what the other put in cache.
# Note that while 2x siblings is common, Xeon Phi has 4x Hyper-Threading.
when not(defined(cpp) and defined(vcc)):
# TODO: Nim casts between Windows Handles but that requires reinterpret cast for C++
pinToCpu(tp.workers[i], i)
# Root worker
setupWorker()
# Root task, this is a sentinel task that is never called.
workerContext.currentTask = TaskNode.new(
parent = nil,
task = default(Task) # TODO RootTask, somehow this uses `=copy`
)
# Wait for the child threads
discard tp.barrier.wait()
return tp
proc cleanup(tp: var TaskPool) {.raises: [OSError].} =
## Cleanup all resources allocated by the taskpool
preCondition: workerContext.currentTask.task.isRootTask()
for i in 1 ..< tp.numThreads:
joinThread(tp.workers[i])
tp.workerSignals.wv_freeAligned()
tp.workers.wv_freeAligned()
tp.workerDeques.wv_freeAligned()
`=destroy`(tp.eventNotifier)
tp.barrier.delete()
tp.wv_freeAligned()
proc shutdown*(tp: var TaskPool) {.raises:[Exception].} =
## Wait until all tasks are processed and then shutdown the taskpool
preCondition: workerContext.currentTask.task.isRootTask()
tp.syncAll()
# Signal termination to all threads
for i in 0 ..< tp.numThreads:
tp.workerSignals[i].terminate.store(true, moRelaxed)
let parked = tp.eventNotifier.getParked()
for i in 0 ..< parked:
tp.eventNotifier.notify()
# 1 matching barrier in worker_entry_fn
discard tp.barrier.wait()
teardownWorker()
tp.cleanup()
# Dealloc dummy task
workerContext.currentTask.c_free()
# Task parallelism
# ---------------------------------------------
{.pop.} # raises:[]
macro spawn*(tp: TaskPool, fnCall: typed): untyped =
## Spawns the input function call asynchronously, potentially on another thread of execution.
##
## If the function calls returns a result, spawn will wrap it in a Flowvar.
## You can use `sync` to block the current thread and extract the asynchronous result from the flowvar.
## You can use `isReady` to check if result is available and if subsequent
## `spawn` returns immediately.
##
## Tasks are processed approximately in Last-In-First-Out (LIFO) order
result = newStmtList()
let fn = fnCall[0]
let fnName = $fn
# Get the return type if any
let retType = fnCall[0].getImpl[3][0]
let needFuture = retType.kind != nnkEmpty
# Package in a task
let taskNode = ident("taskNode")
let task = ident("task")
if not needFuture:
result.add quote do:
let `task` = toTask(`fnCall`)
let `taskNode` = TaskNode.new(workerContext.currentTask, `task`)
schedule(workerContext, `taskNode`)
else:
# tasks have no return value.
# 1. We create a channel/flowvar to transmit the return value to awaiter/sync
# 2. We create a wrapper async_fn without return value that send the return value in the channel
# 3. We package that wrapper function in a task
# 1. Create the channel
let fut = ident("fut")
let futTy = nnkBracketExpr.newTree(
bindSym"FlowVar",
retType
)
result.add quote do:
let `fut` = newFlowVar(type `retType`)
# 2. Create a wrapper function that sends result to the channel
# TODO, upstream "getImpl" doesn't return the generic params
let genericParams = fn.getImpl()[2].replaceSymsByIdents()
let formalParams = fn.getImpl()[3].replaceSymsByIdents()
var asyncParams = nnkFormalParams.newTree(
newEmptyNode()
)
var fnCallIdents = nnkCall.newTree(
fnCall[0]
)
for i in 1 ..< formalParams.len:
let ident = formalParams[i].replaceSymsByIdents()
asyncParams.add ident
for j in 0 ..< ident.len - 2:
# Handle "a, b: int"
fnCallIdents.add ident[j]
let futFnParam = ident("fut")
asyncParams.add newIdentDefs(futFnParam, futTy)
let asyncBody = quote do:
# XXX: can't test that when the RootTask is default(Task) instead of a sentinel value
# preCondition: not isRootTask(workerContext.currentTask.task)
let res = `fnCallIdents`
readyWith(`futFnParam`, res)
let asyncFn = ident("taskpool_" & fnName)
result.add nnkProcDef.newTree(
asyncFn,
newEmptyNode(),
genericParams,
asyncParams,
nnkPragma.newTree(ident("nimcall")),
newEmptyNode(),
asyncBody
)
var asyncCall = newCall(asyncFn)
for i in 1 ..< fnCall.len:
asyncCall.add fnCall[i].replaceSymsByIdents()
asyncCall.add fut
result.add quote do:
let `task` = toTask(`asyncCall`)
let `taskNode` = TaskNode.new(workerContext.currentTask, `task`)
schedule(workerContext, `taskNode`)
# Return the future / flowvar
`fut`
# Wrap in a block for namespacing
result = nnkBlockStmt.newTree(newEmptyNode(), result)
# echo result.toStrLit()