Mamy Ratsimbazafy d996ccd5d8
Path reorgs (#240)
* move tests

* move threadpool to root path

* fix hints and warnings, print nim versions for tests for debugging the new strange issue in CI

* print nim version

* mixup on branches

* mixup on branches reloaded
2023-05-29 20:14:30 +02:00

319 lines
12 KiB
Nim

# Weave
# Copyright (c) 2019 Mamy André-Ratsimbazafy
# Licensed and distributed under either of
# * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
# * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
# at your option. This file may not be copied, modified, or distributed except according to those terms.
# Original transposition codes from Laser project
# (c) Mamy André Ratsimbazafy, Apache License version 2
import
# Stdlib
system/ansi_c, std/[strformat, os, strutils, math, cpuinfo],
# Constantine
../../threadpool
when not defined(windows):
# bench
import ../wtime, ../resources
# Memory
# ---------------------------------------------------
proc wv_alloc*(T: typedesc): ptr T {.inline.}=
## Default allocator for the Picasso library
## This allocates memory to hold the type T
## and returns a pointer to it
##
## Can use Nim allocator to measure the overhead of its lock
## Memory is not zeroed
when defined(WV_useNimAlloc):
createSharedU(T)
else:
cast[ptr T](c_malloc(csize_t sizeof(T)))
proc wv_alloc*(T: typedesc, len: SomeInteger): ptr UncheckedArray[T] {.inline.} =
## Default allocator for the Picasso library.
## This allocates a contiguous chunk of memory
## to hold ``len`` elements of type T
## and returns a pointer to it.
##
## Can use Nim allocator to measure the overhead of its lock
## Memory is not zeroed
when defined(WV_useNimAlloc):
cast[type result](createSharedU(T, len))
else:
cast[type result](c_malloc(csize_t len*sizeof(T)))
proc wv_free*[T: ptr](p: T) {.inline.} =
when defined(WV_useNimAlloc):
freeShared(p)
else:
c_free(p)
# Transpose implementations
# ---------------------------------------------------
type TransposeStrategy = enum
Sequential
Naive
Nested
TiledNested
# Question: do we need __restrict to avoid the compiler generating
# defensive aliasing robust code?
proc sequentialTranspose(tp: Threadpool, M, N: int, bufIn, bufOut: ptr UncheckedArray[float32]) =
for j in 0 ..< N:
for i in 0 ..< M:
bufOut[j*M+i] = bufIn[i*N+j]
proc cttNaiveTranspose(tp: Threadpool, M, N: int, bufIn, bufOut: ptr UncheckedArray[float32]) =
## Transpose a MxN matrix into a NxM matrix
# Write are more expensive than read so we keep i accesses linear for writes
tp.parallelFor j in 0 ..< N:
captures: {M, N, bufIn, bufOut}
for i in 0 ..< M:
bufOut[j*M+i] = bufIn[i*N+j]
proc cttNestedTranspose(tp: Threadpool, M, N: int, bufIn, bufOut: ptr UncheckedArray[float32]) =
## Transpose a MxN matrix into a NxM matrix with nested for loops
tp.parallelFor j in 0 ..< N:
captures: {tp, M, N, bufIn, bufOut}
tp.parallelFor i in 0 ..< M:
captures: {j, M, N, bufIn, bufOut}
bufOut[j*M+i] = bufIn[i*N+j]
proc ctt2DTiledNestedTranspose(tp: Threadpool, M, N: int, bufIn, bufOut: ptr UncheckedArray[float32]) =
## Transpose with 2D tiling and nested
const blck = 64 # const do not need to be captured
tp.parallelFor j in 0 ..< N:
stride: blck
captures: {tp, M, N, bufIn, bufOut}
tp.parallelFor i in 0 ..< M:
stride: blck
captures: {j, M, N, bufIn, bufOut}
for jj in j ..< min(j+blck, N):
for ii in i ..< min(i+blck, M):
bufOut[jj*M+ii] = bufIn[ii*N+jj]
# Meta
# ---------------------------------------------------
func computeMeta(height, width: int): tuple[reqOps, reqBytes, bufSize: int] =
result.reqOps = height * width
result.reqBytes = sizeof(float32) * height * width
result.bufSize = height * width
func initialize(buffer: ptr UncheckedArray[float32], len: int) =
for i in 0 ..< len:
buffer[i] = i.float32
# Bench
# ---------------------------------------------------
template memUsage(maxRSS, runtimeRSS, pageFaults: untyped{ident}, body: untyped) =
var maxRSS, runtimeRSS, pageFaults: int32
block:
when not defined(windows):
var ru: Rusage
getrusage(RusageSelf, ru)
runtimeRSS = ru.ru_maxrss
pageFaults = ru.ru_minflt
body
when not defined(windows):
getrusage(RusageSelf, ru)
runtimeRSS = ru.ru_maxrss - runtimeRSS
pageFaults = ru.ru_minflt - pageFaults
maxRss = ru.ru_maxrss
proc report(
M, N: int, nthreads: int32, nrounds: int, reordered: bool,
transposeStrategy: TransposeStrategy, reqOps, reqBytes: int,
mxnTime: float64, mxnMaxRSS, mxnRuntimeRss, mxnPageFaults: int32,
nxmTime: float64, nxmMaxRSS, nxmRuntimeRss, nxmPageFaults: int32,
) =
let arithIntensity = reqOps.float / reqBytes.float
let mxnPerf = reqOps.float/(mxnTime*1e-3 / nrounds.float) * 1e-9 # Gops per second
let nxmPerf = reqOps.float/(nxmTime*1e-3 / nrounds.float) * 1e-9 # Gops per second
echo "--------------------------------------------------------------------------"
echo "Scheduler: Constantine's threadpool"
echo "Benchmark: Transpose - ", $transposeStrategy
echo "Threads: ", nthreads
echo "# of rounds: ", nrounds
echo "# of operations: ", reqOps
echo "# of bytes: ", reqBytes
echo "Arithmetic Intensity: ", round(arithIntensity, 3)
echo "--------------------------------------------------------------------------"
if not reordered:
echo "Transposition: ", M,'x',N, " --> ", N, 'x', M
when not defined(windows):
echo "Time(ms): ", round(mxnTime, 3)
echo "Max RSS (KB): ", mxnMaxRss
echo "Runtime RSS (KB): ", mxnRuntimeRSS
echo "# of page faults: ", mxnPageFaults
echo "Perf (GMEMOPs/s ~ GigaMemory Operations/s) ", round(mxnPerf, 3)
echo "--------------------------------------------------------------------------"
echo "Transposition: ", N,'x',M, " --> ", M, 'x', N
when not defined(windows):
echo "Time(ms): ", round(nxmTime, 3)
echo "Max RSS (KB): ", nxmMaxRss
echo "Runtime RSS (KB): ", nxmRuntimeRSS
echo "# of page faults: ", nxmPageFaults
echo "Perf (GMEMOPs/s ~ GigaMemory Operations/s) ", round(nxmPerf, 3)
else:
echo "Transposition: ", N,'x',M, " --> ", M, 'x', N
when not defined(windows):
echo "Time(ms): ", round(nxmTime, 3)
echo "Max RSS (KB): ", nxmMaxRss
echo "Runtime RSS (KB): ", nxmRuntimeRSS
echo "# of page faults: ", nxmPageFaults
echo "Perf (GMEMOPs/s ~ GigaMemory Operations/s) ", round(mxnPerf, 3)
echo "--------------------------------------------------------------------------"
echo "Transposition: ", M,'x',N, " --> ", N, 'x', M
when not defined(windows):
echo "Time(ms): ", round(mxnTime, 3)
echo "Max RSS (KB): ", mxnMaxRss
echo "Runtime RSS (KB): ", mxnRuntimeRSS
echo "# of page faults: ", mxnPageFaults
echo "Perf (GMEMOPs/s ~ GigaMemory Operations/s) ", round(nxmPerf, 3)
template runBench(tp: Threadpool, transposeName: typed, reorderCompute, isSequential: bool): untyped =
if not reorderCompute:
if not isSequential:
tp = Threadpool.new()
memUsage(mxnMaxRss, mxnRuntimeRss, mxnPageFaults):
when not defined(windows):
let start = wtime_msec()
for _ in 0 ..< nrounds:
tp.transposeName(M, N, bufIn, bufOut)
if not isSequential:
tp.syncAll()
when not defined(windows):
let stop = wtime_msec()
mxnTime = stop - start
memUsage(nxmMaxRss, nxmRuntimeRss, nxmPageFaults):
when not defined(windows):
let start = wtime_msec()
for _ in 0 ..< nrounds:
tp.transposeName(N, M, bufIn, bufOut)
if not isSequential:
tp.syncAll()
when not defined(windows):
let stop = wtime_msec()
nxmTime = stop - start
if not isSequential:
tp.shutdown()
report(M, N, nthreads, nrounds, reorderCompute,
transposeStrat, reqOps, reqBytes,
mxnTime, mxnMaxRSS, mxnRuntimeRss, mxnPageFaults,
nxmTime, nxmMaxRSS, nxmRuntimeRss, nxmPageFaults
)
else:
if not isSequential:
tp = Threadpool.new()
memUsage(nxmMaxRss, nxmRuntimeRss, nxmPageFaults):
when not defined(windows):
let start = wtime_msec()
for _ in 0 ..< nrounds:
tp.transposeName(N, M, bufIn, bufOut)
if not isSequential:
tp.syncAll()
when not defined(windows):
let stop = wtime_msec()
nxmTime = stop - start
memUsage(mxnMaxRss, mxnRuntimeRss, mxnPageFaults):
when not defined(windows):
let start = wtime_msec()
for _ in 0 ..< nrounds:
tp.transposeName(M, N, bufIn, bufOut)
if not isSequential:
tp.syncAll()
when not defined(windows):
let stop = wtime_msec()
mxnTime = stop - start
if not isSequential:
tp.shutdown()
report(M, N, nthreads, nrounds, reorderCompute,
transposeStrat, reqOps, reqBytes,
mxnTime, mxnMaxRSS, mxnRuntimeRss, mxnPageFaults,
nxmTime, nxmMaxRSS, nxmRuntimeRss, nxmPageFaults
)
# Interface
# ---------------------------------------------------
proc main() =
var
M = 400
N = 4000
nrounds = 1000
transposeStrat = TiledNested
reorderCompute = false
if paramCount() == 0:
let exeName = getAppFilename().extractFilename()
echo &"Usage: {exeName} <M: int> <N: int> <rounds: int> <transposeStrategy: Sequential|Naive|Nested|TiledNested> <reorderCompute: bool>"
echo &"Running with default M={M}, N={N}, rounds={nrounds}, transposeStrategy={transposeStrat}, reorderCompute={reorderCompute}"
elif paramCount() == 5:
M = paramStr(1).parseInt()
N = paramStr(2).parseInt()
nrounds = paramStr(3).parseInt()
transposeStrat = paramStr(4).parseEnum[:TransposeStrategy]()
reorderCompute = paramStr(5).parseBool()
else:
let exeName = getAppFilename().extractFilename()
echo &"Usage: {exeName} <M: int> <N: int> <rounds: int> <transposeStrategy: Sequential|Naive|Nested|TiledNested> <reorderCompute: bool>"
echo &"Default \"{exeName} {M} {N} {nrounds} {transposeStrat} {reorderCompute}\""
quit 1
echo "Inverting the transpose order may favor one transposition heavily for non-tiled strategies"
let isSequential = transposeStrat == Sequential
var nthreads: int32
if transposeStrat == Sequential:
nthreads = 1
elif existsEnv"CTT_NUM_THREADS":
nthreads = getEnv"CTT_NUM_THREADS".parseInt().int32
else:
nthreads = countProcessors().int32
let (reqOps, reqBytes, bufSize) = computeMeta(M, N)
let bufOut = wv_alloc(float32, bufSize)
let bufIn = wv_alloc(float32, bufSize)
bufIn.initialize(bufSize)
var mxnTime, nxmTime: float64
var tp: Threadpool
case transposeStrat
of Sequential: tp.runBench(sequentialTranspose, reorderCompute, isSequential)
of Naive: tp.runBench(cttNaiveTranspose, reorderCompute, isSequential)
of Nested: tp.runBench(cttNestedTranspose, reorderCompute, isSequential)
of TiledNested: tp.runBench(ctt2DTiledNestedTranspose, reorderCompute, isSequential)
wv_free(bufOut)
wv_free(bufIn)
main()