nim-taskpools/benchmarks/heat/stdnim_heat.nim

# Weave
# Copyright (c) 2019 Mamy André-Ratsimbazafy
# Licensed and distributed under either of
#   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
#   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
# at your option. This file may not be copied, modified, or distributed except according to those terms.

# From fibril
#
# Original license
#
# /*
#  * Heat diffusion (Jacobi-type iteration)
#  *
#  * Volker Strumpen, Boston                                 August 1996
#  *
#  * Copyright (c) 1996 Massachusetts Institute of Technology
#  *
#  * This program is free software; you can redistribute it and/or modify
#  * it under the terms of the GNU General Public License as published by
#  * the Free Software Foundation; either version 2 of the License, or
#  * (at your option) any later version.
#  *
#  * This program is distributed in the hope that it will be useful,
#  * but WITHOUT ANY WARRANTY; without even the implied warranty of
#  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#  * GNU General Public License for more details.
#  *
#  * You should have received a copy of the GNU General Public License
#  * along with this program; if not, write to the Free Software
#  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
#  */

import
  # Stdlib
  strformat, os, strutils, math, system/ansi_c,
  cpuinfo, threadpool,
  # bench
  ../wtime, ../resources

# This deadlocks :/

# Helpers
# -------------------------------------------------------

# We need a thin wrapper around raw pointers for matrices,
# we can't pass "var seq[seq[float64]]" to other threads
# nor "var" for that matter
type
  Matrix[T] = object
    buffer: ptr UncheckedArray[T]
    m, n: int

  Row[T] = object
    buffer: ptr UncheckedArray[T]
    len: int

func newMatrix[T](m, n: int): Matrix[T] {.inline.} =
  result.buffer = cast[ptr UncheckedArray[T]](c_malloc(csize_t m*n*sizeof(T)))
  result.m = m
  result.n = n

template `[]`[T](mat: Matrix[T], row, col: Natural): T =
  # row-major storage
  assert row < mat.m
  assert col < mat.n
  mat.buffer[row * mat.n + col]

template `[]=`[T](mat: Matrix[T], row, col: Natural, value: T) =
  assert row < mat.m
  assert col < mat.n
  mat.buffer[row * mat.n + col] = value

func getRow[T](mat: Matrix[T], rowIdx: Natural): Row[T] {.inline.} =
  # row-major storage, there are n columns in between each rows
  assert rowIdx < mat.m
  result.buffer = cast[ptr UncheckedArray[T]](mat.buffer[rowIdx * mat.n].addr)
  result.len = mat.m

template `[]`[T](row: Row[T], idx: Natural): T =
  assert idx < row.len
  row.buffer[idx]

template `[]=`[T](row: Row[T], idx: Natural, value: T) =
  assert idx < row.len
  row.buffer[idx] = value

func delete[T](mat: sink Matrix[T]) =
  c_free(mat.buffer)

# And an auto converter for int32 -> float64 so we don't have to convert
# all i, j indices manually

converter i32toF64(x: int32): float64 {.inline.} =
  float64(x)

# -------------------------------------------------------

template f(x, y: SomeFloat): SomeFloat =
  sin(x) * sin(y)

template randa[T: SomeFloat](x, t: T): T =
  T(0.0)

proc randb(x, t: SomeFloat): SomeFloat {.inline.} =
  # proc instead of template to avoid Nim constant folding bug:
  # https://github.com/nim-lang/Nim/issues/12783
  exp(-2 * t) * sin(x)

template randc[T: SomeFloat](y, t: T): T =
  T(0.0)

proc randd(y, t: SomeFloat): SomeFloat {.inline.} =
  # proc instead of template to avoid Nim constant folding bug:
  # https://github.com/nim-lang/Nim/issues/12783
  exp(-2 * t) * sin(y)

template solu(x, y, t: SomeFloat): SomeFloat =
  exp(-2 * t) * sin(x) * sin(y)

const n = 4096'i32

var
  nx, ny, nt: int32
  xu, xo, yu, yo, tu, to: float64

  dx, dy, dt: float64
  dtdxsq, dtdysq: float64

  odd: Matrix[float64]
  even: Matrix[float64]

proc heat(m: Matrix[float64], il, iu: int32): bool {.discardable.}=
  # TODO to allow awaiting `heat` we return a dummy bool
  # The parallel spawns are updating the same matrix cells otherwise
  if iu - il > 1:
    let im = (il + iu) div 2

    let h = spawn heat(m, il, im)
    heat(m, im, iu)
    discard ^h
    return true
  # ------------------------

  let i = il
  let row = m.getRow(i)

  if i == 0:
    for j in 0 ..< ny:
      row[j] = randc(yu + j*dy, 0)
  elif i == nx - 1:
    for j in 0 ..< ny:
      row[j] = randd(yu + j*dy, 0)
  else:
    row[0] = randa(xu + i*dx, 0)
    for j in 1 ..< ny - 1:
      row[j] = f(xu + i*dx, yu + j*dy)
    row[ny - 1] = randb(xu + i*dx, 0)

proc diffuse(output: Matrix[float64], input: Matrix[float64], il, iu: int32, t: float64): bool {.discardable.} =
  # TODO to allow awaiting `diffuse` we return a dummy bool
  # The parallel spawns are updating the same matrix cells otherwise
  if iu - il > 1:
    let im = (il + iu) div 2

    let d = spawn diffuse(output, input, il, im, t)
    diffuse(output, input, im, iu, t)
    discard ^d
    return true
  # ------------------------

  let i = il
  let row = output.getRow(i)

  if i == 0:
    for j in 0 ..< ny:
      row[j] = randc(yu + j*dy, t)
  elif i == nx - 1:
    for j in 0 ..< ny:
      row[j] = randd(yu + j*dy, t)
  else:
    row[0] = randa(xu + i*dx, t)
    for j in 1 ..< ny - 1:
      row[j] = input[i, j] + # The use of nested sequences here is a bad idea ...
               dtdysq * (input[i, j+1] - 2 * input[i, j] + input[i, j-1]) +
               dtdxsq * (input[i+1, j] - 2 * input[i, j] + input[i-1, j])
    row[ny - 1] = randb(xu + i*dx, t)

proc initTest() =
  nx = n
  ny = 1024
  nt = 100
  xu = 0.0
  xo = 1.570796326794896558
  yu = 0.0
  yo = 1.570796326794896558
  tu = 0.0
  to = 0.0000001

  dx = (xo - xu) / float64(nx - 1)
  dy = (yo - yu) / float64(ny - 1)
  dt = (to - tu) / float64(nt)

  dtdxsq = dt / (dx * dx)
  dtdysq = dt / (dy * dy)

  even = newMatrix[float64](nx, ny)
  odd = newMatrix[float64](nx, ny)

proc prep() =
  heat(even, 0, nx)

proc test() =
  var t = tu

  for _ in countup(1, nt.int, 2):
    # nt included
    t += dt
    diffuse(odd, even, 0, nx, t)
    t += dt
    diffuse(even, odd, 0, nx, t)

  if nt mod 2 != 0:
    t += dt
    diffuse(odd, even, 0, nx, t)

proc verify() =
  var
    mat: Matrix[float64]
    mae: float64
    mre: float64
    me:  float64

  mat = if nt mod 2 != 0: odd else: even

  for a in 0 ..< nx:
    for b in 0 ..< ny:
      var tmp = abs(mat[a, b] - solu(xu + a*dx, yu + b*dy, to))
      if tmp > 1e-3:
        echo "nx: ", nx, " - ny: ", ny
        echo "mat[", a, ", ", b, "] = ", mat[a, b], ", expected sol = ", solu(xu + a*dx, yu + b*dy, to)
        quit 1

      me += tmp
      if tmp > mae: mae = tmp
      if mat[a, b] != 0.0: tmp /= mat[a, b]
      if tmp > mre: mre = tmp

  me /= nx * ny

  if mae > 1e-12:
    echo &"Local maximal absolute error {mae:1.3e}"
    quit 1
  if mre > 1e-12:
    echo &"Local maximal relative error {mre:1.3e}"
    quit 1
  if me > 1e-12:
    echo &"Global mean absolute error {me:1.3e}"
    quit 1

  echo "Verification successful"

proc main() =
  var nthreads: int
  nthreads = countProcessors()

  var ru: Rusage
  getrusage(RusageSelf, ru)
  var
    rss = ru.ru_maxrss
    flt = ru.ru_minflt

  initTest()

  prep()
  let start = wtime_usec()
  test()
  let stop = wtime_usec()

  getrusage(RusageSelf, ru)
  rss = ru.ru_maxrss - rss
  flt = ru.ru_minflt - flt

  sync()

  verify()
  delete(even)
  delete(odd)

  echo "Scheduler:  Nim threadpool (standard lib)"
  echo "Benchmark:        heat"
  echo "Threads:          ", nthreads
  echo "Time(us)          ", stop - start
  echo "Max RSS (KB):     ", ru.ru_maxrss
  echo "Runtime RSS (KB): ", rss
  echo "# of page faults: ", flt

  quit 0

main()