nimbus-eth1/nimbus/transaction/host_call_nested.nim
Jacek Sieka b3cb51e89e
Speed up evm stack (#2881)
The EVM stack is a hot spot in EVM execution and we end up paying a nim
seq tax in several ways, adding up to ~5% of execution time:

* on initial allocation, all bytes get zeroed - this means we have to
choose between allocating a full stack or just a partial one and then
growing it
* pushing and popping introduce additional zeroing
* reallocations on growth copy + zero - expensive again!
* redundant range checking on every operation reducing inlining etc

Here a custom stack using C memory is instroduced:

* no zeroing on allocation
* full stack allocated on EVM startup -> no reallocation during
execution
* fast push/pop - no zeroing again
* 32-byte alignment - this makes it easier for the compiler to use
vector instructions
* no stack allocated for precompiles (these never use it anyway)

Of course, this change also means we have to manage memory manually -
for the EVM, this turns out to be not too bad because we already manage
database transactions the same way (they have to be freed "manually") so
we can simply latch on to this mechanism.

While we're at it, this PR also skips database lookup for known
precompiles by resolving such addresses earlier.
2024-11-30 10:07:10 +01:00

165 lines
7.0 KiB
Nim

# Nimbus - Services available to EVM code that is run for a transaction
#
# Copyright (c) 2019-2024 Status Research & Development GmbH
# Licensed under either of
# * Apache License, version 2.0, ([LICENSE-APACHE](LICENSE-APACHE) or http://www.apache.org/licenses/LICENSE-2.0)
# * MIT license ([LICENSE-MIT](LICENSE-MIT) or http://opensource.org/licenses/MIT)
# at your option. This file may not be copied, modified, or distributed except according to those terms.
{.push raises: [].}
import
eth/common/eth_types,
stew/ptrops,
stew/saturation_arith,
stint,
../evm/[types, precompiles],
../evm/interpreter_dispatch,
../utils/utils,
"."/[host_types, host_trace]
import ../evm/computation except fromEvmc, toEvmc
proc evmcResultRelease(res: var EvmcResult) {.cdecl, gcsafe.} =
dealloc(res.output_data)
proc beforeExecCreateEvmcNested(host: TransactionHost,
m: EvmcMessage): Computation =
# TODO: use evmc_message to avoid copy
let childMsg = Message(
kind: CallKind(m.kind.ord),
depth: m.depth,
gas: GasInt m.gas,
sender: m.sender.fromEvmc,
value: m.value.fromEvmc,
data: @(makeOpenArray(m.input_data, m.input_size.int))
)
return newComputation(host.vmState, false, childMsg, isPrecompile = false,
keepStack = false,
cast[ContractSalt](m.create2_salt))
proc afterExecCreateEvmcNested(host: TransactionHost, child: Computation,
res: var EvmcResult) {.inline.} =
if not child.shouldBurnGas:
res.gas_left = int64.saturate(child.gasMeter.gasRemaining)
if child.isSuccess:
res.gas_refund = child.gasMeter.gasRefunded
res.status_code = EVMC_SUCCESS
res.create_address = child.msg.contractAddress.toEvmc
else:
res.status_code = child.evmcStatus
if child.output.len > 0:
# TODO: can we move the ownership of seq to raw pointer?
res.output_size = child.output.len.uint
res.output_data = cast[ptr byte](alloc(child.output.len))
copyMem(res.output_data, child.output[0].addr, child.output.len)
res.release = evmcResultRelease
proc beforeExecCallEvmcNested(host: TransactionHost,
m: EvmcMessage): Computation {.inline.} =
let childMsg = Message(
kind: CallKind(m.kind.ord),
depth: m.depth,
gas: GasInt m.gas,
sender: m.sender.fromEvmc,
codeAddress: m.code_address.fromEvmc,
contractAddress: if m.kind == EVMC_CALL:
m.recipient.fromEvmc
else:
host.computation.msg.contractAddress,
value: m.value.fromEvmc,
data: @(makeOpenArray(m.input_data, m.input_size.int)),
flags: m.flags,
)
let isPrecompile = getPrecompile(host.vmState.fork, childMsg.codeAddress).isSome()
newComputation(host.vmState, false, childMsg, isPrecompile = isPrecompile, keepStack = false)
proc afterExecCallEvmcNested(host: TransactionHost, child: Computation,
res: var EvmcResult) {.inline.} =
if not child.shouldBurnGas:
res.gas_left = int64.saturate(child.gasMeter.gasRemaining)
if child.isSuccess:
res.gas_refund = child.gasMeter.gasRefunded
res.status_code = EVMC_SUCCESS
else:
res.status_code = child.evmcStatus
if child.output.len > 0:
# TODO: can we move the ownership of seq to raw pointer?
res.output_size = child.output.len.uint
res.output_data = cast[ptr byte](alloc(child.output.len))
copyMem(res.output_data, child.output[0].addr, child.output.len)
res.release = evmcResultRelease
# The next three functions are designed so `callEvmcNested` uses very small C
# stack usage for each level of nested EVM calls.
#
# To keep the C stack usage small when there are deeply nested EVM calls,
# `callEvmcNested` must use as little stack as possible, going from the EVM
# which calls it to the nested EVM which it calls.
#
# First, `callEvmcNested` itself is `template` so it is inlined to the caller
# at Nim level, not C level. Only at Nim level is inlining guaranteed across
# `import`. This saves a C stack frame, which matters because some C compilers
# reserve space for 1-3 copies of the large `EvmcResult` return value.
#
# Second, the complicated parts of preparation and return are done in
# out-of-line functions `beforeExecEvmcNested` and `afterExecEvmcNested`. They
# are annotated with `{.noinline.}` to make sure they are out-of-line. The
# annotation ensures they don't contribute to the stack frame of
# `callEvmcNested`, because otherwise the compiler can optimistically inline.
# (Even across modules when using `-flto`).
#
# The functions `beforeExecEvmcNested` and `afterExecEvmcNested` can use as
# much stack as they like.
proc beforeExecEvmcNested(host: TransactionHost, msg: EvmcMessage): Computation
# This function must be declared with `{.noinline.}` to make sure it doesn't
# contribute to the stack frame of `callEvmcNested` below.
{.noinline.} =
# `call` is special. Most host functions do `flip256` in `evmc_host_glue`
# and `show` in `host_services`, but `call` needs to minimise C stack used
# by nested EVM calls. Just `flip256` in glue's `call` adds a lot of
# stack: +65% in tests, enough to blow our 750kiB test stack target and
# crash. Easily avoided by doing `flip256` and `show` out-of-line here.
var msg = msg # Make a local copy that's ok to modify.
msg.value = flip256(msg.value)
host.showCallEntry(msg)
let c = if msg.kind == EVMC_CREATE or msg.kind == EVMC_CREATE2:
beforeExecCreateEvmcNested(host, msg)
else:
beforeExecCallEvmcNested(host, msg)
when defined(evmc_enabled):
c.host.init(cast[ptr nimbus_host_interface](host.hostInterface),
cast[typeof(c.host.context)](host))
host.saveComputation.add(host.computation)
host.computation = c
return c
proc afterExecEvmcNested(host: TransactionHost, child: Computation,
kind: EvmcCallKind): EvmcResult
# This function must be declared with `{.noinline.}` to make sure it doesn't
# contribute to the stack frame of `callEvmcNested` below.
{.noinline.} =
host.computation = host.saveComputation[^1]
host.saveComputation[^1] = nil
host.saveComputation.setLen(host.saveComputation.len - 1)
if kind == EVMC_CREATE or kind == EVMC_CREATE2:
afterExecCreateEvmcNested(host, child, result)
else:
afterExecCallEvmcNested(host, child, result)
host.showCallReturn(result, kind.isCreate)
template callEvmcNested*(host: TransactionHost, msg: EvmcMessage): EvmcResult =
# `call` is special. The C stack usage must be kept small for deeply nested
# EVM calls. To ensure small stack, this function must use `template` to
# inline at Nim level (same for `host.call(msg)`). `{.inline.}` is not good
# enough. Due to object return it ends up using a lot more stack. (Note
# that template parameters `host` and `msg` are multiple-evaluated here;
# simple expressions must be used when calling.)
let child = beforeExecEvmcNested(host, msg)
child.execCallOrCreate()
afterExecEvmcNested(host, child, msg.kind)