From 605bf993448bc91620b9d40134d1b13c2e3b081e Mon Sep 17 00:00:00 2001 From: tersec Date: Tue, 26 Mar 2024 23:05:49 +0000 Subject: [PATCH] remove macOS/aarch64 workaround from proposeBlockAux (#6138) --- beacon_chain/validators/beacon_validators.nim | 131 ------------------ 1 file changed, 131 deletions(-) diff --git a/beacon_chain/validators/beacon_validators.nim b/beacon_chain/validators/beacon_validators.nim index 076df7f27..964d8ce42 100644 --- a/beacon_chain/validators/beacon_validators.nim +++ b/beacon_chain/validators/beacon_validators.nim @@ -1147,137 +1147,6 @@ proc proposeBlockAux( blobsBundle.proofs, blobsBundle.blobs)) else: Opt.none(seq[BlobSidecar]) - - # BIG BUG SOURCE: The `let` below cannot be combined with the others above! - # If combined, there are sometimes `SIGSEGV` during `test_keymanager_api`. - # This has only been observed on macOS (aarch64) in Jenkins, not on GitHub. - # - # - macOS 14.2.1 (23C71) - # - Xcode 15.1 (15C65) - # - Nim v1.6.18 (a749a8b742bd0a4272c26a65517275db4720e58a) - # - # Issue has started occuring around 12 Jan 2024, in a CI run for PR #5731. - # The PR did not change anything related to this, suggesting an environment - # or hardware change. The issue is flaky; could have been introduced earlier - # before surfacing in the aforementioned PR. About 30% to hit bug. - # - # [2024-01-12T11:54:21.011Z] Wrote test_keymanager_api/bootstrap_node.enr - # [2024-01-12T11:54:29.294Z] Serialization/deserialization [Beacon Node] [Preset: mainnet] . (0.00s) - # [2024-01-12T11:54:29.294Z] ListKeys requests [Beacon Node] [Preset: mainnet] .... (0.01s) - # [2024-01-12T11:54:34.870Z] ImportKeystores requests [Beacon Node] [Preset: mainnet] Traceback (most recent call last, using override) - # [2024-01-12T11:54:34.870Z] vendor/nim-libp2p/libp2p/protocols/rendezvous.nim(1016) main - # [2024-01-12T11:54:34.870Z] vendor/nim-libp2p/libp2p/protocols/rendezvous.nim(1006) NimMain - # [2024-01-12T11:54:34.870Z] vendor/nim-libp2p/libp2p/protocols/rendezvous.nim(997) PreMain - # [2024-01-12T11:54:34.870Z] tests/test_keymanager_api.nim(1502) atmtest_keymanager_apidotnim_Init000 - # [2024-01-12T11:54:34.870Z] tests/test_keymanager_api.nim(1475) main - # [2024-01-12T11:54:34.870Z] vendor/nim-chronos/chronos/internal/asyncfutures.nim(378) futureContinue - # [2024-01-12T11:54:34.870Z] tests/test_keymanager_api.nim(1481) main - # [2024-01-12T11:54:34.870Z] tests/test_keymanager_api.nim(307) startBeaconNode - # [2024-01-12T11:54:34.870Z] beacon_chain/nimbus_beacon_node.nim(1900) start - # [2024-01-12T11:54:34.870Z] beacon_chain/nimbus_beacon_node.nim(1847) run - # [2024-01-12T11:54:34.870Z] vendor/nim-chronos/chronos/internal/asyncengine.nim(150) poll - # [2024-01-12T11:54:34.870Z] vendor/nim-chronos/chronos/internal/asyncfutures.nim(378) futureContinue - # [2024-01-12T11:54:34.870Z] tests/test_keymanager_api.nim(1465) delayedTests - # [2024-01-12T11:54:34.870Z] tests/test_keymanager_api.nim(392) runTests - # [2024-01-12T11:54:34.870Z] vendor/nim-chronos/chronos/internal/asyncfutures.nim(378) futureContinue - # [2024-01-12T11:54:34.870Z] vendor/nim-unittest2/unittest2.nim(1147) runTests - # [2024-01-12T11:54:34.870Z] vendor/nim-unittest2/unittest2.nim(1086) runDirect - # [2024-01-12T11:54:34.870Z] vendor/nim-testutils/testutils/unittests.nim(16) runTestX60gensym2933 - # [2024-01-12T11:54:34.870Z] vendor/nim-chronos/chronos/internal/asyncfutures.nim(656) waitFor - # [2024-01-12T11:54:34.870Z] vendor/nim-chronos/chronos/internal/asyncfutures.nim(631) pollFor - # [2024-01-12T11:54:34.870Z] vendor/nim-chronos/chronos/internal/asyncengine.nim(150) poll - # [2024-01-12T11:54:34.870Z] vendor/nim-chronos/chronos/internal/asyncfutures.nim(378) futureContinue - # [2024-01-12T11:54:34.870Z] beacon_chain/validators/beacon_validators.nim(82) proposeBlockAux - # [2024-01-12T11:54:34.870Z] vendor/nimbus-build-system/vendor/Nim/lib/system/excpt.nim(631) signalHandler - # [2024-01-12T11:54:34.870Z] SIGSEGV: Illegal storage access. (Attempt to read from nil?) - # - # This appeared again around 25 Feb 2024, in a CI run for PR #5959, - # despite the extra `let` having been applied -- once more observed on - # macOS (aarch64) in Jenkins, and much rarer than before. - # - # [2024-02-25T23:21:24.533Z] Wrote test_keymanager_api/bootstrap_node.enr - # [2024-02-25T23:21:32.756Z] Serialization/deserialization [Beacon Node] [Preset: mainnet] . (0.00s) - # [2024-02-25T23:21:32.756Z] ListKeys requests [Beacon Node] [Preset: mainnet] .... (0.01s) - # [2024-02-25T23:21:37.219Z] ImportKeystores requests [Beacon Node] [Preset: mainnet] Traceback (most recent call last, using override) - # [2024-02-25T23:21:37.219Z] vendor/nim-libp2p/libp2p/protocols/pubsub/pubsub.nim(1068) main - # [2024-02-25T23:21:37.219Z] vendor/nim-libp2p/libp2p/protocols/pubsub/pubsub.nim(1058) NimMain - # [2024-02-25T23:21:37.219Z] vendor/nim-libp2p/libp2p/protocols/pubsub/pubsub.nim(1049) PreMain - # [2024-02-25T23:21:37.219Z] tests/test_keymanager_api.nim(1501) atmtest_keymanager_apidotnim_Init000 - # [2024-02-25T23:21:37.219Z] tests/test_keymanager_api.nim(1474) main - # [2024-02-25T23:21:37.219Z] vendor/nim-chronos/chronos/internal/asyncfutures.nim(382) futureContinue - # [2024-02-25T23:21:37.219Z] tests/test_keymanager_api.nim(1480) main - # [2024-02-25T23:21:37.219Z] tests/test_keymanager_api.nim(307) startBeaconNode - # [2024-02-25T23:21:37.219Z] beacon_chain/nimbus_beacon_node.nim(1916) start - # [2024-02-25T23:21:37.219Z] beacon_chain/nimbus_beacon_node.nim(1863) run - # [2024-02-25T23:21:37.219Z] vendor/nim-chronos/chronos/internal/asyncengine.nim(150) poll - # [2024-02-25T23:21:37.219Z] vendor/nim-chronos/chronos/internal/asyncfutures.nim(382) futureContinue - # [2024-02-25T23:21:37.219Z] tests/test_keymanager_api.nim(1464) delayedTests - # [2024-02-25T23:21:37.219Z] tests/test_keymanager_api.nim(391) runTests - # [2024-02-25T23:21:37.219Z] vendor/nim-chronos/chronos/internal/asyncfutures.nim(382) futureContinue - # [2024-02-25T23:21:37.219Z] vendor/nim-unittest2/unittest2.nim(1151) runTests - # [2024-02-25T23:21:37.219Z] vendor/nim-unittest2/unittest2.nim(1086) runDirect - # [2024-02-25T23:21:37.219Z] vendor/nim-testutils/testutils/unittests.nim(16) runTestX60gensym3188 - # [2024-02-25T23:21:37.219Z] vendor/nim-chronos/chronos/internal/asyncfutures.nim(660) waitFor - # [2024-02-25T23:21:37.219Z] vendor/nim-chronos/chronos/internal/asyncfutures.nim(635) pollFor - # [2024-02-25T23:21:37.219Z] vendor/nim-chronos/chronos/internal/asyncengine.nim(150) poll - # [2024-02-25T23:21:37.219Z] vendor/nim-chronos/chronos/internal/asyncfutures.nim(382) futureContinue - # [2024-02-25T23:21:37.219Z] vendor/nim-chronicles/chronicles.nim(251) proposeBlockAux - # [2024-02-25T23:21:37.219Z] SIGSEGV: Illegal storage access. (Attempt to read from nil?) - # - # One theory is that PR #5946 may increase the frequency, as there were - # times where the Jenkins CI failed almost every time using a shorter trace. - # However, the problem was once more flaky, with some passes in-between. - # For now, PR #5946 was reverted (low priority), and the problem is gone, - # whether related or not. - # - # [2024-02-23T23:11:47.700Z] Wrote test_keymanager_api/bootstrap_node.enr - # [2024-02-23T23:11:54.728Z] Serialization/deserialization [Beacon Node] [Preset: mainnet] . (0.00s) - # [2024-02-23T23:11:54.728Z] ListKeys requests [Beacon Node] [Preset: mainnet] .... (0.01s) - # [2024-02-23T23:11:59.523Z] ImportKeystores requests [Beacon Node] [Preset: mainnet] Traceback (most recent call last, using override) - # [2024-02-23T23:11:59.523Z] vendor/nim-libp2p/libp2p/protocols/pubsub/pubsub.nim(1067) main - # [2024-02-23T23:11:59.523Z] vendor/nim-libp2p/libp2p/protocols/pubsub/pubsub.nim(1057) NimMain - # [2024-02-23T23:11:59.523Z] vendor/nim-chronos/chronos/internal/asyncengine.nim(150) poll - # [2024-02-23T23:11:59.523Z] vendor/nim-chronos/chronos/internal/asyncengine.nim(150) poll - # [2024-02-23T23:11:59.523Z] SIGSEGV: Illegal storage access. (Attempt to read from nil?) - # - # The generated `nimcache` differs slightly if the `let` are separated from - # a single block; separation introduces an additional state in closure iter. - # This change, maybe combined with some macOS specific compiler specifics, - # could this trigger the `SIGSEGV`? Maybe the extra state adds just enough - # complexity to the function to disable certain problematic optimizations? - # The change in size of the environment changes a number of things such as - # alignment and which parts of an environment contain pointers and so on, - # which in turn may have surprising behavioural effects, ie most likely this - # extra state masks some underlying issue. Furthermore, the combination of - # `(await xyz).valueOr: return` is not very commonly used with other `await` - # in the same `let` block, which could explain this not being more common. - # - # Note that when compiling for Wasm, there are similar bugs with `results` - # when inlining unwraps, e.g., in `eth2_rest_serialization.nim`. - # These have not been investigated thoroughly so far as that project uses - # Nim 2.0 with --mm:orc and is just a prototype for Wasm, no production use. - # But maybe there is something weird going on with `results` related to the - # random `SIGSEGV` that we are now observing here, related to doing too much - # inline logic without defining intermediate isolated `let` statements. - # - # if mediaType == ApplicationJsonMediaType: - # try: - # - ok RestJson.decode(value, T, - # - requireAllFields = true, - # - allowUnknownFields = true) - # + let r = RestJson.decode(value, T, - # + requireAllFields = true, - # + allowUnknownFields = true) - # + ok r - # except SerializationError as exc: - # debug "Failed to deserialize REST JSON data", - # err = exc.formatMsg(""), - # - # At this time we can only speculate about the trigger of these issues. - # Until a shared pattern can be identified, it is better to apply - # workarounds that at least avoid the known to be reachable triggers. - # The solution is hacky and far from desirable; it is what it is. - let newBlockRef = ( await node.router.routeSignedBeaconBlock(signedBlock, blobsOpt) ).valueOr: