diff --git a/waku/waku_archive_legacy/driver/postgres_driver/postgres_driver.nim b/waku/waku_archive_legacy/driver/postgres_driver/postgres_driver.nim index c2f83e36a..4f3532622 100644 --- a/waku/waku_archive_legacy/driver/postgres_driver/postgres_driver.nim +++ b/waku/waku_archive_legacy/driver/postgres_driver/postgres_driver.nim @@ -15,6 +15,7 @@ import ../../../waku_core, ../../common, ../../driver, + ./postgres_healthcheck, ../../../common/databases/db_postgres as waku_postgres type PostgresDriver* = ref object of ArchiveDriver @@ -132,6 +133,12 @@ proc new*( let writeConnPool = PgAsyncPool.new(dbUrl, maxNumConnOnEachPool).valueOr: return err("error creating write conn pool PgAsyncPool") + if not isNil(onFatalErrorAction): + asyncSpawn checkConnectivity(readConnPool, onFatalErrorAction) + + if not isNil(onFatalErrorAction): + asyncSpawn checkConnectivity(writeConnPool, onFatalErrorAction) + let driver = PostgresDriver(writeConnPool: writeConnPool, readConnPool: readConnPool) return ok(driver) diff --git a/waku/waku_archive_legacy/driver/postgres_driver/postgres_healthcheck.nim b/waku/waku_archive_legacy/driver/postgres_driver/postgres_healthcheck.nim new file mode 100644 index 000000000..4c9f170c9 --- /dev/null +++ b/waku/waku_archive_legacy/driver/postgres_driver/postgres_healthcheck.nim @@ -0,0 +1,38 @@ +{.push raises: [].} + +import chronos, chronicles, results +import ../../../common/databases/db_postgres, ../../../common/error_handling + +## Simple query to validate that the postgres is working and attending requests +const HealthCheckQuery = "SELECT version();" +const CheckConnectivityInterval = 60.seconds +const MaxNumTrials = 20 +const TrialInterval = 1.seconds + +proc checkConnectivity*( + connPool: PgAsyncPool, onFatalErrorAction: OnFatalErrorHandler +) {.async.} = + while true: + (await connPool.pgQuery(HealthCheckQuery)).isOkOr: + ## The connection failed once. Let's try reconnecting for a while. + ## Notice that the 'pgQuery' proc tries to establish a new connection. + + block errorBlock: + ## Force close all the opened connections. No need to close gracefully. + (await connPool.resetConnPool()).isOkOr: + onFatalErrorAction("checkConnectivity legacy resetConnPool error: " & error) + + var numTrial = 0 + while numTrial < MaxNumTrials: + let res = await connPool.pgQuery(HealthCheckQuery) + if res.isOk(): + ## Connection resumed. Let's go back to the normal healthcheck. + break errorBlock + + await sleepAsync(TrialInterval) + numTrial.inc() + + ## The connection couldn't be resumed. Let's inform the upper layers. + onFatalErrorAction("postgres legacy health check error: " & error) + + await sleepAsync(CheckConnectivityInterval)