mirror of
https://github.com/waku-org/nwaku.git
synced 2025-01-28 07:35:56 +00:00
Adding healtcheck and reconnection mechanism to the postgres archive driver (#1997)
It starts an asynchronous infinite task that checks the connectivity with the database. In case of error, the postgres_healthcheck task tries to reconnect for a while, and if it determines that the connection cannot be resumed, then it invokes a callback indicating that situation. For the case of the `wakunode2` app, this callback quits the application itself and adds a log trace indicating the connectivity issue with the database.
This commit is contained in:
parent
5638bd06bb
commit
1fb13b0967
@ -410,10 +410,17 @@ proc setupProtocols(node: WakuNode,
|
|||||||
return err("failed to mount waku RLN relay protocol: " & getCurrentExceptionMsg())
|
return err("failed to mount waku RLN relay protocol: " & getCurrentExceptionMsg())
|
||||||
|
|
||||||
if conf.store:
|
if conf.store:
|
||||||
|
var onErrAction = proc(msg: string) {.gcsafe, closure.} =
|
||||||
|
## Action to be taken when an internal error occurs during the node run.
|
||||||
|
## e.g. the connection with the database is lost and not recovered.
|
||||||
|
error "Unrecoverable error occurred", error = msg
|
||||||
|
quit(QuitFailure)
|
||||||
|
|
||||||
# Archive setup
|
# Archive setup
|
||||||
let archiveDriverRes = ArchiveDriver.new(conf.storeMessageDbUrl,
|
let archiveDriverRes = ArchiveDriver.new(conf.storeMessageDbUrl,
|
||||||
conf.storeMessageDbVacuum,
|
conf.storeMessageDbVacuum,
|
||||||
conf.storeMessageDbMigration)
|
conf.storeMessageDbMigration,
|
||||||
|
onErrAction)
|
||||||
if archiveDriverRes.isErr():
|
if archiveDriverRes.isErr():
|
||||||
return err("failed to setup archive driver: " & archiveDriverRes.error)
|
return err("failed to setup archive driver: " & archiveDriverRes.error)
|
||||||
|
|
||||||
|
@ -136,6 +136,20 @@ proc getConnIndex(pool: PgAsyncPool):
|
|||||||
pool.conns[index].busy = true
|
pool.conns[index].busy = true
|
||||||
return ok(index)
|
return ok(index)
|
||||||
|
|
||||||
|
proc resetConnPool*(pool: PgAsyncPool): Future[DatabaseResult[void]] {.async.} =
|
||||||
|
## Forces closing the connection pool.
|
||||||
|
## This proc is intended to be called when the connection with the database
|
||||||
|
## got interrupted from the database side or a connectivity problem happened.
|
||||||
|
|
||||||
|
for i in 0..<pool.conns.len:
|
||||||
|
pool.conns[i].busy = false
|
||||||
|
|
||||||
|
(await pool.close()).isOkOr:
|
||||||
|
return err("error in resetConnPool: " & error)
|
||||||
|
|
||||||
|
pool.state = PgAsyncPoolState.Live
|
||||||
|
return ok()
|
||||||
|
|
||||||
proc releaseConn(pool: PgAsyncPool, conn: DbConn) =
|
proc releaseConn(pool: PgAsyncPool, conn: DbConn) =
|
||||||
## Marks the connection as released.
|
## Marks the connection as released.
|
||||||
for i in 0..<pool.conns.len:
|
for i in 0..<pool.conns.len:
|
||||||
|
@ -16,6 +16,7 @@ const DefaultPageSize*: uint = 25
|
|||||||
type
|
type
|
||||||
ArchiveDriverResult*[T] = Result[T, string]
|
ArchiveDriverResult*[T] = Result[T, string]
|
||||||
ArchiveDriver* = ref object of RootObj
|
ArchiveDriver* = ref object of RootObj
|
||||||
|
OnErrHandler* = proc(errMsg: string) {.gcsafe, closure.}
|
||||||
|
|
||||||
type ArchiveRow* = (PubsubTopic, WakuMessage, seq[byte], Timestamp)
|
type ArchiveRow* = (PubsubTopic, WakuMessage, seq[byte], Timestamp)
|
||||||
|
|
||||||
|
@ -25,8 +25,13 @@ export
|
|||||||
proc new*(T: type ArchiveDriver,
|
proc new*(T: type ArchiveDriver,
|
||||||
url: string,
|
url: string,
|
||||||
vacuum: bool,
|
vacuum: bool,
|
||||||
migrate: bool):
|
migrate: bool,
|
||||||
|
onErrAction: OnErrHandler):
|
||||||
Result[T, string] =
|
Result[T, string] =
|
||||||
|
## url - string that defines the database
|
||||||
|
## vacuum - if true, a cleanup operation will be applied to the database
|
||||||
|
## migrate - if true, the database schema will be updated
|
||||||
|
## onErrAction - called if, e.g., the connection with db got lost forever
|
||||||
|
|
||||||
let dbUrlValidationRes = dburl.validateDbUrl(url)
|
let dbUrlValidationRes = dburl.validateDbUrl(url)
|
||||||
if dbUrlValidationRes.isErr():
|
if dbUrlValidationRes.isErr():
|
||||||
@ -74,7 +79,7 @@ proc new*(T: type ArchiveDriver,
|
|||||||
|
|
||||||
of "postgres":
|
of "postgres":
|
||||||
const MaxNumConns = 5 #TODO: we may need to set that from app args (maybe?)
|
const MaxNumConns = 5 #TODO: we may need to set that from app args (maybe?)
|
||||||
let res = PostgresDriver.new(url, MaxNumConns)
|
let res = PostgresDriver.new(url, MaxNumConns, onErrAction)
|
||||||
if res.isErr():
|
if res.isErr():
|
||||||
return err("failed to init postgres archive driver: " & res.error)
|
return err("failed to init postgres archive driver: " & res.error)
|
||||||
|
|
||||||
|
@ -12,7 +12,8 @@ import
|
|||||||
../../../waku_core,
|
../../../waku_core,
|
||||||
../../common,
|
../../common,
|
||||||
../../driver,
|
../../driver,
|
||||||
../../../common/databases/db_postgres as waku_postgres
|
../../../common/databases/db_postgres as waku_postgres,
|
||||||
|
./postgres_healthcheck
|
||||||
|
|
||||||
export postgres_driver
|
export postgres_driver
|
||||||
|
|
||||||
@ -43,14 +44,20 @@ const DefaultMaxConnections = 5
|
|||||||
|
|
||||||
proc new*(T: type PostgresDriver,
|
proc new*(T: type PostgresDriver,
|
||||||
dbUrl: string,
|
dbUrl: string,
|
||||||
maxConnections: int = DefaultMaxConnections):
|
maxConnections: int = DefaultMaxConnections,
|
||||||
|
onErrAction: OnErrHandler = nil):
|
||||||
ArchiveDriverResult[T] =
|
ArchiveDriverResult[T] =
|
||||||
|
|
||||||
let connPoolRes = PgAsyncPool.new(dbUrl, maxConnections)
|
let connPoolRes = PgAsyncPool.new(dbUrl, maxConnections)
|
||||||
if connPoolRes.isErr():
|
if connPoolRes.isErr():
|
||||||
return err("error creating PgAsyncPool: " & connPoolRes.error)
|
return err("error creating PgAsyncPool: " & connPoolRes.error)
|
||||||
|
|
||||||
return ok(PostgresDriver(connPool: connPoolRes.get()))
|
let connPool = connPoolRes.get()
|
||||||
|
|
||||||
|
if not isNil(onErrAction):
|
||||||
|
asyncSpawn checkConnectivity(connPool, onErrAction)
|
||||||
|
|
||||||
|
return ok(PostgresDriver(connPool: connPool))
|
||||||
|
|
||||||
proc createMessageTable*(s: PostgresDriver):
|
proc createMessageTable*(s: PostgresDriver):
|
||||||
Future[ArchiveDriverResult[void]] {.async.} =
|
Future[ArchiveDriverResult[void]] {.async.} =
|
||||||
|
@ -0,0 +1,47 @@
|
|||||||
|
when (NimMajor, NimMinor) < (1, 4):
|
||||||
|
{.push raises: [Defect].}
|
||||||
|
else:
|
||||||
|
{.push raises: [].}
|
||||||
|
|
||||||
|
import
|
||||||
|
chronos,
|
||||||
|
stew/results
|
||||||
|
import
|
||||||
|
../../driver,
|
||||||
|
../../../common/databases/db_postgres
|
||||||
|
|
||||||
|
## Simple query to validate that the postgres is working and attending requests
|
||||||
|
const HealthCheckQuery = "SELECT version();"
|
||||||
|
const CheckConnectivityInterval = 30.seconds
|
||||||
|
const MaxNumTrials = 20
|
||||||
|
const TrialInterval = 1.seconds
|
||||||
|
|
||||||
|
proc checkConnectivity*(connPool: PgAsyncPool,
|
||||||
|
onErrAction: OnErrHandler) {.async.} =
|
||||||
|
|
||||||
|
while true:
|
||||||
|
|
||||||
|
(await connPool.exec(HealthCheckQuery)).isOkOr:
|
||||||
|
|
||||||
|
## The connection failed once. Let's try reconnecting for a while.
|
||||||
|
## Notice that the 'exec' proc tries to establish a new connection.
|
||||||
|
|
||||||
|
block errorBlock:
|
||||||
|
## Force close all the opened connections. No need to close gracefully.
|
||||||
|
(await connPool.resetConnPool()).isOkOr:
|
||||||
|
onErrAction("checkConnectivity resetConnPool error: " & error)
|
||||||
|
|
||||||
|
var numTrial = 0
|
||||||
|
while numTrial < MaxNumTrials:
|
||||||
|
let res = await connPool.exec(HealthCheckQuery)
|
||||||
|
if res.isOk():
|
||||||
|
## Connection resumed. Let's go back to the normal healthcheck.
|
||||||
|
break errorBlock
|
||||||
|
|
||||||
|
await sleepAsync(TrialInterval)
|
||||||
|
numTrial.inc()
|
||||||
|
|
||||||
|
## The connection couldn't be resumed. Let's inform the upper layers.
|
||||||
|
onErrAction("postgres health check error: " & error)
|
||||||
|
|
||||||
|
await sleepAsync(CheckConnectivityInterval)
|
Loading…
x
Reference in New Issue
Block a user