chore: simple PR to enhance postgres and retention policy logs (#2884)

This commit is contained in:
Ivan FB 2024-07-15 20:58:31 +02:00 committed by GitHub
parent 241fb8cb97
commit 71ee42de56
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 53 additions and 8 deletions

View File

@ -87,7 +87,6 @@ proc handleMessage*(
return return
let msgHash = computeMessageHash(pubsubTopic, msg) let msgHash = computeMessageHash(pubsubTopic, msg)
let insertStartTime = getTime().toUnixFloat() let insertStartTime = getTime().toUnixFloat()
(await self.driver.put(msgHash, pubsubTopic, msg)).isOkOr: (await self.driver.put(msgHash, pubsubTopic, msg)).isOkOr:
@ -99,7 +98,7 @@ proc handleMessage*(
timestamp = msg.timestamp, timestamp = msg.timestamp,
error = error error = error
notice "message archived", trace "message archived",
hash_hash = msgHash.to0xHex(), hash_hash = msgHash.to0xHex(),
pubsubTopic = pubsubTopic, pubsubTopic = pubsubTopic,
contentTopic = msg.contentTopic, contentTopic = msg.contentTopic,
@ -186,11 +185,10 @@ proc findMessages*(
) )
proc periodicRetentionPolicy(self: WakuArchive) {.async.} = proc periodicRetentionPolicy(self: WakuArchive) {.async.} =
debug "executing message retention policy"
let policy = self.retentionPolicy.get() let policy = self.retentionPolicy.get()
while true: while true:
debug "executing message retention policy"
(await policy.execute(self.driver)).isOkOr: (await policy.execute(self.driver)).isOkOr:
waku_archive_errors.inc(labelValues = [retPolicyFailure]) waku_archive_errors.inc(labelValues = [retPolicyFailure])
error "failed execution of retention policy", error = error error "failed execution of retention policy", error = error

View File

@ -346,6 +346,7 @@ proc getPartitionsList(
## Retrieves the seq of partition table names. ## Retrieves the seq of partition table names.
## e.g: @["messages_1708534333_1708534393", "messages_1708534273_1708534333"] ## e.g: @["messages_1708534333_1708534393", "messages_1708534273_1708534333"]
debug "beginning getPartitionsList"
var partitions: seq[string] var partitions: seq[string]
proc rowCallback(pqResult: ptr PGresult) = proc rowCallback(pqResult: ptr PGresult) =
for iRow in 0 ..< pqResult.pqNtuples(): for iRow in 0 ..< pqResult.pqNtuples():
@ -404,6 +405,7 @@ proc getMessagesArbitraryQuery(
): Future[ArchiveDriverResult[seq[ArchiveRow]]] {.async.} = ): Future[ArchiveDriverResult[seq[ArchiveRow]]] {.async.} =
## This proc allows to handle atypical queries. We don't use prepared statements for those. ## This proc allows to handle atypical queries. We don't use prepared statements for those.
debug "beginning getMessagesArbitraryQuery"
var query = SelectClause var query = SelectClause
var statements: seq[string] var statements: seq[string]
var args: seq[string] var args: seq[string]
@ -484,7 +486,9 @@ proc getMessageHashesArbitraryQuery(
.} = .} =
## This proc allows to handle atypical queries. We don't use prepared statements for those. ## This proc allows to handle atypical queries. We don't use prepared statements for those.
debug "beginning of getMessagesV2ArbitraryQuery"
var query = """SELECT messageHash FROM messages""" var query = """SELECT messageHash FROM messages"""
var statements: seq[string] var statements: seq[string]
var args: seq[string] var args: seq[string]
@ -565,6 +569,7 @@ proc getMessagesPreparedStmt(
var rows: seq[(WakuMessageHash, PubsubTopic, WakuMessage)] var rows: seq[(WakuMessageHash, PubsubTopic, WakuMessage)]
debug "beginning of getMessagesPreparedStmt"
proc rowCallback(pqResult: ptr PGresult) = proc rowCallback(pqResult: ptr PGresult) =
rowCallbackImpl(pqResult, rows) rowCallbackImpl(pqResult, rows)
@ -653,6 +658,7 @@ proc getMessageHashesPreparedStmt(
var rows: seq[(WakuMessageHash, PubsubTopic, WakuMessage)] var rows: seq[(WakuMessageHash, PubsubTopic, WakuMessage)]
debug "beginning of getMessagesV2PreparedStmt"
proc rowCallback(pqResult: ptr PGresult) = proc rowCallback(pqResult: ptr PGresult) =
hashCallbackImpl(pqResult, rows) hashCallbackImpl(pqResult, rows)
@ -742,6 +748,8 @@ method getMessages*(
maxPageSize = DefaultPageSize, maxPageSize = DefaultPageSize,
ascendingOrder = true, ascendingOrder = true,
): Future[ArchiveDriverResult[seq[ArchiveRow]]] {.async.} = ): Future[ArchiveDriverResult[seq[ArchiveRow]]] {.async.} =
debug "beginning of getMessages"
let hexHashes = hashes.mapIt(toHex(it)) let hexHashes = hashes.mapIt(toHex(it))
if contentTopics.len > 0 and hexHashes.len > 0 and pubsubTopic.isSome() and if contentTopics.len > 0 and hexHashes.len > 0 and pubsubTopic.isSome() and
@ -787,6 +795,8 @@ proc getStr(
): Future[ArchiveDriverResult[string]] {.async.} = ): Future[ArchiveDriverResult[string]] {.async.} =
# Performs a query that is expected to return a single string # Performs a query that is expected to return a single string
debug "beginning of getStr"
var ret: string var ret: string
proc rowCallback(pqResult: ptr PGresult) = proc rowCallback(pqResult: ptr PGresult) =
if pqResult.pqnfields() != 1: if pqResult.pqnfields() != 1:
@ -809,6 +819,7 @@ proc getInt(
): Future[ArchiveDriverResult[int64]] {.async.} = ): Future[ArchiveDriverResult[int64]] {.async.} =
# Performs a query that is expected to return a single numeric value (int64) # Performs a query that is expected to return a single numeric value (int64)
debug "beginning of getInt"
var retInt = 0'i64 var retInt = 0'i64
let str = (await s.getStr(query)).valueOr: let str = (await s.getStr(query)).valueOr:
return err("could not get str in getInt: " & $error) return err("could not get str in getInt: " & $error)
@ -826,6 +837,8 @@ proc getInt(
method getDatabaseSize*( method getDatabaseSize*(
s: PostgresDriver s: PostgresDriver
): Future[ArchiveDriverResult[int64]] {.async.} = ): Future[ArchiveDriverResult[int64]] {.async.} =
debug "beginning of getDatabaseSize"
let intRes = (await s.getInt("SELECT pg_database_size(current_database())")).valueOr: let intRes = (await s.getInt("SELECT pg_database_size(current_database())")).valueOr:
return err("error in getDatabaseSize: " & error) return err("error in getDatabaseSize: " & error)
@ -835,6 +848,8 @@ method getDatabaseSize*(
method getMessagesCount*( method getMessagesCount*(
s: PostgresDriver s: PostgresDriver
): Future[ArchiveDriverResult[int64]] {.async.} = ): Future[ArchiveDriverResult[int64]] {.async.} =
debug "beginning of getMessagesCount"
let intRes = await s.getInt("SELECT COUNT(1) FROM messages") let intRes = await s.getInt("SELECT COUNT(1) FROM messages")
if intRes.isErr(): if intRes.isErr():
return err("error in getMessagesCount: " & intRes.error) return err("error in getMessagesCount: " & intRes.error)
@ -847,6 +862,8 @@ method getOldestMessageTimestamp*(
## In some cases it could happen that we have ## In some cases it could happen that we have
## empty partitions which are older than the current stored rows. ## empty partitions which are older than the current stored rows.
## In those cases we want to consider those older partitions as the oldest considered timestamp. ## In those cases we want to consider those older partitions as the oldest considered timestamp.
debug "beginning of getOldestMessageTimestamp"
let oldestPartition = s.partitionMngr.getOldestPartition().valueOr: let oldestPartition = s.partitionMngr.getOldestPartition().valueOr:
return err("could not get oldest partition: " & $error) return err("could not get oldest partition: " & $error)
@ -862,7 +879,9 @@ method getOldestMessageTimestamp*(
method getNewestMessageTimestamp*( method getNewestMessageTimestamp*(
s: PostgresDriver s: PostgresDriver
): Future[ArchiveDriverResult[Timestamp]] {.async.} = ): Future[ArchiveDriverResult[Timestamp]] {.async.} =
debug "beginning of getNewestMessageTimestamp"
let intRes = await s.getInt("SELECT MAX(timestamp) FROM messages") let intRes = await s.getInt("SELECT MAX(timestamp) FROM messages")
if intRes.isErr(): if intRes.isErr():
return err("error in getNewestMessageTimestamp: " & intRes.error) return err("error in getNewestMessageTimestamp: " & intRes.error)
@ -871,6 +890,8 @@ method getNewestMessageTimestamp*(
method deleteOldestMessagesNotWithinLimit*( method deleteOldestMessagesNotWithinLimit*(
s: PostgresDriver, limit: int s: PostgresDriver, limit: int
): Future[ArchiveDriverResult[void]] {.async.} = ): Future[ArchiveDriverResult[void]] {.async.} =
debug "beginning of deleteOldestMessagesNotWithinLimit"
let execRes = await s.writeConnPool.pgQuery( let execRes = await s.writeConnPool.pgQuery(
"""DELETE FROM messages WHERE messageHash NOT IN """DELETE FROM messages WHERE messageHash NOT IN
( (
@ -881,9 +902,12 @@ method deleteOldestMessagesNotWithinLimit*(
if execRes.isErr(): if execRes.isErr():
return err("error in deleteOldestMessagesNotWithinLimit: " & execRes.error) return err("error in deleteOldestMessagesNotWithinLimit: " & execRes.error)
debug "end of deleteOldestMessagesNotWithinLimit"
return ok() return ok()
method close*(s: PostgresDriver): Future[ArchiveDriverResult[void]] {.async.} = method close*(s: PostgresDriver): Future[ArchiveDriverResult[void]] {.async.} =
debug "beginning of postgres close"
## Cancel the partition factory loop ## Cancel the partition factory loop
s.futLoopPartitionFactory.cancelSoon() s.futLoopPartitionFactory.cancelSoon()
@ -929,6 +953,9 @@ proc acquireDatabaseLock*(
## approach is using the "performWriteQueryWithLock" proc. However, we can't use ## approach is using the "performWriteQueryWithLock" proc. However, we can't use
## "performWriteQueryWithLock" in the migrations process because we can't nest two PL/SQL ## "performWriteQueryWithLock" in the migrations process because we can't nest two PL/SQL
## scripts. ## scripts.
debug "beginning of acquireDatabaseLock", lockId
let locked = ( let locked = (
await s.getStr( await s.getStr(
fmt""" fmt"""
@ -947,6 +974,7 @@ proc releaseDatabaseLock*(
s: PostgresDriver, lockId: int = 841886 s: PostgresDriver, lockId: int = 841886
): Future[ArchiveDriverResult[void]] {.async.} = ): Future[ArchiveDriverResult[void]] {.async.} =
## Release an advisory lock (useful to avoid more than one application running migrations at the same time) ## Release an advisory lock (useful to avoid more than one application running migrations at the same time)
debug "beginning of releaseDatabaseLock", lockId
let unlocked = ( let unlocked = (
await s.getStr( await s.getStr(
fmt""" fmt"""
@ -976,9 +1004,7 @@ const COULD_NOT_ACQUIRE_ADVISORY_LOCK* = "could not acquire advisory lock"
proc performWriteQueryWithLock*( proc performWriteQueryWithLock*(
self: PostgresDriver, queryToProtect: string self: PostgresDriver, queryToProtect: string
): Future[ArchiveDriverResult[void]] {.async.} = ): Future[ArchiveDriverResult[void]] {.async.} =
## This wraps the original query in a script so that we make sure a pg_advisory lock protects it. ## This wraps the original query in a script so that we make sure a pg_advisory lock protects it
## The purpose of this proc is to protect write queries that might be performed simultaneously
## to the same database, from different store nodes.
debug "performWriteQueryWithLock", queryToProtect debug "performWriteQueryWithLock", queryToProtect
let query = let query =
fmt""" fmt"""
@ -1042,6 +1068,7 @@ proc addPartition(
## Creates a partition table that will store the messages that fall in the range ## Creates a partition table that will store the messages that fall in the range
## `startTime` <= timestamp < `startTime + duration`. ## `startTime` <= timestamp < `startTime + duration`.
## `startTime` is measured in seconds since epoch ## `startTime` is measured in seconds since epoch
debug "beginning of addPartition"
let beginning = startTime let beginning = startTime
let `end` = partitions_manager.calcEndPartitionTime(startTime) let `end` = partitions_manager.calcEndPartitionTime(startTime)
@ -1139,7 +1166,7 @@ proc loopPartitionFactory(
debug "starting loopPartitionFactory" debug "starting loopPartitionFactory"
while true: while true:
trace "Check if we need to create a new partition" trace "Check if a new partition is needed"
## Let's make the 'partition_manager' aware of the current partitions ## Let's make the 'partition_manager' aware of the current partitions
(await self.refreshPartitionsInfo()).isOkOr: (await self.refreshPartitionsInfo()).isOkOr:
@ -1184,6 +1211,7 @@ proc getTableSize*(
): Future[ArchiveDriverResult[string]] {.async.} = ): Future[ArchiveDriverResult[string]] {.async.} =
## Returns a human-readable representation of the size for the requested table. ## Returns a human-readable representation of the size for the requested table.
## tableName - table of interest. ## tableName - table of interest.
debug "beginning of getTableSize"
let tableSize = ( let tableSize = (
await self.getStr( await self.getStr(
@ -1200,6 +1228,8 @@ proc getTableSize*(
proc removePartition( proc removePartition(
self: PostgresDriver, partitionName: string self: PostgresDriver, partitionName: string
): Future[ArchiveDriverResult[void]] {.async.} = ): Future[ArchiveDriverResult[void]] {.async.} =
debug "beginning of removePartition", partitionName
var partSize = "" var partSize = ""
let partSizeRes = await self.getTableSize(partitionName) let partSizeRes = await self.getTableSize(partitionName)
if partSizeRes.isOk(): if partSizeRes.isOk():
@ -1229,6 +1259,7 @@ proc removePartitionsOlderThan(
## Removes old partitions that don't contain the specified timestamp ## Removes old partitions that don't contain the specified timestamp
let tsInSec = Timestamp(float(tsInNanoSec) / 1_000_000_000) let tsInSec = Timestamp(float(tsInNanoSec) / 1_000_000_000)
debug "beginning of removePartitionsOlderThan", tsInSec
var oldestPartition = self.partitionMngr.getOldestPartition().valueOr: var oldestPartition = self.partitionMngr.getOldestPartition().valueOr:
return err("could not get oldest partition in removePartitionOlderThan: " & $error) return err("could not get oldest partition in removePartitionOlderThan: " & $error)
@ -1249,6 +1280,7 @@ proc removeOldestPartition(
self: PostgresDriver, forceRemoval: bool = false, ## To allow cleanup in tests self: PostgresDriver, forceRemoval: bool = false, ## To allow cleanup in tests
): Future[ArchiveDriverResult[void]] {.async.} = ): Future[ArchiveDriverResult[void]] {.async.} =
## Indirectly called from the retention policy ## Indirectly called from the retention policy
debug "beginning of removeOldestPartition"
let oldestPartition = self.partitionMngr.getOldestPartition().valueOr: let oldestPartition = self.partitionMngr.getOldestPartition().valueOr:
return err("could not remove oldest partition: " & $error) return err("could not remove oldest partition: " & $error)
@ -1271,6 +1303,8 @@ proc containsAnyPartition*(self: PostgresDriver): bool =
method decreaseDatabaseSize*( method decreaseDatabaseSize*(
driver: PostgresDriver, targetSizeInBytes: int64, forceRemoval: bool = false driver: PostgresDriver, targetSizeInBytes: int64, forceRemoval: bool = false
): Future[ArchiveDriverResult[void]] {.async.} = ): Future[ArchiveDriverResult[void]] {.async.} =
debug "beginning of decreaseDatabaseSize"
var dbSize = (await driver.getDatabaseSize()).valueOr: var dbSize = (await driver.getDatabaseSize()).valueOr:
return err("decreaseDatabaseSize failed to get database size: " & $error) return err("decreaseDatabaseSize failed to get database size: " & $error)
@ -1337,6 +1371,8 @@ method existsTable*(
proc getCurrentVersion*( proc getCurrentVersion*(
s: PostgresDriver s: PostgresDriver
): Future[ArchiveDriverResult[int64]] {.async.} = ): Future[ArchiveDriverResult[int64]] {.async.} =
debug "beginning of getCurrentVersion"
let existsVersionTable = (await s.existsTable("version")).valueOr: let existsVersionTable = (await s.existsTable("version")).valueOr:
return err("error in getCurrentVersion-existsTable: " & $error) return err("error in getCurrentVersion-existsTable: " & $error)
@ -1353,6 +1389,8 @@ method deleteMessagesOlderThanTimestamp*(
): Future[ArchiveDriverResult[void]] {.async.} = ): Future[ArchiveDriverResult[void]] {.async.} =
## First of all, let's remove the older partitions so that we can reduce ## First of all, let's remove the older partitions so that we can reduce
## the database size. ## the database size.
debug "beginning of deleteMessagesOlderThanTimestamp"
(await s.removePartitionsOlderThan(tsNanoSec)).isOkOr: (await s.removePartitionsOlderThan(tsNanoSec)).isOkOr:
return err("error while removing older partitions: " & $error) return err("error while removing older partitions: " & $error)

View File

@ -53,6 +53,8 @@ proc new*(T: type CapacityRetentionPolicy, capacity = DefaultCapacity): T =
method execute*( method execute*(
p: CapacityRetentionPolicy, driver: ArchiveDriver p: CapacityRetentionPolicy, driver: ArchiveDriver
): Future[RetentionPolicyResult[void]] {.async.} = ): Future[RetentionPolicyResult[void]] {.async.} =
debug "beginning executing message retention policy - capacity"
let numMessages = (await driver.getMessagesCount()).valueOr: let numMessages = (await driver.getMessagesCount()).valueOr:
return err("failed to get messages count: " & error) return err("failed to get messages count: " & error)
@ -62,4 +64,6 @@ method execute*(
(await driver.deleteOldestMessagesNotWithinLimit(limit = p.capacity + p.deleteWindow)).isOkOr: (await driver.deleteOldestMessagesNotWithinLimit(limit = p.capacity + p.deleteWindow)).isOkOr:
return err("deleting oldest messages failed: " & error) return err("deleting oldest messages failed: " & error)
debug "end executing message retention policy - capacity"
return ok() return ok()

View File

@ -18,7 +18,10 @@ proc new*(T: type SizeRetentionPolicy, size = DefaultRetentionSize): T =
method execute*( method execute*(
p: SizeRetentionPolicy, driver: ArchiveDriver p: SizeRetentionPolicy, driver: ArchiveDriver
): Future[RetentionPolicyResult[void]] {.async.} = ): Future[RetentionPolicyResult[void]] {.async.} =
debug "beginning of executing message retention policy - size"
(await driver.decreaseDatabaseSize(p.sizeLimit)).isOkOr: (await driver.decreaseDatabaseSize(p.sizeLimit)).isOkOr:
return err("decreaseDatabaseSize failed: " & $error) return err("decreaseDatabaseSize failed: " & $error)
debug "end of executing message retention policy - size"
return ok() return ok()

View File

@ -18,6 +18,7 @@ method execute*(
p: TimeRetentionPolicy, driver: ArchiveDriver p: TimeRetentionPolicy, driver: ArchiveDriver
): Future[RetentionPolicyResult[void]] {.async.} = ): Future[RetentionPolicyResult[void]] {.async.} =
## Delete messages that exceed the retention time by 10% and more (batch delete for efficiency) ## Delete messages that exceed the retention time by 10% and more (batch delete for efficiency)
debug "beginning of executing message retention policy - time"
let omtRes = await driver.getOldestMessageTimestamp() let omtRes = await driver.getOldestMessageTimestamp()
if omtRes.isErr(): if omtRes.isErr():
@ -34,4 +35,5 @@ method execute*(
if res.isErr(): if res.isErr():
return err("failed to delete oldest messages: " & res.error) return err("failed to delete oldest messages: " & res.error)
debug "end of executing message retention policy - time"
return ok() return ok()