diff --git a/waku/common/databases/db_postgres/dbconn.nim b/waku/common/databases/db_postgres/dbconn.nim index 287ed4e8d..e711a5ca2 100644 --- a/waku/common/databases/db_postgres/dbconn.nim +++ b/waku/common/databases/db_postgres/dbconn.nim @@ -235,6 +235,13 @@ proc isSecureString(input: string): bool = return true +proc convertQueryToMetricLabel(query: string): string = + ## Simple query categorization. The output label is the one that should be used in query metrics + for snippetQuery, metric in QueriesToMetricMap.pairs(): + if query.contains($snippetQuery): + return $metric + return "unknown_query_metric" + proc dbConnQuery*( dbConnWrapper: DbConnWrapper, query: SqlQuery, @@ -247,11 +254,7 @@ proc dbConnQuery*( dbConnWrapper.futBecomeFree = newFuture[void]("dbConnQuery") - let cleanedQuery = ($query).replace(" ", "").replace("\n", "") - ## remove everything between ' or " all possible sequence of numbers. e.g. rm partition partition - var querySummary = cleanedQuery.replace(re2("""(['"]).*?\\1"""), "") - querySummary = querySummary.replace(re2"\d+", "") - querySummary = "query_tag_" & querySummary[0 ..< min(querySummary.len, 128)] + let metricLabel = convertQueryToMetricLabel($query) var queryStartTime = getTime().toUnixFloat() @@ -262,7 +265,7 @@ proc dbConnQuery*( return err("error in dbConnQuery calling sendQuery: " & $error) let sendDuration = getTime().toUnixFloat() - queryStartTime - query_time_secs.set(sendDuration, [querySummary, "sendToDBQuery"]) + query_time_secs.set(sendDuration, [metricLabel, "sendToDBQuery"]) queryStartTime = getTime().toUnixFloat() @@ -270,16 +273,16 @@ proc dbConnQuery*( return err("error in dbConnQuery calling waitQueryToFinish: " & $error) let waitDuration = getTime().toUnixFloat() - queryStartTime - query_time_secs.set(waitDuration, [querySummary, "waitFinish"]) + query_time_secs.set(waitDuration, [metricLabel, "waitFinish"]) - query_count.inc(labelValues = [querySummary]) + query_count.inc(labelValues = [metricLabel]) if "insert" notin ($query).toLower(): debug "dbConnQuery", requestId, query = $query, args, - querySummary, + metricLabel, waitDbQueryDurationSecs = waitDuration, sendToDBDurationSecs = sendDuration @@ -302,9 +305,8 @@ proc dbConnQueryPrepared*( error "error in dbConnQueryPrepared", error = $error return err("error in dbConnQueryPrepared calling sendQuery: " & $error) - let stmtNameSummary = stmtName[0 ..< min(stmtName.len, 128)] let sendDuration = getTime().toUnixFloat() - queryStartTime - query_time_secs.set(sendDuration, [stmtNameSummary, "sendToDBQuery"]) + query_time_secs.set(sendDuration, [stmtName, "sendToDBQuery"]) queryStartTime = getTime().toUnixFloat() @@ -312,9 +314,9 @@ proc dbConnQueryPrepared*( return err("error in dbConnQueryPrepared calling waitQueryToFinish: " & $error) let waitDuration = getTime().toUnixFloat() - queryStartTime - query_time_secs.set(waitDuration, [stmtNameSummary, "waitFinish"]) + query_time_secs.set(waitDuration, [stmtName, "waitFinish"]) - query_count.inc(labelValues = [stmtNameSummary]) + query_count.inc(labelValues = [stmtName]) if "insert" notin stmtName.toLower(): debug "dbConnQueryPrepared", diff --git a/waku/common/databases/db_postgres/query_metrics.nim b/waku/common/databases/db_postgres/query_metrics.nim index 06209cac0..124d7cdf4 100644 --- a/waku/common/databases/db_postgres/query_metrics.nim +++ b/waku/common/databases/db_postgres/query_metrics.nim @@ -5,3 +5,27 @@ declarePublicGauge query_time_secs, declarePublicCounter query_count, "number of times a query is being performed", labels = ["query"] + +## Maps parts of the possible known queries with a fixed and shorter query label. +const QueriesToMetricMap* = { + "contentTopic IN": "content_topic", + "SELECT version()": "select_version", + "WITH min_timestamp": "messages_lookup", + "SELECT messageHash FROM messages WHERE pubsubTopic = ? AND timestamp >= ? AND timestamp <= ? ORDER BY timestamp DESC, messageHash DESC LIMIT ?": + "msg_hash_no_ctopic", + "AS partition_name": "get_partitions_list", + "SELECT COUNT(1) FROM messages": "count_msgs", + "SELECT messageHash FROM messages WHERE (timestamp, messageHash) < (?,?) AND pubsubTopic = ? AND timestamp >= ? AND timestamp <= ? ORDER BY timestamp DESC, messageHash DESC LIMIT ?": + "msg_hash_with_cursor", + "SELECT pg_database_size(current_database())": "get_database_size", + "DELETE FROM messages_lookup WHERE timestamp": "delete_from_msgs_lookup", + "DROP TABLE messages_": "drop_partition_table", + "ALTER TABLE messages DETACH PARTITION": "detach_partition", + "SELECT pg_size_pretty(pg_total_relation_size(C.oid))": "get_partition_size", + "pg_try_advisory_lock": "try_advisory_lock", + "SELECT messageHash FROM messages ORDER BY timestamp DESC, messageHash DESC LIMIT ?": + "get_all_msg_hash", + "SELECT pg_advisory_unlock": "advisory_unlock", + "ANALYZE messages": "analyze_messages", + "SELECT EXISTS": "check_version_table_exists", +} diff --git a/waku/waku_store/client.nim b/waku/waku_store/client.nim index 61229576a..082120823 100644 --- a/waku/waku_store/client.nim +++ b/waku/waku_store/client.nim @@ -39,11 +39,11 @@ proc sendStoreRequest( return err(StoreError(kind: ErrorCode.BAD_RESPONSE, cause: error.msg)) let res = StoreQueryResponse.decode(buf).valueOr: - waku_store_errors.inc(labelValues = [decodeRpcFailure]) - return err(StoreError(kind: ErrorCode.BAD_RESPONSE, cause: decodeRpcFailure)) + waku_store_errors.inc(labelValues = [DecodeRpcFailure]) + return err(StoreError(kind: ErrorCode.BAD_RESPONSE, cause: DecodeRpcFailure)) if res.statusCode != uint32(StatusCode.SUCCESS): - waku_store_errors.inc(labelValues = [res.statusDesc]) + waku_store_errors.inc(labelValues = [NoSuccessStatusCode]) return err(StoreError.new(res.statusCode, res.statusDesc)) return ok(res) @@ -55,7 +55,7 @@ proc query*( return err(StoreError(kind: ErrorCode.BAD_REQUEST, cause: "invalid cursor")) let connection = (await self.peerManager.dialPeer(peer, WakuStoreCodec)).valueOr: - waku_store_errors.inc(labelValues = [dialFailure]) + waku_store_errors.inc(labelValues = [DialFailure]) return err(StoreError(kind: ErrorCode.PEER_DIAL_FAILURE, address: $peer)) @@ -74,7 +74,7 @@ proc queryToAny*( return err(StoreError(kind: BAD_RESPONSE, cause: "no service store peer connected")) let connection = (await self.peerManager.dialPeer(peer, WakuStoreCodec)).valueOr: - waku_store_errors.inc(labelValues = [dialFailure]) + waku_store_errors.inc(labelValues = [DialFailure]) return err(StoreError(kind: ErrorCode.PEER_DIAL_FAILURE, address: $peer)) diff --git a/waku/waku_store/protocol.nim b/waku/waku_store/protocol.nim index 5f986983e..aa22fe5cd 100644 --- a/waku/waku_store/protocol.nim +++ b/waku/waku_store/protocol.nim @@ -45,7 +45,7 @@ proc handleQueryRequest( let req = StoreQueryRequest.decode(raw_request).valueOr: error "failed to decode rpc", peerId = requestor, error = $error - waku_store_errors.inc(labelValues = [decodeRpcFailure]) + waku_store_errors.inc(labelValues = [DecodeRpcFailure]) res.statusCode = uint32(ErrorCode.BAD_REQUEST) res.statusDesc = "decoding rpc failed: " & $error diff --git a/waku/waku_store/protocol_metrics.nim b/waku/waku_store/protocol_metrics.nim index b077147a6..5d9e69420 100644 --- a/waku/waku_store/protocol_metrics.nim +++ b/waku/waku_store/protocol_metrics.nim @@ -12,8 +12,9 @@ declarePublicGauge waku_store_time_seconds, # Error types (metric label values) const - dialFailure* = "dial_failure" - decodeRpcFailure* = "decode_rpc_failure" - peerNotFoundFailure* = "peer_not_found_failure" - emptyRpcQueryFailure* = "empty_rpc_query_failure" - emptyRpcResponseFailure* = "empty_rpc_response_failure" + DialFailure* = "dial_failure" + DecodeRpcFailure* = "decode_rpc_failure" + PeerNotFoundFailure* = "peer_not_found_failure" + EmptyRpcQueryFailure* = "empty_rpc_query_failure" + EmptyRpcResponseFailure* = "empty_rpc_response_failure" + NoSuccessStatusCode* = "status_code_no_success" diff --git a/waku/waku_store_legacy/protocol_metrics.nim b/waku/waku_store_legacy/protocol_metrics.nim index 53cc71427..c293f09ca 100644 --- a/waku/waku_store_legacy/protocol_metrics.nim +++ b/waku/waku_store_legacy/protocol_metrics.nim @@ -13,8 +13,8 @@ declarePublicGauge waku_legacy_store_time_seconds, # Error types (metric label values) const - dialFailure* = "dial_failure" - decodeRpcFailure* = "decode_rpc_failure" - peerNotFoundFailure* = "peer_not_found_failure" - emptyRpcQueryFailure* = "empty_rpc_query_failure" - emptyRpcResponseFailure* = "empty_rpc_response_failure" + dialFailure* = "dial_failure_legacy" + decodeRpcFailure* = "decode_rpc_failure_legacy" + peerNotFoundFailure* = "peer_not_found_failure_legacy" + emptyRpcQueryFailure* = "empty_rpc_query_failure_legacy" + emptyRpcResponseFailure* = "empty_rpc_response_failure_legacy"