nim-sds/sds/sds_utils.nim

import std/[times, tables, sequtils, sets, hashes]
import chronos, chronicles, results
import ./rolling_bloom_filter
import
  ./types/[
    sds_message_id, history_entry, sds_message, unacknowledged_message,
    incoming_message, reliability_error, callbacks, app_callbacks, reliability_config,
    repair_entry, channel_context, reliability_manager,
  ]
export
  sds_message_id, history_entry, sds_message, unacknowledged_message, incoming_message,
  reliability_error, callbacks, app_callbacks, reliability_config, repair_entry,
  channel_context, reliability_manager

proc defaultConfig*(): ReliabilityConfig =
  return ReliabilityConfig.init()

proc reliabilityErr*(detail: string): ReliabilityError {.gcsafe, raises: [].} =
  ## Maps a backend-supplied persistence error string onto the
  ## `rePersistenceError` enum value. The enum carries no payload, so the
  ## original detail is logged here — this is the single point where a
  ## persistence failure is recorded, while the enum value travels up the
  ## `Result` chain to the public API caller, who decides what to do.
  ##
  ## With the snapshot-based Persistence interface, most protocol ops no
  ## longer propagate persistence errors at all — they log and continue
  ## (see PLAN_SNAPSHOT_PERSISTENCE.md §8). This helper is still used by
  ## the durability-intent ops (removeChannel, resetReliabilityManager,
  ## getOrCreateChannel) that retain err-on-failure semantics.
  warn "persistence operation failed", detail = detail
  ReliabilityError.rePersistenceError

proc snapshotMeta*(channel: ChannelContext): ChannelMeta {.gcsafe, raises: [].} =
  ## Captures the current in-memory state of a `ChannelContext` as a
  ## `ChannelMeta` blob, suitable for `Persistence.saveChannelMeta`.
  ##
  ## The in-memory shape uses `Table`-keyed buffers for fast lookup;
  ## `ChannelMeta` flattens them to `seq`s for stable wire serialization
  ## (see PLAN §6). The bloom filter and message history are intentionally
  ## excluded — the former is rebuilt from the latter on bootstrap, and
  ## the latter is persisted separately via `updateHistory`.
  result = ChannelMeta.init()
  result.lamportTimestamp = channel.lamportTimestamp
  for u in channel.outgoingBuffer:
    result.outgoingBuffer.add(u)
  for _, m in channel.incomingBuffer.pairs:
    result.incomingBuffer.add(m)
  for id, e in channel.outgoingRepairBuffer.pairs:
    result.outgoingRepairBuffer.add(OutgoingRepairKV(messageId: id, entry: e))
  for id, e in channel.incomingRepairBuffer.pairs:
    result.incomingRepairBuffer.add(IncomingRepairKV(messageId: id, entry: e))

proc trySaveMeta*(
    rm: ReliabilityManager, channelId: SdsChannelID, channel: ChannelContext
) {.async: (raises: []).} =
  ## Best-effort meta snapshot save. Per PLAN §8 the protocol op does NOT
  ## abort on persistence failure — in-memory state is the source of truth
  ## and the next op's snapshot will re-synchronise on-disk state.
  ##
  ## This helper is the single point where snapshot-save failures are
  ## logged; callers do not need to handle the Result.
  let res = await rm.persistence.saveChannelMeta(channelId, snapshotMeta(channel))
  if res.isErr:
    warn "snapshot save failed; in-memory state authoritative, next op will retry",
      channelId = channelId, detail = res.error

proc queueHistoryAppend*(channel: ChannelContext, msgId: SdsMessageID) =
  ## Push an append onto the pending history queue. Only the id is
  ## stored — the full SdsMessage is looked up from `messageHistory` at
  ## flush time (invariant: every queued id is present in messageHistory).
  ##
  ## Merge rule: **latest operation wins.** Cancels any pending evict for
  ## the same id, then adds. Handles the evict-then-re-add sequence
  ## correctly (e.g. SDS-R repair re-delivers a previously-evicted
  ## message while the backend is unreachable).
  channel.pendingHistoryEvicts.excl(msgId)
  channel.pendingHistoryAppends.incl(msgId)

proc queueHistoryEvict*(channel: ChannelContext, msgId: SdsMessageID) =
  ## Push an evict onto the pending history queue. Merge rule symmetric
  ## with `queueHistoryAppend`: cancels any pending append for the same
  ## id (the just-evicted message no longer needs to be persisted as an
  ## addition), then adds to the evict set.
  channel.pendingHistoryAppends.excl(msgId)
  channel.pendingHistoryEvicts.incl(msgId)

proc tryUpdateHistory*(
    rm: ReliabilityManager, channelId: SdsChannelID
) {.async: (raises: []).} =
  ## Flush the channel's pending history queue to disk.
  ##
  ## The pending queue (`channel.pendingHistoryAppends` /
  ## `pendingHistoryEvicts`) plays a DUAL role — and that's deliberate:
  ##   1. **Per-op accumulator.** Every `addToHistory` call pushes its
  ##      mutation into this queue but does NOT persist. A protocol op
  ##      that invokes `addToHistory` N times (e.g. a
  ##      `processIncomingBuffer` cascade) leaves N entries queued and
  ##      issues exactly ONE `tryUpdateHistory` at op end — one
  ##      round-trip per op regardless of cascade depth. This fixes PR
  ##      #72 review comments #2 and #3.
  ##   2. **R2 retry queue.** If the flush fails, the queue is NOT
  ##      cleared. The next op's `addToHistory` calls add to it; the
  ##      next op's `tryUpdateHistory` retries the merged batch. This
  ##      fixes PR #72 review comment #1 (delta loss).
  ##
  ## Both roles share the same data structure because they want the same
  ## semantics: "merge everything pending into one batch and try to
  ## flush". Failure is non-fatal at the FFI boundary (PLAN §8) — the
  ## in-memory state is the source of truth.
  ##
  ## Callers MUST invoke this once at the end of every protocol op (even
  ## when this op had no history changes) — otherwise a previously-failed
  ## batch could sit on the queue indefinitely.
  var channel: ChannelContext
  try:
    if channelId notin rm.channels:
      return
    channel = rm.channels[channelId]
  except KeyError:
    return # checked `in` above; unreachable, but tables can raise per spec

  if channel.pendingHistoryAppends.len == 0 and
      channel.pendingHistoryEvicts.len == 0:
    return # nothing to flush — no round-trip cost

  var batch = HistoryUpdate.init()
  # Look up each queued id in messageHistory (source of truth). The
  # invariant on pendingHistoryAppends guarantees the id is present;
  # the defensive check below logs any violation rather than crashing.
  for id in channel.pendingHistoryAppends:
    try:
      if id in channel.messageHistory:
        batch.append.add(channel.messageHistory[id])
      else:
        warn "queued append id missing from messageHistory; invariant violated, skipping",
          channelId = channelId, msgId = id
    except KeyError:
      discard # unreachable — `in` was true
  for id in channel.pendingHistoryEvicts:
    batch.evict.add(id)

  let res = await rm.persistence.updateHistory(channelId, batch)
  if res.isOk:
    channel.pendingHistoryAppends.clear()
    channel.pendingHistoryEvicts.clear()
  else:
    warn "history update failed; queued for retry on next op",
      channelId = channelId,
      pendingAppends = channel.pendingHistoryAppends.len,
      pendingEvicts = channel.pendingHistoryEvicts.len,
      detail = res.error
    if channel.pendingHistoryAppends.len > rm.config.maxMessageHistory:
      warn "pending history queue exceeds maxMessageHistory; backend may be stuck",
        channelId = channelId,
        pendingAppends = channel.pendingHistoryAppends.len

proc dropChannelFromPersistence*(
    rm: ReliabilityManager, channelId: SdsChannelID
): Future[Result[void, ReliabilityError]] {.async: (raises: []).} =
  ## Wipes all persisted state for a channel via a single backend call.
  ## Called by removeChannel / resetReliabilityManager before they clear
  ## in-memory state. Backend executes the wipe in one transaction.
  ##
  ## Phase 2D: uses `persistenceV2.dropChannel`. This op DOES propagate
  ## err on failure (durability is the semantic intent — the caller asked
  ## us to confirm a disk wipe; we cannot silently lie). See PLAN §8.
  (await rm.persistence.dropChannel(channelId)).isOkOr:
    return err(reliabilityErr(error))
  ok()

proc cleanup*(rm: ReliabilityManager) {.async: (raises: []).} =
  ## Releases in-memory state. Does NOT wipe persistence — the manager may be
  ## reconstructed against the same backend after cleanup, so disk state must
  ## survive. For deliberate disk wipe, use `removeChannel` or
  ## `resetReliabilityManager`.
  ##
  ## Periodic tasks are cancelled BEFORE acquiring the lock so that a task
  ## currently blocked on `lock.acquire()` can unwind via CancelledError
  ## without deadlocking against cleanup itself.
  if rm.isNil():
    return
  for task in rm.periodicTasks:
    if not task.finished:
      await task.cancelAndWait()
  rm.periodicTasks.setLen(0)
  try:
    await rm.lock.acquire()
    try:
      for channelId, channel in rm.channels:
        channel.outgoingBuffer.setLen(0)
        channel.incomingBuffer.clear()
        channel.messageHistory.clear()
        channel.outgoingRepairBuffer.clear()
        channel.incomingRepairBuffer.clear()
        channel.pendingHistoryAppends.clear()
        channel.pendingHistoryEvicts.clear()
      rm.channels.clear()
    finally:
      rm.lock.release()
  except CatchableError:
    error "Error during cleanup", error = getCurrentExceptionMsg()

proc cleanBloomFilter*(
    rm: ReliabilityManager, channelId: SdsChannelID
) {.async: (raises: []).} =
  try:
    await rm.lock.acquire()
    try:
      if channelId in rm.channels:
        rm.channels[channelId].bloomFilter.clean()
    finally:
      rm.lock.release()
  except CatchableError:
    error "Failed to clean bloom filter",
      error = getCurrentExceptionMsg(), channelId = channelId

proc addToHistory*(
    rm: ReliabilityManager, msg: SdsMessage, channelId: SdsChannelID
): Future[Result[void, ReliabilityError]] {.async: (raises: []).} =
  ## Inserts a delivered message into the channel's history map, evicts
  ## the eldest entries past `maxMessageHistory`, and queues the resulting
  ## append+evict on the channel's pending-history queue. Does NOT issue
  ## a persistence call — the caller's op-end `tryUpdateHistory` flushes
  ## the queue in one round-trip.
  ##
  ## A cascade of N unblocked messages (e.g. `processIncomingBuffer`)
  ## therefore leaves N entries queued and triggers ONE persistence call
  ## at op end, not N. Fixes PR #72 review #2/#3.
  ##
  ## Direct callers (tests, ad-hoc) that want the disk write to land
  ## immediately should follow this with `await rm.tryUpdateHistory(channelId)`.
  try:
    if channelId in rm.channels:
      let channel = rm.channels[channelId]
      channel.messageHistory[msg.messageId] = msg
      queueHistoryAppend(channel, msg.messageId)
      while channel.messageHistory.len > rm.config.maxMessageHistory:
        var firstKey: SdsMessageID
        for k in channel.messageHistory.keys:
          firstKey = k
          break
        channel.messageHistory.del(firstKey)
        queueHistoryEvict(channel, firstKey)
    ok()
  except CatchableError:
    error "Failed to add to history",
      channelId = channelId, msgId = msg.messageId, error = getCurrentExceptionMsg()
    err(ReliabilityError.reInternalError)

proc updateLamportTimestamp*(
    rm: ReliabilityManager, msgTs: int64, channelId: SdsChannelID
): Future[Result[void, ReliabilityError]] {.async: (raises: []).} =
  ## Pure in-memory update (phase 2B). The new lamport value is captured
  ## by the op-end `trySaveMeta` issued by the calling protocol op; no
  ## per-mutation persistence call here.
  try:
    if channelId in rm.channels:
      let channel = rm.channels[channelId]
      channel.lamportTimestamp = max(msgTs, channel.lamportTimestamp) + 1
    ok()
  except CatchableError:
    error "Failed to update lamport timestamp",
      channelId = channelId, msgTs = msgTs, error = getCurrentExceptionMsg()
    err(ReliabilityError.reInternalError)

proc newHistoryEntry*(
    messageId: SdsMessageID, retrievalHint: seq[byte] = @[]
): HistoryEntry =
  return HistoryEntry.init(messageId, retrievalHint)

proc toCausalHistory*(messageIds: seq[SdsMessageID]): seq[HistoryEntry] =
  return messageIds.mapIt(newHistoryEntry(it))

proc getMessageIds*(causalHistory: seq[HistoryEntry]): seq[SdsMessageID] =
  return causalHistory.mapIt(it.messageId)

## SDS-R: Repair computation functions

proc computeTReq*(
    participantId: SdsParticipantID,
    messageId: SdsMessageID,
    tMin: times.Duration,
    tMax: times.Duration,
): times.Duration =
  ## Computes the repair request backoff duration per SDS-R spec:
  ## T_req = hash(participant_id, message_id) % (T_max - T_min) + T_min
  let h = abs(hash(participantId.string & messageId))
  let rangeMs = tMax.inMilliseconds - tMin.inMilliseconds
  if rangeMs <= 0:
    return tMin
  let offsetMs = h mod rangeMs
  initDuration(milliseconds = tMin.inMilliseconds + offsetMs)

proc computeTResp*(
    participantId: SdsParticipantID,
    senderId: SdsParticipantID,
    messageId: SdsMessageID,
    tMax: times.Duration,
): times.Duration =
  ## Computes the repair response backoff duration per SDS-R spec:
  ## distance = hash(participant_id) XOR hash(sender_id)
  ## T_resp = distance * hash(message_id) % T_max
  ## Original sender has distance=0, so T_resp=0 (responds immediately).
  let distance = abs(hash(participantId) xor hash(senderId))
  let msgHash = abs(hash(messageId))
  let tMaxMs = tMax.inMilliseconds
  if tMaxMs <= 0 or distance == 0:
    return initDuration(milliseconds = 0)
  # Use uint64 to avoid overflow on multiplication
  let d = uint64(distance mod tMaxMs)
  let m = uint64(msgHash mod tMaxMs)
  let offsetMs = int64((d * m) mod uint64(tMaxMs))
  initDuration(milliseconds = offsetMs)

proc isInResponseGroup*(
    participantId: SdsParticipantID,
    senderId: SdsParticipantID,
    messageId: SdsMessageID,
    numResponseGroups: int,
): bool =
  ## Determines if this participant is in the response group for a given message per SDS-R spec:
  ## hash(participant_id, message_id) % num_groups == hash(sender_id, message_id) % num_groups
  if numResponseGroups <= 1:
    return true # All participants in the same group
  let myGroup = abs(hash(participantId.string & messageId)) mod numResponseGroups
  let senderGroup = abs(hash(senderId.string & messageId)) mod numResponseGroups
  myGroup == senderGroup

proc getRecentHistoryEntries*(
    rm: ReliabilityManager, n: int, channelId: SdsChannelID
): Future[Result[seq[HistoryEntry], ReliabilityError]] {.async: (raises: []).} =
  ## Get recent history entries for sending in causal history.
  ## Populates retrieval hints and senderId (SDS-R) for each entry.
  try:
    if channelId in rm.channels:
      let channel = rm.channels[channelId]
      var orderedIds: seq[SdsMessageID] = @[]
      for msgId in channel.messageHistory.keys:
        orderedIds.add(msgId)
      let recentMessageIds = orderedIds[max(0, orderedIds.len - n) .. ^1]
      var entries: seq[HistoryEntry] = @[]
      for msgId in recentMessageIds:
        var entry = HistoryEntry(messageId: msgId)
        if not rm.onRetrievalHint.isNil():
          {.cast(raises: []).}:
            entry.retrievalHint = rm.onRetrievalHint(msgId)
          if entry.retrievalHint.len > 0:
            # Phase 2B: best-effort hint persistence via V2. Non-fatal —
            # hints are an optimisation; a missing hint just means the
            # peer falls back to slower retrieval.
            let hintRes = await rm.persistence.setRetrievalHint(
              msgId, entry.retrievalHint
            )
            if hintRes.isErr:
              warn "retrieval hint save failed; continuing",
                msgId = msgId, detail = hintRes.error
        entry.senderId = channel.messageHistory[msgId].senderId
        entries.add(entry)
      ok(entries)
    else:
      ok(newSeq[HistoryEntry]())
  except CatchableError:
    error "Failed to get recent history entries",
      channelId = channelId, n = n, error = getCurrentExceptionMsg()
    err(ReliabilityError.reInternalError)

proc checkDependencies*(
    rm: ReliabilityManager, deps: seq[HistoryEntry], channelId: SdsChannelID
): seq[HistoryEntry] =
  var missingDeps: seq[HistoryEntry] = @[]
  try:
    if channelId in rm.channels:
      let channel = rm.channels[channelId]
      for dep in deps:
        if dep.messageId notin channel.messageHistory:
          missingDeps.add(dep)
    else:
      missingDeps = deps
  except Exception:
    error "Failed to check dependencies",
      channelId = channelId, error = getCurrentExceptionMsg()
    missingDeps = deps
  return missingDeps

proc getMessageHistory*(
    rm: ReliabilityManager, channelId: SdsChannelID
): Future[seq[SdsMessageID]] {.async: (raises: []).} =
  try:
    await rm.lock.acquire()
    try:
      if channelId in rm.channels:
        var ids: seq[SdsMessageID] = @[]
        for msgId in rm.channels[channelId].messageHistory.keys:
          ids.add(msgId)
        return ids
      else:
        return @[]
    finally:
      rm.lock.release()
  except CatchableError:
    error "Failed to get message history",
      channelId = channelId, error = getCurrentExceptionMsg()
    return @[]

proc getOutgoingBuffer*(
    rm: ReliabilityManager, channelId: SdsChannelID
): Future[seq[UnacknowledgedMessage]] {.async: (raises: []).} =
  try:
    await rm.lock.acquire()
    try:
      if channelId in rm.channels:
        return rm.channels[channelId].outgoingBuffer
      else:
        return @[]
    finally:
      rm.lock.release()
  except CatchableError:
    error "Failed to get outgoing buffer",
      channelId = channelId, error = getCurrentExceptionMsg()
    return @[]

proc getIncomingBuffer*(
    rm: ReliabilityManager, channelId: SdsChannelID
): Future[Table[SdsMessageID, IncomingMessage]] {.async: (raises: []), gcsafe.} =
  try:
    await rm.lock.acquire()
    try:
      if channelId in rm.channels:
        return rm.channels[channelId].incomingBuffer
      else:
        return initTable[SdsMessageID, IncomingMessage]()
    finally:
      rm.lock.release()
  except CatchableError:
    error "Failed to get incoming buffer",
      channelId = channelId, error = getCurrentExceptionMsg()
    return initTable[SdsMessageID, IncomingMessage]()

proc getOrCreateChannel*(
    rm: ReliabilityManager, channelId: SdsChannelID
): Future[Result[ChannelContext, ReliabilityError]] {.async: (raises: []).} =
  ## Returns the channel context, creating and bootstrapping it from the
  ## persistence backend if it does not yet exist in memory. The bloom filter
  ## is rebuilt deterministically from the loaded message history rather than
  ## persisted directly. Caller is expected to hold rm.lock.
  ##
  ## Phase 2C: bootstrap via `persistenceV2.loadChannel`. Bootstrap DOES
  ## propagate err on load failure — the caller asked us to materialise a
  ## channel and we cannot do that without knowing the prior state. See
  ## PLAN §8.
  try:
    if channelId notin rm.channels:
      let channel = ChannelContext.new(
        RollingBloomFilter.init(
          rm.config.bloomFilterCapacity, rm.config.bloomFilterErrorRate
        )
      )
      let data = (await rm.persistence.loadChannel(channelId)).valueOr:
        return err(reliabilityErr(error))
      channel.lamportTimestamp = data.meta.lamportTimestamp
      # Backend contract: messageHistory MUST be ordered oldest-first.
      # If a backend violates this, FIFO eviction breaks across restarts.
      for msg in data.messageHistory:
        channel.messageHistory[msg.messageId] = msg
        channel.bloomFilter.add(msg.messageId)
      for unack in data.meta.outgoingBuffer:
        channel.outgoingBuffer.add(unack)
      for incoming in data.meta.incomingBuffer:
        channel.incomingBuffer[incoming.message.messageId] = incoming
      for kv in data.meta.outgoingRepairBuffer:
        channel.outgoingRepairBuffer[kv.messageId] = kv.entry
      for kv in data.meta.incomingRepairBuffer:
        channel.incomingRepairBuffer[kv.messageId] = kv.entry
      rm.channels[channelId] = channel
    ok(rm.channels[channelId])
  except CatchableError:
    error "Failed to get or create channel",
      channelId = channelId, error = getCurrentExceptionMsg()
    err(ReliabilityError.reInternalError)

proc ensureChannel*(
    rm: ReliabilityManager, channelId: SdsChannelID
): Future[Result[void, ReliabilityError]] {.async: (raises: []).} =
  try:
    await rm.lock.acquire()
    try:
      (await rm.getOrCreateChannel(channelId)).isOkOr:
        return err(error)
      return ok()
    finally:
      rm.lock.release()
  except CatchableError:
    error "Failed to ensure channel (lock)",
      channelId = channelId, msg = getCurrentExceptionMsg()
    return err(ReliabilityError.reInternalError)

proc removeChannel*(
    rm: ReliabilityManager, channelId: SdsChannelID
): Future[Result[void, ReliabilityError]] {.async: (raises: []).} =
  try:
    await rm.lock.acquire()
    try:
      try:
        if channelId in rm.channels:
          let channel = rm.channels[channelId]
          (await rm.dropChannelFromPersistence(channelId)).isOkOr:
            return err(error)
          channel.outgoingBuffer.setLen(0)
          channel.incomingBuffer.clear()
          channel.messageHistory.clear()
          channel.outgoingRepairBuffer.clear()
          channel.incomingRepairBuffer.clear()
          channel.pendingHistoryAppends.clear()
          channel.pendingHistoryEvicts.clear()
          rm.channels.del(channelId)
        return ok()
      except CatchableError:
        error "Failed to remove channel",
          channelId = channelId, msg = getCurrentExceptionMsg()
        return err(ReliabilityError.reInternalError)
    finally:
      rm.lock.release()
  except CatchableError:
    error "Failed to remove channel (lock)",
      channelId = channelId, msg = getCurrentExceptionMsg()
    return err(ReliabilityError.reInternalError)