# nimbus-eth1
# Copyright (c) 2023-2024 Status Research & Development GmbH
# Licensed under either of
#  * Apache License, version 2.0, ([LICENSE-APACHE](LICENSE-APACHE) or
#    http://www.apache.org/licenses/LICENSE-2.0)
#  * MIT license ([LICENSE-MIT](LICENSE-MIT) or
#    http://opensource.org/licenses/MIT)
# at your option. This file may not be copied, modified, or distributed
# except according to those terms.

## Rocksdb constructor/destructor for Aristo DB
## ============================================

{.push raises: [].}

import
  std/[sequtils, os],
  rocksdb,
  results,
  ../../aristo_desc,
  ./rdb_desc,
  ../../../opts

# ------------------------------------------------------------------------------
# Public constructor
# ------------------------------------------------------------------------------

proc init*(
    rdb: var RdbInst;
    basePath: string;
    opts: DbOptions;
      ): Result[void,(AristoError,string)] =
  ## Constructor c ode inspired by `RocksStoreRef.init()` from
  ## kvstore_rocksdb.nim
  const initFailed = "RocksDB/init() failed"

  rdb.basePath = basePath

  let
    dataDir = rdb.dataDir
  try:
    dataDir.createDir
  except OSError, IOError:
    return err((RdbBeCantCreateDataDir, ""))

  # TODO the configuration options below have not been tuned but are rather
  #      based on gut feeling, guesses and by looking at other clients - it
  #      would make sense to test different settings and combinations once the
  #      data model itself has settled down as their optimal values will depend
  #      on the shape of the data - it'll also be different per column family..
  let cfOpts = defaultColFamilyOptions()

  if opts.writeBufferSize > 0:
    cfOpts.setWriteBufferSize(opts.writeBufferSize)

  # Without this option, the WAL might never get flushed since a small column
  # family (like the admin CF) with only tiny writes might keep it open - this
  # negatively affects startup times since the WAL is replayed on every startup.
  # https://github.com/facebook/rocksdb/blob/af50823069818fc127438e39fef91d2486d6e76c/include/rocksdb/options.h#L719
  # Flushing the oldest
  let writeBufferSize =
    if opts.writeBufferSize > 0:
      opts.writeBufferSize
    else:
      64 * 1024 * 1024 # TODO read from rocksdb?

  cfOpts.setMaxTotalWalSize(2 * writeBufferSize)

  # When data is written to rocksdb, it is first put in an in-memory table
  # whose index is a skip list. Since the mem table holds the most recent data,
  # all reads must go through this skiplist which results in slow lookups for
  # already-written data.
  # We enable a bloom filter on the mem table to avoid this lookup in the cases
  # where the data is actually on disk already (ie wasn't updated recently).
  # TODO there's also a hashskiplist that has both a hash index and a skip list
  #      which maybe could be used - uses more memory, requires a key prefix
  #      extractor
  cfOpts.setMemtableWholeKeyFiltering(true)
  cfOpts.setMemtablePrefixBloomSizeRatio(0.1)

  # LZ4 seems to cut database size to 2/3 roughly, at the time of writing
  # Using it for the bottom-most level means it applies to 90% of data but
  # delays compression until data has settled a bit, which seems like a
  # reasonable tradeoff.
  # TODO evaluate zstd compression with a trained dictionary
  # https://github.com/facebook/rocksdb/wiki/Compression
  cfOpts.setBottommostCompression(Compression.lz4Compression)

  let
    cfs = @[initColFamilyDescriptor(AdmCF, cfOpts),
            initColFamilyDescriptor(VtxCF, cfOpts),
            initColFamilyDescriptor(KeyCF, cfOpts)] &
          RdbGuest.mapIt(initColFamilyDescriptor($it, cfOpts))
    dbOpts = defaultDbOptions()

  dbOpts.setMaxOpenFiles(opts.maxOpenFiles)
  dbOpts.setMaxBytesForLevelBase(opts.writeBufferSize)

  if opts.rowCacheSize > 0:
    # Good for GET queries, which is what we do most of the time - if we start
    # using range queries, we should probably give more attention to the block
    # cache
    # https://github.com/facebook/rocksdb/blob/af50823069818fc127438e39fef91d2486d6e76c/include/rocksdb/options.h#L1276
    dbOpts.setRowCache(cacheCreateLRU(opts.rowCacheSize))

  # We mostly look up data we know is there, so we don't need filters at the
  # last level of the database - this option saves 90% bloom filter memory usage
  # TODO verify this point
  # https://github.com/EighteenZi/rocksdb_wiki/blob/master/Memory-usage-in-RocksDB.md#indexes-and-filter-blocks
  # https://github.com/facebook/rocksdb/blob/af50823069818fc127438e39fef91d2486d6e76c/include/rocksdb/advanced_options.h#L696
  dbOpts.setOptimizeFiltersForHits(true)

  let tableOpts = defaultTableOptions()
  # This bloom filter helps avoid having to read multiple SST files when looking
  # for a value.
  # A 9.9-bits-per-key ribbon filter takes ~7 bits per key and has a 1% false
  # positive rate which feels like a good enough starting point, though this
  # should be better investigated.
  # https://github.com/facebook/rocksdb/wiki/RocksDB-Bloom-Filter#ribbon-filter
  # https://github.com/facebook/rocksdb/blob/d64eac28d32a025770cba641ea04e697f475cdd6/include/rocksdb/filter_policy.h#L208
  tableOpts.setFilterPolicy(createRibbonHybrid(9.9))

  if opts.blockCacheSize > 0:
    tableOpts.setBlockCache(cacheCreateLRU(opts.rowCacheSize))

  # Single-level indices might cause long stalls due to their large size -
  # two-level indexing allows the first level to be kept in memory at all times
  # while the second level is partitioned resulting in smoother loading
  # https://github.com/facebook/rocksdb/wiki/Partitioned-Index-Filters#how-to-use-it
  tableOpts.setIndexType(IndexType.twoLevelIndexSearch)
  tableOpts.setPinTopLevelIndexAndFilter(true)
  tableOpts.setCacheIndexAndFilterBlocksWithHighPriority(true)
  tableOpts.setPartitionFilters(true) # TODO do we need this?

  # This option adds a small hash index to each data block, presumably speeding
  # up Get queries (but again not range queries) - takes up space, apparently
  # a good tradeoff for most workloads
  # https://github.com/facebook/rocksdb/wiki/Data-Block-Hash-Index
  tableOpts.setDataBlockIndexType(DataBlockIndexType.binarySearchAndHash)
  tableOpts.setDataBlockHashRatio(0.75)

  dbOpts.setBlockBasedTableFactory(tableOpts)

  # Reserve a family corner for `Aristo` on the database
  let baseDb = openRocksDb(dataDir, dbOpts, columnFamilies = cfs).valueOr:
    raiseAssert initFailed & " cannot create base descriptor: " & error

  # Initialise column handlers (this stores implicitely `baseDb`)
  rdb.admCol = baseDb.withColFamily(AdmCF).valueOr:
    raiseAssert initFailed & " cannot initialise AdmCF descriptor: " & error
  rdb.vtxCol = baseDb.withColFamily(VtxCF).valueOr:
    raiseAssert initFailed & " cannot initialise VtxCF descriptor: " & error
  rdb.keyCol = baseDb.withColFamily(KeyCF).valueOr:
    raiseAssert initFailed & " cannot initialise KeyCF descriptor: " & error

  ok()

proc initGuestDb*(
    rdb: RdbInst;
    instance: int;
      ): Result[RootRef,(AristoError,string)] =
  ## Initialise `Guest` family
  ##
  ## Thus was a worth a try, but there are better solutions and this item
  ## will be removed in future.
  ##
  if high(RdbGuest).ord < instance:
    return err((RdbGuestInstanceUnsupported,""))
  let
    guestSym = $RdbGuest(instance)
    guestDb = rdb.baseDb.withColFamily(guestSym).valueOr:
      raiseAssert "RocksDb/initGuestDb() failed: " & error

  ok RdbGuestDbRef(
    beKind: BackendRocksDB,
    guestDb: guestDb)


proc destroy*(rdb: var RdbInst; flush: bool) =
  ## Destructor
  rdb.baseDb.close()

  if flush:
    try:
      rdb.dataDir.removeDir

      # Remove the base folder if it is empty
      block done:
        for w in rdb.baseDir.walkDirRec:
          # Ignore backup files
          if 0 < w.len and w[^1] != '~':
            break done
        rdb.baseDir.removeDir

    except CatchableError:
      discard

# ------------------------------------------------------------------------------
# End
# ------------------------------------------------------------------------------