Fix dboptions init (#2391)
For the block cache to be shared between column families, the options instance must be shared between the various column families being created. This also ensures that there is only one source of truth for configuration options instead of having two different sets depending on how the tables were initialized. This PR also removes the re-opening mechanism which can double startup time - every time the database is opened, the log is replayed - a large log file will take a long time to open. Finally, several options got correclty implemented as column family options, including an one that puts a hash index in the SST files.
This commit is contained in:
parent
83f6f89869
commit
41cf81f80b
|
@ -38,21 +38,23 @@ export
|
|||
|
||||
proc newAristoRdbDbRef(
|
||||
basePath: string;
|
||||
opts: DbOptions;
|
||||
): Result[AristoDbRef, AristoError]=
|
||||
dbOpts: DbOptionsRef;
|
||||
cfOpts: ColFamilyOptionsRef;
|
||||
guestCFs: openArray[ColFamilyDescriptor];
|
||||
): Result[(AristoDbRef, seq[ColFamilyReadWrite]), AristoError]=
|
||||
let
|
||||
be = ? rocksDbBackend(basePath, opts)
|
||||
(be, oCfs) = ? rocksDbBackend(basePath, dbOpts, cfOpts, guestCFs)
|
||||
vTop = block:
|
||||
let rc = be.getTuvFn()
|
||||
if rc.isErr:
|
||||
be.closeFn(eradicate = false)
|
||||
return err(rc.error)
|
||||
rc.value
|
||||
ok AristoDbRef(
|
||||
ok((AristoDbRef(
|
||||
top: LayerRef(
|
||||
delta: LayerDeltaRef(vTop: vTop),
|
||||
final: LayerFinalRef()),
|
||||
backend: be)
|
||||
backend: be), oCfs))
|
||||
|
||||
# ------------------------------------------------------------------------------
|
||||
# Public database constuctors, destructor
|
||||
|
@ -62,36 +64,14 @@ proc init*(
|
|||
T: type AristoDbRef;
|
||||
B: type RdbBackendRef;
|
||||
basePath: string;
|
||||
opts: DbOptions
|
||||
): Result[T, AristoError] =
|
||||
dbOpts: DbOptionsRef;
|
||||
cfOpts: ColFamilyOptionsRef;
|
||||
guestCFs: openArray[ColFamilyDescriptor];
|
||||
): Result[(T, seq[ColFamilyReadWrite]), AristoError] =
|
||||
## Generic constructor, `basePath` argument is ignored for memory backend
|
||||
## databases (which also unconditionally succeed initialising.)
|
||||
##
|
||||
basePath.newAristoRdbDbRef opts
|
||||
|
||||
proc reinit*(
|
||||
db: AristoDbRef;
|
||||
cfs: openArray[ColFamilyDescriptor];
|
||||
): Result[seq[ColFamilyReadWrite],AristoError] =
|
||||
## Re-initialise the `RocksDb` backend database with additional or changed
|
||||
## column family settings. This can be used to make space for guest use of
|
||||
## the backend used by `Aristo`. The function returns a list of column family
|
||||
## descriptors in the same order as the `cfs` argument.
|
||||
##
|
||||
## The argument `cfs` list replaces and extends the CFs already on disk by
|
||||
## its options except for the ones defined for use with `Aristo`.
|
||||
##
|
||||
## Even though tx layers and filters might not be affected by this function,
|
||||
## it is prudent to have them clean and saved on the backend database before
|
||||
## changing it. On error conditions, data might get lost.
|
||||
##
|
||||
case db.backend.kind:
|
||||
of BackendRocksDB:
|
||||
db.backend.rocksDbUpdateCfs cfs
|
||||
of BackendRdbHosting:
|
||||
err(RdbBeWrTriggerActiveAlready)
|
||||
else:
|
||||
return err(RdbBeTypeUnsupported)
|
||||
basePath.newAristoRdbDbRef dbOpts, cfOpts, guestCFs
|
||||
|
||||
proc activateWrTrigger*(
|
||||
db: AristoDbRef;
|
||||
|
|
|
@ -250,19 +250,22 @@ proc putBegHostingFn(db: RdbBackendRef): PutBegFn =
|
|||
|
||||
proc rocksDbBackend*(
|
||||
path: string;
|
||||
opts: DbOptions
|
||||
): Result[BackendRef,AristoError] =
|
||||
dbOpts: DbOptionsRef;
|
||||
cfOpts: ColFamilyOptionsRef;
|
||||
guestCFs: openArray[ColFamilyDescriptor];
|
||||
): Result[(BackendRef, seq[ColFamilyReadWrite]),AristoError] =
|
||||
let db = RdbBackendRef(
|
||||
beKind: BackendRocksDB)
|
||||
|
||||
# Initialise RocksDB
|
||||
block:
|
||||
let rc = db.rdb.init(path, opts)
|
||||
let oCfs = block:
|
||||
let rc = db.rdb.init(path, dbOpts, cfOpts, guestCFs)
|
||||
if rc.isErr:
|
||||
when extraTraceMessages:
|
||||
trace logTxt "constructor failed",
|
||||
error=rc.error[0], info=rc.error[1]
|
||||
return err(rc.error[0])
|
||||
rc.value()
|
||||
|
||||
db.getVtxFn = getVtxFn db
|
||||
db.getKeyFn = getKeyFn db
|
||||
|
@ -277,19 +280,7 @@ proc rocksDbBackend*(
|
|||
db.putEndFn = putEndFn db
|
||||
|
||||
db.closeFn = closeFn db
|
||||
ok db
|
||||
|
||||
|
||||
proc rocksDbUpdateCfs*(
|
||||
be: BackendRef;
|
||||
cfs: openArray[ColFamilyDescriptor];
|
||||
): Result[seq[ColFamilyReadWrite],AristoError] =
|
||||
## Reopen with extended column families given as argument.
|
||||
let
|
||||
db = RdbBackendRef(be)
|
||||
rCfs = db.rdb.reinit(cfs).valueOr:
|
||||
return err(error[0])
|
||||
ok rCfs
|
||||
ok((db, oCfs))
|
||||
|
||||
|
||||
proc rocksDbSetEventTrigger*(
|
||||
|
|
|
@ -18,7 +18,6 @@ import
|
|||
eth/common,
|
||||
rocksdb,
|
||||
stew/[endians2, keyed_queue],
|
||||
../../../opts,
|
||||
../../aristo_desc,
|
||||
../init_common
|
||||
|
||||
|
@ -43,7 +42,6 @@ type
|
|||
rdVtxLru*: KeyedQueue[VertexID,VertexRef] ## Read cache
|
||||
|
||||
basePath*: string ## Database directory
|
||||
opts*: DbOptions ## Just a copy here for re-opening
|
||||
trgWriteEvent*: RdbWriteEventCb ## Database piggiback call back handler
|
||||
|
||||
AristoCFs* = enum
|
||||
|
|
|
@ -25,115 +25,17 @@ import
|
|||
# Private constructor
|
||||
# ------------------------------------------------------------------------------
|
||||
|
||||
proc getInitOptions(
|
||||
opts: DbOptions;
|
||||
): tuple[cfOpts: ColFamilyOptionsRef, dbOpts: DbOptionsRef] =
|
||||
# TODO the configuration options below have not been tuned but are rather
|
||||
# based on gut feeling, guesses and by looking at other clients - it
|
||||
# would make sense to test different settings and combinations once the
|
||||
# data model itself has settled down as their optimal values will depend
|
||||
# on the shape of the data - it'll also be different per column family..
|
||||
let cfOpts = defaultColFamilyOptions()
|
||||
|
||||
if opts.writeBufferSize > 0:
|
||||
cfOpts.setWriteBufferSize(opts.writeBufferSize)
|
||||
|
||||
# When data is written to rocksdb, it is first put in an in-memory table
|
||||
# whose index is a skip list. Since the mem table holds the most recent data,
|
||||
# all reads must go through this skiplist which results in slow lookups for
|
||||
# already-written data.
|
||||
# We enable a bloom filter on the mem table to avoid this lookup in the cases
|
||||
# where the data is actually on disk already (ie wasn't updated recently).
|
||||
# TODO there's also a hashskiplist that has both a hash index and a skip list
|
||||
# which maybe could be used - uses more memory, requires a key prefix
|
||||
# extractor
|
||||
cfOpts.setMemtableWholeKeyFiltering(true)
|
||||
cfOpts.setMemtablePrefixBloomSizeRatio(0.1)
|
||||
|
||||
# LZ4 seems to cut database size to 2/3 roughly, at the time of writing
|
||||
# Using it for the bottom-most level means it applies to 90% of data but
|
||||
# delays compression until data has settled a bit, which seems like a
|
||||
# reasonable tradeoff.
|
||||
# TODO evaluate zstd compression with a trained dictionary
|
||||
# https://github.com/facebook/rocksdb/wiki/Compression
|
||||
cfOpts.setBottommostCompression(Compression.lz4Compression)
|
||||
|
||||
let dbOpts = defaultDbOptions()
|
||||
dbOpts.setMaxOpenFiles(opts.maxOpenFiles)
|
||||
dbOpts.setMaxBytesForLevelBase(opts.writeBufferSize)
|
||||
|
||||
if opts.rowCacheSize > 0:
|
||||
# Good for GET queries, which is what we do most of the time - if we start
|
||||
# using range queries, we should probably give more attention to the block
|
||||
# cache
|
||||
# https://github.com/facebook/rocksdb/blob/af50823069818fc127438e39fef91d2486d6e76c/include/rocksdb/options.h#L1276
|
||||
dbOpts.setRowCache(cacheCreateLRU(opts.rowCacheSize))
|
||||
|
||||
# We mostly look up data we know is there, so we don't need filters at the
|
||||
# last level of the database - this option saves 90% bloom filter memory usage
|
||||
# TODO verify this point
|
||||
# https://github.com/EighteenZi/rocksdb_wiki/blob/master/Memory-usage-in-RocksDB.md#indexes-and-filter-blocks
|
||||
# https://github.com/facebook/rocksdb/blob/af50823069818fc127438e39fef91d2486d6e76c/include/rocksdb/advanced_options.h#L696
|
||||
dbOpts.setOptimizeFiltersForHits(true)
|
||||
|
||||
# Without this option, WAL files might never get removed since a small column
|
||||
# family (like the admin CF) with only tiny writes might keep it open - this
|
||||
# negatively affects startup times since the WAL is replayed on every startup.
|
||||
# https://github.com/facebook/rocksdb/blob/af50823069818fc127438e39fef91d2486d6e76c/include/rocksdb/options.h#L719
|
||||
# Flushing the oldest
|
||||
let writeBufferSize =
|
||||
if opts.writeBufferSize > 0:
|
||||
opts.writeBufferSize
|
||||
else:
|
||||
64 * 1024 * 1024 # TODO read from rocksdb?
|
||||
|
||||
dbOpts.setMaxTotalWalSize(2 * writeBufferSize)
|
||||
|
||||
let tableOpts = defaultTableOptions()
|
||||
# This bloom filter helps avoid having to read multiple SST files when looking
|
||||
# for a value.
|
||||
# A 9.9-bits-per-key ribbon filter takes ~7 bits per key and has a 1% false
|
||||
# positive rate which feels like a good enough starting point, though this
|
||||
# should be better investigated.
|
||||
# https://github.com/facebook/rocksdb/wiki/RocksDB-Bloom-Filter#ribbon-filter
|
||||
# https://github.com/facebook/rocksdb/blob/d64eac28d32a025770cba641ea04e697f475cdd6/include/rocksdb/filter_policy.h#L208
|
||||
tableOpts.setFilterPolicy(createRibbonHybrid(9.9))
|
||||
|
||||
if opts.blockCacheSize > 0:
|
||||
tableOpts.setBlockCache(cacheCreateLRU(opts.rowCacheSize))
|
||||
|
||||
# Single-level indices might cause long stalls due to their large size -
|
||||
# two-level indexing allows the first level to be kept in memory at all times
|
||||
# while the second level is partitioned resulting in smoother loading
|
||||
# https://github.com/facebook/rocksdb/wiki/Partitioned-Index-Filters#how-to-use-it
|
||||
tableOpts.setIndexType(IndexType.twoLevelIndexSearch)
|
||||
tableOpts.setPinTopLevelIndexAndFilter(true)
|
||||
tableOpts.setCacheIndexAndFilterBlocksWithHighPriority(true)
|
||||
tableOpts.setPartitionFilters(true) # TODO do we need this?
|
||||
|
||||
# This option adds a small hash index to each data block, presumably speeding
|
||||
# up Get queries (but again not range queries) - takes up space, apparently
|
||||
# a good tradeoff for most workloads
|
||||
# https://github.com/facebook/rocksdb/wiki/Data-Block-Hash-Index
|
||||
tableOpts.setDataBlockIndexType(DataBlockIndexType.binarySearchAndHash)
|
||||
tableOpts.setDataBlockHashRatio(0.75)
|
||||
|
||||
dbOpts.setBlockBasedTableFactory(tableOpts)
|
||||
|
||||
(cfOpts,dbOpts)
|
||||
|
||||
|
||||
proc initImpl(
|
||||
rdb: var RdbInst;
|
||||
basePath: string;
|
||||
opts: DbOptions;
|
||||
dbOpts: DbOptionsRef,
|
||||
cfOpts: ColFamilyOptionsRef;
|
||||
guestCFs: openArray[ColFamilyDescriptor] = [];
|
||||
): Result[void,(AristoError,string)] =
|
||||
): Result[seq[ColFamilyReadWrite],(AristoError,string)] =
|
||||
## Database backend constructor
|
||||
const initFailed = "RocksDB/init() failed"
|
||||
|
||||
rdb.basePath = basePath
|
||||
rdb.opts = opts
|
||||
|
||||
let
|
||||
dataDir = rdb.dataDir
|
||||
|
@ -142,9 +44,6 @@ proc initImpl(
|
|||
except OSError, IOError:
|
||||
return err((RdbBeCantCreateDataDir, ""))
|
||||
|
||||
# Expand argument `opts` to rocksdb options
|
||||
let (cfOpts, dbOpts) = opts.getInitOptions()
|
||||
|
||||
# Column familiy names to allocate when opening the database. This list
|
||||
# might be extended below.
|
||||
var useCFs = AristoCFs.mapIt($it).toHashSet
|
||||
|
@ -182,7 +81,7 @@ proc initImpl(
|
|||
rdb.keyCol = baseDb.withColFamily($KeyCF).valueOr:
|
||||
raiseAssert initFailed & " cannot initialise KeyCF descriptor: " & error
|
||||
|
||||
ok()
|
||||
ok(guestCFs.mapIt(baseDb.withColFamily(it.name).expect("loaded cf")))
|
||||
|
||||
# ------------------------------------------------------------------------------
|
||||
# Public constructor
|
||||
|
@ -191,43 +90,12 @@ proc initImpl(
|
|||
proc init*(
|
||||
rdb: var RdbInst;
|
||||
basePath: string;
|
||||
opts: DbOptions;
|
||||
): Result[void,(AristoError,string)] =
|
||||
## Temporarily define a guest CF list here.
|
||||
rdb.initImpl(basePath, opts)
|
||||
|
||||
proc reinit*(
|
||||
rdb: var RdbInst;
|
||||
cfs: openArray[ColFamilyDescriptor];
|
||||
dbOpts: DbOptionsRef;
|
||||
cfOpts: ColFamilyOptionsRef;
|
||||
guestCFs: openArray[ColFamilyDescriptor];
|
||||
): Result[seq[ColFamilyReadWrite],(AristoError,string)] =
|
||||
## Re-open database with changed parameters. Even though tx layers and
|
||||
## filters might not be affected it is prudent to have them clean and
|
||||
## saved on the backend database before changing it.
|
||||
##
|
||||
## The function returns a list of column family descriptors in the same
|
||||
## order as the `cfs` argument.
|
||||
##
|
||||
## The `cfs` list replaces and extends the CFs already on disk by its
|
||||
## options except for the ones defined with `AristoCFs`.
|
||||
##
|
||||
const initFailed = "RocksDB/reinit() failed"
|
||||
|
||||
if not rdb.session.isNil:
|
||||
return err((RdbBeWrSessionUnfinished,""))
|
||||
if not rdb.baseDb.isClosed():
|
||||
rdb.baseDb.close()
|
||||
|
||||
rdb.initImpl(rdb.basePath, rdb.opts, cfs).isOkOr:
|
||||
return err(error)
|
||||
|
||||
# Assemble list of column family descriptors
|
||||
var guestCols = newSeq[ColFamilyReadWrite](cfs.len)
|
||||
for n,col in cfs:
|
||||
guestCols[n] = rdb.baseDb.withColFamily(col.name).valueOr:
|
||||
raiseAssert initFailed & " cannot initialise " &
|
||||
col.name & " descriptor: " & error
|
||||
|
||||
ok guestCols
|
||||
## Temporarily define a guest CF list here.
|
||||
rdb.initImpl(basePath, dbOpts, cfOpts, guestCFs)
|
||||
|
||||
|
||||
proc destroy*(rdb: var RdbInst; eradicate: bool) =
|
||||
|
|
|
@ -11,20 +11,22 @@
|
|||
{.push raises: [].}
|
||||
|
||||
import
|
||||
std/sequtils,
|
||||
eth/common,
|
||||
rocksdb,
|
||||
results,
|
||||
../../aristo,
|
||||
../../aristo/aristo_persistent as use_ari,
|
||||
../../aristo/aristo_init/rocks_db as use_ari,
|
||||
../../aristo/[aristo_desc, aristo_walk/persistent, aristo_tx],
|
||||
../../kvt,
|
||||
../../kvt/kvt_persistent as use_kvt,
|
||||
../../kvt/kvt_init/rocks_db/rdb_init,
|
||||
../base,
|
||||
./aristo_db,
|
||||
./aristo_db/[common_desc, handlers_aristo],
|
||||
../../opts
|
||||
|
||||
include
|
||||
./aristo_db/aristo_replicate
|
||||
include ./aristo_db/aristo_replicate
|
||||
|
||||
const
|
||||
# Expectation messages
|
||||
|
@ -34,16 +36,122 @@ const
|
|||
# Annotation helper(s)
|
||||
{.pragma: rlpRaise, gcsafe, raises: [AristoApiRlpError].}
|
||||
|
||||
proc toRocksDb*(
|
||||
opts: DbOptions
|
||||
): tuple[dbOpts: DbOptionsRef, cfOpts: ColFamilyOptionsRef] =
|
||||
# TODO the configuration options below have not been tuned but are rather
|
||||
# based on gut feeling, guesses and by looking at other clients - it
|
||||
# would make sense to test different settings and combinations once the
|
||||
# data model itself has settled down as their optimal values will depend
|
||||
# on the shape of the data - it'll also be different per column family..
|
||||
|
||||
let tableOpts = defaultTableOptions()
|
||||
# This bloom filter helps avoid having to read multiple SST files when looking
|
||||
# for a value.
|
||||
# A 9.9-bits-per-key ribbon filter takes ~7 bits per key and has a 1% false
|
||||
# positive rate which feels like a good enough starting point, though this
|
||||
# should be better investigated.
|
||||
# https://github.com/facebook/rocksdb/wiki/RocksDB-Bloom-Filter#ribbon-filter
|
||||
# https://github.com/facebook/rocksdb/blob/d64eac28d32a025770cba641ea04e697f475cdd6/include/rocksdb/filter_policy.h#L208
|
||||
tableOpts.filterPolicy = createRibbonHybrid(9.9)
|
||||
|
||||
if opts.blockCacheSize > 0:
|
||||
# Share a single block cache instance between all column families
|
||||
tableOpts.blockCache = cacheCreateLRU(opts.blockCacheSize)
|
||||
|
||||
# Single-level indices might cause long stalls due to their large size -
|
||||
# two-level indexing allows the first level to be kept in memory at all times
|
||||
# while the second level is partitioned resulting in smoother loading
|
||||
# https://github.com/facebook/rocksdb/wiki/Partitioned-Index-Filters#how-to-use-it
|
||||
tableOpts.indexType = IndexType.twoLevelIndexSearch
|
||||
tableOpts.pinTopLevelIndexAndFilter = true
|
||||
tableOpts.cacheIndexAndFilterBlocksWithHighPriority = true
|
||||
tableOpts.partitionFilters = true # TODO do we need this?
|
||||
|
||||
# This option adds a small hash index to each data block, presumably speeding
|
||||
# up Get queries (but again not range queries) - takes up space, apparently
|
||||
# a good tradeoff for most workloads
|
||||
# https://github.com/facebook/rocksdb/wiki/Data-Block-Hash-Index
|
||||
tableOpts.dataBlockIndexType = DataBlockIndexType.binarySearchAndHash
|
||||
tableOpts.dataBlockHashRatio = 0.75
|
||||
|
||||
let cfOpts = defaultColFamilyOptions()
|
||||
|
||||
cfOpts.blockBasedTableFactory = tableOpts
|
||||
|
||||
if opts.writeBufferSize > 0:
|
||||
cfOpts.writeBufferSize = opts.writeBufferSize
|
||||
|
||||
# When data is written to rocksdb, it is first put in an in-memory table
|
||||
# whose index is a skip list. Since the mem table holds the most recent data,
|
||||
# all reads must go through this skiplist which results in slow lookups for
|
||||
# already-written data.
|
||||
# We enable a bloom filter on the mem table to avoid this lookup in the cases
|
||||
# where the data is actually on disk already (ie wasn't updated recently).
|
||||
# TODO there's also a hashskiplist that has both a hash index and a skip list
|
||||
# which maybe could be used - uses more memory, requires a key prefix
|
||||
# extractor
|
||||
cfOpts.memtableWholeKeyFiltering = true
|
||||
cfOpts.memtablePrefixBloomSizeRatio = 0.1
|
||||
|
||||
# LZ4 seems to cut database size to 2/3 roughly, at the time of writing
|
||||
# Using it for the bottom-most level means it applies to 90% of data but
|
||||
# delays compression until data has settled a bit, which seems like a
|
||||
# reasonable tradeoff.
|
||||
# TODO evaluate zstd compression with a trained dictionary
|
||||
# https://github.com/facebook/rocksdb/wiki/Compression
|
||||
cfOpts.bottommostCompression = Compression.lz4Compression
|
||||
|
||||
# We mostly look up data we know is there, so we don't need filters at the
|
||||
# last level of the database - this option saves 90% bloom filter memory usage
|
||||
# TODO verify this point
|
||||
# https://github.com/EighteenZi/rocksdb_wiki/blob/master/Memory-usage-in-RocksDB.md#indexes-and-filter-blocks
|
||||
# https://github.com/facebook/rocksdb/blob/af50823069818fc127438e39fef91d2486d6e76c/include/rocksdb/advanced_options.h#L696
|
||||
cfOpts.optimizeFiltersForHits = true
|
||||
|
||||
cfOpts.maxBytesForLevelBase = opts.writeBufferSize
|
||||
|
||||
let dbOpts = defaultDbOptions()
|
||||
dbOpts.maxOpenFiles = opts.maxOpenFiles
|
||||
|
||||
if opts.rowCacheSize > 0:
|
||||
# Good for GET queries, which is what we do most of the time - if we start
|
||||
# using range queries, we should probably give more attention to the block
|
||||
# cache
|
||||
# https://github.com/facebook/rocksdb/blob/af50823069818fc127438e39fef91d2486d6e76c/include/rocksdb/options.h#L1276
|
||||
dbOpts.rowCache = cacheCreateLRU(opts.rowCacheSize)
|
||||
|
||||
# Without this option, WAL files might never get removed since a small column
|
||||
# family (like the admin CF) with only tiny writes might keep it open - this
|
||||
# negatively affects startup times since the WAL is replayed on every startup.
|
||||
# https://github.com/facebook/rocksdb/blob/af50823069818fc127438e39fef91d2486d6e76c/include/rocksdb/options.h#L719
|
||||
# Flushing the oldest
|
||||
let writeBufferSize =
|
||||
if opts.writeBufferSize > 0:
|
||||
opts.writeBufferSize
|
||||
else:
|
||||
cfOpts.writeBufferSize
|
||||
|
||||
dbOpts.maxTotalWalSize = 2 * writeBufferSize
|
||||
|
||||
dbOpts.keepLogFileNum = 16 # No point keeping 1000 log files around...
|
||||
|
||||
(dbOpts, cfOpts)
|
||||
|
||||
# ------------------------------------------------------------------------------
|
||||
# Public constructor
|
||||
# ------------------------------------------------------------------------------
|
||||
|
||||
proc newAristoRocksDbCoreDbRef*(path: string, opts: DbOptions): CoreDbRef =
|
||||
## This funcion piggybacks the `KVT` on the `Aristo` backend.
|
||||
|
||||
let
|
||||
adb = AristoDbRef.init(use_ari.RdbBackendRef, path, opts).valueOr:
|
||||
# Sharing opts means we also share caches between column families!
|
||||
(dbOpts, cfOpts) = opts.toRocksDb()
|
||||
guestCFs = RdbInst.guestCFs(cfOpts)
|
||||
(adb, oCfs) = AristoDbRef.init(use_ari.RdbBackendRef, path, dbOpts, cfOpts, guestCFs).valueOr:
|
||||
raiseAssert aristoFail & ": " & $error
|
||||
kdb = KvtDbRef.init(use_kvt.RdbBackendRef, adb, opts).valueOr:
|
||||
kdb = KvtDbRef.init(use_kvt.RdbBackendRef, adb, oCfs).valueOr:
|
||||
raiseAssert kvtFail & ": " & $error
|
||||
AristoDbRocks.create(kdb, adb)
|
||||
|
||||
|
@ -51,9 +159,10 @@ proc newAristoDualRocksDbCoreDbRef*(path: string, opts: DbOptions): CoreDbRef =
|
|||
## This is only for debugging. The KVT is run on a completely separate
|
||||
## database backend.
|
||||
let
|
||||
adb = AristoDbRef.init(use_ari.RdbBackendRef, path, opts).valueOr:
|
||||
(dbOpts, cfOpts) = opts.toRocksDb()
|
||||
(adb, _) = AristoDbRef.init(use_ari.RdbBackendRef, path, dbOpts, cfOpts, []).valueOr:
|
||||
raiseAssert aristoFail & ": " & $error
|
||||
kdb = KvtDbRef.init(use_kvt.RdbBackendRef, path, opts).valueOr:
|
||||
kdb = KvtDbRef.init(use_kvt.RdbBackendRef, path, dbOpts, cfOpts).valueOr:
|
||||
raiseAssert kvtFail & ": " & $error
|
||||
AristoDbRocks.create(kdb, adb)
|
||||
|
||||
|
@ -61,10 +170,10 @@ proc newAristoDualRocksDbCoreDbRef*(path: string, opts: DbOptions): CoreDbRef =
|
|||
# Public aristo iterators
|
||||
# ------------------------------------------------------------------------------
|
||||
|
||||
iterator aristoReplicateRdb*(dsc: CoreDxMptRef): (Blob,Blob) {.rlpRaise.} =
|
||||
iterator aristoReplicateRdb*(dsc: CoreDxMptRef): (Blob, Blob) {.rlpRaise.} =
|
||||
## Instantiation for `VoidBackendRef`
|
||||
for k,v in aristoReplicate[use_ari.RdbBackendRef](dsc):
|
||||
yield (k,v)
|
||||
for k, v in aristoReplicate[use_ari.RdbBackendRef](dsc):
|
||||
yield (k, v)
|
||||
|
||||
# ------------------------------------------------------------------------------
|
||||
# End
|
||||
|
|
|
@ -18,8 +18,6 @@ import
|
|||
|
||||
export kvstore
|
||||
|
||||
const maxOpenFiles = 512
|
||||
|
||||
type
|
||||
RocksStoreRef* = ref object of RootObj
|
||||
db: RocksDbReadWriteRef
|
||||
|
@ -86,7 +84,6 @@ proc init*(
|
|||
return err("RocksStoreRef: cannot create database directory")
|
||||
|
||||
let dbOpts = defaultDbOptions()
|
||||
dbOpts.setMaxOpenFiles(maxOpenFiles)
|
||||
|
||||
let db = ? openRocksDb(dataDir, dbOpts,
|
||||
columnFamilies = namespaces.mapIt(initColFamilyDescriptor(it)))
|
||||
|
|
|
@ -19,6 +19,7 @@
|
|||
{.push raises: [].}
|
||||
|
||||
import
|
||||
rocksdb,
|
||||
results,
|
||||
../../aristo,
|
||||
../../opts,
|
||||
|
@ -44,19 +45,20 @@ proc init*(
|
|||
T: type KvtDbRef;
|
||||
B: type RdbBackendRef;
|
||||
basePath: string;
|
||||
opts: DbOptions;
|
||||
dbOpts: DbOptionsRef;
|
||||
cfOpts: ColFamilyOptionsRef;
|
||||
): Result[KvtDbRef,KvtError] =
|
||||
## Generic constructor for `RocksDb` backend
|
||||
##
|
||||
ok KvtDbRef(
|
||||
top: LayerRef.init(),
|
||||
backend: ? rocksDbKvtBackend(basePath, opts).mapErr toErr0)
|
||||
backend: ? rocksDbKvtBackend(basePath, dbOpts, cfOpts).mapErr toErr0)
|
||||
|
||||
proc init*(
|
||||
T: type KvtDbRef;
|
||||
B: type RdbBackendRef;
|
||||
adb: AristoDbRef;
|
||||
opts: DbOptions;
|
||||
oCfs: openArray[ColFamilyReadWrite];
|
||||
): Result[KvtDbRef,KvtError] =
|
||||
## Constructor for `RocksDb` backend which piggybacks on the `Aristo`
|
||||
## backend. The following changes will occur after successful instantiation:
|
||||
|
@ -83,7 +85,7 @@ proc init*(
|
|||
##
|
||||
ok KvtDbRef(
|
||||
top: LayerRef.init(),
|
||||
backend: ? rocksDbKvtTriggeredBackend(adb, opts).mapErr toErr0)
|
||||
backend: ? rocksDbKvtTriggeredBackend(adb, oCfs).mapErr toErr0)
|
||||
|
||||
# ------------------------------------------------------------------------------
|
||||
# End
|
||||
|
|
|
@ -255,13 +255,14 @@ proc writeEvCb(db: RdbBackendRef): RdbWriteEventCb =
|
|||
|
||||
proc rocksDbKvtBackend*(
|
||||
path: string;
|
||||
opts: DbOptions;
|
||||
dbOpts: DbOptionsRef;
|
||||
cfOpts: ColFamilyOptionsRef;
|
||||
): Result[BackendRef,(KvtError,string)] =
|
||||
let db = RdbBackendRef(
|
||||
beKind: BackendRocksDB)
|
||||
|
||||
# Initialise RocksDB
|
||||
db.rdb.init(path, opts).isOkOr:
|
||||
db.rdb.init(path, dbOpts, cfOpts).isOkOr:
|
||||
when extraTraceMessages:
|
||||
trace logTxt "constructor failed", error=error[0], info=error[1]
|
||||
return err(error)
|
||||
|
@ -280,13 +281,13 @@ proc rocksDbKvtBackend*(
|
|||
|
||||
proc rocksDbKvtTriggeredBackend*(
|
||||
adb: AristoDbRef;
|
||||
opts: DbOptions;
|
||||
oCfs: openArray[ColFamilyReadWrite];
|
||||
): Result[BackendRef,(KvtError,string)] =
|
||||
let db = RdbBackendRef(
|
||||
beKind: BackendRdbTriggered)
|
||||
|
||||
# Initialise RocksDB piggy-backed on `Aristo` backend.
|
||||
db.rdb.init(adb, opts).isOkOr:
|
||||
db.rdb.init(oCfs).isOkOr:
|
||||
when extraTraceMessages:
|
||||
trace logTxt "constructor failed", error=error[0], info=error[1]
|
||||
return err(error)
|
||||
|
|
|
@ -17,62 +17,12 @@ import
|
|||
std/[sequtils, os],
|
||||
rocksdb,
|
||||
results,
|
||||
../../../aristo/aristo_init/persistent,
|
||||
../../../opts,
|
||||
../../kvt_desc,
|
||||
../../kvt_desc/desc_error as kdb,
|
||||
./rdb_desc
|
||||
|
||||
# ------------------------------------------------------------------------------
|
||||
# Private helpers
|
||||
# ------------------------------------------------------------------------------
|
||||
|
||||
proc getCFInitOptions(opts: DbOptions): ColFamilyOptionsRef =
|
||||
# TODO the configuration options below have not been tuned but are rather
|
||||
# based on gut feeling, guesses and by looking at other clients - it
|
||||
# would make sense to test different settings and combinations once the
|
||||
# data model itself has settled down as their optimal values will depend
|
||||
# on the shape of the data - it'll also be different per column family..
|
||||
let cfOpts = defaultColFamilyOptions()
|
||||
|
||||
if opts.writeBufferSize > 0:
|
||||
cfOpts.setWriteBufferSize(opts.writeBufferSize)
|
||||
|
||||
# When data is written to rocksdb, it is first put in an in-memory table
|
||||
# whose index is a skip list. Since the mem table holds the most recent data,
|
||||
# all reads must go through this skiplist which results in slow lookups for
|
||||
# already-written data.
|
||||
# We enable a bloom filter on the mem table to avoid this lookup in the cases
|
||||
# where the data is actually on disk already (ie wasn't updated recently).
|
||||
# TODO there's also a hashskiplist that has both a hash index and a skip list
|
||||
# which maybe could be used - uses more memory, requires a key prefix
|
||||
# extractor
|
||||
cfOpts.setMemtableWholeKeyFiltering(true)
|
||||
cfOpts.setMemtablePrefixBloomSizeRatio(0.1)
|
||||
|
||||
# LZ4 seems to cut database size to 2/3 roughly, at the time of writing
|
||||
# Using it for the bottom-most level means it applies to 90% of data but
|
||||
# delays compression until data has settled a bit, which seems like a
|
||||
# reasonable tradeoff.
|
||||
# TODO evaluate zstd compression with a trained dictionary
|
||||
# https://github.com/facebook/rocksdb/wiki/Compression
|
||||
cfOpts.setBottommostCompression(Compression.lz4Compression)
|
||||
|
||||
cfOpts
|
||||
|
||||
|
||||
proc getDbInitOptions(opts: DbOptions): DbOptionsRef =
|
||||
result = defaultDbOptions()
|
||||
result.setMaxOpenFiles(opts.maxOpenFiles)
|
||||
result.setMaxBytesForLevelBase(opts.writeBufferSize)
|
||||
|
||||
if opts.rowCacheSize > 0:
|
||||
result.setRowCache(cacheCreateLRU(opts.rowCacheSize))
|
||||
|
||||
if opts.blockCacheSize > 0:
|
||||
let tableOpts = defaultTableOptions()
|
||||
tableOpts.setBlockCache(cacheCreateLRU(opts.rowCacheSize))
|
||||
result.setBlockBasedTableFactory(tableOpts)
|
||||
export rdb_desc, results
|
||||
|
||||
# ------------------------------------------------------------------------------
|
||||
# Public constructor
|
||||
|
@ -81,7 +31,8 @@ proc getDbInitOptions(opts: DbOptions): DbOptionsRef =
|
|||
proc init*(
|
||||
rdb: var RdbInst;
|
||||
basePath: string;
|
||||
opts: DbOptions;
|
||||
dbOpts: DbOptionsRef;
|
||||
cfOpts: ColFamilyOptionsRef;
|
||||
): Result[void,(KvtError,string)] =
|
||||
## Database backend constructor for stand-alone version
|
||||
##
|
||||
|
@ -96,9 +47,6 @@ proc init*(
|
|||
except OSError, IOError:
|
||||
return err((kdb.RdbBeCantCreateDataDir, ""))
|
||||
|
||||
# Expand argument `opts` to rocksdb options
|
||||
let (cfOpts, dbOpts) = (opts.getCFInitOptions, opts.getDbInitOptions)
|
||||
|
||||
# Column familiy names to allocate when opening the database.
|
||||
let cfs = KvtCFs.mapIt(($it).initColFamilyDescriptor cfOpts)
|
||||
|
||||
|
@ -113,20 +61,15 @@ proc init*(
|
|||
$col & " descriptor: " & error
|
||||
ok()
|
||||
|
||||
proc guestCFs*(T: type RdbInst, cfOpts: ColFamilyOptionsRef): seq =
|
||||
KvtCFs.toSeq.mapIt(initColFamilyDescriptor($it, cfOpts))
|
||||
|
||||
proc init*(
|
||||
rdb: var RdbInst;
|
||||
adb: AristoDbRef;
|
||||
opts: DbOptions;
|
||||
oCfs: openArray[ColFamilyReadWrite];
|
||||
): Result[void,(KvtError,string)] =
|
||||
## Initalise column handlers piggy-backing on the `Aristo` backend.
|
||||
##
|
||||
let
|
||||
cfOpts = opts.getCFInitOptions()
|
||||
iCfs = KvtCFs.toSeq.mapIt(initColFamilyDescriptor($it, cfOpts))
|
||||
oCfs = adb.reinit(iCfs).valueOr:
|
||||
return err((RdbBeHostError,$error))
|
||||
|
||||
# Collect column family descriptors (this stores implicitely `baseDb`)
|
||||
for n in KvtCFs:
|
||||
assert oCfs[n.ord].name != "" # debugging only
|
||||
|
|
|
@ -17,6 +17,7 @@ import
|
|||
results,
|
||||
unittest2,
|
||||
../../nimbus/db/opts,
|
||||
../../nimbus/db/core_db/backend/aristo_rocksdb,
|
||||
../../nimbus/db/aristo/[
|
||||
aristo_check,
|
||||
aristo_debug,
|
||||
|
@ -104,10 +105,11 @@ iterator quadripartite(td: openArray[ProofTrieData]): LeafQuartet =
|
|||
proc dbTriplet(w: LeafQuartet; rdbPath: string): Result[DbTriplet,AristoError] =
|
||||
let db = block:
|
||||
if 0 < rdbPath.len:
|
||||
let rc = AristoDbRef.init(RdbBackendRef, rdbPath, DbOptions.init())
|
||||
let (dbOpts, cfOpts) = DbOptions.init().toRocksDb()
|
||||
let rc = AristoDbRef.init(RdbBackendRef, rdbPath, dbOpts, cfOpts, [])
|
||||
xCheckRc rc.error == 0:
|
||||
result = err(rc.error)
|
||||
rc.value
|
||||
rc.value()[0]
|
||||
else:
|
||||
AristoDbRef.init MemBackendRef
|
||||
|
||||
|
|
|
@ -17,6 +17,7 @@ import
|
|||
unittest2,
|
||||
stew/endians2,
|
||||
../../nimbus/db/opts,
|
||||
../../nimbus/db/core_db/backend/aristo_rocksdb,
|
||||
../../nimbus/db/aristo/[
|
||||
aristo_check,
|
||||
aristo_debug,
|
||||
|
@ -330,9 +331,10 @@ proc testTxMergeAndDeleteOneByOne*(
|
|||
# Start with brand new persistent database.
|
||||
db = block:
|
||||
if 0 < rdbPath.len:
|
||||
let rc = AristoDbRef.init(RdbBackendRef, rdbPath, DbOptions.init())
|
||||
let (dbOpts, cfOpts) = DbOptions.init().toRocksDb()
|
||||
let rc = AristoDbRef.init(RdbBackendRef, rdbPath, dbOpts, cfOpts, [])
|
||||
xCheckRc rc.error == 0
|
||||
rc.value
|
||||
rc.value()[0]
|
||||
else:
|
||||
AristoDbRef.init(MemBackendRef)
|
||||
|
||||
|
@ -441,9 +443,10 @@ proc testTxMergeAndDeleteSubTree*(
|
|||
# Start with brand new persistent database.
|
||||
db = block:
|
||||
if 0 < rdbPath.len:
|
||||
let rc = AristoDbRef.init(RdbBackendRef, rdbPath, DbOptions.init())
|
||||
let (dbOpts, cfOpts) = DbOptions.init().toRocksDb()
|
||||
let rc = AristoDbRef.init(RdbBackendRef, rdbPath, dbOpts, cfOpts, [])
|
||||
xCheckRc rc.error == 0
|
||||
rc.value
|
||||
rc.value()[0]
|
||||
else:
|
||||
AristoDbRef.init(MemBackendRef)
|
||||
|
||||
|
@ -545,9 +548,10 @@ proc testTxMergeProofAndKvpList*(
|
|||
db = block:
|
||||
# New DB with disabled filter slots management
|
||||
if 0 < rdbPath.len:
|
||||
let rc = AristoDbRef.init(RdbBackendRef, rdbPath, DbOptions.init())
|
||||
let (dbOpts, cfOpts) = DbOptions.init().toRocksDb()
|
||||
let rc = AristoDbRef.init(RdbBackendRef, rdbPath, dbOpts, cfOpts, [])
|
||||
xCheckRc rc.error == 0
|
||||
rc.value
|
||||
rc.value()[0]
|
||||
else:
|
||||
AristoDbRef.init(MemBackendRef)
|
||||
|
||||
|
|
|
@ -1 +1 @@
|
|||
Subproject commit 293dc0745ea8386237546acb352a265a4bc874b5
|
||||
Subproject commit f5dcb34ae83648bf5868618bc7fe916073b4455f
|
Loading…
Reference in New Issue