Fix dboptions init (#2391)

For the block cache to be shared between column families, the options
instance must be shared between the various column families being
created. This also ensures that there is only one source of truth for
configuration options instead of having two different sets depending on
how the tables were initialized.

This PR also removes the re-opening mechanism which can double startup
time - every time the database is opened, the log is replayed - a large
log file will take a long time to open.

Finally, several options got correclty implemented as column family
options, including an one that puts a hash index in the SST files.
This commit is contained in:
Jacek Sieka 2024-06-19 10:55:57 +02:00 committed by GitHub
parent 83f6f89869
commit 41cf81f80b
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
12 changed files with 180 additions and 285 deletions

View File

@ -38,21 +38,23 @@ export
proc newAristoRdbDbRef( proc newAristoRdbDbRef(
basePath: string; basePath: string;
opts: DbOptions; dbOpts: DbOptionsRef;
): Result[AristoDbRef, AristoError]= cfOpts: ColFamilyOptionsRef;
guestCFs: openArray[ColFamilyDescriptor];
): Result[(AristoDbRef, seq[ColFamilyReadWrite]), AristoError]=
let let
be = ? rocksDbBackend(basePath, opts) (be, oCfs) = ? rocksDbBackend(basePath, dbOpts, cfOpts, guestCFs)
vTop = block: vTop = block:
let rc = be.getTuvFn() let rc = be.getTuvFn()
if rc.isErr: if rc.isErr:
be.closeFn(eradicate = false) be.closeFn(eradicate = false)
return err(rc.error) return err(rc.error)
rc.value rc.value
ok AristoDbRef( ok((AristoDbRef(
top: LayerRef( top: LayerRef(
delta: LayerDeltaRef(vTop: vTop), delta: LayerDeltaRef(vTop: vTop),
final: LayerFinalRef()), final: LayerFinalRef()),
backend: be) backend: be), oCfs))
# ------------------------------------------------------------------------------ # ------------------------------------------------------------------------------
# Public database constuctors, destructor # Public database constuctors, destructor
@ -62,36 +64,14 @@ proc init*(
T: type AristoDbRef; T: type AristoDbRef;
B: type RdbBackendRef; B: type RdbBackendRef;
basePath: string; basePath: string;
opts: DbOptions dbOpts: DbOptionsRef;
): Result[T, AristoError] = cfOpts: ColFamilyOptionsRef;
guestCFs: openArray[ColFamilyDescriptor];
): Result[(T, seq[ColFamilyReadWrite]), AristoError] =
## Generic constructor, `basePath` argument is ignored for memory backend ## Generic constructor, `basePath` argument is ignored for memory backend
## databases (which also unconditionally succeed initialising.) ## databases (which also unconditionally succeed initialising.)
## ##
basePath.newAristoRdbDbRef opts basePath.newAristoRdbDbRef dbOpts, cfOpts, guestCFs
proc reinit*(
db: AristoDbRef;
cfs: openArray[ColFamilyDescriptor];
): Result[seq[ColFamilyReadWrite],AristoError] =
## Re-initialise the `RocksDb` backend database with additional or changed
## column family settings. This can be used to make space for guest use of
## the backend used by `Aristo`. The function returns a list of column family
## descriptors in the same order as the `cfs` argument.
##
## The argument `cfs` list replaces and extends the CFs already on disk by
## its options except for the ones defined for use with `Aristo`.
##
## Even though tx layers and filters might not be affected by this function,
## it is prudent to have them clean and saved on the backend database before
## changing it. On error conditions, data might get lost.
##
case db.backend.kind:
of BackendRocksDB:
db.backend.rocksDbUpdateCfs cfs
of BackendRdbHosting:
err(RdbBeWrTriggerActiveAlready)
else:
return err(RdbBeTypeUnsupported)
proc activateWrTrigger*( proc activateWrTrigger*(
db: AristoDbRef; db: AristoDbRef;

View File

@ -250,19 +250,22 @@ proc putBegHostingFn(db: RdbBackendRef): PutBegFn =
proc rocksDbBackend*( proc rocksDbBackend*(
path: string; path: string;
opts: DbOptions dbOpts: DbOptionsRef;
): Result[BackendRef,AristoError] = cfOpts: ColFamilyOptionsRef;
guestCFs: openArray[ColFamilyDescriptor];
): Result[(BackendRef, seq[ColFamilyReadWrite]),AristoError] =
let db = RdbBackendRef( let db = RdbBackendRef(
beKind: BackendRocksDB) beKind: BackendRocksDB)
# Initialise RocksDB # Initialise RocksDB
block: let oCfs = block:
let rc = db.rdb.init(path, opts) let rc = db.rdb.init(path, dbOpts, cfOpts, guestCFs)
if rc.isErr: if rc.isErr:
when extraTraceMessages: when extraTraceMessages:
trace logTxt "constructor failed", trace logTxt "constructor failed",
error=rc.error[0], info=rc.error[1] error=rc.error[0], info=rc.error[1]
return err(rc.error[0]) return err(rc.error[0])
rc.value()
db.getVtxFn = getVtxFn db db.getVtxFn = getVtxFn db
db.getKeyFn = getKeyFn db db.getKeyFn = getKeyFn db
@ -277,19 +280,7 @@ proc rocksDbBackend*(
db.putEndFn = putEndFn db db.putEndFn = putEndFn db
db.closeFn = closeFn db db.closeFn = closeFn db
ok db ok((db, oCfs))
proc rocksDbUpdateCfs*(
be: BackendRef;
cfs: openArray[ColFamilyDescriptor];
): Result[seq[ColFamilyReadWrite],AristoError] =
## Reopen with extended column families given as argument.
let
db = RdbBackendRef(be)
rCfs = db.rdb.reinit(cfs).valueOr:
return err(error[0])
ok rCfs
proc rocksDbSetEventTrigger*( proc rocksDbSetEventTrigger*(

View File

@ -18,7 +18,6 @@ import
eth/common, eth/common,
rocksdb, rocksdb,
stew/[endians2, keyed_queue], stew/[endians2, keyed_queue],
../../../opts,
../../aristo_desc, ../../aristo_desc,
../init_common ../init_common
@ -43,7 +42,6 @@ type
rdVtxLru*: KeyedQueue[VertexID,VertexRef] ## Read cache rdVtxLru*: KeyedQueue[VertexID,VertexRef] ## Read cache
basePath*: string ## Database directory basePath*: string ## Database directory
opts*: DbOptions ## Just a copy here for re-opening
trgWriteEvent*: RdbWriteEventCb ## Database piggiback call back handler trgWriteEvent*: RdbWriteEventCb ## Database piggiback call back handler
AristoCFs* = enum AristoCFs* = enum

View File

@ -25,115 +25,17 @@ import
# Private constructor # Private constructor
# ------------------------------------------------------------------------------ # ------------------------------------------------------------------------------
proc getInitOptions(
opts: DbOptions;
): tuple[cfOpts: ColFamilyOptionsRef, dbOpts: DbOptionsRef] =
# TODO the configuration options below have not been tuned but are rather
# based on gut feeling, guesses and by looking at other clients - it
# would make sense to test different settings and combinations once the
# data model itself has settled down as their optimal values will depend
# on the shape of the data - it'll also be different per column family..
let cfOpts = defaultColFamilyOptions()
if opts.writeBufferSize > 0:
cfOpts.setWriteBufferSize(opts.writeBufferSize)
# When data is written to rocksdb, it is first put in an in-memory table
# whose index is a skip list. Since the mem table holds the most recent data,
# all reads must go through this skiplist which results in slow lookups for
# already-written data.
# We enable a bloom filter on the mem table to avoid this lookup in the cases
# where the data is actually on disk already (ie wasn't updated recently).
# TODO there's also a hashskiplist that has both a hash index and a skip list
# which maybe could be used - uses more memory, requires a key prefix
# extractor
cfOpts.setMemtableWholeKeyFiltering(true)
cfOpts.setMemtablePrefixBloomSizeRatio(0.1)
# LZ4 seems to cut database size to 2/3 roughly, at the time of writing
# Using it for the bottom-most level means it applies to 90% of data but
# delays compression until data has settled a bit, which seems like a
# reasonable tradeoff.
# TODO evaluate zstd compression with a trained dictionary
# https://github.com/facebook/rocksdb/wiki/Compression
cfOpts.setBottommostCompression(Compression.lz4Compression)
let dbOpts = defaultDbOptions()
dbOpts.setMaxOpenFiles(opts.maxOpenFiles)
dbOpts.setMaxBytesForLevelBase(opts.writeBufferSize)
if opts.rowCacheSize > 0:
# Good for GET queries, which is what we do most of the time - if we start
# using range queries, we should probably give more attention to the block
# cache
# https://github.com/facebook/rocksdb/blob/af50823069818fc127438e39fef91d2486d6e76c/include/rocksdb/options.h#L1276
dbOpts.setRowCache(cacheCreateLRU(opts.rowCacheSize))
# We mostly look up data we know is there, so we don't need filters at the
# last level of the database - this option saves 90% bloom filter memory usage
# TODO verify this point
# https://github.com/EighteenZi/rocksdb_wiki/blob/master/Memory-usage-in-RocksDB.md#indexes-and-filter-blocks
# https://github.com/facebook/rocksdb/blob/af50823069818fc127438e39fef91d2486d6e76c/include/rocksdb/advanced_options.h#L696
dbOpts.setOptimizeFiltersForHits(true)
# Without this option, WAL files might never get removed since a small column
# family (like the admin CF) with only tiny writes might keep it open - this
# negatively affects startup times since the WAL is replayed on every startup.
# https://github.com/facebook/rocksdb/blob/af50823069818fc127438e39fef91d2486d6e76c/include/rocksdb/options.h#L719
# Flushing the oldest
let writeBufferSize =
if opts.writeBufferSize > 0:
opts.writeBufferSize
else:
64 * 1024 * 1024 # TODO read from rocksdb?
dbOpts.setMaxTotalWalSize(2 * writeBufferSize)
let tableOpts = defaultTableOptions()
# This bloom filter helps avoid having to read multiple SST files when looking
# for a value.
# A 9.9-bits-per-key ribbon filter takes ~7 bits per key and has a 1% false
# positive rate which feels like a good enough starting point, though this
# should be better investigated.
# https://github.com/facebook/rocksdb/wiki/RocksDB-Bloom-Filter#ribbon-filter
# https://github.com/facebook/rocksdb/blob/d64eac28d32a025770cba641ea04e697f475cdd6/include/rocksdb/filter_policy.h#L208
tableOpts.setFilterPolicy(createRibbonHybrid(9.9))
if opts.blockCacheSize > 0:
tableOpts.setBlockCache(cacheCreateLRU(opts.rowCacheSize))
# Single-level indices might cause long stalls due to their large size -
# two-level indexing allows the first level to be kept in memory at all times
# while the second level is partitioned resulting in smoother loading
# https://github.com/facebook/rocksdb/wiki/Partitioned-Index-Filters#how-to-use-it
tableOpts.setIndexType(IndexType.twoLevelIndexSearch)
tableOpts.setPinTopLevelIndexAndFilter(true)
tableOpts.setCacheIndexAndFilterBlocksWithHighPriority(true)
tableOpts.setPartitionFilters(true) # TODO do we need this?
# This option adds a small hash index to each data block, presumably speeding
# up Get queries (but again not range queries) - takes up space, apparently
# a good tradeoff for most workloads
# https://github.com/facebook/rocksdb/wiki/Data-Block-Hash-Index
tableOpts.setDataBlockIndexType(DataBlockIndexType.binarySearchAndHash)
tableOpts.setDataBlockHashRatio(0.75)
dbOpts.setBlockBasedTableFactory(tableOpts)
(cfOpts,dbOpts)
proc initImpl( proc initImpl(
rdb: var RdbInst; rdb: var RdbInst;
basePath: string; basePath: string;
opts: DbOptions; dbOpts: DbOptionsRef,
cfOpts: ColFamilyOptionsRef;
guestCFs: openArray[ColFamilyDescriptor] = []; guestCFs: openArray[ColFamilyDescriptor] = [];
): Result[void,(AristoError,string)] = ): Result[seq[ColFamilyReadWrite],(AristoError,string)] =
## Database backend constructor ## Database backend constructor
const initFailed = "RocksDB/init() failed" const initFailed = "RocksDB/init() failed"
rdb.basePath = basePath rdb.basePath = basePath
rdb.opts = opts
let let
dataDir = rdb.dataDir dataDir = rdb.dataDir
@ -142,9 +44,6 @@ proc initImpl(
except OSError, IOError: except OSError, IOError:
return err((RdbBeCantCreateDataDir, "")) return err((RdbBeCantCreateDataDir, ""))
# Expand argument `opts` to rocksdb options
let (cfOpts, dbOpts) = opts.getInitOptions()
# Column familiy names to allocate when opening the database. This list # Column familiy names to allocate when opening the database. This list
# might be extended below. # might be extended below.
var useCFs = AristoCFs.mapIt($it).toHashSet var useCFs = AristoCFs.mapIt($it).toHashSet
@ -182,7 +81,7 @@ proc initImpl(
rdb.keyCol = baseDb.withColFamily($KeyCF).valueOr: rdb.keyCol = baseDb.withColFamily($KeyCF).valueOr:
raiseAssert initFailed & " cannot initialise KeyCF descriptor: " & error raiseAssert initFailed & " cannot initialise KeyCF descriptor: " & error
ok() ok(guestCFs.mapIt(baseDb.withColFamily(it.name).expect("loaded cf")))
# ------------------------------------------------------------------------------ # ------------------------------------------------------------------------------
# Public constructor # Public constructor
@ -191,43 +90,12 @@ proc initImpl(
proc init*( proc init*(
rdb: var RdbInst; rdb: var RdbInst;
basePath: string; basePath: string;
opts: DbOptions; dbOpts: DbOptionsRef;
): Result[void,(AristoError,string)] = cfOpts: ColFamilyOptionsRef;
## Temporarily define a guest CF list here. guestCFs: openArray[ColFamilyDescriptor];
rdb.initImpl(basePath, opts)
proc reinit*(
rdb: var RdbInst;
cfs: openArray[ColFamilyDescriptor];
): Result[seq[ColFamilyReadWrite],(AristoError,string)] = ): Result[seq[ColFamilyReadWrite],(AristoError,string)] =
## Re-open database with changed parameters. Even though tx layers and ## Temporarily define a guest CF list here.
## filters might not be affected it is prudent to have them clean and rdb.initImpl(basePath, dbOpts, cfOpts, guestCFs)
## saved on the backend database before changing it.
##
## The function returns a list of column family descriptors in the same
## order as the `cfs` argument.
##
## The `cfs` list replaces and extends the CFs already on disk by its
## options except for the ones defined with `AristoCFs`.
##
const initFailed = "RocksDB/reinit() failed"
if not rdb.session.isNil:
return err((RdbBeWrSessionUnfinished,""))
if not rdb.baseDb.isClosed():
rdb.baseDb.close()
rdb.initImpl(rdb.basePath, rdb.opts, cfs).isOkOr:
return err(error)
# Assemble list of column family descriptors
var guestCols = newSeq[ColFamilyReadWrite](cfs.len)
for n,col in cfs:
guestCols[n] = rdb.baseDb.withColFamily(col.name).valueOr:
raiseAssert initFailed & " cannot initialise " &
col.name & " descriptor: " & error
ok guestCols
proc destroy*(rdb: var RdbInst; eradicate: bool) = proc destroy*(rdb: var RdbInst; eradicate: bool) =

View File

@ -11,20 +11,22 @@
{.push raises: [].} {.push raises: [].}
import import
std/sequtils,
eth/common, eth/common,
rocksdb,
results, results,
../../aristo, ../../aristo,
../../aristo/aristo_persistent as use_ari, ../../aristo/aristo_init/rocks_db as use_ari,
../../aristo/[aristo_desc, aristo_walk/persistent, aristo_tx], ../../aristo/[aristo_desc, aristo_walk/persistent, aristo_tx],
../../kvt, ../../kvt,
../../kvt/kvt_persistent as use_kvt, ../../kvt/kvt_persistent as use_kvt,
../../kvt/kvt_init/rocks_db/rdb_init,
../base, ../base,
./aristo_db, ./aristo_db,
./aristo_db/[common_desc, handlers_aristo], ./aristo_db/[common_desc, handlers_aristo],
../../opts ../../opts
include include ./aristo_db/aristo_replicate
./aristo_db/aristo_replicate
const const
# Expectation messages # Expectation messages
@ -34,16 +36,122 @@ const
# Annotation helper(s) # Annotation helper(s)
{.pragma: rlpRaise, gcsafe, raises: [AristoApiRlpError].} {.pragma: rlpRaise, gcsafe, raises: [AristoApiRlpError].}
proc toRocksDb*(
opts: DbOptions
): tuple[dbOpts: DbOptionsRef, cfOpts: ColFamilyOptionsRef] =
# TODO the configuration options below have not been tuned but are rather
# based on gut feeling, guesses and by looking at other clients - it
# would make sense to test different settings and combinations once the
# data model itself has settled down as their optimal values will depend
# on the shape of the data - it'll also be different per column family..
let tableOpts = defaultTableOptions()
# This bloom filter helps avoid having to read multiple SST files when looking
# for a value.
# A 9.9-bits-per-key ribbon filter takes ~7 bits per key and has a 1% false
# positive rate which feels like a good enough starting point, though this
# should be better investigated.
# https://github.com/facebook/rocksdb/wiki/RocksDB-Bloom-Filter#ribbon-filter
# https://github.com/facebook/rocksdb/blob/d64eac28d32a025770cba641ea04e697f475cdd6/include/rocksdb/filter_policy.h#L208
tableOpts.filterPolicy = createRibbonHybrid(9.9)
if opts.blockCacheSize > 0:
# Share a single block cache instance between all column families
tableOpts.blockCache = cacheCreateLRU(opts.blockCacheSize)
# Single-level indices might cause long stalls due to their large size -
# two-level indexing allows the first level to be kept in memory at all times
# while the second level is partitioned resulting in smoother loading
# https://github.com/facebook/rocksdb/wiki/Partitioned-Index-Filters#how-to-use-it
tableOpts.indexType = IndexType.twoLevelIndexSearch
tableOpts.pinTopLevelIndexAndFilter = true
tableOpts.cacheIndexAndFilterBlocksWithHighPriority = true
tableOpts.partitionFilters = true # TODO do we need this?
# This option adds a small hash index to each data block, presumably speeding
# up Get queries (but again not range queries) - takes up space, apparently
# a good tradeoff for most workloads
# https://github.com/facebook/rocksdb/wiki/Data-Block-Hash-Index
tableOpts.dataBlockIndexType = DataBlockIndexType.binarySearchAndHash
tableOpts.dataBlockHashRatio = 0.75
let cfOpts = defaultColFamilyOptions()
cfOpts.blockBasedTableFactory = tableOpts
if opts.writeBufferSize > 0:
cfOpts.writeBufferSize = opts.writeBufferSize
# When data is written to rocksdb, it is first put in an in-memory table
# whose index is a skip list. Since the mem table holds the most recent data,
# all reads must go through this skiplist which results in slow lookups for
# already-written data.
# We enable a bloom filter on the mem table to avoid this lookup in the cases
# where the data is actually on disk already (ie wasn't updated recently).
# TODO there's also a hashskiplist that has both a hash index and a skip list
# which maybe could be used - uses more memory, requires a key prefix
# extractor
cfOpts.memtableWholeKeyFiltering = true
cfOpts.memtablePrefixBloomSizeRatio = 0.1
# LZ4 seems to cut database size to 2/3 roughly, at the time of writing
# Using it for the bottom-most level means it applies to 90% of data but
# delays compression until data has settled a bit, which seems like a
# reasonable tradeoff.
# TODO evaluate zstd compression with a trained dictionary
# https://github.com/facebook/rocksdb/wiki/Compression
cfOpts.bottommostCompression = Compression.lz4Compression
# We mostly look up data we know is there, so we don't need filters at the
# last level of the database - this option saves 90% bloom filter memory usage
# TODO verify this point
# https://github.com/EighteenZi/rocksdb_wiki/blob/master/Memory-usage-in-RocksDB.md#indexes-and-filter-blocks
# https://github.com/facebook/rocksdb/blob/af50823069818fc127438e39fef91d2486d6e76c/include/rocksdb/advanced_options.h#L696
cfOpts.optimizeFiltersForHits = true
cfOpts.maxBytesForLevelBase = opts.writeBufferSize
let dbOpts = defaultDbOptions()
dbOpts.maxOpenFiles = opts.maxOpenFiles
if opts.rowCacheSize > 0:
# Good for GET queries, which is what we do most of the time - if we start
# using range queries, we should probably give more attention to the block
# cache
# https://github.com/facebook/rocksdb/blob/af50823069818fc127438e39fef91d2486d6e76c/include/rocksdb/options.h#L1276
dbOpts.rowCache = cacheCreateLRU(opts.rowCacheSize)
# Without this option, WAL files might never get removed since a small column
# family (like the admin CF) with only tiny writes might keep it open - this
# negatively affects startup times since the WAL is replayed on every startup.
# https://github.com/facebook/rocksdb/blob/af50823069818fc127438e39fef91d2486d6e76c/include/rocksdb/options.h#L719
# Flushing the oldest
let writeBufferSize =
if opts.writeBufferSize > 0:
opts.writeBufferSize
else:
cfOpts.writeBufferSize
dbOpts.maxTotalWalSize = 2 * writeBufferSize
dbOpts.keepLogFileNum = 16 # No point keeping 1000 log files around...
(dbOpts, cfOpts)
# ------------------------------------------------------------------------------ # ------------------------------------------------------------------------------
# Public constructor # Public constructor
# ------------------------------------------------------------------------------ # ------------------------------------------------------------------------------
proc newAristoRocksDbCoreDbRef*(path: string, opts: DbOptions): CoreDbRef = proc newAristoRocksDbCoreDbRef*(path: string, opts: DbOptions): CoreDbRef =
## This funcion piggybacks the `KVT` on the `Aristo` backend. ## This funcion piggybacks the `KVT` on the `Aristo` backend.
let let
adb = AristoDbRef.init(use_ari.RdbBackendRef, path, opts).valueOr: # Sharing opts means we also share caches between column families!
(dbOpts, cfOpts) = opts.toRocksDb()
guestCFs = RdbInst.guestCFs(cfOpts)
(adb, oCfs) = AristoDbRef.init(use_ari.RdbBackendRef, path, dbOpts, cfOpts, guestCFs).valueOr:
raiseAssert aristoFail & ": " & $error raiseAssert aristoFail & ": " & $error
kdb = KvtDbRef.init(use_kvt.RdbBackendRef, adb, opts).valueOr: kdb = KvtDbRef.init(use_kvt.RdbBackendRef, adb, oCfs).valueOr:
raiseAssert kvtFail & ": " & $error raiseAssert kvtFail & ": " & $error
AristoDbRocks.create(kdb, adb) AristoDbRocks.create(kdb, adb)
@ -51,9 +159,10 @@ proc newAristoDualRocksDbCoreDbRef*(path: string, opts: DbOptions): CoreDbRef =
## This is only for debugging. The KVT is run on a completely separate ## This is only for debugging. The KVT is run on a completely separate
## database backend. ## database backend.
let let
adb = AristoDbRef.init(use_ari.RdbBackendRef, path, opts).valueOr: (dbOpts, cfOpts) = opts.toRocksDb()
(adb, _) = AristoDbRef.init(use_ari.RdbBackendRef, path, dbOpts, cfOpts, []).valueOr:
raiseAssert aristoFail & ": " & $error raiseAssert aristoFail & ": " & $error
kdb = KvtDbRef.init(use_kvt.RdbBackendRef, path, opts).valueOr: kdb = KvtDbRef.init(use_kvt.RdbBackendRef, path, dbOpts, cfOpts).valueOr:
raiseAssert kvtFail & ": " & $error raiseAssert kvtFail & ": " & $error
AristoDbRocks.create(kdb, adb) AristoDbRocks.create(kdb, adb)
@ -61,10 +170,10 @@ proc newAristoDualRocksDbCoreDbRef*(path: string, opts: DbOptions): CoreDbRef =
# Public aristo iterators # Public aristo iterators
# ------------------------------------------------------------------------------ # ------------------------------------------------------------------------------
iterator aristoReplicateRdb*(dsc: CoreDxMptRef): (Blob,Blob) {.rlpRaise.} = iterator aristoReplicateRdb*(dsc: CoreDxMptRef): (Blob, Blob) {.rlpRaise.} =
## Instantiation for `VoidBackendRef` ## Instantiation for `VoidBackendRef`
for k,v in aristoReplicate[use_ari.RdbBackendRef](dsc): for k, v in aristoReplicate[use_ari.RdbBackendRef](dsc):
yield (k,v) yield (k, v)
# ------------------------------------------------------------------------------ # ------------------------------------------------------------------------------
# End # End

View File

@ -18,8 +18,6 @@ import
export kvstore export kvstore
const maxOpenFiles = 512
type type
RocksStoreRef* = ref object of RootObj RocksStoreRef* = ref object of RootObj
db: RocksDbReadWriteRef db: RocksDbReadWriteRef
@ -86,7 +84,6 @@ proc init*(
return err("RocksStoreRef: cannot create database directory") return err("RocksStoreRef: cannot create database directory")
let dbOpts = defaultDbOptions() let dbOpts = defaultDbOptions()
dbOpts.setMaxOpenFiles(maxOpenFiles)
let db = ? openRocksDb(dataDir, dbOpts, let db = ? openRocksDb(dataDir, dbOpts,
columnFamilies = namespaces.mapIt(initColFamilyDescriptor(it))) columnFamilies = namespaces.mapIt(initColFamilyDescriptor(it)))

View File

@ -19,6 +19,7 @@
{.push raises: [].} {.push raises: [].}
import import
rocksdb,
results, results,
../../aristo, ../../aristo,
../../opts, ../../opts,
@ -44,19 +45,20 @@ proc init*(
T: type KvtDbRef; T: type KvtDbRef;
B: type RdbBackendRef; B: type RdbBackendRef;
basePath: string; basePath: string;
opts: DbOptions; dbOpts: DbOptionsRef;
cfOpts: ColFamilyOptionsRef;
): Result[KvtDbRef,KvtError] = ): Result[KvtDbRef,KvtError] =
## Generic constructor for `RocksDb` backend ## Generic constructor for `RocksDb` backend
## ##
ok KvtDbRef( ok KvtDbRef(
top: LayerRef.init(), top: LayerRef.init(),
backend: ? rocksDbKvtBackend(basePath, opts).mapErr toErr0) backend: ? rocksDbKvtBackend(basePath, dbOpts, cfOpts).mapErr toErr0)
proc init*( proc init*(
T: type KvtDbRef; T: type KvtDbRef;
B: type RdbBackendRef; B: type RdbBackendRef;
adb: AristoDbRef; adb: AristoDbRef;
opts: DbOptions; oCfs: openArray[ColFamilyReadWrite];
): Result[KvtDbRef,KvtError] = ): Result[KvtDbRef,KvtError] =
## Constructor for `RocksDb` backend which piggybacks on the `Aristo` ## Constructor for `RocksDb` backend which piggybacks on the `Aristo`
## backend. The following changes will occur after successful instantiation: ## backend. The following changes will occur after successful instantiation:
@ -83,7 +85,7 @@ proc init*(
## ##
ok KvtDbRef( ok KvtDbRef(
top: LayerRef.init(), top: LayerRef.init(),
backend: ? rocksDbKvtTriggeredBackend(adb, opts).mapErr toErr0) backend: ? rocksDbKvtTriggeredBackend(adb, oCfs).mapErr toErr0)
# ------------------------------------------------------------------------------ # ------------------------------------------------------------------------------
# End # End

View File

@ -255,13 +255,14 @@ proc writeEvCb(db: RdbBackendRef): RdbWriteEventCb =
proc rocksDbKvtBackend*( proc rocksDbKvtBackend*(
path: string; path: string;
opts: DbOptions; dbOpts: DbOptionsRef;
cfOpts: ColFamilyOptionsRef;
): Result[BackendRef,(KvtError,string)] = ): Result[BackendRef,(KvtError,string)] =
let db = RdbBackendRef( let db = RdbBackendRef(
beKind: BackendRocksDB) beKind: BackendRocksDB)
# Initialise RocksDB # Initialise RocksDB
db.rdb.init(path, opts).isOkOr: db.rdb.init(path, dbOpts, cfOpts).isOkOr:
when extraTraceMessages: when extraTraceMessages:
trace logTxt "constructor failed", error=error[0], info=error[1] trace logTxt "constructor failed", error=error[0], info=error[1]
return err(error) return err(error)
@ -280,13 +281,13 @@ proc rocksDbKvtBackend*(
proc rocksDbKvtTriggeredBackend*( proc rocksDbKvtTriggeredBackend*(
adb: AristoDbRef; adb: AristoDbRef;
opts: DbOptions; oCfs: openArray[ColFamilyReadWrite];
): Result[BackendRef,(KvtError,string)] = ): Result[BackendRef,(KvtError,string)] =
let db = RdbBackendRef( let db = RdbBackendRef(
beKind: BackendRdbTriggered) beKind: BackendRdbTriggered)
# Initialise RocksDB piggy-backed on `Aristo` backend. # Initialise RocksDB piggy-backed on `Aristo` backend.
db.rdb.init(adb, opts).isOkOr: db.rdb.init(oCfs).isOkOr:
when extraTraceMessages: when extraTraceMessages:
trace logTxt "constructor failed", error=error[0], info=error[1] trace logTxt "constructor failed", error=error[0], info=error[1]
return err(error) return err(error)

View File

@ -17,62 +17,12 @@ import
std/[sequtils, os], std/[sequtils, os],
rocksdb, rocksdb,
results, results,
../../../aristo/aristo_init/persistent,
../../../opts, ../../../opts,
../../kvt_desc, ../../kvt_desc,
../../kvt_desc/desc_error as kdb, ../../kvt_desc/desc_error as kdb,
./rdb_desc ./rdb_desc
# ------------------------------------------------------------------------------ export rdb_desc, results
# Private helpers
# ------------------------------------------------------------------------------
proc getCFInitOptions(opts: DbOptions): ColFamilyOptionsRef =
# TODO the configuration options below have not been tuned but are rather
# based on gut feeling, guesses and by looking at other clients - it
# would make sense to test different settings and combinations once the
# data model itself has settled down as their optimal values will depend
# on the shape of the data - it'll also be different per column family..
let cfOpts = defaultColFamilyOptions()
if opts.writeBufferSize > 0:
cfOpts.setWriteBufferSize(opts.writeBufferSize)
# When data is written to rocksdb, it is first put in an in-memory table
# whose index is a skip list. Since the mem table holds the most recent data,
# all reads must go through this skiplist which results in slow lookups for
# already-written data.
# We enable a bloom filter on the mem table to avoid this lookup in the cases
# where the data is actually on disk already (ie wasn't updated recently).
# TODO there's also a hashskiplist that has both a hash index and a skip list
# which maybe could be used - uses more memory, requires a key prefix
# extractor
cfOpts.setMemtableWholeKeyFiltering(true)
cfOpts.setMemtablePrefixBloomSizeRatio(0.1)
# LZ4 seems to cut database size to 2/3 roughly, at the time of writing
# Using it for the bottom-most level means it applies to 90% of data but
# delays compression until data has settled a bit, which seems like a
# reasonable tradeoff.
# TODO evaluate zstd compression with a trained dictionary
# https://github.com/facebook/rocksdb/wiki/Compression
cfOpts.setBottommostCompression(Compression.lz4Compression)
cfOpts
proc getDbInitOptions(opts: DbOptions): DbOptionsRef =
result = defaultDbOptions()
result.setMaxOpenFiles(opts.maxOpenFiles)
result.setMaxBytesForLevelBase(opts.writeBufferSize)
if opts.rowCacheSize > 0:
result.setRowCache(cacheCreateLRU(opts.rowCacheSize))
if opts.blockCacheSize > 0:
let tableOpts = defaultTableOptions()
tableOpts.setBlockCache(cacheCreateLRU(opts.rowCacheSize))
result.setBlockBasedTableFactory(tableOpts)
# ------------------------------------------------------------------------------ # ------------------------------------------------------------------------------
# Public constructor # Public constructor
@ -81,7 +31,8 @@ proc getDbInitOptions(opts: DbOptions): DbOptionsRef =
proc init*( proc init*(
rdb: var RdbInst; rdb: var RdbInst;
basePath: string; basePath: string;
opts: DbOptions; dbOpts: DbOptionsRef;
cfOpts: ColFamilyOptionsRef;
): Result[void,(KvtError,string)] = ): Result[void,(KvtError,string)] =
## Database backend constructor for stand-alone version ## Database backend constructor for stand-alone version
## ##
@ -96,9 +47,6 @@ proc init*(
except OSError, IOError: except OSError, IOError:
return err((kdb.RdbBeCantCreateDataDir, "")) return err((kdb.RdbBeCantCreateDataDir, ""))
# Expand argument `opts` to rocksdb options
let (cfOpts, dbOpts) = (opts.getCFInitOptions, opts.getDbInitOptions)
# Column familiy names to allocate when opening the database. # Column familiy names to allocate when opening the database.
let cfs = KvtCFs.mapIt(($it).initColFamilyDescriptor cfOpts) let cfs = KvtCFs.mapIt(($it).initColFamilyDescriptor cfOpts)
@ -113,20 +61,15 @@ proc init*(
$col & " descriptor: " & error $col & " descriptor: " & error
ok() ok()
proc guestCFs*(T: type RdbInst, cfOpts: ColFamilyOptionsRef): seq =
KvtCFs.toSeq.mapIt(initColFamilyDescriptor($it, cfOpts))
proc init*( proc init*(
rdb: var RdbInst; rdb: var RdbInst;
adb: AristoDbRef; oCfs: openArray[ColFamilyReadWrite];
opts: DbOptions;
): Result[void,(KvtError,string)] = ): Result[void,(KvtError,string)] =
## Initalise column handlers piggy-backing on the `Aristo` backend. ## Initalise column handlers piggy-backing on the `Aristo` backend.
## ##
let
cfOpts = opts.getCFInitOptions()
iCfs = KvtCFs.toSeq.mapIt(initColFamilyDescriptor($it, cfOpts))
oCfs = adb.reinit(iCfs).valueOr:
return err((RdbBeHostError,$error))
# Collect column family descriptors (this stores implicitely `baseDb`) # Collect column family descriptors (this stores implicitely `baseDb`)
for n in KvtCFs: for n in KvtCFs:
assert oCfs[n.ord].name != "" # debugging only assert oCfs[n.ord].name != "" # debugging only

View File

@ -17,6 +17,7 @@ import
results, results,
unittest2, unittest2,
../../nimbus/db/opts, ../../nimbus/db/opts,
../../nimbus/db/core_db/backend/aristo_rocksdb,
../../nimbus/db/aristo/[ ../../nimbus/db/aristo/[
aristo_check, aristo_check,
aristo_debug, aristo_debug,
@ -104,10 +105,11 @@ iterator quadripartite(td: openArray[ProofTrieData]): LeafQuartet =
proc dbTriplet(w: LeafQuartet; rdbPath: string): Result[DbTriplet,AristoError] = proc dbTriplet(w: LeafQuartet; rdbPath: string): Result[DbTriplet,AristoError] =
let db = block: let db = block:
if 0 < rdbPath.len: if 0 < rdbPath.len:
let rc = AristoDbRef.init(RdbBackendRef, rdbPath, DbOptions.init()) let (dbOpts, cfOpts) = DbOptions.init().toRocksDb()
let rc = AristoDbRef.init(RdbBackendRef, rdbPath, dbOpts, cfOpts, [])
xCheckRc rc.error == 0: xCheckRc rc.error == 0:
result = err(rc.error) result = err(rc.error)
rc.value rc.value()[0]
else: else:
AristoDbRef.init MemBackendRef AristoDbRef.init MemBackendRef

View File

@ -17,6 +17,7 @@ import
unittest2, unittest2,
stew/endians2, stew/endians2,
../../nimbus/db/opts, ../../nimbus/db/opts,
../../nimbus/db/core_db/backend/aristo_rocksdb,
../../nimbus/db/aristo/[ ../../nimbus/db/aristo/[
aristo_check, aristo_check,
aristo_debug, aristo_debug,
@ -330,9 +331,10 @@ proc testTxMergeAndDeleteOneByOne*(
# Start with brand new persistent database. # Start with brand new persistent database.
db = block: db = block:
if 0 < rdbPath.len: if 0 < rdbPath.len:
let rc = AristoDbRef.init(RdbBackendRef, rdbPath, DbOptions.init()) let (dbOpts, cfOpts) = DbOptions.init().toRocksDb()
let rc = AristoDbRef.init(RdbBackendRef, rdbPath, dbOpts, cfOpts, [])
xCheckRc rc.error == 0 xCheckRc rc.error == 0
rc.value rc.value()[0]
else: else:
AristoDbRef.init(MemBackendRef) AristoDbRef.init(MemBackendRef)
@ -441,9 +443,10 @@ proc testTxMergeAndDeleteSubTree*(
# Start with brand new persistent database. # Start with brand new persistent database.
db = block: db = block:
if 0 < rdbPath.len: if 0 < rdbPath.len:
let rc = AristoDbRef.init(RdbBackendRef, rdbPath, DbOptions.init()) let (dbOpts, cfOpts) = DbOptions.init().toRocksDb()
let rc = AristoDbRef.init(RdbBackendRef, rdbPath, dbOpts, cfOpts, [])
xCheckRc rc.error == 0 xCheckRc rc.error == 0
rc.value rc.value()[0]
else: else:
AristoDbRef.init(MemBackendRef) AristoDbRef.init(MemBackendRef)
@ -545,9 +548,10 @@ proc testTxMergeProofAndKvpList*(
db = block: db = block:
# New DB with disabled filter slots management # New DB with disabled filter slots management
if 0 < rdbPath.len: if 0 < rdbPath.len:
let rc = AristoDbRef.init(RdbBackendRef, rdbPath, DbOptions.init()) let (dbOpts, cfOpts) = DbOptions.init().toRocksDb()
let rc = AristoDbRef.init(RdbBackendRef, rdbPath, dbOpts, cfOpts, [])
xCheckRc rc.error == 0 xCheckRc rc.error == 0
rc.value rc.value()[0]
else: else:
AristoDbRef.init(MemBackendRef) AristoDbRef.init(MemBackendRef)

2
vendor/nim-rocksdb vendored

@ -1 +1 @@
Subproject commit 293dc0745ea8386237546acb352a265a4bc874b5 Subproject commit f5dcb34ae83648bf5868618bc7fe916073b4455f