Add persistent backoff for peers (#497)

This commit is contained in:
Hanno Cornelius 2021-04-21 11:36:56 +02:00 committed by GitHub
parent c997860397
commit f0eadfec13
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 75 additions and 47 deletions

View File

@ -21,21 +21,23 @@ suite "Peer Storage":
peerProto = "/waku/2/default-waku/codec" peerProto = "/waku/2/default-waku/codec"
stored = StoredInfo(peerId: peer.peerId, addrs: toHashSet([peerLoc]), protos: toHashSet([peerProto]), publicKey: peerKey.getKey().tryGet()) stored = StoredInfo(peerId: peer.peerId, addrs: toHashSet([peerLoc]), protos: toHashSet([peerProto]), publicKey: peerKey.getKey().tryGet())
conn = Connectedness.CanConnect conn = Connectedness.CanConnect
disconn = 999999
defer: storage.close() defer: storage.close()
# Test insert and retrieve # Test insert and retrieve
discard storage.put(peer.peerId, stored, conn) discard storage.put(peer.peerId, stored, conn, disconn)
var responseCount = 0 var responseCount = 0
proc data(peerId: PeerID, storedInfo: StoredInfo, proc data(peerId: PeerID, storedInfo: StoredInfo,
connectedness: Connectedness) = connectedness: Connectedness, disconnectTime: int64) =
responseCount += 1 responseCount += 1
check: check:
peerId == peer.peerId peerId == peer.peerId
storedInfo == stored storedInfo == stored
connectedness == conn connectedness == conn
disconnectTime == disconn
let res = storage.getAll(data) let res = storage.getAll(data)
@ -44,16 +46,17 @@ suite "Peer Storage":
responseCount == 1 responseCount == 1
# Test replace and retrieve (update an existing entry) # Test replace and retrieve (update an existing entry)
discard storage.put(peer.peerId, stored, Connectedness.CannotConnect) discard storage.put(peer.peerId, stored, Connectedness.CannotConnect, disconn + 10)
responseCount = 0 responseCount = 0
proc replacedData(peerId: PeerID, storedInfo: StoredInfo, proc replacedData(peerId: PeerID, storedInfo: StoredInfo,
connectedness: Connectedness) = connectedness: Connectedness, disconnectTime: int64) =
responseCount += 1 responseCount += 1
check: check:
peerId == peer.peerId peerId == peer.peerId
storedInfo == stored storedInfo == stored
connectedness == CannotConnect connectedness == CannotConnect
disconnectTime == disconn + 10
let repRes = storage.getAll(replacedData) let repRes = storage.getAll(replacedData)

View File

@ -42,6 +42,11 @@ type
staticnodes* {. staticnodes* {.
desc: "Peer multiaddr to directly connect with. Argument may be repeated." desc: "Peer multiaddr to directly connect with. Argument may be repeated."
name: "staticnode" }: seq[string] name: "staticnode" }: seq[string]
peerpersist* {.
desc: "Enable peer persistence: true|false",
defaultValue: false
name: "peerpersist" }: bool
storenode* {. storenode* {.
desc: "Peer multiaddr to query for storage.", desc: "Peer multiaddr to query for storage.",

View File

@ -1,7 +1,7 @@
{.push raises: [Defect, Exception].} {.push raises: [Defect, Exception].}
import import
std/[options, sets, sequtils], std/[options, sets, sequtils, times],
chronos, chronicles, metrics, chronos, chronicles, metrics,
./waku_peer_store, ./waku_peer_store,
../storage/peer/peer_storage ../storage/peer/peer_storage
@ -20,8 +20,8 @@ type
peerStore*: WakuPeerStore peerStore*: WakuPeerStore
storage: PeerStorage storage: PeerStorage
const let
defaultDialTimeout = 1.minutes # @TODO should this be made configurable? defaultDialTimeout = chronos.minutes(1) # @TODO should this be made configurable?
#################### ####################
# Helper functions # # Helper functions #
@ -32,9 +32,11 @@ proc toPeerInfo(storedInfo: StoredInfo): PeerInfo =
addrs = toSeq(storedInfo.addrs), addrs = toSeq(storedInfo.addrs),
protocols = toSeq(storedInfo.protos)) protocols = toSeq(storedInfo.protos))
proc insertOrReplace(ps: PeerStorage, peerId: PeerID, storedInfo: StoredInfo, connectedness: Connectedness) = proc insertOrReplace(ps: PeerStorage,
peerId: PeerID,
storedInfo: StoredInfo, connectedness: Connectedness, disconnectTime: int64 = 0) =
# Insert peer entry into persistent storage, or replace existing entry with updated info # Insert peer entry into persistent storage, or replace existing entry with updated info
let res = ps.put(peerId, storedInfo, connectedness) let res = ps.put(peerId, storedInfo, connectedness, disconnectTime)
if res.isErr: if res.isErr:
warn "failed to store peers", err = res.error warn "failed to store peers", err = res.error
waku_peers_errors.inc(labelValues = ["storage_failure"]) waku_peers_errors.inc(labelValues = ["storage_failure"])
@ -75,7 +77,7 @@ proc dialPeer(pm: PeerManager, peerId: PeerID,
proc loadFromStorage(pm: PeerManager) = proc loadFromStorage(pm: PeerManager) =
# Load peers from storage, if available # Load peers from storage, if available
proc onData(peerId: PeerID, storedInfo: StoredInfo, connectedness: Connectedness) = proc onData(peerId: PeerID, storedInfo: StoredInfo, connectedness: Connectedness, disconnectTime: int64) =
if peerId == pm.switch.peerInfo.peerId: if peerId == pm.switch.peerInfo.peerId:
# Do not manage self # Do not manage self
return return
@ -84,6 +86,7 @@ proc loadFromStorage(pm: PeerManager) =
pm.peerStore.protoBook.set(peerId, storedInfo.protos) pm.peerStore.protoBook.set(peerId, storedInfo.protos)
pm.peerStore.keyBook.set(peerId, storedInfo.publicKey) pm.peerStore.keyBook.set(peerId, storedInfo.publicKey)
pm.peerStore.connectionBook.set(peerId, NotConnected) # Reset connectedness state pm.peerStore.connectionBook.set(peerId, NotConnected) # Reset connectedness state
pm.peerStore.disconnectBook.set(peerId, disconnectTime)
let res = pm.storage.getAll(onData) let res = pm.storage.getAll(onData)
if res.isErr: if res.isErr:
@ -104,7 +107,7 @@ proc onConnEvent(pm: PeerManager, peerId: PeerID, event: ConnEvent) {.async.} =
of ConnEventKind.Disconnected: of ConnEventKind.Disconnected:
pm.peerStore.connectionBook.set(peerId, CanConnect) pm.peerStore.connectionBook.set(peerId, CanConnect)
if not pm.storage.isNil: if not pm.storage.isNil:
pm.storage.insertOrReplace(peerId, pm.peerStore.get(peerId), CanConnect) pm.storage.insertOrReplace(peerId, pm.peerStore.get(peerId), CanConnect, getTime().toUnix)
return return
proc new*(T: type PeerManager, switch: Switch, storage: PeerStorage = nil): PeerManager = proc new*(T: type PeerManager, switch: Switch, storage: PeerStorage = nil): PeerManager =
@ -195,13 +198,31 @@ proc selectPeer*(pm: PeerManager, proto: string): Option[PeerInfo] =
else: else:
return none(PeerInfo) return none(PeerInfo)
proc reconnectPeers*(pm: PeerManager, proto: string) {.async.} = proc reconnectPeers*(pm: PeerManager, proto: string, backoff: chronos.Duration = chronos.seconds(0)) {.async.} =
## Reconnect to peers registered for this protocol. This will update connectedness. ## Reconnect to peers registered for this protocol. This will update connectedness.
## Especially useful to resume connections from persistent storage after a restart. ## Especially useful to resume connections from persistent storage after a restart.
debug "Reconnecting peers", proto=proto debug "Reconnecting peers", proto=proto
for storedInfo in pm.peers(proto): for storedInfo in pm.peers(proto):
# Check if peer is reachable.
if pm.peerStore.connectionBook.get(storedInfo.peerId) == CannotConnect:
debug "Not reconnecting to unreachable peer", peerId=storedInfo.peerId
continue
# Respect optional backoff period where applicable.
let
disconnectTime = Moment.init(pm.peerStore.disconnectBook.get(storedInfo.peerId), Second) # Convert
currentTime = Moment.init(getTime().toUnix, Second) # Current time comparable to persisted value
backoffTime = disconnectTime + backoff - currentTime # Consider time elapsed since last disconnect
trace "Respecting backoff", backoff=backoff, disconnectTime=disconnectTime, currentTime=currentTime, backoffTime=backoffTime
if backoffTime > ZeroDuration:
debug "Backing off before reconnect...", peerId=storedInfo.peerId, backoffTime=backoffTime
# We disconnected recently and still need to wait for a backoff period before connecting
await sleepAsync(backoffTime)
trace "Reconnecting to peer", peerId=storedInfo.peerId trace "Reconnecting to peer", peerId=storedInfo.peerId
discard await pm.dialPeer(storedInfo.peerId, toSeq(storedInfo.addrs), proto) discard await pm.dialPeer(storedInfo.peerId, toSeq(storedInfo.addrs), proto)
@ -217,5 +238,9 @@ proc dialPeer*(pm: PeerManager, peerInfo: PeerInfo, proto: string, dialTimeout =
if not pm.hasPeer(peerInfo, proto): if not pm.hasPeer(peerInfo, proto):
trace "Adding newly dialed peer to manager", peerId = peerInfo.peerId, addr = peerInfo.addrs[0], proto = proto trace "Adding newly dialed peer to manager", peerId = peerInfo.peerId, addr = peerInfo.addrs[0], proto = proto
pm.addPeer(peerInfo, proto) pm.addPeer(peerInfo, proto)
if peerInfo.peerId == pm.switch.peerInfo.peerId:
# Do not attempt to dial self
return none(Connection)
return await pm.dialPeer(peerInfo.peerId, peerInfo.addrs, proto, dialTimeout) return await pm.dialPeer(peerInfo.peerId, peerInfo.addrs, proto, dialTimeout)

View File

@ -19,8 +19,11 @@ type
ConnectionBook* = object of PeerBook[Connectedness] ConnectionBook* = object of PeerBook[Connectedness]
DisconnectBook* = object of PeerBook[int64] # Keeps track of when peers were disconnected in Unix timestamps
WakuPeerStore* = ref object of PeerStore WakuPeerStore* = ref object of PeerStore
connectionBook*: ConnectionBook connectionBook*: ConnectionBook
disconnectBook*: DisconnectBook
proc new*(T: type WakuPeerStore): WakuPeerStore = proc new*(T: type WakuPeerStore): WakuPeerStore =
var p: WakuPeerStore var p: WakuPeerStore

View File

@ -12,12 +12,13 @@ type
PeerStorageResult*[T] = Result[T, string] PeerStorageResult*[T] = Result[T, string]
DataProc* = proc(peerId: PeerID, storedInfo: StoredInfo, DataProc* = proc(peerId: PeerID, storedInfo: StoredInfo,
connectedness: Connectedness) {.closure.} connectedness: Connectedness, disconnectTime: int64) {.closure.}
# PeerStorage interface # PeerStorage interface
method put*(db: PeerStorage, method put*(db: PeerStorage,
peerId: PeerID, peerId: PeerID,
storedInfo: StoredInfo, storedInfo: StoredInfo,
connectedness: Connectedness): PeerStorageResult[void] {.base.} = discard connectedness: Connectedness,
disconnectTime: int64): PeerStorageResult[void] {.base.} = discard
method getAll*(db: PeerStorage, onData: DataProc): PeerStorageResult[bool] {.base.} = discard method getAll*(db: PeerStorage, onData: DataProc): PeerStorageResult[bool] {.base.} = discard

View File

@ -56,16 +56,18 @@ proc encode*(storedInfo: StoredInfo): ProtoBuffer =
########################## ##########################
proc new*(T: type WakuPeerStorage, db: SqliteDatabase): PeerStorageResult[T] = proc new*(T: type WakuPeerStorage, db: SqliteDatabase): PeerStorageResult[T] =
## Create the "Peers" table ## Create the "Peer" table
## It contains: ## It contains:
## - peer id as primary key, stored as a blob ## - peer id as primary key, stored as a blob
## - stored info (serialised protobuf), stored as a blob ## - stored info (serialised protobuf), stored as a blob
## - last known enumerated connectedness state, stored as an integer ## - last known enumerated connectedness state, stored as an integer
## - disconnect time in epoch seconds, if applicable
let prepare = db.prepareStmt(""" let prepare = db.prepareStmt("""
CREATE TABLE IF NOT EXISTS Peers ( CREATE TABLE IF NOT EXISTS Peer (
peerId BLOB PRIMARY KEY, peerId BLOB PRIMARY KEY,
storedInfo BLOB, storedInfo BLOB,
connectedness INTEGER connectedness INTEGER,
disconnectTime INTEGER
) WITHOUT ROWID; ) WITHOUT ROWID;
""", NoParams, void) """, NoParams, void)
@ -82,19 +84,20 @@ proc new*(T: type WakuPeerStorage, db: SqliteDatabase): PeerStorageResult[T] =
method put*(db: WakuPeerStorage, method put*(db: WakuPeerStorage,
peerId: PeerID, peerId: PeerID,
storedInfo: StoredInfo, storedInfo: StoredInfo,
connectedness: Connectedness): PeerStorageResult[void] = connectedness: Connectedness,
disconnectTime: int64): PeerStorageResult[void] =
## Adds a peer to storage or replaces existing entry if it already exists ## Adds a peer to storage or replaces existing entry if it already exists
let prepare = db.database.prepareStmt( let prepare = db.database.prepareStmt(
"REPLACE INTO Peers (peerId, storedInfo, connectedness) VALUES (?, ?, ?);", "REPLACE INTO Peer (peerId, storedInfo, connectedness, disconnectTime) VALUES (?, ?, ?, ?);",
(seq[byte], seq[byte], int32), (seq[byte], seq[byte], int32, int64),
void void
) )
if prepare.isErr: if prepare.isErr:
return err("failed to prepare") return err("failed to prepare")
let res = prepare.value.exec((peerId.data, storedInfo.encode().buffer, int32(ord(connectedness)))) let res = prepare.value.exec((peerId.data, storedInfo.encode().buffer, int32(ord(connectedness)), disconnectTime))
if res.isErr: if res.isErr:
return err("failed") return err("failed")
@ -117,10 +120,12 @@ method getAll*(db: WakuPeerStorage, onData: peer_storage.DataProc): PeerStorageR
storedInfo = StoredInfo.init(@(toOpenArray(sTo, 0, sToL - 1))).tryGet() storedInfo = StoredInfo.init(@(toOpenArray(sTo, 0, sToL - 1))).tryGet()
# Connectedness # Connectedness
connectedness = Connectedness(sqlite3_column_int(s, 2)) connectedness = Connectedness(sqlite3_column_int(s, 2))
# DisconnectTime
disconnectTime = sqlite3_column_int64(s, 3)
onData(peerId, storedInfo, connectedness) onData(peerId, storedInfo, connectedness, disconnectTime)
let res = db.database.query("SELECT peerId, storedInfo, connectedness FROM Peers", peer) let res = db.database.query("SELECT peerId, storedInfo, connectedness, disconnectTime FROM Peer", peer)
if res.isErr: if res.isErr:
return err("failed") return err("failed")

View File

@ -10,6 +10,7 @@ import
# NOTE For TopicHandler, solve with exports? # NOTE For TopicHandler, solve with exports?
libp2p/protocols/pubsub/rpc/messages, libp2p/protocols/pubsub/rpc/messages,
libp2p/protocols/pubsub/pubsub, libp2p/protocols/pubsub/pubsub,
libp2p/protocols/pubsub/gossipsub,
libp2p/standard_setup, libp2p/standard_setup,
../protocol/[waku_relay, waku_message, message_notifier], ../protocol/[waku_relay, waku_message, message_notifier],
../protocol/waku_store/waku_store, ../protocol/waku_store/waku_store,
@ -419,31 +420,16 @@ proc mountRelay*(node: WakuNode, topics: seq[string] = newSeq[string](), rlnRela
info "mounting relay" info "mounting relay"
node.subscribe(defaultTopic, none(TopicHandler))
for topic in topics:
node.subscribe(topic, none(TopicHandler))
if node.peerManager.hasPeers(WakuRelayCodec): if node.peerManager.hasPeers(WakuRelayCodec):
trace "Found previous WakuRelay peers. Reconnecting." trace "Found previous WakuRelay peers. Reconnecting."
# Reconnect to previous relay peers # Reconnect to previous relay peers. This will respect a backoff period, if necessary
waitFor node.peerManager.reconnectPeers(WakuRelayCodec) waitFor node.peerManager.reconnectPeers(WakuRelayCodec,
wakuRelay.parameters.pruneBackoff + chronos.seconds(BackoffSlackTime))
## GossipSub specifies a backoff period after disconnecting and unsubscribing before attempting
## to re-graft peer on previous topics. We have to respect this period before starting WakuRelay.
trace "Backing off before grafting after reconnecting to WakuRelay peers", backoff=wakuRelay.parameters.pruneBackoff
proc subscribeFuture() {.async.} =
# Subscribe after the backoff period
await sleepAsync(wakuRelay.parameters.pruneBackoff)
node.subscribe(defaultTopic, none(TopicHandler))
for topic in topics:
node.subscribe(topic, none(TopicHandler))
discard subscribeFuture() # Dispatch future, but do not await.
else:
# Subscribe immediately
node.subscribe(defaultTopic, none(TopicHandler))
for topic in topics:
node.subscribe(topic, none(TopicHandler))
if rlnRelayEnabled: if rlnRelayEnabled:
# TODO pass rln relay inputs to this proc, right now it uses default values that are set in the mountRlnRelay proc # TODO pass rln relay inputs to this proc, right now it uses default values that are set in the mountRlnRelay proc
@ -595,7 +581,7 @@ when isMainModule:
var pStorage: WakuPeerStorage var pStorage: WakuPeerStorage
if not sqliteDatabase.isNil: if conf.peerpersist and not sqliteDatabase.isNil:
let res = WakuPeerStorage.new(sqliteDatabase) let res = WakuPeerStorage.new(sqliteDatabase)
if res.isErr: if res.isErr:
warn "failed to init new WakuPeerStorage", err = res.error warn "failed to init new WakuPeerStorage", err = res.error