2022-11-04 09:40:13 +01:00
|
|
|
|
when (NimMajor, NimMinor) < (1, 4):
|
|
|
|
|
{.push raises: [Defect].}
|
|
|
|
|
else:
|
|
|
|
|
{.push raises: [].}
|
|
|
|
|
|
2021-02-04 12:32:58 +02:00
|
|
|
|
|
|
|
|
|
import
|
2023-04-14 15:12:22 +02:00
|
|
|
|
std/[options, sets, sequtils, times, strutils, math],
|
2022-11-24 14:11:23 +01:00
|
|
|
|
chronos,
|
|
|
|
|
chronicles,
|
2022-11-04 09:40:13 +01:00
|
|
|
|
metrics,
|
2023-04-12 13:05:34 +02:00
|
|
|
|
libp2p/multistream,
|
|
|
|
|
libp2p/muxers/muxer
|
2022-11-04 09:40:13 +01:00
|
|
|
|
import
|
2023-04-19 14:27:16 +02:00
|
|
|
|
../../../common/nimchronos,
|
2023-04-18 15:22:10 +02:00
|
|
|
|
../../waku_relay,
|
2022-11-04 09:40:13 +01:00
|
|
|
|
../../utils/peers,
|
|
|
|
|
./peer_store/peer_storage,
|
|
|
|
|
./waku_peer_store
|
2021-02-04 12:32:58 +02:00
|
|
|
|
|
2021-10-06 14:29:08 +02:00
|
|
|
|
export waku_peer_store, peer_storage, peers
|
2021-02-05 12:49:11 +02:00
|
|
|
|
|
|
|
|
|
declareCounter waku_peers_dials, "Number of peer dials", ["outcome"]
|
2022-12-14 16:04:11 +01:00
|
|
|
|
# TODO: Populate from PeerStore.Source when ready
|
2022-09-20 13:03:34 +02:00
|
|
|
|
declarePublicCounter waku_node_conns_initiated, "Number of connections initiated", ["source"]
|
2021-03-26 10:49:51 +02:00
|
|
|
|
declarePublicGauge waku_peers_errors, "Number of peer manager errors", ["type"]
|
2023-04-12 13:05:34 +02:00
|
|
|
|
declarePublicGauge waku_connected_peers, "Number of physical connections per direction and protocol", labels = ["direction", "protocol"]
|
|
|
|
|
declarePublicGauge waku_streams_peers, "Number of streams per direction and protocol", labels = ["direction", "protocol"]
|
2023-01-31 13:24:49 +01:00
|
|
|
|
declarePublicGauge waku_peer_store_size, "Number of peers managed by the peer store"
|
2023-02-27 18:24:31 +01:00
|
|
|
|
declarePublicGauge waku_service_peers, "Service peer protocol and multiaddress ", labels = ["protocol", "peerId"]
|
2022-11-04 09:40:13 +01:00
|
|
|
|
|
2021-02-05 12:49:11 +02:00
|
|
|
|
logScope:
|
2022-11-03 16:36:24 +01:00
|
|
|
|
topics = "waku node peer_manager"
|
2021-02-05 12:49:11 +02:00
|
|
|
|
|
2022-12-14 16:04:11 +01:00
|
|
|
|
const
|
|
|
|
|
# TODO: Make configurable
|
|
|
|
|
DefaultDialTimeout = chronos.seconds(10)
|
2021-02-05 12:49:11 +02:00
|
|
|
|
|
2023-01-23 21:24:46 +01:00
|
|
|
|
# Max attempts before removing the peer
|
|
|
|
|
MaxFailedAttempts = 5
|
|
|
|
|
|
|
|
|
|
# Time to wait before attempting to dial again is calculated as:
|
|
|
|
|
# initialBackoffInSec*(backoffFactor^(failedAttempts-1))
|
|
|
|
|
# 120s, 480s, 1920, 7680s
|
|
|
|
|
InitialBackoffInSec = 120
|
|
|
|
|
BackoffFactor = 4
|
|
|
|
|
|
2023-01-26 10:20:20 +01:00
|
|
|
|
# Limit the amount of paralel dials
|
2023-01-18 15:17:56 +01:00
|
|
|
|
MaxParalelDials = 10
|
|
|
|
|
|
2023-01-26 10:20:20 +01:00
|
|
|
|
# Delay between consecutive relayConnectivityLoop runs
|
2023-02-14 15:38:32 +01:00
|
|
|
|
ConnectivityLoopInterval = chronos.seconds(15)
|
2023-01-18 15:17:56 +01:00
|
|
|
|
|
2023-01-31 13:24:49 +01:00
|
|
|
|
# How often the peer store is pruned
|
|
|
|
|
PrunePeerStoreInterval = chronos.minutes(5)
|
|
|
|
|
|
2023-04-12 13:05:34 +02:00
|
|
|
|
# How often the peer store is updated with metrics
|
|
|
|
|
UpdateMetricsInterval = chronos.seconds(15)
|
|
|
|
|
|
2023-01-23 21:24:46 +01:00
|
|
|
|
type
|
|
|
|
|
PeerManager* = ref object of RootObj
|
|
|
|
|
switch*: Switch
|
|
|
|
|
peerStore*: PeerStore
|
|
|
|
|
initialBackoffInSec*: int
|
|
|
|
|
backoffFactor*: int
|
|
|
|
|
maxFailedAttempts*: int
|
|
|
|
|
storage: PeerStorage
|
2023-01-26 10:20:20 +01:00
|
|
|
|
serviceSlots*: Table[string, RemotePeerInfo]
|
|
|
|
|
started: bool
|
2023-01-23 21:24:46 +01:00
|
|
|
|
|
2023-02-27 18:24:31 +01:00
|
|
|
|
proc protocolMatcher*(codec: string): Matcher =
|
|
|
|
|
## Returns a protocol matcher function for the provided codec
|
|
|
|
|
proc match(proto: string): bool {.gcsafe.} =
|
|
|
|
|
## Matches a proto with any postfix to the provided codec.
|
|
|
|
|
## E.g. if the codec is `/vac/waku/filter/2.0.0` it matches the protos:
|
|
|
|
|
## `/vac/waku/filter/2.0.0`, `/vac/waku/filter/2.0.0-beta3`, `/vac/waku/filter/2.0.0-actualnonsense`
|
|
|
|
|
return proto.startsWith(codec)
|
|
|
|
|
|
|
|
|
|
return match
|
|
|
|
|
|
2023-04-14 15:12:22 +02:00
|
|
|
|
proc calculateBackoff(initialBackoffInSec: int,
|
|
|
|
|
backoffFactor: int,
|
|
|
|
|
failedAttempts: int): timer.Duration =
|
|
|
|
|
if failedAttempts == 0:
|
|
|
|
|
return chronos.seconds(0)
|
|
|
|
|
return chronos.seconds(initialBackoffInSec*(backoffFactor^(failedAttempts-1)))
|
|
|
|
|
|
2021-03-26 10:49:51 +02:00
|
|
|
|
####################
|
|
|
|
|
# Helper functions #
|
|
|
|
|
####################
|
|
|
|
|
|
2021-04-21 11:36:56 +02:00
|
|
|
|
proc insertOrReplace(ps: PeerStorage,
|
|
|
|
|
peerId: PeerID,
|
2023-03-09 13:05:50 -05:00
|
|
|
|
remotePeerInfo: RemotePeerInfo,
|
2021-06-09 16:37:08 +02:00
|
|
|
|
connectedness: Connectedness,
|
2021-07-14 19:58:46 +02:00
|
|
|
|
disconnectTime: int64 = 0) =
|
2021-03-26 10:49:51 +02:00
|
|
|
|
# Insert peer entry into persistent storage, or replace existing entry with updated info
|
2023-03-09 13:05:50 -05:00
|
|
|
|
let res = ps.put(peerId, remotePeerInfo, connectedness, disconnectTime)
|
2021-03-26 10:49:51 +02:00
|
|
|
|
if res.isErr:
|
|
|
|
|
warn "failed to store peers", err = res.error
|
|
|
|
|
waku_peers_errors.inc(labelValues = ["storage_failure"])
|
|
|
|
|
|
2023-03-28 13:29:48 +02:00
|
|
|
|
proc addPeer*(pm: PeerManager, remotePeerInfo: RemotePeerInfo) =
|
|
|
|
|
# Adds peer to manager for the specified protocol
|
|
|
|
|
|
|
|
|
|
if remotePeerInfo.peerId == pm.switch.peerInfo.peerId:
|
|
|
|
|
# Do not attempt to manage our unmanageable self
|
|
|
|
|
return
|
|
|
|
|
|
|
|
|
|
# ...public key
|
|
|
|
|
var publicKey: PublicKey
|
|
|
|
|
discard remotePeerInfo.peerId.extractPublicKey(publicKey)
|
|
|
|
|
|
|
|
|
|
if pm.peerStore[AddressBook][remotePeerInfo.peerId] == remotePeerInfo.addrs and
|
|
|
|
|
pm.peerStore[KeyBook][remotePeerInfo.peerId] == publicKey:
|
|
|
|
|
# Peer already managed
|
|
|
|
|
return
|
|
|
|
|
|
|
|
|
|
trace "Adding peer to manager", peerId = remotePeerInfo.peerId, addresses = remotePeerInfo.addrs
|
|
|
|
|
|
|
|
|
|
pm.peerStore[AddressBook][remotePeerInfo.peerId] = remotePeerInfo.addrs
|
|
|
|
|
pm.peerStore[KeyBook][remotePeerInfo.peerId] = publicKey
|
|
|
|
|
|
|
|
|
|
# Add peer to storage. Entry will subsequently be updated with connectedness information
|
|
|
|
|
if not pm.storage.isNil:
|
|
|
|
|
pm.storage.insertOrReplace(remotePeerInfo.peerId, pm.peerStore.get(remotePeerInfo.peerId), NotConnected)
|
|
|
|
|
|
|
|
|
|
# Connects to a given node. Note that this function uses `connect` and
|
|
|
|
|
# does not provide a protocol. Streams for relay (gossipsub) are created
|
|
|
|
|
# automatically without the needing to dial.
|
|
|
|
|
proc connectRelay*(pm: PeerManager,
|
|
|
|
|
peer: RemotePeerInfo,
|
|
|
|
|
dialTimeout = DefaultDialTimeout,
|
|
|
|
|
source = "api"): Future[bool] {.async.} =
|
|
|
|
|
|
|
|
|
|
let peerId = peer.peerId
|
2022-12-14 16:04:11 +01:00
|
|
|
|
|
|
|
|
|
# Do not attempt to dial self
|
|
|
|
|
if peerId == pm.switch.peerInfo.peerId:
|
2023-03-28 13:29:48 +02:00
|
|
|
|
return false
|
2022-12-14 16:04:11 +01:00
|
|
|
|
|
2023-03-28 13:29:48 +02:00
|
|
|
|
if not pm.peerStore.hasPeer(peerId, WakuRelayCodec):
|
|
|
|
|
pm.addPeer(peer)
|
2021-03-26 10:49:51 +02:00
|
|
|
|
|
2023-03-28 13:29:48 +02:00
|
|
|
|
let failedAttempts = pm.peerStore[NumberFailedConnBook][peerId]
|
|
|
|
|
debug "Connecting to relay peer", wireAddr=peer.addrs, peerId=peerId, failedAttempts=failedAttempts
|
2021-03-26 10:49:51 +02:00
|
|
|
|
|
2023-03-28 13:29:48 +02:00
|
|
|
|
var deadline = sleepAsync(dialTimeout)
|
|
|
|
|
var workfut = pm.switch.connect(peerId, peer.addrs)
|
2023-01-23 21:24:46 +01:00
|
|
|
|
var reasonFailed = ""
|
2023-03-28 13:29:48 +02:00
|
|
|
|
|
2021-03-26 10:49:51 +02:00
|
|
|
|
try:
|
2023-03-28 13:29:48 +02:00
|
|
|
|
await workfut or deadline
|
|
|
|
|
if workfut.finished():
|
|
|
|
|
if not deadline.finished():
|
|
|
|
|
deadline.cancel()
|
2021-03-26 10:49:51 +02:00
|
|
|
|
waku_peers_dials.inc(labelValues = ["successful"])
|
2022-12-14 16:04:11 +01:00
|
|
|
|
waku_node_conns_initiated.inc(labelValues = [source])
|
2023-01-23 21:24:46 +01:00
|
|
|
|
pm.peerStore[NumberFailedConnBook][peerId] = 0
|
2023-03-28 13:29:48 +02:00
|
|
|
|
return true
|
2021-03-26 10:49:51 +02:00
|
|
|
|
else:
|
2023-03-28 13:29:48 +02:00
|
|
|
|
reasonFailed = "timed out"
|
|
|
|
|
await cancelAndWait(workfut)
|
2023-01-23 21:24:46 +01:00
|
|
|
|
except CatchableError as exc:
|
2023-03-28 13:29:48 +02:00
|
|
|
|
reasonFailed = "remote peer failed"
|
2021-03-26 10:49:51 +02:00
|
|
|
|
|
2023-01-23 21:24:46 +01:00
|
|
|
|
# Dial failed
|
|
|
|
|
pm.peerStore[NumberFailedConnBook][peerId] = pm.peerStore[NumberFailedConnBook][peerId] + 1
|
|
|
|
|
pm.peerStore[LastFailedConnBook][peerId] = Moment.init(getTime().toUnix, Second)
|
|
|
|
|
pm.peerStore[ConnectionBook][peerId] = CannotConnect
|
2022-11-24 14:11:23 +01:00
|
|
|
|
|
2023-03-28 13:29:48 +02:00
|
|
|
|
debug "Connecting relay peer failed",
|
2023-01-26 10:20:20 +01:00
|
|
|
|
peerId = peerId,
|
|
|
|
|
reason = reasonFailed,
|
|
|
|
|
failedAttempts = pm.peerStore[NumberFailedConnBook][peerId]
|
2023-01-23 21:24:46 +01:00
|
|
|
|
waku_peers_dials.inc(labelValues = [reasonFailed])
|
2022-11-24 14:11:23 +01:00
|
|
|
|
|
2023-03-28 13:29:48 +02:00
|
|
|
|
return false
|
|
|
|
|
|
|
|
|
|
# Dialing should be used for just protocols that require a stream to write and read
|
|
|
|
|
# This shall not be used to dial Relay protocols, since that would create
|
|
|
|
|
# unneccesary unused streams.
|
|
|
|
|
proc dialPeer(pm: PeerManager,
|
|
|
|
|
peerId: PeerID,
|
|
|
|
|
addrs: seq[MultiAddress],
|
|
|
|
|
proto: string,
|
|
|
|
|
dialTimeout = DefaultDialTimeout,
|
|
|
|
|
source = "api"): Future[Option[Connection]] {.async.} =
|
|
|
|
|
|
|
|
|
|
if peerId == pm.switch.peerInfo.peerId:
|
|
|
|
|
error "could not dial self"
|
|
|
|
|
return none(Connection)
|
|
|
|
|
|
|
|
|
|
if proto == WakuRelayCodec:
|
|
|
|
|
error "dial shall not be used to connect to relays"
|
|
|
|
|
return none(Connection)
|
|
|
|
|
|
|
|
|
|
debug "Dialing peer", wireAddr=addrs, peerId=peerId, proto=proto
|
|
|
|
|
|
|
|
|
|
# Dial Peer
|
|
|
|
|
let dialFut = pm.switch.dial(peerId, addrs, proto)
|
|
|
|
|
var reasonFailed = ""
|
|
|
|
|
try:
|
|
|
|
|
if (await dialFut.withTimeout(dialTimeout)):
|
|
|
|
|
return some(dialFut.read())
|
|
|
|
|
else:
|
|
|
|
|
reasonFailed = "timeout"
|
|
|
|
|
await cancelAndWait(dialFut)
|
|
|
|
|
except CatchableError as exc:
|
|
|
|
|
reasonFailed = "failed"
|
|
|
|
|
|
|
|
|
|
debug "Dialing peer failed", peerId=peerId, reason=reasonFailed, proto=proto
|
2022-11-24 14:11:23 +01:00
|
|
|
|
|
2023-01-23 21:24:46 +01:00
|
|
|
|
return none(Connection)
|
|
|
|
|
|
2021-07-14 19:58:46 +02:00
|
|
|
|
proc loadFromStorage(pm: PeerManager) =
|
2021-07-27 08:48:56 +02:00
|
|
|
|
debug "loading peers from storage"
|
2021-03-26 10:49:51 +02:00
|
|
|
|
# Load peers from storage, if available
|
2023-03-09 13:05:50 -05:00
|
|
|
|
proc onData(peerId: PeerID, remotePeerInfo: RemotePeerInfo, connectedness: Connectedness, disconnectTime: int64) =
|
|
|
|
|
trace "loading peer", peerId=peerId, connectedness=connectedness
|
2021-07-27 08:48:56 +02:00
|
|
|
|
|
2021-04-16 11:57:45 +02:00
|
|
|
|
if peerId == pm.switch.peerInfo.peerId:
|
|
|
|
|
# Do not manage self
|
|
|
|
|
return
|
|
|
|
|
|
2022-11-24 14:11:23 +01:00
|
|
|
|
# nim-libp2p books
|
2023-03-09 13:05:50 -05:00
|
|
|
|
pm.peerStore[AddressBook][peerId] = remotePeerInfo.addrs
|
|
|
|
|
pm.peerStore[ProtoBook][peerId] = remotePeerInfo.protocols
|
|
|
|
|
pm.peerStore[KeyBook][peerId] = remotePeerInfo.publicKey
|
|
|
|
|
pm.peerStore[AgentBook][peerId] = remotePeerInfo.agent
|
|
|
|
|
pm.peerStore[ProtoVersionBook][peerId] = remotePeerInfo.protoVersion
|
2022-11-24 14:11:23 +01:00
|
|
|
|
|
|
|
|
|
# custom books
|
|
|
|
|
pm.peerStore[ConnectionBook][peerId] = NotConnected # Reset connectedness state
|
|
|
|
|
pm.peerStore[DisconnectBook][peerId] = disconnectTime
|
2023-03-09 13:05:50 -05:00
|
|
|
|
pm.peerStore[SourceBook][peerId] = remotePeerInfo.origin
|
2022-11-24 14:11:23 +01:00
|
|
|
|
|
2021-03-26 10:49:51 +02:00
|
|
|
|
let res = pm.storage.getAll(onData)
|
|
|
|
|
if res.isErr:
|
|
|
|
|
warn "failed to load peers from storage", err = res.error
|
|
|
|
|
waku_peers_errors.inc(labelValues = ["storage_load_failure"])
|
2021-07-27 08:48:56 +02:00
|
|
|
|
else:
|
|
|
|
|
debug "successfully queried peer storage"
|
2022-11-24 14:11:23 +01:00
|
|
|
|
|
2023-04-14 15:12:22 +02:00
|
|
|
|
proc canBeConnected*(pm: PeerManager,
|
|
|
|
|
peerId: PeerId): bool =
|
|
|
|
|
# Returns if we can try to connect to this peer, based on past failed attempts
|
|
|
|
|
# It uses an exponential backoff. Each connection attempt makes us
|
|
|
|
|
# wait more before trying again.
|
|
|
|
|
let failedAttempts = pm.peerStore[NumberFailedConnBook][peerId]
|
|
|
|
|
|
|
|
|
|
# if it never errored, we can try to connect
|
|
|
|
|
if failedAttempts == 0:
|
|
|
|
|
return true
|
|
|
|
|
|
|
|
|
|
# if there are too many failed attempts, do not reconnect
|
|
|
|
|
if failedAttempts >= pm.maxFailedAttempts:
|
|
|
|
|
return false
|
|
|
|
|
|
|
|
|
|
# If it errored we wait an exponential backoff from last connection
|
|
|
|
|
# the more failed attempts, the greater the backoff since last attempt
|
|
|
|
|
let now = Moment.init(getTime().toUnix, Second)
|
|
|
|
|
let lastFailed = pm.peerStore[LastFailedConnBook][peerId]
|
|
|
|
|
let backoff = calculateBackoff(pm.initialBackoffInSec, pm.backoffFactor, failedAttempts)
|
|
|
|
|
if now >= (lastFailed + backoff):
|
|
|
|
|
return true
|
|
|
|
|
return false
|
|
|
|
|
|
2021-03-26 10:49:51 +02:00
|
|
|
|
##################
|
|
|
|
|
# Initialisation #
|
2022-11-24 14:11:23 +01:00
|
|
|
|
##################
|
2021-03-26 10:49:51 +02:00
|
|
|
|
|
2023-02-14 15:38:32 +01:00
|
|
|
|
# currently disabled. note that peer connection state connected/disconnected
|
|
|
|
|
# cant be tracked using this handler when more than one conn is allowed and
|
|
|
|
|
# when using autonat. eg if a peer has 2 conns and one is disconnected we cant
|
|
|
|
|
# assume that the peer is disconnected, because the other one might still be active.
|
|
|
|
|
# note that even with maxconn = 1, autonat forces more than one connection.
|
2021-02-12 10:53:52 +02:00
|
|
|
|
proc onConnEvent(pm: PeerManager, peerId: PeerID, event: ConnEvent) {.async.} =
|
2022-06-08 11:20:18 +02:00
|
|
|
|
|
2021-02-12 10:53:52 +02:00
|
|
|
|
case event.kind
|
|
|
|
|
of ConnEventKind.Connected:
|
2022-11-29 17:35:25 +01:00
|
|
|
|
let direction = if event.incoming: Inbound else: Outbound
|
2023-02-14 15:38:32 +01:00
|
|
|
|
discard
|
|
|
|
|
of ConnEventKind.Disconnected:
|
|
|
|
|
discard
|
|
|
|
|
|
|
|
|
|
proc onPeerEvent(pm: PeerManager, peerId: PeerId, event: PeerEvent) {.async.} =
|
|
|
|
|
if event.kind == PeerEventKind.Joined:
|
|
|
|
|
let direction = if event.initiator: Outbound else: Inbound
|
2022-11-24 14:11:23 +01:00
|
|
|
|
pm.peerStore[ConnectionBook][peerId] = Connected
|
2022-11-29 17:35:25 +01:00
|
|
|
|
pm.peerStore[DirectionBook][peerId] = direction
|
|
|
|
|
|
2021-03-26 10:49:51 +02:00
|
|
|
|
if not pm.storage.isNil:
|
|
|
|
|
pm.storage.insertOrReplace(peerId, pm.peerStore.get(peerId), Connected)
|
2021-02-12 10:53:52 +02:00
|
|
|
|
return
|
2022-11-29 17:35:25 +01:00
|
|
|
|
|
2023-02-14 15:38:32 +01:00
|
|
|
|
elif event.kind == PeerEventKind.Left:
|
2022-11-29 17:35:25 +01:00
|
|
|
|
pm.peerStore[DirectionBook][peerId] = UnknownDirection
|
2022-11-24 14:11:23 +01:00
|
|
|
|
pm.peerStore[ConnectionBook][peerId] = CanConnect
|
2023-02-14 15:38:32 +01:00
|
|
|
|
|
2021-03-26 10:49:51 +02:00
|
|
|
|
if not pm.storage.isNil:
|
2021-04-21 11:36:56 +02:00
|
|
|
|
pm.storage.insertOrReplace(peerId, pm.peerStore.get(peerId), CanConnect, getTime().toUnix)
|
2021-02-12 10:53:52 +02:00
|
|
|
|
return
|
|
|
|
|
|
2023-01-23 21:24:46 +01:00
|
|
|
|
proc new*(T: type PeerManager,
|
|
|
|
|
switch: Switch,
|
|
|
|
|
storage: PeerStorage = nil,
|
|
|
|
|
initialBackoffInSec = InitialBackoffInSec,
|
|
|
|
|
backoffFactor = BackoffFactor,
|
|
|
|
|
maxFailedAttempts = MaxFailedAttempts,): PeerManager =
|
|
|
|
|
|
2023-01-31 13:24:49 +01:00
|
|
|
|
let capacity = switch.peerStore.capacity
|
|
|
|
|
let maxConnections = switch.connManager.inSema.size
|
|
|
|
|
if maxConnections > capacity:
|
|
|
|
|
error "Max number of connections can't be greater than PeerManager capacity",
|
|
|
|
|
capacity = capacity,
|
|
|
|
|
maxConnections = maxConnections
|
|
|
|
|
raise newException(Defect, "Max number of connections can't be greater than PeerManager capacity")
|
|
|
|
|
|
2023-04-14 15:12:22 +02:00
|
|
|
|
# attempt to calculate max backoff to prevent potential overflows or unreasonably high values
|
|
|
|
|
let backoff = calculateBackoff(initialBackoffInSec, backoffFactor, maxFailedAttempts)
|
|
|
|
|
if backoff.weeks() > 1:
|
|
|
|
|
error "Max backoff time can't be over 1 week",
|
|
|
|
|
maxBackoff=backoff
|
|
|
|
|
raise newException(Defect, "Max backoff time can't be over 1 week")
|
|
|
|
|
|
2021-02-12 10:53:52 +02:00
|
|
|
|
let pm = PeerManager(switch: switch,
|
2022-11-24 14:11:23 +01:00
|
|
|
|
peerStore: switch.peerStore,
|
2023-01-23 21:24:46 +01:00
|
|
|
|
storage: storage,
|
|
|
|
|
initialBackoffInSec: initialBackoffInSec,
|
|
|
|
|
backoffFactor: backoffFactor,
|
|
|
|
|
maxFailedAttempts: maxFailedAttempts)
|
2023-04-14 15:12:22 +02:00
|
|
|
|
|
2023-02-14 15:38:32 +01:00
|
|
|
|
proc connHook(peerId: PeerID, event: ConnEvent): Future[void] {.gcsafe.} =
|
2021-10-06 14:29:08 +02:00
|
|
|
|
onConnEvent(pm, peerId, event)
|
2022-11-24 14:11:23 +01:00
|
|
|
|
|
2023-02-14 15:38:32 +01:00
|
|
|
|
proc peerHook(peerId: PeerId, event: PeerEvent): Future[void] {.gcsafe.} =
|
|
|
|
|
onPeerEvent(pm, peerId, event)
|
|
|
|
|
|
2023-01-31 13:24:49 +01:00
|
|
|
|
proc peerStoreChanged(peerId: PeerId) {.gcsafe.} =
|
|
|
|
|
waku_peer_store_size.set(toSeq(pm.peerStore[AddressBook].book.keys).len.int64)
|
|
|
|
|
|
2023-02-14 15:38:32 +01:00
|
|
|
|
# currently disabled
|
|
|
|
|
#pm.switch.addConnEventHandler(connHook, ConnEventKind.Connected)
|
|
|
|
|
#pm.switch.addConnEventHandler(connHook, ConnEventKind.Disconnected)
|
|
|
|
|
|
|
|
|
|
pm.switch.addPeerEventHandler(peerHook, PeerEventKind.Joined)
|
|
|
|
|
pm.switch.addPeerEventHandler(peerHook, PeerEventKind.Left)
|
2021-02-12 10:53:52 +02:00
|
|
|
|
|
2023-01-31 13:24:49 +01:00
|
|
|
|
# called every time the peerstore is updated
|
|
|
|
|
pm.peerStore[AddressBook].addHandler(peerStoreChanged)
|
|
|
|
|
|
2023-01-26 10:20:20 +01:00
|
|
|
|
pm.serviceSlots = initTable[string, RemotePeerInfo]()
|
|
|
|
|
|
2022-11-03 16:36:24 +01:00
|
|
|
|
if not storage.isNil():
|
2021-07-27 08:48:56 +02:00
|
|
|
|
debug "found persistent peer storage"
|
2021-03-26 10:49:51 +02:00
|
|
|
|
pm.loadFromStorage() # Load previously managed peers.
|
2021-07-27 08:48:56 +02:00
|
|
|
|
else:
|
|
|
|
|
debug "no peer storage found"
|
2022-11-24 14:11:23 +01:00
|
|
|
|
|
2021-02-12 10:53:52 +02:00
|
|
|
|
return pm
|
2021-02-04 12:32:58 +02:00
|
|
|
|
|
2021-02-11 10:58:25 +02:00
|
|
|
|
#####################
|
|
|
|
|
# Manager interface #
|
|
|
|
|
#####################
|
|
|
|
|
|
2023-01-26 10:20:20 +01:00
|
|
|
|
proc addServicePeer*(pm: PeerManager, remotePeerInfo: RemotePeerInfo, proto: string) =
|
|
|
|
|
# Do not add relay peers
|
|
|
|
|
if proto == WakuRelayCodec:
|
|
|
|
|
warn "Can't add relay peer to service peers slots"
|
|
|
|
|
return
|
|
|
|
|
|
|
|
|
|
info "Adding peer to service slots", peerId = remotePeerInfo.peerId, addr = remotePeerInfo.addrs[0], service = proto
|
2023-02-27 18:24:31 +01:00
|
|
|
|
waku_service_peers.set(1, labelValues = [$proto, $remotePeerInfo.addrs[0]])
|
2023-01-26 10:20:20 +01:00
|
|
|
|
|
|
|
|
|
# Set peer for service slot
|
|
|
|
|
pm.serviceSlots[proto] = remotePeerInfo
|
|
|
|
|
|
2023-02-27 18:24:31 +01:00
|
|
|
|
pm.addPeer(remotePeerInfo)
|
2023-01-26 10:20:20 +01:00
|
|
|
|
|
2021-07-27 08:48:56 +02:00
|
|
|
|
proc reconnectPeers*(pm: PeerManager,
|
|
|
|
|
proto: string,
|
|
|
|
|
backoff: chronos.Duration = chronos.seconds(0)) {.async.} =
|
2021-03-26 10:49:51 +02:00
|
|
|
|
## Reconnect to peers registered for this protocol. This will update connectedness.
|
|
|
|
|
## Especially useful to resume connections from persistent storage after a restart.
|
2022-11-24 14:11:23 +01:00
|
|
|
|
|
2021-03-26 10:49:51 +02:00
|
|
|
|
debug "Reconnecting peers", proto=proto
|
2022-11-24 14:11:23 +01:00
|
|
|
|
|
2023-02-27 18:24:31 +01:00
|
|
|
|
# Proto is not persisted, we need to iterate over all peers.
|
2023-03-28 13:29:48 +02:00
|
|
|
|
for peerInfo in pm.peerStore.peers(protocolMatcher(proto)):
|
2022-11-24 14:11:23 +01:00
|
|
|
|
# Check that the peer can be connected
|
2023-03-28 13:29:48 +02:00
|
|
|
|
if peerInfo.connectedness == CannotConnect:
|
|
|
|
|
debug "Not reconnecting to unreachable or non-existing peer", peerId=peerInfo.peerId
|
2021-04-21 11:36:56 +02:00
|
|
|
|
continue
|
2022-11-24 14:11:23 +01:00
|
|
|
|
|
2021-04-21 11:36:56 +02:00
|
|
|
|
# Respect optional backoff period where applicable.
|
|
|
|
|
let
|
2022-11-24 14:11:23 +01:00
|
|
|
|
# TODO: Add method to peerStore (eg isBackoffExpired())
|
2023-03-28 13:29:48 +02:00
|
|
|
|
disconnectTime = Moment.init(peerInfo.disconnectTime, Second) # Convert
|
2021-04-21 11:36:56 +02:00
|
|
|
|
currentTime = Moment.init(getTime().toUnix, Second) # Current time comparable to persisted value
|
|
|
|
|
backoffTime = disconnectTime + backoff - currentTime # Consider time elapsed since last disconnect
|
2022-11-24 14:11:23 +01:00
|
|
|
|
|
2021-04-21 11:36:56 +02:00
|
|
|
|
trace "Respecting backoff", backoff=backoff, disconnectTime=disconnectTime, currentTime=currentTime, backoffTime=backoffTime
|
2022-11-24 14:11:23 +01:00
|
|
|
|
|
|
|
|
|
# TODO: This blocks the whole function. Try to connect to another peer in the meantime.
|
2021-04-21 11:36:56 +02:00
|
|
|
|
if backoffTime > ZeroDuration:
|
2023-03-28 13:29:48 +02:00
|
|
|
|
debug "Backing off before reconnect...", peerId=peerInfo.peerId, backoffTime=backoffTime
|
2021-04-21 11:36:56 +02:00
|
|
|
|
# We disconnected recently and still need to wait for a backoff period before connecting
|
|
|
|
|
await sleepAsync(backoffTime)
|
|
|
|
|
|
2023-03-28 13:29:48 +02:00
|
|
|
|
discard await pm.connectRelay(peerInfo)
|
2021-03-26 10:49:51 +02:00
|
|
|
|
|
2021-02-08 11:17:20 +02:00
|
|
|
|
####################
|
|
|
|
|
# Dialer interface #
|
|
|
|
|
####################
|
|
|
|
|
|
2022-12-14 16:04:11 +01:00
|
|
|
|
proc dialPeer*(pm: PeerManager,
|
|
|
|
|
remotePeerInfo: RemotePeerInfo,
|
|
|
|
|
proto: string,
|
|
|
|
|
dialTimeout = DefaultDialTimeout,
|
2023-01-23 21:24:46 +01:00
|
|
|
|
source = "api",
|
|
|
|
|
): Future[Option[Connection]] {.async.} =
|
2021-02-08 11:17:20 +02:00
|
|
|
|
# Dial a given peer and add it to the list of known peers
|
2022-11-04 09:40:13 +01:00
|
|
|
|
# TODO: check peer validity and score before continuing. Limit number of peers to be managed.
|
2022-11-24 14:11:23 +01:00
|
|
|
|
|
2023-02-27 18:24:31 +01:00
|
|
|
|
# First add dialed peer info to peer store, if it does not exist yet..
|
|
|
|
|
# TODO: nim libp2p peerstore already adds them
|
2022-11-24 14:11:23 +01:00
|
|
|
|
if not pm.peerStore.hasPeer(remotePeerInfo.peerId, proto):
|
2022-12-07 12:30:32 +01:00
|
|
|
|
trace "Adding newly dialed peer to manager", peerId= $remotePeerInfo.peerId, address= $remotePeerInfo.addrs[0], proto= proto
|
2023-02-27 18:24:31 +01:00
|
|
|
|
pm.addPeer(remotePeerInfo)
|
2022-11-24 14:11:23 +01:00
|
|
|
|
|
2022-12-14 16:04:11 +01:00
|
|
|
|
return await pm.dialPeer(remotePeerInfo.peerId,remotePeerInfo.addrs, proto, dialTimeout, source)
|
2021-10-06 14:29:08 +02:00
|
|
|
|
|
2022-12-14 16:04:11 +01:00
|
|
|
|
proc dialPeer*(pm: PeerManager,
|
|
|
|
|
peerId: PeerID,
|
|
|
|
|
proto: string,
|
|
|
|
|
dialTimeout = DefaultDialTimeout,
|
2023-01-23 21:24:46 +01:00
|
|
|
|
source = "api",
|
2022-12-14 16:04:11 +01:00
|
|
|
|
): Future[Option[Connection]] {.async.} =
|
2021-10-06 14:29:08 +02:00
|
|
|
|
# Dial an existing peer by looking up it's existing addrs in the switch's peerStore
|
2022-11-04 09:40:13 +01:00
|
|
|
|
# TODO: check peer validity and score before continuing. Limit number of peers to be managed.
|
2022-11-24 14:11:23 +01:00
|
|
|
|
|
2022-06-01 11:49:41 +02:00
|
|
|
|
let addrs = pm.switch.peerStore[AddressBook][peerId]
|
2022-12-14 16:04:11 +01:00
|
|
|
|
return await pm.dialPeer(peerId, addrs, proto, dialTimeout, source)
|
2021-10-06 14:29:08 +02:00
|
|
|
|
|
2022-12-14 16:04:11 +01:00
|
|
|
|
proc connectToNodes*(pm: PeerManager,
|
|
|
|
|
nodes: seq[string]|seq[RemotePeerInfo],
|
|
|
|
|
dialTimeout = DefaultDialTimeout,
|
|
|
|
|
source = "api") {.async.} =
|
2023-01-23 21:24:46 +01:00
|
|
|
|
if nodes.len == 0:
|
|
|
|
|
return
|
|
|
|
|
|
|
|
|
|
info "Dialing multiple peers", numOfPeers = nodes.len
|
2022-11-24 14:11:23 +01:00
|
|
|
|
|
2023-03-28 13:29:48 +02:00
|
|
|
|
var futConns: seq[Future[bool]]
|
2022-12-14 16:04:11 +01:00
|
|
|
|
for node in nodes:
|
2023-04-12 11:29:11 +02:00
|
|
|
|
let node = parsePeerInfo(node)
|
|
|
|
|
if node.isOk():
|
|
|
|
|
futConns.add(pm.connectRelay(node.value))
|
|
|
|
|
else:
|
|
|
|
|
error "Couldn't parse node info", error = node.error
|
2023-01-09 21:45:50 +01:00
|
|
|
|
|
|
|
|
|
await allFutures(futConns)
|
2023-04-17 11:46:15 +02:00
|
|
|
|
let successfulConns = futConns.mapIt(it.read()).countIt(it == true)
|
2023-01-23 21:24:46 +01:00
|
|
|
|
|
|
|
|
|
info "Finished dialing multiple peers", successfulConns=successfulConns, attempted=nodes.len
|
2022-09-20 13:03:34 +02:00
|
|
|
|
|
|
|
|
|
# The issue seems to be around peers not being fully connected when
|
|
|
|
|
# trying to subscribe. So what we do is sleep to guarantee nodes are
|
|
|
|
|
# fully connected.
|
|
|
|
|
#
|
|
|
|
|
# This issue was known to Dmitiry on nim-libp2p and may be resolvable
|
|
|
|
|
# later.
|
|
|
|
|
await sleepAsync(chronos.seconds(5))
|
2023-01-18 15:17:56 +01:00
|
|
|
|
|
2023-04-12 13:05:34 +02:00
|
|
|
|
# Returns the amount of physical connections for a given direction
|
|
|
|
|
# containing at least one stream with the given protocol.
|
|
|
|
|
proc getNumConnections*(pm: PeerManager, dir: Direction, protocol: string): int =
|
|
|
|
|
var numConns = 0
|
|
|
|
|
for peerId, muxers in pm.switch.connManager.getConnections():
|
|
|
|
|
for peerConn in muxers:
|
|
|
|
|
let streams = peerConn.getStreams()
|
|
|
|
|
if peerConn.connection.transportDir == dir:
|
|
|
|
|
if streams.anyIt(it.protocol == protocol):
|
|
|
|
|
numConns += 1
|
|
|
|
|
return numConns
|
|
|
|
|
|
|
|
|
|
proc getNumStreams*(pm: PeerManager, dir: Direction, protocol: string): int =
|
|
|
|
|
var numConns = 0
|
|
|
|
|
for peerId, muxers in pm.switch.connManager.getConnections():
|
|
|
|
|
for peerConn in muxers:
|
|
|
|
|
for stream in peerConn.getStreams():
|
|
|
|
|
if stream.protocol == protocol and stream.dir == dir:
|
|
|
|
|
numConns += 1
|
|
|
|
|
return numConns
|
|
|
|
|
|
2023-02-27 18:24:31 +01:00
|
|
|
|
proc connectToRelayPeers*(pm: PeerManager) {.async.} =
|
|
|
|
|
let maxConnections = pm.switch.connManager.inSema.size
|
2023-04-12 13:05:34 +02:00
|
|
|
|
let inRelayPeers = pm.getNumConnections(Direction.In, WakuRelayCodec)
|
|
|
|
|
let outRelayPeers = pm.getNumConnections(Direction.Out, WakuRelayCodec)
|
|
|
|
|
let totalRelayPeers = inRelayPeers + outRelayPeers
|
2023-01-18 15:17:56 +01:00
|
|
|
|
|
2023-02-27 18:24:31 +01:00
|
|
|
|
# Leave some room for service peers
|
2023-04-12 13:05:34 +02:00
|
|
|
|
if totalRelayPeers >= (maxConnections - 5):
|
2023-02-27 18:24:31 +01:00
|
|
|
|
return
|
2023-01-18 15:17:56 +01:00
|
|
|
|
|
2023-02-27 18:24:31 +01:00
|
|
|
|
# TODO: Track only relay connections (nwaku/issues/1566)
|
|
|
|
|
let notConnectedPeers = pm.peerStore.getNotConnectedPeers().mapIt(RemotePeerInfo.init(it.peerId, it.addrs))
|
2023-04-14 15:12:22 +02:00
|
|
|
|
let outsideBackoffPeers = notConnectedPeers.filterIt(pm.canBeConnected(it.peerId))
|
2023-04-12 13:05:34 +02:00
|
|
|
|
let numPeersToConnect = min(min(maxConnections - totalRelayPeers, outsideBackoffPeers.len), MaxParalelDials)
|
2023-01-18 15:17:56 +01:00
|
|
|
|
|
2023-02-27 18:24:31 +01:00
|
|
|
|
info "Relay peer connections",
|
2023-04-12 13:05:34 +02:00
|
|
|
|
inRelayConns = inRelayPeers,
|
|
|
|
|
outRelayConns = outRelayPeers,
|
|
|
|
|
totalRelayConns = totalRelayPeers,
|
2023-02-27 18:24:31 +01:00
|
|
|
|
targetConnectedPeers = maxConnections,
|
|
|
|
|
notConnectedPeers = notConnectedPeers.len,
|
|
|
|
|
outsideBackoffPeers = outsideBackoffPeers.len
|
2023-01-18 15:17:56 +01:00
|
|
|
|
|
2023-03-28 13:29:48 +02:00
|
|
|
|
await pm.connectToNodes(outsideBackoffPeers[0..<numPeersToConnect])
|
2023-01-26 10:20:20 +01:00
|
|
|
|
|
2023-01-31 13:24:49 +01:00
|
|
|
|
proc prunePeerStore*(pm: PeerManager) =
|
|
|
|
|
let numPeers = toSeq(pm.peerStore[AddressBook].book.keys).len
|
|
|
|
|
let capacity = pm.peerStore.capacity
|
|
|
|
|
if numPeers < capacity:
|
|
|
|
|
return
|
|
|
|
|
|
|
|
|
|
debug "Peer store capacity exceeded", numPeers = numPeers, capacity = capacity
|
|
|
|
|
let peersToPrune = numPeers - capacity
|
|
|
|
|
|
|
|
|
|
# prune peers with too many failed attempts
|
|
|
|
|
var pruned = 0
|
2023-02-13 18:10:20 +01:00
|
|
|
|
# copy to avoid modifying the book while iterating
|
|
|
|
|
let peerKeys = toSeq(pm.peerStore[NumberFailedConnBook].book.keys)
|
|
|
|
|
for peerId in peerKeys:
|
2023-01-31 13:24:49 +01:00
|
|
|
|
if peersToPrune - pruned == 0:
|
|
|
|
|
break
|
|
|
|
|
if pm.peerStore[NumberFailedConnBook][peerId] >= pm.maxFailedAttempts:
|
|
|
|
|
pm.peerStore.del(peerId)
|
|
|
|
|
pruned += 1
|
|
|
|
|
|
|
|
|
|
# if we still need to prune, prune peers that are not connected
|
|
|
|
|
let notConnected = pm.peerStore.getNotConnectedPeers().mapIt(it.peerId)
|
|
|
|
|
for peerId in notConnected:
|
|
|
|
|
if peersToPrune - pruned == 0:
|
|
|
|
|
break
|
|
|
|
|
pm.peerStore.del(peerId)
|
|
|
|
|
pruned += 1
|
|
|
|
|
|
|
|
|
|
let afterNumPeers = toSeq(pm.peerStore[AddressBook].book.keys).len
|
|
|
|
|
debug "Finished pruning peer store", beforeNumPeers = numPeers,
|
|
|
|
|
afterNumPeers = afterNumPeers,
|
|
|
|
|
capacity = capacity,
|
|
|
|
|
pruned = pruned
|
|
|
|
|
|
2023-01-26 10:20:20 +01:00
|
|
|
|
proc selectPeer*(pm: PeerManager, proto: string): Option[RemotePeerInfo] =
|
|
|
|
|
debug "Selecting peer from peerstore", protocol=proto
|
|
|
|
|
|
|
|
|
|
# Selects the best peer for a given protocol
|
|
|
|
|
let peers = pm.peerStore.getPeersByProtocol(proto)
|
|
|
|
|
|
|
|
|
|
# No criteria for selecting a peer for WakuRelay, random one
|
|
|
|
|
if proto == WakuRelayCodec:
|
|
|
|
|
# TODO: proper heuristic here that compares peer scores and selects "best" one. For now the first peer for the given protocol is returned
|
|
|
|
|
if peers.len > 0:
|
|
|
|
|
debug "Got peer from peerstore", peerId=peers[0].peerId, multi=peers[0].addrs[0], protocol=proto
|
2023-03-09 13:05:50 -05:00
|
|
|
|
return some(peers[0])
|
2023-01-26 10:20:20 +01:00
|
|
|
|
debug "No peer found for protocol", protocol=proto
|
|
|
|
|
return none(RemotePeerInfo)
|
|
|
|
|
|
|
|
|
|
# For other protocols, we select the peer that is slotted for the given protocol
|
|
|
|
|
pm.serviceSlots.withValue(proto, serviceSlot):
|
|
|
|
|
debug "Got peer from service slots", peerId=serviceSlot[].peerId, multi=serviceSlot[].addrs[0], protocol=proto
|
|
|
|
|
return some(serviceSlot[])
|
|
|
|
|
|
|
|
|
|
# If not slotted, we select a random peer for the given protocol
|
|
|
|
|
if peers.len > 0:
|
|
|
|
|
debug "Got peer from peerstore", peerId=peers[0].peerId, multi=peers[0].addrs[0], protocol=proto
|
2023-03-09 13:05:50 -05:00
|
|
|
|
return some(peers[0])
|
2023-01-26 10:20:20 +01:00
|
|
|
|
debug "No peer found for protocol", protocol=proto
|
|
|
|
|
return none(RemotePeerInfo)
|
|
|
|
|
|
2023-02-27 18:24:31 +01:00
|
|
|
|
# Prunes peers from peerstore to remove old/stale ones
|
|
|
|
|
proc prunePeerStoreLoop(pm: PeerManager) {.async.} =
|
|
|
|
|
debug "Starting prune peerstore loop"
|
|
|
|
|
while pm.started:
|
|
|
|
|
pm.prunePeerStore()
|
|
|
|
|
await sleepAsync(PrunePeerStoreInterval)
|
|
|
|
|
|
|
|
|
|
# Ensures a healthy amount of connected relay peers
|
|
|
|
|
proc relayConnectivityLoop*(pm: PeerManager) {.async.} =
|
|
|
|
|
debug "Starting relay connectivity loop"
|
|
|
|
|
while pm.started:
|
|
|
|
|
await pm.connectToRelayPeers()
|
|
|
|
|
await sleepAsync(ConnectivityLoopInterval)
|
|
|
|
|
|
2023-04-12 13:05:34 +02:00
|
|
|
|
proc updateMetrics(pm: PeerManager) {.async.} =
|
|
|
|
|
heartbeat "Scheduling updateMetrics run", UpdateMetricsInterval:
|
|
|
|
|
for dir in @[Direction.In, Direction.Out]:
|
|
|
|
|
for proto in pm.peerStore.getWakuProtos():
|
|
|
|
|
let protoDirConns = pm.getNumConnections(dir, proto)
|
|
|
|
|
let protoDirStreams = pm.getNumStreams(dir, proto)
|
|
|
|
|
waku_connected_peers.set(protoDirConns.float64, labelValues = [$dir, proto])
|
|
|
|
|
waku_streams_peers.set(protoDirStreams.float64, labelValues = [$dir, proto])
|
|
|
|
|
|
2023-01-26 10:20:20 +01:00
|
|
|
|
proc start*(pm: PeerManager) =
|
|
|
|
|
pm.started = true
|
2023-04-12 13:05:34 +02:00
|
|
|
|
asyncSpawn pm.updateMetrics()
|
2023-01-26 10:20:20 +01:00
|
|
|
|
asyncSpawn pm.relayConnectivityLoop()
|
2023-01-31 13:24:49 +01:00
|
|
|
|
asyncSpawn pm.prunePeerStoreLoop()
|
2023-01-26 10:20:20 +01:00
|
|
|
|
|
|
|
|
|
proc stop*(pm: PeerManager) =
|
|
|
|
|
pm.started = false
|