fix gossipsub memory leak on disconnected peer (#371)

When messages can't be sent to peer, we try to establish a send
connection - this causes messages to stack up as more and more unsent
messages are blocked on the dial lock.

* remove dial lock
* run reconnection loop in background task
This commit is contained in:
Jacek Sieka 2020-09-22 09:05:53 +02:00 committed by GitHub
parent 49a12e619d
commit 471e5906f6
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 158 additions and 157 deletions

View File

@ -49,7 +49,7 @@ const
GossipSubHeartbeatInterval* = 1.seconds GossipSubHeartbeatInterval* = 1.seconds
# fanout ttl # fanout ttl
const const
GossipSubFanoutTTL* = 1.minutes GossipSubFanoutTTL* = 1.minutes
const const
@ -189,7 +189,7 @@ proc init*(_: type[GossipSubParams]): GossipSubParams =
) )
proc validateParameters*(parameters: GossipSubParams): Result[void, cstring] = proc validateParameters*(parameters: GossipSubParams): Result[void, cstring] =
if (parameters.dOut >= GossipSubDlo) or if (parameters.dOut >= GossipSubDlo) or
(parameters.dOut > (GossipSubD div 2)): (parameters.dOut > (GossipSubD div 2)):
err("gossipsub: dOut parameter error, Number of outbound connections to keep in the mesh. Must be less than D_lo and at most D/2") err("gossipsub: dOut parameter error, Number of outbound connections to keep in the mesh. Must be less than D_lo and at most D/2")
elif parameters.gossipThreshold >= 0: elif parameters.gossipThreshold >= 0:
@ -359,6 +359,21 @@ proc replenishFanout(g: GossipSub, topic: string) =
trace "fanout replenished with peers", peers = g.fanout.peers(topic) trace "fanout replenished with peers", peers = g.fanout.peers(topic)
method onPubSubPeerEvent*(p: GossipSub, peer: PubsubPeer, event: PubSubPeerEvent) {.gcsafe.} =
case event.kind
of PubSubPeerEventKind.Connected:
discard
of PubSubPeerEventKind.Disconnected:
# If a send connection is lost, it's better to remove peer from the mesh -
# if it gets reestablished, the peer will be readded to the mesh, and if it
# doesn't, well.. then we hope the peer is going away!
for _, peers in p.mesh.mpairs():
peers.excl(peer)
for _, peers in p.fanout.mpairs():
peers.excl(peer)
procCall FloodSub(p).onPubSubPeerEvent(peer, event)
proc rebalanceMesh(g: GossipSub, topic: string) {.async.} = proc rebalanceMesh(g: GossipSub, topic: string) {.async.} =
logScope: logScope:
topic topic
@ -379,7 +394,7 @@ proc rebalanceMesh(g: GossipSub, topic: string) {.async.} =
grafts = toSeq( grafts = toSeq(
g.gossipsub.getOrDefault(topic, initHashSet[PubSubPeer]()) - g.gossipsub.getOrDefault(topic, initHashSet[PubSubPeer]()) -
g.mesh.getOrDefault(topic, initHashSet[PubSubPeer]()) g.mesh.getOrDefault(topic, initHashSet[PubSubPeer]())
) ).filterIt(it.connected)
grafts.keepIf do (x: PubSubPeer) -> bool: grafts.keepIf do (x: PubSubPeer) -> bool:
# avoid negative score peers # avoid negative score peers
@ -404,7 +419,7 @@ proc rebalanceMesh(g: GossipSub, topic: string) {.async.} =
g.grafted(peer, topic) g.grafted(peer, topic)
g.fanout.removePeer(topic, peer) g.fanout.removePeer(topic, peer)
grafting &= peer grafting &= peer
elif npeers < g.parameters.dOut: elif npeers < g.parameters.dOut:
trace "replenishing mesh outbound quota", peers = g.mesh.peers(topic) trace "replenishing mesh outbound quota", peers = g.mesh.peers(topic)
# replenish the mesh if we're below Dlo # replenish the mesh if we're below Dlo
@ -439,7 +454,7 @@ proc rebalanceMesh(g: GossipSub, topic: string) {.async.} =
g.grafted(peer, topic) g.grafted(peer, topic)
g.fanout.removePeer(topic, peer) g.fanout.removePeer(topic, peer)
grafting &= peer grafting &= peer
if g.mesh.peers(topic) > GossipSubDhi: if g.mesh.peers(topic) > GossipSubDhi:
# prune peers if we've gone over Dhi # prune peers if we've gone over Dhi
@ -465,7 +480,7 @@ proc rebalanceMesh(g: GossipSub, topic: string) {.async.} =
outbound &= peer outbound &= peer
else: else:
inbound &= peer inbound &= peer
let pruneLen = inbound.len - GossipSubD let pruneLen = inbound.len - GossipSubD
if pruneLen > 0: if pruneLen > 0:
# Ok we got some peers to prune, # Ok we got some peers to prune,
@ -506,17 +521,17 @@ proc rebalanceMesh(g: GossipSub, topic: string) {.async.} =
x.peerId notin g.parameters.directPeers and x.peerId notin g.parameters.directPeers and
# and avoid peers we are backing off # and avoid peers we are backing off
x.peerId notin g.backingOff x.peerId notin g.backingOff
# by spec, grab only 2 # by spec, grab only 2
if avail.len > 2: if avail.len > 2:
avail.setLen(2) avail.setLen(2)
for peer in avail: for peer in avail:
if g.mesh.addPeer(topic, peer): if g.mesh.addPeer(topic, peer):
g.grafted(peer, topic) g.grafted(peer, topic)
grafting &= peer grafting &= peer
trace "opportunistic grafting", peer = $peer trace "opportunistic grafting", peer = $peer
when defined(libp2p_expensive_metrics): when defined(libp2p_expensive_metrics):
libp2p_gossipsub_peers_per_topic_gossipsub libp2p_gossipsub_peers_per_topic_gossipsub
.set(g.gossipsub.peers(topic).int64, labelValues = [topic]) .set(g.gossipsub.peers(topic).int64, labelValues = [topic])
@ -537,7 +552,7 @@ proc rebalanceMesh(g: GossipSub, topic: string) {.async.} =
let prune = RPCMsg(control: some(ControlMessage( let prune = RPCMsg(control: some(ControlMessage(
prune: @[ControlPrune( prune: @[ControlPrune(
topicID: topic, topicID: topic,
peers: g.peerExchangeList(topic), peers: g.peerExchangeList(topic),
backoff: g.parameters.pruneBackoff.seconds.uint64)]))) backoff: g.parameters.pruneBackoff.seconds.uint64)])))
g.broadcast(prunes, prune) g.broadcast(prunes, prune)
@ -623,7 +638,7 @@ proc colocationFactor(g: GossipSub, peer: PubSubPeer): float64 =
proc updateScores(g: GossipSub) = # avoid async proc updateScores(g: GossipSub) = # avoid async
trace "updating scores", peers = g.peers.len trace "updating scores", peers = g.peers.len
let now = Moment.now() let now = Moment.now()
var evicting: seq[PubSubPeer] var evicting: seq[PubSubPeer]
@ -645,13 +660,13 @@ proc updateScores(g: GossipSub) = # avoid async
# Scoring # Scoring
var topicScore = 0'f64 var topicScore = 0'f64
if info.inMesh: if info.inMesh:
inc is_grafted inc is_grafted
info.meshTime = now - info.graftTime info.meshTime = now - info.graftTime
if info.meshTime > topicParams.meshMessageDeliveriesActivation: if info.meshTime > topicParams.meshMessageDeliveriesActivation:
info.meshMessageDeliveriesActive = true info.meshMessageDeliveriesActive = true
var p1 = info.meshTime / topicParams.timeInMeshQuantum var p1 = info.meshTime / topicParams.timeInMeshQuantum
if p1 > topicParams.timeInMeshCap: if p1 > topicParams.timeInMeshCap:
p1 = topicParams.timeInMeshCap p1 = topicParams.timeInMeshCap
@ -700,7 +715,7 @@ proc updateScores(g: GossipSub) = # avoid async
# Wrap up # Wrap up
# commit our changes, mgetOrPut does NOT work as wanted with value types (lent?) # commit our changes, mgetOrPut does NOT work as wanted with value types (lent?)
stats.topicInfos[topic] = info stats.topicInfos[topic] = info
peer.score += peer.appScore * g.parameters.appSpecificWeight peer.score += peer.appScore * g.parameters.appSpecificWeight
peer.score += peer.behaviourPenalty * peer.behaviourPenalty * g.parameters.behaviourPenaltyWeight peer.score += peer.behaviourPenalty * peer.behaviourPenalty * g.parameters.behaviourPenaltyWeight
@ -713,10 +728,10 @@ proc updateScores(g: GossipSub) = # avoid async
peer.behaviourPenalty = 0 peer.behaviourPenalty = 0
debug "updated peer's score", peer, score = peer.score, n_topics, is_grafted debug "updated peer's score", peer, score = peer.score, n_topics, is_grafted
for peer in evicting: for peer in evicting:
g.peerStats.del(peer) g.peerStats.del(peer)
trace "updated scores", peers = g.peers.len trace "updated scores", peers = g.peers.len
proc heartbeat(g: GossipSub) {.async.} = proc heartbeat(g: GossipSub) {.async.} =
@ -755,7 +770,7 @@ proc heartbeat(g: GossipSub) {.async.} =
prunes &= peer prunes &= peer
let prune = RPCMsg(control: some(ControlMessage( let prune = RPCMsg(control: some(ControlMessage(
prune: @[ControlPrune( prune: @[ControlPrune(
topicID: t, topicID: t,
peers: g.peerExchangeList(t), peers: g.peerExchangeList(t),
backoff: g.parameters.pruneBackoff.seconds.uint64)]))) backoff: g.parameters.pruneBackoff.seconds.uint64)])))
g.broadcast(prunes, prune) g.broadcast(prunes, prune)
@ -825,7 +840,7 @@ method unsubscribePeer*(g: GossipSub, peer: PeerID) =
when defined(libp2p_expensive_metrics): when defined(libp2p_expensive_metrics):
libp2p_gossipsub_peers_per_topic_fanout libp2p_gossipsub_peers_per_topic_fanout
.set(g.fanout.peers(t).int64, labelValues = [t]) .set(g.fanout.peers(t).int64, labelValues = [t])
# don't retain bad score peers # don't retain bad score peers
if pubSubPeer.score < 0.0: if pubSubPeer.score < 0.0:
g.peerStats.del(pubSubPeer) g.peerStats.del(pubSubPeer)
@ -847,7 +862,7 @@ method subscribeTopic*(g: GossipSub,
logScope: logScope:
peer peer
topic topic
g.onNewPeer(peer) g.onNewPeer(peer)
if subscribe: if subscribe:
@ -890,7 +905,7 @@ proc handleGraft(g: GossipSub,
# It is an error to GRAFT on a explicit peer # It is an error to GRAFT on a explicit peer
if peer.peerId in g.parameters.directPeers: if peer.peerId in g.parameters.directPeers:
trace "attempt to graft an explicit peer", peer=peer.id, trace "attempt to graft an explicit peer", peer=peer.id,
topicID=graft.topicID topicID=graft.topicID
# and such an attempt should be logged and rejected with a PRUNE # and such an attempt should be logged and rejected with a PRUNE
result.add(ControlPrune( result.add(ControlPrune(
@ -900,7 +915,7 @@ proc handleGraft(g: GossipSub,
continue continue
if peer.peerId in g.backingOff: if peer.peerId in g.backingOff:
trace "attempt to graft an backingOff peer", peer=peer.id, trace "attempt to graft an backingOff peer", peer=peer.id,
topicID=graft.topicID, topicID=graft.topicID,
expire=g.backingOff[peer.peerId] expire=g.backingOff[peer.peerId]
# and such an attempt should be logged and rejected with a PRUNE # and such an attempt should be logged and rejected with a PRUNE
@ -909,10 +924,10 @@ proc handleGraft(g: GossipSub,
peers: @[], # omitting heavy computation here as the remote did something illegal peers: @[], # omitting heavy computation here as the remote did something illegal
backoff: g.parameters.pruneBackoff.seconds.uint64)) backoff: g.parameters.pruneBackoff.seconds.uint64))
continue continue
if peer notin g.peerStats: if peer notin g.peerStats:
g.peerStats[peer] = PeerStats() g.peerStats[peer] = PeerStats()
# If they send us a graft before they send us a subscribe, what should # If they send us a graft before they send us a subscribe, what should
# we do? For now, we add them to mesh but don't add them to gossipsub. # we do? For now, we add them to mesh but don't add them to gossipsub.
if topic in g.topics: if topic in g.topics:
@ -927,7 +942,7 @@ proc handleGraft(g: GossipSub,
trace "peer already in mesh" trace "peer already in mesh"
else: else:
result.add(ControlPrune( result.add(ControlPrune(
topicID: topic, topicID: topic,
peers: g.peerExchangeList(topic), peers: g.peerExchangeList(topic),
backoff: g.parameters.pruneBackoff.seconds.uint64)) backoff: g.parameters.pruneBackoff.seconds.uint64))
else: else:
@ -950,10 +965,10 @@ proc handlePrune(g: GossipSub, peer: PubSubPeer, prunes: seq[ControlPrune]) =
let current = g.backingOff.getOrDefault(peer.peerId) let current = g.backingOff.getOrDefault(peer.peerId)
if backoff > current: if backoff > current:
g.backingOff[peer.peerId] = backoff g.backingOff[peer.peerId] = backoff
g.pruned(peer, prune.topicID) g.pruned(peer, prune.topicID)
g.mesh.removePeer(prune.topicID, peer) g.mesh.removePeer(prune.topicID, peer)
when defined(libp2p_expensive_metrics): when defined(libp2p_expensive_metrics):
libp2p_gossipsub_peers_per_topic_mesh libp2p_gossipsub_peers_per_topic_mesh
.set(g.mesh.peers(prune.topicID).int64, labelValues = [prune.topicID]) .set(g.mesh.peers(prune.topicID).int64, labelValues = [prune.topicID])
@ -1026,7 +1041,7 @@ method rpcHandler*(g: GossipSub,
# commit back to the table # commit back to the table
g.peerStats[peer].topicInfos[t] = stats g.peerStats[peer].topicInfos[t] = stats
continue continue
g.mcache.put(msgId, msg) g.mcache.put(msgId, msg)
@ -1091,11 +1106,11 @@ method subscribe*(g: GossipSub,
topic: string, topic: string,
handler: TopicHandler) {.async.} = handler: TopicHandler) {.async.} =
await procCall PubSub(g).subscribe(topic, handler) await procCall PubSub(g).subscribe(topic, handler)
# if we have a fanout on this topic break it # if we have a fanout on this topic break it
if topic in g.fanout: if topic in g.fanout:
g.fanout.del(topic) g.fanout.del(topic)
await g.rebalanceMesh(topic) await g.rebalanceMesh(topic)
method unsubscribe*(g: GossipSub, method unsubscribe*(g: GossipSub,
@ -1113,7 +1128,7 @@ method unsubscribe*(g: GossipSub,
g.pruned(peer, topic) g.pruned(peer, topic)
let prune = RPCMsg(control: some(ControlMessage( let prune = RPCMsg(control: some(ControlMessage(
prune: @[ControlPrune( prune: @[ControlPrune(
topicID: topic, topicID: topic,
peers: g.peerExchangeList(topic), peers: g.peerExchangeList(topic),
backoff: g.parameters.pruneBackoff.seconds.uint64)]))) backoff: g.parameters.pruneBackoff.seconds.uint64)])))
g.broadcast(toSeq(peers), prune) g.broadcast(toSeq(peers), prune)
@ -1128,7 +1143,7 @@ method unsubscribeAll*(g: GossipSub, topic: string) {.async.} =
g.pruned(peer, topic) g.pruned(peer, topic)
let prune = RPCMsg(control: some(ControlMessage( let prune = RPCMsg(control: some(ControlMessage(
prune: @[ControlPrune( prune: @[ControlPrune(
topicID: topic, topicID: topic,
peers: g.peerExchangeList(topic), peers: g.peerExchangeList(topic),
backoff: g.parameters.pruneBackoff.seconds.uint64)]))) backoff: g.parameters.pruneBackoff.seconds.uint64)])))
g.broadcast(toSeq(peers), prune) g.broadcast(toSeq(peers), prune)
@ -1145,11 +1160,11 @@ method publish*(g: GossipSub,
if topic.len <= 0: # data could be 0/empty if topic.len <= 0: # data could be 0/empty
debug "Empty topic, skipping publish" debug "Empty topic, skipping publish"
return 0 return 0
var peers: HashSet[PubSubPeer] var peers: HashSet[PubSubPeer]
if g.parameters.floodPublish: if g.parameters.floodPublish:
# With flood publishing enabled, the mesh is used when propagating messages from other peers, # With flood publishing enabled, the mesh is used when propagating messages from other peers,
# but a peer's own messages will always be published to all known peers in the topic. # but a peer's own messages will always be published to all known peers in the topic.
for peer in g.gossipsub.getOrDefault(topic): for peer in g.gossipsub.getOrDefault(topic):
if peer.score >= g.parameters.publishThreshold: if peer.score >= g.parameters.publishThreshold:
@ -1248,7 +1263,7 @@ method initPubSub*(g: GossipSub) =
if not g.parameters.explicit: if not g.parameters.explicit:
g.parameters = GossipSubParams.init() g.parameters = GossipSubParams.init()
g.parameters.validateParameters().tryGet() g.parameters.validateParameters().tryGet()
randomize() randomize()

View File

@ -141,6 +141,22 @@ proc replenishFanout(g: GossipSub, topic: string) =
trace "fanout replenished with peers", peers = g.fanout.peers(topic) trace "fanout replenished with peers", peers = g.fanout.peers(topic)
method onPubSubPeerEvent*(p: GossipSub, peer: PubsubPeer, event: PubSubPeerEvent) {.gcsafe.} =
case event.kind
of PubSubPeerEventKind.Connected:
discard
of PubSubPeerEventKind.Disconnected:
# If a send connection is lost, it's better to remove peer from the mesh -
# if it gets reestablished, the peer will be readded to the mesh, and if it
# doesn't, well.. then we hope the peer is going away!
for _, peers in p.mesh.mpairs():
peers.excl(peer)
for _, peers in p.fanout.mpairs():
peers.excl(peer)
procCall FloodSub(p).onPubSubPeerEvent(peer, event)
proc rebalanceMesh(g: GossipSub, topic: string) {.async.} = proc rebalanceMesh(g: GossipSub, topic: string) {.async.} =
logScope: logScope:
topic topic
@ -160,7 +176,7 @@ proc rebalanceMesh(g: GossipSub, topic: string) {.async.} =
grafts = toSeq( grafts = toSeq(
g.gossipsub.getOrDefault(topic, initHashSet[PubSubPeer]()) - g.gossipsub.getOrDefault(topic, initHashSet[PubSubPeer]()) -
g.mesh.getOrDefault(topic, initHashSet[PubSubPeer]()) g.mesh.getOrDefault(topic, initHashSet[PubSubPeer]())
) ).filterIt(it.connected)
shuffle(grafts) shuffle(grafts)

View File

@ -54,24 +54,25 @@ type
handler*: seq[TopicHandler] handler*: seq[TopicHandler]
PubSub* = ref object of LPProtocol PubSub* = ref object of LPProtocol
switch*: Switch # the switch used to dial/connect to peers switch*: Switch # the switch used to dial/connect to peers
peerInfo*: PeerInfo # this peer's info peerInfo*: PeerInfo # this peer's info
topics*: Table[string, Topic] # local topics topics*: Table[string, Topic] # local topics
peers*: Table[PeerID, PubSubPeer] # peerid to peer map peers*: Table[PeerID, PubSubPeer] ##\
triggerSelf*: bool # trigger own local handler on publish ## Peers that we are interested to gossip with (but not necessarily
verifySignature*: bool # enable signature verification ## yet connected to)
sign*: bool # enable message signing triggerSelf*: bool # trigger own local handler on publish
verifySignature*: bool # enable signature verification
sign*: bool # enable message signing
validators*: Table[string, HashSet[ValidatorHandler]] validators*: Table[string, HashSet[ValidatorHandler]]
observers: ref seq[PubSubObserver] # ref as in smart_ptr observers: ref seq[PubSubObserver] # ref as in smart_ptr
msgIdProvider*: MsgIdProvider # Turn message into message id (not nil) msgIdProvider*: MsgIdProvider # Turn message into message id (not nil)
msgSeqno*: uint64 msgSeqno*: uint64
lifetimeFut*: Future[void] # pubsub liftime future
method unsubscribePeer*(p: PubSub, peerId: PeerID) {.base.} = method unsubscribePeer*(p: PubSub, peerId: PeerID) {.base.} =
## handle peer disconnects ## handle peer disconnects
## ##
trace "unsubscribing pubsub peer", peer = $peerId trace "unsubscribing pubsub peer", peerId
p.peers.del(peerId) p.peers.del(peerId)
libp2p_pubsub_peers.set(p.peers.len.int64) libp2p_pubsub_peers.set(p.peers.len.int64)
@ -80,7 +81,7 @@ proc send*(p: PubSub, peer: PubSubPeer, msg: RPCMsg) =
## Attempt to send `msg` to remote peer ## Attempt to send `msg` to remote peer
## ##
trace "sending pubsub message to peer", peer = $peer, msg = shortLog(msg) trace "sending pubsub message to peer", peer, msg = shortLog(msg)
peer.send(msg) peer.send(msg)
proc broadcast*( proc broadcast*(
@ -119,6 +120,14 @@ method rpcHandler*(p: PubSub,
method onNewPeer(p: PubSub, peer: PubSubPeer) {.base.} = discard method onNewPeer(p: PubSub, peer: PubSubPeer) {.base.} = discard
method onPubSubPeerEvent*(p: PubSub, peer: PubsubPeer, event: PubsubPeerEvent) {.base, gcsafe.} =
# Peer event is raised for the send connection in particular
case event.kind
of PubSubPeerEventKind.Connected:
p.sendSubs(peer, toSeq(p.topics.keys), true)
of PubSubPeerEventKind.Disconnected:
discard
proc getOrCreatePeer*( proc getOrCreatePeer*(
p: PubSub, p: PubSub,
peer: PeerID, peer: PeerID,
@ -126,13 +135,15 @@ proc getOrCreatePeer*(
if peer in p.peers: if peer in p.peers:
return p.peers[peer] return p.peers[peer]
proc getConn(): Future[(Connection, RPCMsg)] {.async.} = proc getConn(): Future[Connection] =
let conn = await p.switch.dial(peer, protos) p.switch.dial(peer, protos)
return (conn, RPCMsg.withSubs(toSeq(p.topics.keys), true))
proc onEvent(peer: PubsubPeer, event: PubsubPeerEvent) {.gcsafe.} =
p.onPubSubPeerEvent(peer, event)
# create new pubsub peer # create new pubsub peer
let pubSubPeer = newPubSubPeer(peer, getConn, protos[0]) let pubSubPeer = newPubSubPeer(peer, getConn, onEvent, protos[0])
trace "created new pubsub peer", peerId = $peer trace "created new pubsub peer", peer
p.peers[peer] = pubSubPeer p.peers[peer] = pubSubPeer
pubSubPeer.observers = p.observers pubSubPeer.observers = p.observers

View File

@ -31,17 +31,25 @@ type
onRecv*: proc(peer: PubSubPeer; msgs: var RPCMsg) {.gcsafe, raises: [Defect].} onRecv*: proc(peer: PubSubPeer; msgs: var RPCMsg) {.gcsafe, raises: [Defect].}
onSend*: proc(peer: PubSubPeer; msgs: var RPCMsg) {.gcsafe, raises: [Defect].} onSend*: proc(peer: PubSubPeer; msgs: var RPCMsg) {.gcsafe, raises: [Defect].}
GetConn* = proc(): Future[(Connection, RPCMsg)] {.gcsafe.} PubSubPeerEventKind* {.pure.} = enum
Connected
Disconnected
PubsubPeerEvent* = object
kind*: PubSubPeerEventKind
GetConn* = proc(): Future[Connection] {.gcsafe.}
OnEvent* = proc(peer: PubSubPeer, event: PubsubPeerEvent) {.gcsafe.}
PubSubPeer* = ref object of RootObj PubSubPeer* = ref object of RootObj
getConn*: GetConn # callback to establish a new send connection getConn*: GetConn # callback to establish a new send connection
onEvent*: OnEvent # Connectivity updates for peer
codec*: string # the protocol that this peer joined from codec*: string # the protocol that this peer joined from
sendConn: Connection # cached send connection sendConn*: Connection # cached send connection
connections*: seq[Connection] # connections to this peer connections*: seq[Connection] # connections to this peer
peerId*: PeerID peerId*: PeerID
handler*: RPCHandler handler*: RPCHandler
observers*: ref seq[PubSubObserver] # ref as in smart_ptr observers*: ref seq[PubSubObserver] # ref as in smart_ptr
dialLock: AsyncLock
score*: float64 score*: float64
iWantBudget*: int iWantBudget*: int
@ -54,7 +62,7 @@ type
chronicles.formatIt(PubSubPeer): $it.peerId chronicles.formatIt(PubSubPeer): $it.peerId
func hash*(p: PubSubPeer): Hash = func hash*(p: PubSubPeer): Hash =
# int is either 32/64, so intptr basically, pubsubpeer is a ref # int is either 32/64, so intptr basically, pubsubpeer is a ref
cast[pointer](p).hash cast[pointer](p).hash
@ -117,10 +125,6 @@ proc handle*(p: PubSubPeer, conn: Connection) {.async.} =
await p.handler(p, rmsg.get()) await p.handler(p, rmsg.get())
finally: finally:
await conn.close() await conn.close()
if p.sendConn == conn:
p.sendConn = nil
except CancelledError: except CancelledError:
# This is top-level procedure which will work as separate task, so it # This is top-level procedure which will work as separate task, so it
# do not need to propogate CancelledError. # do not need to propogate CancelledError.
@ -132,88 +136,54 @@ proc handle*(p: PubSubPeer, conn: Connection) {.async.} =
debug "exiting pubsub read loop", debug "exiting pubsub read loop",
conn, peer = p, closed = conn.closed conn, peer = p, closed = conn.closed
proc getSendConn(p: PubSubPeer): Future[Connection] {.async.} = proc connectOnce(p: PubSubPeer): Future[void] {.async.} =
## get a cached send connection or create a new one - will return nil if
## getting a new connection fails
##
block: # check if there's an existing connection that can be reused
let current = p.sendConn
if not current.isNil:
if not (current.closed() or current.atEof):
# The existing send connection looks like it might work - reuse it
trace "Reusing existing connection", current
return current
# Send connection is set but broken - get rid of it
p.sendConn = nil
# Careful, p.sendConn might change after here!
await current.close() # TODO this might be unnecessary
try: try:
# Testing has demonstrated that when we perform concurrent meshsub dials let newConn = await p.getConn()
# and later close one of them, other implementations such as rust-libp2p
# become deaf to our messages (potentially due to the clean-up associated
# with closing connections). To prevent this, we use a lock that ensures
# that only a single dial will be performed for each peer and send the
# subscription table every time we reconnect.
#
# Nevertheless, this approach is still quite problematic because the gossip
# sends and their respective dials may be started from the mplex read loop.
# This may cause the read loop to get stuck which ultimately results in a
# deadlock when the other side tries to send us any other message that must
# be routed through mplex (it will be stuck on `pushTo`). Such messages
# naturally arise in the process of dialing itself.
#
# See https://github.com/status-im/nim-libp2p/issues/337
#
# One possible long-term solution is to avoid "blocking" the mplex read
# loop by making the gossip send non-blocking through the use of a queue.
await p.dialLock.acquire()
# Another concurrent dial may have populated p.sendConn
if p.sendConn != nil:
let current = p.sendConn
if not (current.closed() or current.atEof):
# The existing send connection looks like it might work - reuse it
debug "Reusing existing connection", current
return current
else:
p.sendConn = nil
# Grab a new send connection
let (newConn, handshake) = await p.getConn() # ...and here
if newConn.isNil: if newConn.isNil:
debug "Failed to get a new send connection" raise (ref CatchableError)(msg: "Cannot establish send connection")
return nil
trace "Sending handshake", newConn, handshake = shortLog(handshake) # When the send channel goes up, subscriptions need to be sent to the
await newConn.writeLp(encodeRpcMsg(handshake)) # remote peer - if we had multiple channels up and one goes down, all
# stop working so we make an effort to only keep a single channel alive
trace "Caching new send connection", newConn trace "Get new send connection", p, newConn
p.sendConn = newConn p.sendConn = newConn
# Start a read loop on the new connection.
# All the errors are handled inside `handle()` procedure.
asyncSpawn p.handle(newConn)
return newConn
finally:
if p.dialLock.locked:
p.dialLock.release()
proc connectImpl*(p: PubSubPeer) {.async.} = if p.onEvent != nil:
p.onEvent(p, PubsubPeerEvent(kind: PubSubPeerEventKind.Connected))
await handle(p, newConn)
finally:
if p.sendConn != nil:
trace "Removing send connection", p, conn = p.sendConn
await p.sendConn.close()
p.sendConn = nil
if p.onEvent != nil:
p.onEvent(p, PubsubPeerEvent(kind: PubSubPeerEventKind.Disconnected))
proc connectImpl(p: PubSubPeer) {.async.} =
try: try:
discard await getSendConn(p) # Keep trying to establish a connection while it's possible to do so - the
# send connection might get disconnected due to a timeout or an unrelated
# issue so we try to get a new on
while true:
await connectOnce(p)
except CatchableError as exc: except CatchableError as exc:
debug "Could not connect to pubsub peer", err = exc.msg debug "Could not establish send connection", msg = exc.msg
proc connect*(p: PubSubPeer) = proc connect*(p: PubSubPeer) =
asyncCheck(connectImpl(p)) asyncSpawn connectImpl(p)
proc sendImpl(p: PubSubPeer, msg: RPCMsg) {.async.} = proc sendImpl(p: PubSubPeer, msg: RPCMsg) {.async.} =
doAssert(not isNil(p), "pubsubpeer nil!") doAssert(not isNil(p), "pubsubpeer nil!")
let conn = p.sendConn
if conn == nil:
trace "No send connection, skipping message", p, msg
return
trace "sending msg to peer", peer = p, rpcMsg = shortLog(msg) trace "sending msg to peer", peer = p, rpcMsg = shortLog(msg)
# trigger send hooks # trigger send hooks
@ -225,13 +195,7 @@ proc sendImpl(p: PubSubPeer, msg: RPCMsg) {.async.} =
info "empty message, skipping" info "empty message, skipping"
return return
var conn: Connection
try: try:
conn = await p.getSendConn()
if conn == nil:
trace "Couldn't get send connection, dropping message", peer = p
return
trace "sending encoded msgs to peer", conn, encoded = shortLog(encoded) trace "sending encoded msgs to peer", conn, encoded = shortLog(encoded)
await conn.writeLp(encoded) await conn.writeLp(encoded)
trace "sent pubsub message to remote", conn trace "sent pubsub message to remote", conn
@ -244,31 +208,26 @@ proc sendImpl(p: PubSubPeer, msg: RPCMsg) {.async.} =
except CatchableError as exc: except CatchableError as exc:
# Because we detach the send call from the currently executing task using # Because we detach the send call from the currently executing task using
# asyncCheck, no exceptions may leak out of it # asyncSpawn, no exceptions may leak out of it
trace "Unable to send to remote", conn, exc = exc.msg trace "Unable to send to remote", conn, exc = exc.msg
# Next time sendConn is used, it will be have its close flag set and thus # Next time sendConn is used, it will be have its close flag set and thus
# will be recycled # will be recycled
if not isNil(conn):
await conn.close() # This will clean up the send connection
if exc is CancelledError: # TODO not handled await conn.close() # This will clean up the send connection
debug "Send cancelled", peer = p
# We'll ask for a new send connection whenever possible
if p.sendConn == conn:
p.sendConn = nil
proc send*(p: PubSubPeer, msg: RPCMsg) = proc send*(p: PubSubPeer, msg: RPCMsg) =
asyncCheck sendImpl(p, msg) asyncSpawn sendImpl(p, msg)
proc `$`*(p: PubSubPeer): string = proc `$`*(p: PubSubPeer): string =
$p.peerId $p.peerId
proc newPubSubPeer*(peerId: PeerID, proc newPubSubPeer*(peerId: PeerID,
getConn: GetConn, getConn: GetConn,
onEvent: OnEvent,
codec: string): PubSubPeer = codec: string): PubSubPeer =
new result PubSubPeer(
result.getConn = getConn getConn: getConn,
result.codec = codec onEvent: onEvent,
result.peerId = peerId codec: codec,
result.dialLock = newAsyncLock() peerId: peerId,
)

View File

@ -17,11 +17,10 @@ type
proc noop(data: seq[byte]) {.async, gcsafe.} = discard proc noop(data: seq[byte]) {.async, gcsafe.} = discard
proc getPubSubPeer(p: TestGossipSub, peerId: PeerID): auto = proc getPubSubPeer(p: TestGossipSub, peerId: PeerID): auto =
proc getConn(): Future[(Connection, RPCMsg)] {.async.} = proc getConn(): Future[Connection] =
let conn = await p.switch.dial(peerId, GossipSubCodec) p.switch.dial(peerId, GossipSubCodec)
return (conn, RPCMsg.withSubs(toSeq(p.topics.keys), true))
newPubSubPeer(peerId, getConn, GossipSubCodec) newPubSubPeer(peerId, getConn, nil, GossipSubCodec)
proc randomPeerInfo(): PeerInfo = proc randomPeerInfo(): PeerInfo =
PeerInfo.init(PrivateKey.random(ECDSA, rng[]).get()) PeerInfo.init(PrivateKey.random(ECDSA, rng[]).get())
@ -56,6 +55,7 @@ suite "GossipSub internal":
let peerInfo = randomPeerInfo() let peerInfo = randomPeerInfo()
conn.peerInfo = peerInfo conn.peerInfo = peerInfo
let peer = gossipSub.getPubSubPeer(peerInfo.peerId) let peer = gossipSub.getPubSubPeer(peerInfo.peerId)
peer.sendConn = conn
gossipSub.onNewPeer(peer) gossipSub.onNewPeer(peer)
gossipSub.peers[peerInfo.peerId] = peer gossipSub.peers[peerInfo.peerId] = peer
gossipSub.gossipsub[topic].incl(peer) gossipSub.gossipsub[topic].incl(peer)

View File

@ -17,11 +17,10 @@ type
proc noop(data: seq[byte]) {.async, gcsafe.} = discard proc noop(data: seq[byte]) {.async, gcsafe.} = discard
proc getPubSubPeer(p: TestGossipSub, peerId: PeerID): auto = proc getPubSubPeer(p: TestGossipSub, peerId: PeerID): auto =
proc getConn(): Future[(Connection, RPCMsg)] {.async.} = proc getConn(): Future[Connection] =
let conn = await p.switch.dial(peerId, GossipSubCodec) p.switch.dial(peerId, GossipSubCodec)
return (conn, RPCMsg.withSubs(toSeq(p.topics.keys), true))
newPubSubPeer(peerId, getConn, GossipSubCodec) newPubSubPeer(peerId, getConn, nil, GossipSubCodec)
proc randomPeerInfo(): PeerInfo = proc randomPeerInfo(): PeerInfo =
PeerInfo.init(PrivateKey.random(ECDSA, rng[]).get()) PeerInfo.init(PrivateKey.random(ECDSA, rng[]).get())
@ -47,6 +46,7 @@ suite "GossipSub internal":
let peerInfo = randomPeerInfo() let peerInfo = randomPeerInfo()
conn.peerInfo = peerInfo conn.peerInfo = peerInfo
let peer = gossipSub.getPubSubPeer(peerInfo.peerId) let peer = gossipSub.getPubSubPeer(peerInfo.peerId)
peer.sendConn = conn
gossipSub.peers[peerInfo.peerId] = peer gossipSub.peers[peerInfo.peerId] = peer
gossipSub.mesh[topic].incl(peer) gossipSub.mesh[topic].incl(peer)