Fix deadlock in peerpoool

This is a bit complicated, so:

1) Peerpool was subscribing to `event.Feed`, which is a global event
emitter for ethereum.
2) The p2p.Server was publshing on `event.Feed`, this triggered in the
same routine a publish on `event.Feed`.
3) Peerpool was listening to `event.Feed`, react on it, and in the same
routine, trigger some code on p2p.Server that would publish on
`event.Feed`

This meant that if the size of the channel was unbufferred, it would deadlock, as
peerPool would not be consuming when it would publish (the same go
routine publishes and listen effectively, through a lot of indirection
and non-buffered channels, p2p.Server->event.Feed)

The channel though was a buffered channel with size 10, and this meant that most of the times is
fine.

The issue is that peerpool is not the only producer to this channel.
So it's possible that while is processing an event, the buffer would
fill up, and it would hange trying to publish, and nobody is listening
to the channel, hanging EVERYTHING.

At least that's what I think, needs to be tested, but definitely an
issue.

I kept the code changes to a minimum, this code is a bit hairy, but it's
fairly critical so I don't want to make too many changes.
This commit is contained in:
Andrea Maria Piana 2021-01-29 13:44:25 +01:00
parent ee8f333dbf
commit 24a30d7a7c
1 changed files with 50 additions and 22 deletions

View File

@ -30,6 +30,7 @@ var (
type PoolEvent string type PoolEvent string
const ( const (
immediately = 0 * time.Minute
// expirationPeriod is an amount of time while peer is considered as a connectable // expirationPeriod is an amount of time while peer is considered as a connectable
expirationPeriod = 60 * time.Minute expirationPeriod = 60 * time.Minute
// discoveryRestartTimeout defines how often loop will try to start discovery server // discoveryRestartTimeout defines how often loop will try to start discovery server
@ -249,8 +250,29 @@ func (p *PeerPool) restartDiscovery(server *p2p.Server) error {
// @TODO(adam): split it into peers and discovery management loops. This should // @TODO(adam): split it into peers and discovery management loops. This should
// simplify the whole logic and allow to remove `timeout` field from `PeerPool`. // simplify the whole logic and allow to remove `timeout` field from `PeerPool`.
func (p *PeerPool) handleServerPeers(server *p2p.Server, events <-chan *p2p.PeerEvent) { func (p *PeerPool) handleServerPeers(server *p2p.Server, events <-chan *p2p.PeerEvent) {
var retryDiscv5 <-chan time.Time retryDiscv5 := make(chan struct{}, 1)
var stopDiscv5 <-chan time.Time stopDiscv5 := make(chan struct{}, 1)
queueRetry := func(d time.Duration) {
go func() {
time.Sleep(d)
select {
case retryDiscv5 <- struct{}{}:
default:
}
}()
}
queueStop := func() {
go func() {
select {
case stopDiscv5 <- struct{}{}:
default:
}
}()
}
for { for {
p.mu.RLock() p.mu.RLock()
@ -267,40 +289,48 @@ func (p *PeerPool) handleServerPeers(server *p2p.Server, events <-chan *p2p.Peer
p.stopDiscovery(server) p.stopDiscovery(server)
case <-retryDiscv5: case <-retryDiscv5:
if err := p.restartDiscovery(server); err != nil { if err := p.restartDiscovery(server); err != nil {
retryDiscv5 = time.After(discoveryRestartTimeout)
log.Error("starting discv5 failed", "error", err, "retry", discoveryRestartTimeout) log.Error("starting discv5 failed", "error", err, "retry", discoveryRestartTimeout)
queueRetry(discoveryRestartTimeout)
} }
case <-stopDiscv5: case <-stopDiscv5:
p.handleStopTopics(server) p.handleStopTopics(server)
case req := <-p.updateTopic: case req := <-p.updateTopic:
if p.updateTopicLimits(server, req) == nil { if p.updateTopicLimits(server, req) == nil {
if !p.discovery.Running() { if !p.discovery.Running() {
retryDiscv5 = time.After(0) queueRetry(immediately)
} }
} }
case event := <-events: case event := <-events:
switch event.Type { // NOTE: handlePeerEventType needs to be called asynchronously
case p2p.PeerEventTypeDrop: // as it publishes on the <-events channel, leading to a deadlock
log.Debug("confirm peer dropped", "ID", event.Peer) // if events channel is full.
if p.handleDroppedPeer(server, event.Peer) { go p.handlePeerEventType(server, event, queueRetry, queueStop)
retryDiscv5 = time.After(0)
}
case p2p.PeerEventTypeAdd: // skip other events
log.Debug("confirm peer added", "ID", event.Peer)
p.handleAddedPeer(server, event.Peer)
stopDiscv5 = time.After(p.opts.TopicStopSearchDelay)
default:
continue
}
SendDiscoverySummary(server.PeersInfo())
} }
} }
} }
// handleAddedPeer notifies all topics about added peer. func (p *PeerPool) handlePeerEventType(server *p2p.Server, event *p2p.PeerEvent, queueRetry func(time.Duration), queueStop func()) {
func (p *PeerPool) handleAddedPeer(server *p2p.Server, nodeID enode.ID) {
p.mu.Lock() p.mu.Lock()
defer p.mu.Unlock() defer p.mu.Unlock()
switch event.Type {
case p2p.PeerEventTypeDrop:
log.Debug("confirm peer dropped", "ID", event.Peer)
if p.handleDroppedPeer(server, event.Peer) {
queueRetry(immediately)
}
case p2p.PeerEventTypeAdd: // skip other events
log.Debug("confirm peer added", "ID", event.Peer)
p.handleAddedPeer(server, event.Peer)
queueStop()
default:
return
}
SendDiscoverySummary(server.PeersInfo())
}
// handleAddedPeer notifies all topics about added peer.
func (p *PeerPool) handleAddedPeer(server *p2p.Server, nodeID enode.ID) {
for _, t := range p.topics { for _, t := range p.topics {
t.ConfirmAdded(server, nodeID) t.ConfirmAdded(server, nodeID)
if p.opts.AllowStop && t.MaxReached() { if p.opts.AllowStop && t.MaxReached() {
@ -344,8 +374,6 @@ func (p *PeerPool) allTopicsStopped() (all bool) {
// handleDroppedPeer notifies every topic about dropped peer and returns true if any peer have connections // handleDroppedPeer notifies every topic about dropped peer and returns true if any peer have connections
// below min limit // below min limit
func (p *PeerPool) handleDroppedPeer(server *p2p.Server, nodeID enode.ID) (any bool) { func (p *PeerPool) handleDroppedPeer(server *p2p.Server, nodeID enode.ID) (any bool) {
p.mu.Lock()
defer p.mu.Unlock()
for _, t := range p.topics { for _, t := range p.topics {
confirmed := t.ConfirmDropped(server, nodeID) confirmed := t.ConfirmDropped(server, nodeID)
if confirmed { if confirmed {