Handle deadlock in peerpool
There was another deadlock in the peer pool. Because we made the event handler asynchrnous, another deadlock popped up, as the loop locks the global peerpool lock before processing events. But the handlers also take the global look, effectively resulting in the same situation we had before, i.e the loop is not running. THE LOOP MUST BE RUNNING AT ALL TIMES OTHERWISE THE SERVER HANGS.
This commit is contained in:
parent
b4e5bf417b
commit
4685b9eaa9
|
@ -38,7 +38,7 @@ func SubscribeServerEvents(ctx context.Context, node *node.Node) error {
|
|||
// https://github.com/status-im/status-go/blob/e60f425b45d00d3880b42fdd77b460ec465a9f55/vendor/github.com/ethereum/go-ethereum/p2p/server.go#L746
|
||||
// If there's back-pressure on the peer event feed
|
||||
// https://github.com/status-im/status-go/blob/e60f425b45d00d3880b42fdd77b460ec465a9f55/vendor/github.com/ethereum/go-ethereum/p2p/server.go#L783
|
||||
// The event channel above might become while updateNodeMetrics
|
||||
// The event channel above might become full while updateNodeMetrics
|
||||
// is called, which means is never consumed, the server blocks on publishing on
|
||||
// it, and the two will deadlock (server waits for the channel above to be consumed,
|
||||
// this code waits for the server to respond to peerCount, which is in the same
|
||||
|
|
|
@ -103,6 +103,7 @@ type PeerPool struct {
|
|||
cache *Cache
|
||||
|
||||
mu sync.RWMutex
|
||||
timeoutMu sync.RWMutex
|
||||
topics []TopicPoolInterface
|
||||
serverSubscription event.Subscription
|
||||
events chan *p2p.PeerEvent
|
||||
|
@ -123,6 +124,8 @@ func NewPeerPool(discovery discovery.Discovery, config map[discv5.Topic]params.L
|
|||
}
|
||||
|
||||
func (p *PeerPool) setDiscoveryTimeout() {
|
||||
p.timeoutMu.Lock()
|
||||
defer p.timeoutMu.Unlock()
|
||||
if p.opts.AllowStop && p.opts.DiscServerTimeout > 0 {
|
||||
p.timeout = time.After(p.opts.DiscServerTimeout)
|
||||
}
|
||||
|
@ -217,9 +220,9 @@ func (p *PeerPool) stopDiscovery(server *p2p.Server) {
|
|||
t.StopSearch(server)
|
||||
}
|
||||
|
||||
p.mu.Lock()
|
||||
p.timeoutMu.Lock()
|
||||
p.timeout = nil
|
||||
p.mu.Unlock()
|
||||
p.timeoutMu.Unlock()
|
||||
|
||||
signal.SendDiscoveryStopped()
|
||||
}
|
||||
|
@ -275,9 +278,15 @@ func (p *PeerPool) handleServerPeers(server *p2p.Server, events <-chan *p2p.Peer
|
|||
}
|
||||
|
||||
for {
|
||||
p.mu.RLock()
|
||||
// We use a separate lock for timeout, as this loop should
|
||||
// always be running, otherwise the p2p.Server will hang.
|
||||
// Because the handler of events might potentially hang on the
|
||||
// server, deadlocking if this loop is waiting for the global lock.
|
||||
// NOTE: this code probably needs to be refactored and simplified
|
||||
// as it's difficult to follow the asynchronous nature of it.
|
||||
p.timeoutMu.RLock()
|
||||
timeout := p.timeout
|
||||
p.mu.RUnlock()
|
||||
p.timeoutMu.RUnlock()
|
||||
|
||||
select {
|
||||
case <-p.quit:
|
||||
|
|
Loading…
Reference in New Issue