From 4685b9eaa963dbaf62702f6bf711debf44798c96 Mon Sep 17 00:00:00 2001 From: Andrea Maria Piana Date: Mon, 1 Feb 2021 14:39:41 +0100 Subject: [PATCH] Handle deadlock in peerpool There was another deadlock in the peer pool. Because we made the event handler asynchrnous, another deadlock popped up, as the loop locks the global peerpool lock before processing events. But the handlers also take the global look, effectively resulting in the same situation we had before, i.e the loop is not running. THE LOOP MUST BE RUNNING AT ALL TIMES OTHERWISE THE SERVER HANGS. --- metrics/node/subscribe.go | 2 +- peers/peerpool.go | 17 +++++++++++++---- 2 files changed, 14 insertions(+), 5 deletions(-) diff --git a/metrics/node/subscribe.go b/metrics/node/subscribe.go index c40478c38..87cca0e83 100644 --- a/metrics/node/subscribe.go +++ b/metrics/node/subscribe.go @@ -38,7 +38,7 @@ func SubscribeServerEvents(ctx context.Context, node *node.Node) error { // https://github.com/status-im/status-go/blob/e60f425b45d00d3880b42fdd77b460ec465a9f55/vendor/github.com/ethereum/go-ethereum/p2p/server.go#L746 // If there's back-pressure on the peer event feed // https://github.com/status-im/status-go/blob/e60f425b45d00d3880b42fdd77b460ec465a9f55/vendor/github.com/ethereum/go-ethereum/p2p/server.go#L783 - // The event channel above might become while updateNodeMetrics + // The event channel above might become full while updateNodeMetrics // is called, which means is never consumed, the server blocks on publishing on // it, and the two will deadlock (server waits for the channel above to be consumed, // this code waits for the server to respond to peerCount, which is in the same diff --git a/peers/peerpool.go b/peers/peerpool.go index 81b69213a..143135218 100644 --- a/peers/peerpool.go +++ b/peers/peerpool.go @@ -103,6 +103,7 @@ type PeerPool struct { cache *Cache mu sync.RWMutex + timeoutMu sync.RWMutex topics []TopicPoolInterface serverSubscription event.Subscription events chan *p2p.PeerEvent @@ -123,6 +124,8 @@ func NewPeerPool(discovery discovery.Discovery, config map[discv5.Topic]params.L } func (p *PeerPool) setDiscoveryTimeout() { + p.timeoutMu.Lock() + defer p.timeoutMu.Unlock() if p.opts.AllowStop && p.opts.DiscServerTimeout > 0 { p.timeout = time.After(p.opts.DiscServerTimeout) } @@ -217,9 +220,9 @@ func (p *PeerPool) stopDiscovery(server *p2p.Server) { t.StopSearch(server) } - p.mu.Lock() + p.timeoutMu.Lock() p.timeout = nil - p.mu.Unlock() + p.timeoutMu.Unlock() signal.SendDiscoveryStopped() } @@ -275,9 +278,15 @@ func (p *PeerPool) handleServerPeers(server *p2p.Server, events <-chan *p2p.Peer } for { - p.mu.RLock() + // We use a separate lock for timeout, as this loop should + // always be running, otherwise the p2p.Server will hang. + // Because the handler of events might potentially hang on the + // server, deadlocking if this loop is waiting for the global lock. + // NOTE: this code probably needs to be refactored and simplified + // as it's difficult to follow the asynchronous nature of it. + p.timeoutMu.RLock() timeout := p.timeout - p.mu.RUnlock() + p.timeoutMu.RUnlock() select { case <-p.quit: