diff --git a/metrics/node/subscribe.go b/metrics/node/subscribe.go index c40478c38..87cca0e83 100644 --- a/metrics/node/subscribe.go +++ b/metrics/node/subscribe.go @@ -38,7 +38,7 @@ func SubscribeServerEvents(ctx context.Context, node *node.Node) error { // https://github.com/status-im/status-go/blob/e60f425b45d00d3880b42fdd77b460ec465a9f55/vendor/github.com/ethereum/go-ethereum/p2p/server.go#L746 // If there's back-pressure on the peer event feed // https://github.com/status-im/status-go/blob/e60f425b45d00d3880b42fdd77b460ec465a9f55/vendor/github.com/ethereum/go-ethereum/p2p/server.go#L783 - // The event channel above might become while updateNodeMetrics + // The event channel above might become full while updateNodeMetrics // is called, which means is never consumed, the server blocks on publishing on // it, and the two will deadlock (server waits for the channel above to be consumed, // this code waits for the server to respond to peerCount, which is in the same diff --git a/peers/peerpool.go b/peers/peerpool.go index 81b69213a..143135218 100644 --- a/peers/peerpool.go +++ b/peers/peerpool.go @@ -103,6 +103,7 @@ type PeerPool struct { cache *Cache mu sync.RWMutex + timeoutMu sync.RWMutex topics []TopicPoolInterface serverSubscription event.Subscription events chan *p2p.PeerEvent @@ -123,6 +124,8 @@ func NewPeerPool(discovery discovery.Discovery, config map[discv5.Topic]params.L } func (p *PeerPool) setDiscoveryTimeout() { + p.timeoutMu.Lock() + defer p.timeoutMu.Unlock() if p.opts.AllowStop && p.opts.DiscServerTimeout > 0 { p.timeout = time.After(p.opts.DiscServerTimeout) } @@ -217,9 +220,9 @@ func (p *PeerPool) stopDiscovery(server *p2p.Server) { t.StopSearch(server) } - p.mu.Lock() + p.timeoutMu.Lock() p.timeout = nil - p.mu.Unlock() + p.timeoutMu.Unlock() signal.SendDiscoveryStopped() } @@ -275,9 +278,15 @@ func (p *PeerPool) handleServerPeers(server *p2p.Server, events <-chan *p2p.Peer } for { - p.mu.RLock() + // We use a separate lock for timeout, as this loop should + // always be running, otherwise the p2p.Server will hang. + // Because the handler of events might potentially hang on the + // server, deadlocking if this loop is waiting for the global lock. + // NOTE: this code probably needs to be refactored and simplified + // as it's difficult to follow the asynchronous nature of it. + p.timeoutMu.RLock() timeout := p.timeout - p.mu.RUnlock() + p.timeoutMu.RUnlock() select { case <-p.quit: