From f1d710af006d7e9ed6046ab410976bd20c1e3c97 Mon Sep 17 00:00:00 2001
From: Felix Lange <fjl@twurst.com>
Date: Wed, 8 Apr 2015 17:37:11 +0200
Subject: [PATCH] p2p: fix Peer shutdown deadlocks

There were multiple synchronization issues in the disconnect handling,
all caused by the odd special-casing of Peer.readLoop errors. Remove the
special handling of read errors and make readLoop part of the Peer
WaitGroup.

Thanks to @Gustav-Simonsson for pointing at arrows in a diagram
and playing rubber-duck.
---
 p2p/handshake.go  |   2 +-
 p2p/peer.go       | 131 ++++++++++++++++++++++++----------------------
 p2p/peer_error.go |  10 ++--
 p2p/peer_test.go  |  74 ++++++++++++++++++++++----
 4 files changed, 139 insertions(+), 78 deletions(-)

diff --git a/p2p/handshake.go b/p2p/handshake.go
index 031064407..5a259cd76 100644
--- a/p2p/handshake.go
+++ b/p2p/handshake.go
@@ -409,7 +409,7 @@ func readProtocolHandshake(r MsgReader, our *protoHandshake) (*protoHandshake, e
 		// spec and we send it ourself if Server.addPeer fails.
 		var reason DiscReason
 		rlp.Decode(msg.Payload, &reason)
-		return nil, discRequestedError(reason)
+		return nil, reason
 	}
 	if msg.Code != handshakeMsg {
 		return nil, fmt.Errorf("expected handshake, got %x", msg.Code)
diff --git a/p2p/peer.go b/p2p/peer.go
index 6b97ea58d..a82ee4bca 100644
--- a/p2p/peer.go
+++ b/p2p/peer.go
@@ -44,7 +44,7 @@ type Peer struct {
 	rw      *conn
 	running map[string]*protoRW
 
-	protoWG  sync.WaitGroup
+	wg       sync.WaitGroup
 	protoErr chan error
 	closed   chan struct{}
 	disc     chan DiscReason
@@ -102,58 +102,50 @@ func (p *Peer) String() string {
 
 func newPeer(fd net.Conn, conn *conn, protocols []Protocol) *Peer {
 	logtag := fmt.Sprintf("Peer %.8x %v", conn.ID[:], fd.RemoteAddr())
+	protomap := matchProtocols(protocols, conn.Caps, conn)
 	p := &Peer{
 		Logger:   logger.NewLogger(logtag),
 		conn:     fd,
 		rw:       conn,
-		running:  matchProtocols(protocols, conn.Caps, conn),
+		running:  protomap,
 		disc:     make(chan DiscReason),
-		protoErr: make(chan error),
+		protoErr: make(chan error, len(protomap)+1), // protocols + pingLoop
 		closed:   make(chan struct{}),
 	}
 	return p
 }
 
 func (p *Peer) run() DiscReason {
-	var readErr = make(chan error, 1)
-	defer p.closeProtocols()
-	defer close(p.closed)
+	readErr := make(chan error, 1)
+	p.wg.Add(2)
+	go p.readLoop(readErr)
+	go p.pingLoop()
 
 	p.startProtocols()
-	go func() { readErr <- p.readLoop() }()
-
-	ping := time.NewTicker(pingInterval)
-	defer ping.Stop()
 
 	// Wait for an error or disconnect.
 	var reason DiscReason
-loop:
-	for {
-		select {
-		case <-ping.C:
-			go func() {
-				if err := SendItems(p.rw, pingMsg); err != nil {
-					p.protoErr <- err
-					return
-				}
-			}()
-		case err := <-readErr:
-			// We rely on protocols to abort if there is a write error. It
-			// might be more robust to handle them here as well.
-			p.DebugDetailf("Read error: %v\n", err)
-			p.conn.Close()
-			return DiscNetworkError
-		case err := <-p.protoErr:
-			reason = discReasonForError(err)
-			break loop
-		case reason = <-p.disc:
-			break loop
+	select {
+	case err := <-readErr:
+		if r, ok := err.(DiscReason); ok {
+			reason = r
+			break
 		}
+		// Note: We rely on protocols to abort if there is a write
+		// error. It might be more robust to handle them here as well.
+		p.DebugDetailf("Read error: %v\n", err)
+		p.conn.Close()
+		reason = DiscNetworkError
+	case err := <-p.protoErr:
+		reason = discReasonForError(err)
+	case reason = <-p.disc:
 	}
-	p.politeDisconnect(reason)
 
-	// Wait for readLoop. It will end because conn is now closed.
-	<-readErr
+	close(p.closed)
+	p.wg.Wait()
+	if reason != DiscNetworkError {
+		p.politeDisconnect(reason)
+	}
 	p.Debugf("Disconnected: %v\n", reason)
 	return reason
 }
@@ -174,18 +166,37 @@ func (p *Peer) politeDisconnect(reason DiscReason) {
 	p.conn.Close()
 }
 
-func (p *Peer) readLoop() error {
+func (p *Peer) pingLoop() {
+	ping := time.NewTicker(pingInterval)
+	defer p.wg.Done()
+	defer ping.Stop()
+	for {
+		select {
+		case <-ping.C:
+			if err := SendItems(p.rw, pingMsg); err != nil {
+				p.protoErr <- err
+				return
+			}
+		case <-p.closed:
+			return
+		}
+	}
+}
+
+func (p *Peer) readLoop(errc chan<- error) {
+	defer p.wg.Done()
 	for {
 		p.conn.SetDeadline(time.Now().Add(frameReadTimeout))
 		msg, err := p.rw.ReadMsg()
 		if err != nil {
-			return err
+			errc <- err
+			return
 		}
 		if err = p.handle(msg); err != nil {
-			return err
+			errc <- err
+			return
 		}
 	}
-	return nil
 }
 
 func (p *Peer) handle(msg Msg) error {
@@ -195,12 +206,11 @@ func (p *Peer) handle(msg Msg) error {
 		go SendItems(p.rw, pongMsg)
 	case msg.Code == discMsg:
 		var reason [1]DiscReason
-		// no need to discard or for error checking, we'll close the
-		// connection after this.
+		// This is the last message. We don't need to discard or
+		// check errors because, the connection will be closed after it.
 		rlp.Decode(msg.Payload, &reason)
 		p.Debugf("Disconnect requested: %v\n", reason[0])
-		p.Disconnect(DiscRequested)
-		return discRequestedError(reason[0])
+		return DiscRequested
 	case msg.Code < baseProtocolLength:
 		// ignore other base protocol messages
 		return msg.Discard()
@@ -210,7 +220,12 @@ func (p *Peer) handle(msg Msg) error {
 		if err != nil {
 			return fmt.Errorf("msg code out of range: %v", msg.Code)
 		}
-		proto.in <- msg
+		select {
+		case proto.in <- msg:
+			return nil
+		case <-p.closed:
+			return io.EOF
+		}
 	}
 	return nil
 }
@@ -234,10 +249,11 @@ outer:
 }
 
 func (p *Peer) startProtocols() {
+	p.wg.Add(len(p.running))
 	for _, proto := range p.running {
 		proto := proto
+		proto.closed = p.closed
 		p.DebugDetailf("Starting protocol %s/%d\n", proto.Name, proto.Version)
-		p.protoWG.Add(1)
 		go func() {
 			err := proto.Run(p, proto)
 			if err == nil {
@@ -246,11 +262,8 @@ func (p *Peer) startProtocols() {
 			} else {
 				p.DebugDetailf("Protocol %s/%d error: %v\n", proto.Name, proto.Version, err)
 			}
-			select {
-			case p.protoErr <- err:
-			case <-p.closed:
-			}
-			p.protoWG.Done()
+			p.protoErr <- err
+			p.wg.Done()
 		}()
 	}
 }
@@ -266,13 +279,6 @@ func (p *Peer) getProto(code uint64) (*protoRW, error) {
 	return nil, newPeerError(errInvalidMsgCode, "%d", code)
 }
 
-func (p *Peer) closeProtocols() {
-	for _, p := range p.running {
-		close(p.in)
-	}
-	p.protoWG.Wait()
-}
-
 // writeProtoMsg sends the given message on behalf of the given named protocol.
 // this exists because of Server.Broadcast.
 func (p *Peer) writeProtoMsg(protoName string, msg Msg) error {
@@ -289,8 +295,8 @@ func (p *Peer) writeProtoMsg(protoName string, msg Msg) error {
 
 type protoRW struct {
 	Protocol
-
 	in     chan Msg
+	closed <-chan struct{}
 	offset uint64
 	w      MsgWriter
 }
@@ -304,10 +310,11 @@ func (rw *protoRW) WriteMsg(msg Msg) error {
 }
 
 func (rw *protoRW) ReadMsg() (Msg, error) {
-	msg, ok := <-rw.in
-	if !ok {
-		return msg, io.EOF
+	select {
+	case msg := <-rw.in:
+		msg.Code -= rw.offset
+		return msg, nil
+	case <-rw.closed:
+		return Msg{}, io.EOF
 	}
-	msg.Code -= rw.offset
-	return msg, nil
 }
diff --git a/p2p/peer_error.go b/p2p/peer_error.go
index 0ff4f4b43..402131630 100644
--- a/p2p/peer_error.go
+++ b/p2p/peer_error.go
@@ -98,15 +98,13 @@ func (d DiscReason) String() string {
 	return discReasonToString[d]
 }
 
-type discRequestedError DiscReason
-
-func (err discRequestedError) Error() string {
-	return fmt.Sprintf("disconnect requested: %v", DiscReason(err))
+func (d DiscReason) Error() string {
+	return d.String()
 }
 
 func discReasonForError(err error) DiscReason {
-	if reason, ok := err.(discRequestedError); ok {
-		return DiscReason(reason)
+	if reason, ok := err.(DiscReason); ok {
+		return reason
 	}
 	peerError, ok := err.(*peerError)
 	if !ok {
diff --git a/p2p/peer_test.go b/p2p/peer_test.go
index 3c4c71c0c..fb76818a0 100644
--- a/p2p/peer_test.go
+++ b/p2p/peer_test.go
@@ -2,8 +2,9 @@ package p2p
 
 import (
 	"bytes"
+	"errors"
 	"fmt"
-	"io"
+	"math/rand"
 	"net"
 	"reflect"
 	"testing"
@@ -27,7 +28,7 @@ var discard = Protocol{
 	},
 }
 
-func testPeer(protos []Protocol) (io.Closer, *conn, *Peer, <-chan DiscReason) {
+func testPeer(protos []Protocol) (func(), *conn, *Peer, <-chan DiscReason) {
 	fd1, _ := net.Pipe()
 	hs1 := &protoHandshake{ID: randomID(), Version: baseProtocolVersion}
 	hs2 := &protoHandshake{ID: randomID(), Version: baseProtocolVersion}
@@ -41,7 +42,11 @@ func testPeer(protos []Protocol) (io.Closer, *conn, *Peer, <-chan DiscReason) {
 	errc := make(chan DiscReason, 1)
 	go func() { errc <- peer.run() }()
 
-	return p1, &conn{p2, hs2}, peer, errc
+	closer := func() {
+		p1.Close()
+		fd1.Close()
+	}
+	return closer, &conn{p2, hs2}, peer, errc
 }
 
 func TestPeerProtoReadMsg(t *testing.T) {
@@ -67,7 +72,7 @@ func TestPeerProtoReadMsg(t *testing.T) {
 	}
 
 	closer, rw, _, errc := testPeer([]Protocol{proto})
-	defer closer.Close()
+	defer closer()
 
 	Send(rw, baseProtocolLength+2, []uint{1})
 	Send(rw, baseProtocolLength+3, []uint{2})
@@ -99,7 +104,7 @@ func TestPeerProtoEncodeMsg(t *testing.T) {
 		},
 	}
 	closer, rw, _, _ := testPeer([]Protocol{proto})
-	defer closer.Close()
+	defer closer()
 
 	if err := ExpectMsg(rw, 17, []string{"foo", "bar"}); err != nil {
 		t.Error(err)
@@ -110,7 +115,7 @@ func TestPeerWriteForBroadcast(t *testing.T) {
 	defer testlog(t).detach()
 
 	closer, rw, peer, peerErr := testPeer([]Protocol{discard})
-	defer closer.Close()
+	defer closer()
 
 	emptymsg := func(code uint64) Msg {
 		return Msg{Code: code, Size: 0, Payload: bytes.NewReader(nil)}
@@ -150,7 +155,7 @@ func TestPeerPing(t *testing.T) {
 	defer testlog(t).detach()
 
 	closer, rw, _, _ := testPeer(nil)
-	defer closer.Close()
+	defer closer()
 	if err := SendItems(rw, pingMsg); err != nil {
 		t.Fatal(err)
 	}
@@ -163,19 +168,70 @@ func TestPeerDisconnect(t *testing.T) {
 	defer testlog(t).detach()
 
 	closer, rw, _, disc := testPeer(nil)
-	defer closer.Close()
+	defer closer()
 	if err := SendItems(rw, discMsg, DiscQuitting); err != nil {
 		t.Fatal(err)
 	}
 	if err := ExpectMsg(rw, discMsg, []interface{}{DiscRequested}); err != nil {
 		t.Error(err)
 	}
-	closer.Close() // make test end faster
+	closer()
 	if reason := <-disc; reason != DiscRequested {
 		t.Errorf("run returned wrong reason: got %v, want %v", reason, DiscRequested)
 	}
 }
 
+// This test is supposed to verify that Peer can reliably handle
+// multiple causes of disconnection occurring at the same time.
+func TestPeerDisconnectRace(t *testing.T) {
+	defer testlog(t).detach()
+	maybe := func() bool { return rand.Intn(1) == 1 }
+
+	for i := 0; i < 1000; i++ {
+		protoclose := make(chan error)
+		protodisc := make(chan DiscReason)
+		closer, rw, p, disc := testPeer([]Protocol{
+			{
+				Name:   "closereq",
+				Run:    func(p *Peer, rw MsgReadWriter) error { return <-protoclose },
+				Length: 1,
+			},
+			{
+				Name:   "disconnect",
+				Run:    func(p *Peer, rw MsgReadWriter) error { p.Disconnect(<-protodisc); return nil },
+				Length: 1,
+			},
+		})
+
+		// Simulate incoming messages.
+		go SendItems(rw, baseProtocolLength+1)
+		go SendItems(rw, baseProtocolLength+2)
+		// Close the network connection.
+		go closer()
+		// Make protocol "closereq" return.
+		protoclose <- errors.New("protocol closed")
+		// Make protocol "disconnect" call peer.Disconnect
+		protodisc <- DiscAlreadyConnected
+		// In some cases, simulate something else calling peer.Disconnect.
+		if maybe() {
+			go p.Disconnect(DiscInvalidIdentity)
+		}
+		// In some cases, simulate remote requesting a disconnect.
+		if maybe() {
+			go SendItems(rw, discMsg, DiscQuitting)
+		}
+
+		select {
+		case <-disc:
+		case <-time.After(2 * time.Second):
+			// Peer.run should return quickly. If it doesn't the Peer
+			// goroutines are probably deadlocked. Call panic in order to
+			// show the stacks.
+			panic("Peer.run took to long to return.")
+		}
+	}
+}
+
 func TestNewPeer(t *testing.T) {
 	name := "nodename"
 	caps := []Cap{{"foo", 2}, {"bar", 3}}