fix req/resp protocol (#1621)

per spec, we must half-close request stream - not doing so may lead to
failure of the other end to start processing our request leading to
timeouts.

In particular, this fixes many sync problems that have been seen on
medalla.

* remove safeClose - close no longer raises
* use per-chunk timeouts in request processing
This commit is contained in:
Jacek Sieka 2020-09-10 21:40:09 +02:00 committed by GitHub
parent 697bd23c9b
commit a087909683
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 30 additions and 15 deletions

View File

@ -250,10 +250,6 @@ declarePublicCounter nbc_timeout_dials,
declarePublicGauge nbc_peers,
"Number of active libp2p peers"
proc safeClose(conn: Connection) {.async.} =
if not conn.closed:
await close(conn)
const
snappy_implementation {.strdefine.} = "libp2p"
@ -470,7 +466,7 @@ proc sendNotificationMsg(peer: Peer, protocolId: string, requestBytes: Bytes) {.
try:
await stream.writeChunk(none ResponseCode, requestBytes)
finally:
await safeClose(stream)
await stream.close()
proc sendResponseChunkBytes(response: UntypedResponse, payload: Bytes) {.async.} =
inc response.writtenChunks
@ -505,14 +501,16 @@ proc makeEth2Request(peer: Peer, protocolId: string, requestBytes: Bytes,
try:
# Send the request
await stream.writeChunk(none ResponseCode, requestBytes)
# Half-close the stream to mark the end of the request - if this is not
# done, the other peer might never send us the response.
await stream.close()
# Read the response
return awaitWithTimeout(
readResponse(when useNativeSnappy: libp2pInput(stream) else: stream,
peer, ResponseMsg),
deadline, neterr(ReadResponseTimeout))
return
await readResponse(when useNativeSnappy: libp2pInput(stream) else: stream,
peer, ResponseMsg, timeout)
finally:
await safeClose(stream)
await stream.close()
proc init*[MsgType](T: type MultipleChunksResponse[MsgType],
peer: Peer, conn: Connection): T =
@ -676,7 +674,7 @@ proc handleIncomingStream(network: Eth2Node,
debug "Error processing an incoming request", err = err.msg, msgName
finally:
await safeClose(conn)
await conn.close()
proc handleOutgoingPeer(peer: Peer): Future[bool] {.async.} =
let network = peer.network

View File

@ -155,17 +155,34 @@ proc readResponseChunk(conn: Connection, peer: Peer,
return neterr UnexpectedEOF
proc readResponse(conn: Connection, peer: Peer,
MsgType: type): Future[NetRes[MsgType]] {.gcsafe, async.} =
MsgType: type, timeout: Duration): Future[NetRes[MsgType]] {.gcsafe, async.} =
when MsgType is seq:
type E = ElemType(MsgType)
var results: MsgType
while true:
let nextRes = await conn.readResponseChunk(peer, E)
# Because we interleave networking with response processing, it may
# happen that reading all chunks takes longer than a strict dealine
# timeout would allow, so we allow each chunk a new timeout instead.
# The problem is exacerbated by the large number of round-trips to the
# poll loop that each future along the way causes.
trace "reading chunk", conn
let nextFut = conn.readResponseChunk(peer, E)
if not await nextFut.withTimeout(timeout):
return neterr(ReadResponseTimeout)
let nextRes = nextFut.read()
if nextRes.isErr:
if nextRes.error.kind == PotentiallyExpectedEOF:
trace "EOF chunk", conn, err = nextRes.error
return ok results
trace "Error chunk", conn, err = nextRes.error
return err nextRes.error
else:
trace "Got chunk", conn
results.add nextRes.value
else:
return await conn.readResponseChunk(peer, MsgType)
let nextFut = conn.readResponseChunk(peer, MsgType)
if not await nextFut.withTimeout(timeout):
return neterr(ReadResponseTimeout)
return nextFut.read()

View File

@ -633,7 +633,7 @@ proc getBlocks*[A, B](man: SyncManager[A, B], peer: A,
debug "Error, while reading getBlocks response",
peer = peer, slot = req.slot, count = req.count,
step = req.step, peer_speed = peer.netKbps(),
topics = "syncman", error = res.error()
topics = "syncman", error = $res.error()
result = res
template headAge(): uint64 =