From 90c83a32b586c7d4add8d8ca0096025ecb886a77 Mon Sep 17 00:00:00 2001 From: Frank Schroeder Date: Mon, 19 Jun 2017 20:02:01 +0200 Subject: [PATCH] agent: fix 'consul leave' shutdown race (#2880) When the agent is triggered to shutdown via an external 'consul leave' command delivered via the HTTP API then the client expects to receive a response when the agent is down. This creates a race on when to shutdown the agent itself like the RPC server, the checks and the state and the external endpoints like DNS and HTTP. Ideally, the external endpoints should be shutdown before the internal state but if the goal is to respond reliably that the agent is down then this is not possible. This patch splits the agent shutdown into two parts implemented in a single method to keep it simple and unambiguos for the caller. The first stage shuts down the internal state, checks, RPC server, ... synchronously and then triggers the shutdown of the external endpoints asychronously. This way the caller is guaranteed that the internal state services are down when Shutdown returns and there remains enough time to send a response. Fixes #2880 --- agent/agent.go | 62 +++++++++++++++++++++++++++++++------------------- 1 file changed, 38 insertions(+), 24 deletions(-) diff --git a/agent/agent.go b/agent/agent.go index 6595600e61..ea2eec52af 100644 --- a/agent/agent.go +++ b/agent/agent.go @@ -1095,6 +1095,18 @@ func (a *Agent) Leave() error { // Shutdown is used to hard stop the agent. Should be // preceded by a call to Leave to do it gracefully. func (a *Agent) Shutdown() error { + // Execute the shutdown in two stages since it may have been + // triggered through the HTTP server via 'consul leave'. In this + // case the HTTP server needs to be up after the agent is down in + // order to deliver the response. The 'consul leave' handler waits + // for the Shutdown() method to complete before delivering the + // response. + // + // This could also be implemented through multiple methods to + // decouple this in a different way but then this logic would have + // to be implemented everywhere. This way the logic remains in a + // single place. + a.shutdownLock.Lock() defer a.shutdownLock.Unlock() @@ -1103,28 +1115,7 @@ func (a *Agent) Shutdown() error { } a.logger.Println("[INFO] agent: Requesting shutdown") - // Stop all API endpoints - for _, srv := range a.dnsServers { - a.logger.Printf("[INFO] agent: Stopping DNS server %s (%s)", srv.Server.Addr, srv.Server.Net) - srv.Shutdown() - } - for _, srv := range a.httpServers { - // http server is HTTPS if TLSConfig is not nil and NextProtos does not only contain "h2" - // the latter seems to be a side effect of HTTP/2 support in go 1.8. TLSConfig != nil is - // no longer sufficient to check for an HTTPS server. - a.logger.Printf("[INFO] agent: Stopping %s server %s", strings.ToUpper(srv.proto), srv.Addr) - - // graceful shutdown - ctx, cancel := context.WithTimeout(context.Background(), time.Second) - defer cancel() - srv.Shutdown(ctx) - if ctx.Err() == context.DeadlineExceeded { - a.logger.Printf("[WARN] agent: Timeout stopping %s server %s", strings.ToUpper(srv.proto), srv.Addr) - } - } - a.logger.Println("[INFO] agent: Waiting for endpoints to shut down") - a.wgServers.Wait() - a.logger.Print("[INFO] agent: Endpoints down") + // stage 1: shutdown agent w/o HTTP and DNS endpoints // Stop all the checks a.checkLock.Lock() @@ -1159,9 +1150,32 @@ func (a *Agent) Shutdown() error { a.logger.Println("[WARN] agent: could not delete pid file ", pidErr) } - a.logger.Println("[INFO] agent: shutdown complete") + // stage 2: shutdown HTTP and DNS endpoints and trigger final shutdown notification + + go func() { + // Stop all API endpoints + for _, srv := range a.dnsServers { + a.logger.Printf("[INFO] agent: Stopping DNS server %s (%s)", srv.Server.Addr, srv.Server.Net) + srv.Shutdown() + } + for _, srv := range a.httpServers { + a.logger.Printf("[INFO] agent: Stopping %s server %s", strings.ToUpper(srv.proto), srv.Addr) + ctx, cancel := context.WithTimeout(context.Background(), time.Second) + defer cancel() + srv.Shutdown(ctx) + if ctx.Err() == context.DeadlineExceeded { + a.logger.Printf("[WARN] agent: Timeout stopping %s server %s", strings.ToUpper(srv.proto), srv.Addr) + } + } + a.logger.Println("[INFO] agent: Waiting for endpoints to shut down") + a.wgServers.Wait() + a.logger.Print("[INFO] agent: Endpoints down") + + a.logger.Println("[INFO] agent: shutdown complete") + close(a.shutdownCh) + }() + a.shutdown = true - close(a.shutdownCh) return err }