agent: fix 'consul leave' shutdown race (#2880)

When the agent is triggered to shutdown via an external 'consul leave'
command delivered via the HTTP API then the client expects to receive a
response when the agent is down. This creates a race on when to shutdown
the agent itself like the RPC server, the checks and the state and the
external endpoints like DNS and HTTP.

This patch splits the shutdown process into two parts:

 * shutdown the agent
 * shutdown the endpoints (http and dns)

They can be executed multiple times, concurrently and in any order but
should be executed first agent, then endpoints to provide consistent
behavior across all use cases. Both calls have to be executed for a
proper shutdown.

This could be partially hidden in a single function but would introduce
some magic that happens behind the scenes which one has to know of but
isn't obvious.

Fixes #2880
This commit is contained in:
Frank Schroeder 2017-06-20 09:29:20 +02:00 committed by Frank Schröder
parent 7abe308c66
commit ea5b0f2c7c
4 changed files with 47 additions and 32 deletions

View File

@ -1112,9 +1112,10 @@ func (a *Agent) Leave() error {
return a.delegate.Leave()
}
// Shutdown is used to hard stop the agent. Should be
// preceded by a call to Leave to do it gracefully.
func (a *Agent) Shutdown() error {
// ShutdownAgent is used to hard stop the agent. Should be preceded by
// Leave to do it gracefully. Should be followed by ShutdownEndpoints to
// terminate the HTTP and DNS servers as well.
func (a *Agent) ShutdownAgent() error {
a.shutdownLock.Lock()
defer a.shutdownLock.Unlock()
@ -1123,29 +1124,6 @@ func (a *Agent) Shutdown() error {
}
a.logger.Println("[INFO] agent: Requesting shutdown")
// Stop all API endpoints
for _, srv := range a.dnsServers {
a.logger.Printf("[INFO] agent: Stopping DNS server %s (%s)", srv.Server.Addr, srv.Server.Net)
srv.Shutdown()
}
for _, srv := range a.httpServers {
// http server is HTTPS if TLSConfig is not nil and NextProtos does not only contain "h2"
// the latter seems to be a side effect of HTTP/2 support in go 1.8. TLSConfig != nil is
// no longer sufficient to check for an HTTPS server.
a.logger.Printf("[INFO] agent: Stopping %s server %s", strings.ToUpper(srv.proto), srv.Addr)
// graceful shutdown
ctx, cancel := context.WithTimeout(context.Background(), time.Second)
defer cancel()
srv.Shutdown(ctx)
if ctx.Err() == context.DeadlineExceeded {
a.logger.Printf("[WARN] agent: Timeout stopping %s server %s", strings.ToUpper(srv.proto), srv.Addr)
}
}
a.logger.Println("[INFO] agent: Waiting for endpoints to shut down")
a.wgServers.Wait()
a.logger.Print("[INFO] agent: Endpoints down")
// Stop all the checks
a.checkLock.Lock()
defer a.checkLock.Unlock()
@ -1155,11 +1133,9 @@ func (a *Agent) Shutdown() error {
for _, chk := range a.checkTTLs {
chk.Stop()
}
for _, chk := range a.checkHTTPs {
chk.Stop()
}
for _, chk := range a.checkTCPs {
chk.Stop()
}
@ -1185,6 +1161,38 @@ func (a *Agent) Shutdown() error {
return err
}
// ShutdownEndpoints terminates the HTTP and DNS servers. Should be
// preceeded by ShutdownAgent.
func (a *Agent) ShutdownEndpoints() {
a.shutdownLock.Lock()
defer a.shutdownLock.Unlock()
if len(a.dnsServers) == 0 || len(a.httpServers) == 0 {
return
}
for _, srv := range a.dnsServers {
a.logger.Printf("[INFO] agent: Stopping DNS server %s (%s)", srv.Server.Addr, srv.Server.Net)
srv.Shutdown()
}
a.dnsServers = nil
for _, srv := range a.httpServers {
a.logger.Printf("[INFO] agent: Stopping %s server %s", strings.ToUpper(srv.proto), srv.Addr)
ctx, cancel := context.WithTimeout(context.Background(), time.Second)
defer cancel()
srv.Shutdown(ctx)
if ctx.Err() == context.DeadlineExceeded {
a.logger.Printf("[WARN] agent: Timeout stopping %s server %s", strings.ToUpper(srv.proto), srv.Addr)
}
}
a.httpServers = nil
a.logger.Println("[INFO] agent: Waiting for endpoints to shut down")
a.wgServers.Wait()
a.logger.Print("[INFO] agent: Endpoints down")
}
// ReloadCh is used to return a channel that can be
// used for triggering reloads and returning a response.
func (a *Agent) ReloadCh() chan chan error {

View File

@ -199,7 +199,7 @@ func (s *HTTPServer) AgentLeave(resp http.ResponseWriter, req *http.Request) (in
if err := s.agent.Leave(); err != nil {
return nil, err
}
return nil, s.agent.Shutdown()
return nil, s.agent.ShutdownAgent()
}
func (s *HTTPServer) AgentForceLeave(resp http.ResponseWriter, req *http.Request) (interface{}, error) {

View File

@ -156,7 +156,8 @@ func (a *TestAgent) Start() *TestAgent {
fmt.Println(id, a.Name, "Error starting agent:", err)
runtime.Goexit()
} else {
agent.Shutdown()
agent.ShutdownAgent()
agent.ShutdownEndpoints()
wait := time.Duration(rand.Int31n(2000)) * time.Millisecond
fmt.Println(id, a.Name, "retrying in", wait)
time.Sleep(wait)
@ -221,7 +222,10 @@ func (a *TestAgent) Shutdown() error {
os.RemoveAll(a.DataDir)
}
}()
return a.Agent.Shutdown()
// shutdown agent before endpoints
defer a.Agent.ShutdownEndpoints()
return a.Agent.ShutdownAgent()
}
func (a *TestAgent) HTTPAddr() string {

View File

@ -691,7 +691,10 @@ func (cmd *AgentCommand) run(args []string) int {
cmd.UI.Error(fmt.Sprintf("Error starting agent: %s", err))
return 1
}
defer agent.Shutdown()
// shutdown agent before endpoints
defer agent.ShutdownEndpoints()
defer agent.ShutdownAgent()
if !config.DisableUpdateCheck {
cmd.startupUpdateCheck(config)