mirror of
https://github.com/status-im/consul.git
synced 2025-01-22 19:50:36 +00:00
agent: fix 'consul leave' shutdown race (#2880)
When the agent is triggered to shutdown via an external 'consul leave' command delivered via the HTTP API then the client expects to receive a response when the agent is down. This creates a race on when to shutdown the agent itself like the RPC server, the checks and the state and the external endpoints like DNS and HTTP. This patch splits the shutdown process into two parts: * shutdown the agent * shutdown the endpoints (http and dns) They can be executed multiple times, concurrently and in any order but should be executed first agent, then endpoints to provide consistent behavior across all use cases. Both calls have to be executed for a proper shutdown. This could be partially hidden in a single function but would introduce some magic that happens behind the scenes which one has to know of but isn't obvious. Fixes #2880
This commit is contained in:
parent
7abe308c66
commit
ea5b0f2c7c
@ -1112,9 +1112,10 @@ func (a *Agent) Leave() error {
|
||||
return a.delegate.Leave()
|
||||
}
|
||||
|
||||
// Shutdown is used to hard stop the agent. Should be
|
||||
// preceded by a call to Leave to do it gracefully.
|
||||
func (a *Agent) Shutdown() error {
|
||||
// ShutdownAgent is used to hard stop the agent. Should be preceded by
|
||||
// Leave to do it gracefully. Should be followed by ShutdownEndpoints to
|
||||
// terminate the HTTP and DNS servers as well.
|
||||
func (a *Agent) ShutdownAgent() error {
|
||||
a.shutdownLock.Lock()
|
||||
defer a.shutdownLock.Unlock()
|
||||
|
||||
@ -1123,29 +1124,6 @@ func (a *Agent) Shutdown() error {
|
||||
}
|
||||
a.logger.Println("[INFO] agent: Requesting shutdown")
|
||||
|
||||
// Stop all API endpoints
|
||||
for _, srv := range a.dnsServers {
|
||||
a.logger.Printf("[INFO] agent: Stopping DNS server %s (%s)", srv.Server.Addr, srv.Server.Net)
|
||||
srv.Shutdown()
|
||||
}
|
||||
for _, srv := range a.httpServers {
|
||||
// http server is HTTPS if TLSConfig is not nil and NextProtos does not only contain "h2"
|
||||
// the latter seems to be a side effect of HTTP/2 support in go 1.8. TLSConfig != nil is
|
||||
// no longer sufficient to check for an HTTPS server.
|
||||
a.logger.Printf("[INFO] agent: Stopping %s server %s", strings.ToUpper(srv.proto), srv.Addr)
|
||||
|
||||
// graceful shutdown
|
||||
ctx, cancel := context.WithTimeout(context.Background(), time.Second)
|
||||
defer cancel()
|
||||
srv.Shutdown(ctx)
|
||||
if ctx.Err() == context.DeadlineExceeded {
|
||||
a.logger.Printf("[WARN] agent: Timeout stopping %s server %s", strings.ToUpper(srv.proto), srv.Addr)
|
||||
}
|
||||
}
|
||||
a.logger.Println("[INFO] agent: Waiting for endpoints to shut down")
|
||||
a.wgServers.Wait()
|
||||
a.logger.Print("[INFO] agent: Endpoints down")
|
||||
|
||||
// Stop all the checks
|
||||
a.checkLock.Lock()
|
||||
defer a.checkLock.Unlock()
|
||||
@ -1155,11 +1133,9 @@ func (a *Agent) Shutdown() error {
|
||||
for _, chk := range a.checkTTLs {
|
||||
chk.Stop()
|
||||
}
|
||||
|
||||
for _, chk := range a.checkHTTPs {
|
||||
chk.Stop()
|
||||
}
|
||||
|
||||
for _, chk := range a.checkTCPs {
|
||||
chk.Stop()
|
||||
}
|
||||
@ -1185,6 +1161,38 @@ func (a *Agent) Shutdown() error {
|
||||
return err
|
||||
}
|
||||
|
||||
// ShutdownEndpoints terminates the HTTP and DNS servers. Should be
|
||||
// preceeded by ShutdownAgent.
|
||||
func (a *Agent) ShutdownEndpoints() {
|
||||
a.shutdownLock.Lock()
|
||||
defer a.shutdownLock.Unlock()
|
||||
|
||||
if len(a.dnsServers) == 0 || len(a.httpServers) == 0 {
|
||||
return
|
||||
}
|
||||
|
||||
for _, srv := range a.dnsServers {
|
||||
a.logger.Printf("[INFO] agent: Stopping DNS server %s (%s)", srv.Server.Addr, srv.Server.Net)
|
||||
srv.Shutdown()
|
||||
}
|
||||
a.dnsServers = nil
|
||||
|
||||
for _, srv := range a.httpServers {
|
||||
a.logger.Printf("[INFO] agent: Stopping %s server %s", strings.ToUpper(srv.proto), srv.Addr)
|
||||
ctx, cancel := context.WithTimeout(context.Background(), time.Second)
|
||||
defer cancel()
|
||||
srv.Shutdown(ctx)
|
||||
if ctx.Err() == context.DeadlineExceeded {
|
||||
a.logger.Printf("[WARN] agent: Timeout stopping %s server %s", strings.ToUpper(srv.proto), srv.Addr)
|
||||
}
|
||||
}
|
||||
a.httpServers = nil
|
||||
|
||||
a.logger.Println("[INFO] agent: Waiting for endpoints to shut down")
|
||||
a.wgServers.Wait()
|
||||
a.logger.Print("[INFO] agent: Endpoints down")
|
||||
}
|
||||
|
||||
// ReloadCh is used to return a channel that can be
|
||||
// used for triggering reloads and returning a response.
|
||||
func (a *Agent) ReloadCh() chan chan error {
|
||||
|
@ -199,7 +199,7 @@ func (s *HTTPServer) AgentLeave(resp http.ResponseWriter, req *http.Request) (in
|
||||
if err := s.agent.Leave(); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return nil, s.agent.Shutdown()
|
||||
return nil, s.agent.ShutdownAgent()
|
||||
}
|
||||
|
||||
func (s *HTTPServer) AgentForceLeave(resp http.ResponseWriter, req *http.Request) (interface{}, error) {
|
||||
|
@ -156,7 +156,8 @@ func (a *TestAgent) Start() *TestAgent {
|
||||
fmt.Println(id, a.Name, "Error starting agent:", err)
|
||||
runtime.Goexit()
|
||||
} else {
|
||||
agent.Shutdown()
|
||||
agent.ShutdownAgent()
|
||||
agent.ShutdownEndpoints()
|
||||
wait := time.Duration(rand.Int31n(2000)) * time.Millisecond
|
||||
fmt.Println(id, a.Name, "retrying in", wait)
|
||||
time.Sleep(wait)
|
||||
@ -221,7 +222,10 @@ func (a *TestAgent) Shutdown() error {
|
||||
os.RemoveAll(a.DataDir)
|
||||
}
|
||||
}()
|
||||
return a.Agent.Shutdown()
|
||||
|
||||
// shutdown agent before endpoints
|
||||
defer a.Agent.ShutdownEndpoints()
|
||||
return a.Agent.ShutdownAgent()
|
||||
}
|
||||
|
||||
func (a *TestAgent) HTTPAddr() string {
|
||||
|
@ -691,7 +691,10 @@ func (cmd *AgentCommand) run(args []string) int {
|
||||
cmd.UI.Error(fmt.Sprintf("Error starting agent: %s", err))
|
||||
return 1
|
||||
}
|
||||
defer agent.Shutdown()
|
||||
|
||||
// shutdown agent before endpoints
|
||||
defer agent.ShutdownEndpoints()
|
||||
defer agent.ShutdownAgent()
|
||||
|
||||
if !config.DisableUpdateCheck {
|
||||
cmd.startupUpdateCheck(config)
|
||||
|
Loading…
x
Reference in New Issue
Block a user