Kill check processes after the timeout is reached (#3567)

* Kill check processes after the timeout is reached

Kill the subprocess spawned by a script check once the timeout is reached. Previously Consul just marked the check critical and left the subprocess around.

Fixes #3565.

* Set err to non-nil when timeout occurs

* Fix check timeout test

* Kill entire process subtree on check timeout

* Add a docs note about windows subprocess termination
This commit is contained in:
Kyle Havlovitz 2017-10-11 11:57:39 -07:00 committed by James Phillips
parent aaf6b17a4d
commit 106b8b0b33
4 changed files with 47 additions and 15 deletions

View File

@ -104,13 +104,16 @@ func (c *CheckMonitor) check() {
// Create the command
var cmd *exec.Cmd
var err error
var cmdDisplay string
if len(c.ScriptArgs) > 0 {
cmdDisplay = fmt.Sprintf("%v", c.ScriptArgs)
cmd, err = ExecSubprocess(c.ScriptArgs)
} else {
cmdDisplay = c.Script
cmd, err = ExecScript(c.Script)
}
if err != nil {
c.Logger.Printf("[ERR] agent: failed to setup invoke '%s': %s", c.Script, err)
c.Logger.Printf("[ERR] agent: failed to setup invoke '%s': %s", cmdDisplay, err)
c.Notify.UpdateCheck(c.CheckID, api.HealthCritical, err.Error())
return
}
@ -119,28 +122,39 @@ func (c *CheckMonitor) check() {
output, _ := circbuf.NewBuffer(CheckBufSize)
cmd.Stdout = output
cmd.Stderr = output
SetSysProcAttr(cmd)
// Start the check
if err := cmd.Start(); err != nil {
c.Logger.Printf("[ERR] agent: failed to invoke '%s': %s", c.Script, err)
c.Logger.Printf("[ERR] agent: failed to invoke '%s': %s", cmdDisplay, err)
c.Notify.UpdateCheck(c.CheckID, api.HealthCritical, err.Error())
return
}
// Wait for the check to complete
errCh := make(chan error, 2)
waitCh := make(chan error, 1)
go func() {
errCh <- cmd.Wait()
waitCh <- cmd.Wait()
}()
go func() {
timeout := 30 * time.Second
if c.Timeout > 0 {
time.Sleep(c.Timeout)
} else {
time.Sleep(30 * time.Second)
timeout = c.Timeout
}
select {
case <-time.After(timeout):
if err := KillCommandSubtree(cmd); err != nil {
c.Logger.Printf("[WARN] Timed out running check '%s': error killing process: %v", cmdDisplay, err)
} else {
c.Logger.Printf("[WARN] Timed out (%s) running check '%s'", timeout.String(), cmdDisplay)
}
err = fmt.Errorf("Timed out running check '%s'", cmdDisplay)
<-waitCh
case err = <-waitCh:
// The process returned before the timeout, proceed normally
}
errCh <- fmt.Errorf("Timed out running check '%s'", c.Script)
}()
err = <-errCh
// Get the output, add a message about truncation
outputStr := string(output.Bytes())
@ -150,7 +164,7 @@ func (c *CheckMonitor) check() {
}
c.Logger.Printf("[DEBUG] agent: Check '%s' script '%s' output: %s",
c.CheckID, c.Script, outputStr)
c.CheckID, cmdDisplay, outputStr)
// Check if the check passed
if err == nil {

View File

@ -5,6 +5,7 @@ package agent
import (
"os"
"os/exec"
"syscall"
)
// ExecScript returns a command to execute a script through a shell.
@ -15,3 +16,12 @@ func ExecScript(script string) (*exec.Cmd, error) {
}
return exec.Command(shell, "-c", script), nil
}
func SetSysProcAttr(cmd *exec.Cmd) {
cmd.SysProcAttr = &syscall.SysProcAttr{Setpgid: true}
}
// KillCommandSubtree kills the command process and any child processes
func KillCommandSubtree(cmd *exec.Cmd) error {
return syscall.Kill(-cmd.Process.Pid, syscall.SIGKILL)
}

View File

@ -22,3 +22,9 @@ func ExecScript(script string) (*exec.Cmd, error) {
}
return cmd, nil
}
func SetSysProcAttr(cmd *exec.Cmd) {}
func KillCommandSubtree(cmd *exec.Cmd) error {
return cmd.Process.Kill()
}

View File

@ -24,9 +24,11 @@ There are five different kinds of checks:
a script check is limited to 4KB. Output larger than this will be truncated.
By default, Script checks will be configured with a timeout equal to 30 seconds.
It is possible to configure a custom Script check timeout value by specifying the
`timeout` field in the check definition. In Consul 0.9.0 and later, the agent
must be configured with [`enable_script_checks`](/docs/agent/options.html#_enable_script_checks)
set to `true` in order to enable script checks.
`timeout` field in the check definition. On Windows, Consul will wait for any child processes
spawned by the script to finish once the timeout is reached (instead of killing them immediately,
as on other platforms). In Consul 0.9.0 and later, the agent must be configured with
[`enable_script_checks`](/docs/agent/options.html#_enable_script_checks) set to `true`
in order to enable script checks.
* HTTP + Interval - These checks make an HTTP `GET` request every Interval (e.g.
every 30 seconds) to the specified URL. The status of the service depends on