Kill check processes after the timeout is reached (#3567)

* Kill check processes after the timeout is reached

Kill the subprocess spawned by a script check once the timeout is reached. Previously Consul just marked the check critical and left the subprocess around.

Fixes #3565.

* Set err to non-nil when timeout occurs

* Fix check timeout test

* Kill entire process subtree on check timeout

* Add a docs note about windows subprocess termination
This commit is contained in:
Kyle Havlovitz 2017-10-11 11:57:39 -07:00 committed by James Phillips
parent aaf6b17a4d
commit 106b8b0b33
4 changed files with 47 additions and 15 deletions

View File

@ -104,13 +104,16 @@ func (c *CheckMonitor) check() {
// Create the command // Create the command
var cmd *exec.Cmd var cmd *exec.Cmd
var err error var err error
var cmdDisplay string
if len(c.ScriptArgs) > 0 { if len(c.ScriptArgs) > 0 {
cmdDisplay = fmt.Sprintf("%v", c.ScriptArgs)
cmd, err = ExecSubprocess(c.ScriptArgs) cmd, err = ExecSubprocess(c.ScriptArgs)
} else { } else {
cmdDisplay = c.Script
cmd, err = ExecScript(c.Script) cmd, err = ExecScript(c.Script)
} }
if err != nil { if err != nil {
c.Logger.Printf("[ERR] agent: failed to setup invoke '%s': %s", c.Script, err) c.Logger.Printf("[ERR] agent: failed to setup invoke '%s': %s", cmdDisplay, err)
c.Notify.UpdateCheck(c.CheckID, api.HealthCritical, err.Error()) c.Notify.UpdateCheck(c.CheckID, api.HealthCritical, err.Error())
return return
} }
@ -119,28 +122,39 @@ func (c *CheckMonitor) check() {
output, _ := circbuf.NewBuffer(CheckBufSize) output, _ := circbuf.NewBuffer(CheckBufSize)
cmd.Stdout = output cmd.Stdout = output
cmd.Stderr = output cmd.Stderr = output
SetSysProcAttr(cmd)
// Start the check // Start the check
if err := cmd.Start(); err != nil { if err := cmd.Start(); err != nil {
c.Logger.Printf("[ERR] agent: failed to invoke '%s': %s", c.Script, err) c.Logger.Printf("[ERR] agent: failed to invoke '%s': %s", cmdDisplay, err)
c.Notify.UpdateCheck(c.CheckID, api.HealthCritical, err.Error()) c.Notify.UpdateCheck(c.CheckID, api.HealthCritical, err.Error())
return return
} }
// Wait for the check to complete // Wait for the check to complete
errCh := make(chan error, 2) waitCh := make(chan error, 1)
go func() { go func() {
errCh <- cmd.Wait() waitCh <- cmd.Wait()
}() }()
go func() {
if c.Timeout > 0 { timeout := 30 * time.Second
time.Sleep(c.Timeout) if c.Timeout > 0 {
timeout = c.Timeout
}
select {
case <-time.After(timeout):
if err := KillCommandSubtree(cmd); err != nil {
c.Logger.Printf("[WARN] Timed out running check '%s': error killing process: %v", cmdDisplay, err)
} else { } else {
time.Sleep(30 * time.Second) c.Logger.Printf("[WARN] Timed out (%s) running check '%s'", timeout.String(), cmdDisplay)
} }
errCh <- fmt.Errorf("Timed out running check '%s'", c.Script)
}() err = fmt.Errorf("Timed out running check '%s'", cmdDisplay)
err = <-errCh <-waitCh
case err = <-waitCh:
// The process returned before the timeout, proceed normally
}
// Get the output, add a message about truncation // Get the output, add a message about truncation
outputStr := string(output.Bytes()) outputStr := string(output.Bytes())
@ -150,7 +164,7 @@ func (c *CheckMonitor) check() {
} }
c.Logger.Printf("[DEBUG] agent: Check '%s' script '%s' output: %s", c.Logger.Printf("[DEBUG] agent: Check '%s' script '%s' output: %s",
c.CheckID, c.Script, outputStr) c.CheckID, cmdDisplay, outputStr)
// Check if the check passed // Check if the check passed
if err == nil { if err == nil {

View File

@ -5,6 +5,7 @@ package agent
import ( import (
"os" "os"
"os/exec" "os/exec"
"syscall"
) )
// ExecScript returns a command to execute a script through a shell. // ExecScript returns a command to execute a script through a shell.
@ -15,3 +16,12 @@ func ExecScript(script string) (*exec.Cmd, error) {
} }
return exec.Command(shell, "-c", script), nil return exec.Command(shell, "-c", script), nil
} }
func SetSysProcAttr(cmd *exec.Cmd) {
cmd.SysProcAttr = &syscall.SysProcAttr{Setpgid: true}
}
// KillCommandSubtree kills the command process and any child processes
func KillCommandSubtree(cmd *exec.Cmd) error {
return syscall.Kill(-cmd.Process.Pid, syscall.SIGKILL)
}

View File

@ -22,3 +22,9 @@ func ExecScript(script string) (*exec.Cmd, error) {
} }
return cmd, nil return cmd, nil
} }
func SetSysProcAttr(cmd *exec.Cmd) {}
func KillCommandSubtree(cmd *exec.Cmd) error {
return cmd.Process.Kill()
}

View File

@ -24,9 +24,11 @@ There are five different kinds of checks:
a script check is limited to 4KB. Output larger than this will be truncated. a script check is limited to 4KB. Output larger than this will be truncated.
By default, Script checks will be configured with a timeout equal to 30 seconds. By default, Script checks will be configured with a timeout equal to 30 seconds.
It is possible to configure a custom Script check timeout value by specifying the It is possible to configure a custom Script check timeout value by specifying the
`timeout` field in the check definition. In Consul 0.9.0 and later, the agent `timeout` field in the check definition. On Windows, Consul will wait for any child processes
must be configured with [`enable_script_checks`](/docs/agent/options.html#_enable_script_checks) spawned by the script to finish once the timeout is reached (instead of killing them immediately,
set to `true` in order to enable script checks. as on other platforms). In Consul 0.9.0 and later, the agent must be configured with
[`enable_script_checks`](/docs/agent/options.html#_enable_script_checks) set to `true`
in order to enable script checks.
* HTTP + Interval - These checks make an HTTP `GET` request every Interval (e.g. * HTTP + Interval - These checks make an HTTP `GET` request every Interval (e.g.
every 30 seconds) to the specified URL. The status of the service depends on every 30 seconds) to the specified URL. The status of the service depends on