Test for adopted process Stop race and fix

This commit is contained in:
Paul Banks 2018-06-06 13:45:39 +01:00 committed by Jack Pearkes
parent 03131398c4
commit b2ff583392
3 changed files with 330 additions and 19 deletions

View File

@ -8,6 +8,7 @@ import (
"reflect" "reflect"
"strconv" "strconv"
"sync" "sync"
"syscall"
"time" "time"
"github.com/hashicorp/consul/lib/file" "github.com/hashicorp/consul/lib/file"
@ -108,7 +109,11 @@ func (p *Daemon) keepAlive(stopCh <-chan struct{}, exitedCh chan<- struct{}) {
// attempts keeps track of the number of restart attempts we've had and // attempts keeps track of the number of restart attempts we've had and
// is used to calculate the wait time using an exponential backoff. // is used to calculate the wait time using an exponential backoff.
var attemptsDeadline time.Time var attemptsDeadline time.Time
var attempts uint var attempts uint32
// Assume the process is adopted, we reset this when we start a new process
// ourselves below and use it to decide on a strategy for waiting.
adopted := true
for { for {
if process == nil { if process == nil {
@ -121,7 +126,11 @@ func (p *Daemon) keepAlive(stopCh <-chan struct{}, exitedCh chan<- struct{}) {
// Calculate the exponential backoff and wait if we have to // Calculate the exponential backoff and wait if we have to
if attempts > DaemonRestartBackoffMin { if attempts > DaemonRestartBackoffMin {
waitTime := (1 << (attempts - DaemonRestartBackoffMin)) * time.Second exponent := (attempts - DaemonRestartBackoffMin)
if exponent > 31 {
exponent = 31
}
waitTime := (1 << exponent) * time.Second
if waitTime > DaemonRestartMaxWait { if waitTime > DaemonRestartMaxWait {
waitTime = DaemonRestartMaxWait waitTime = DaemonRestartMaxWait
} }
@ -159,6 +168,7 @@ func (p *Daemon) keepAlive(stopCh <-chan struct{}, exitedCh chan<- struct{}) {
process, err = p.start() process, err = p.start()
if err == nil { if err == nil {
p.process = process p.process = process
adopted = false
} }
p.lock.Unlock() p.lock.Unlock()
@ -169,25 +179,34 @@ func (p *Daemon) keepAlive(stopCh <-chan struct{}, exitedCh chan<- struct{}) {
} }
// Wait for the process to exit. Note that if we restored this proxy var ps *os.ProcessState
// then Wait will always fail because we likely aren't the parent var err error
// process. Therefore, we do an extra sanity check after to use other
// syscalls to verify the process is truly dead.
ps, err := process.Wait()
if _, err := findProcess(process.Pid); err == nil {
select {
case <-time.After(1 * time.Second):
// We want a busy loop, but not too busy. 1 second between
// detecting a process death seems reasonable.
case <-stopCh: if adopted {
// If we receive a stop request we want to exit immediately. // assign to err outside scope
return _, err = findProcess(process.Pid)
if err == nil {
// Process appears to be running still, wait a bit before we poll again.
// We want a busy loop, but not too busy. 1 second between detecting a
// process death seems reasonable.
//
// SUBTELTY: we must NOT select on stopCh here since the Stop function
// assumes that as soon as this method returns and closes exitedCh, that
// the process is no longer running. If we are polling then we don't
// know that is true until we've polled again so we have to keep polling
// until the process goes away even if we know the Daemon is stopping.
time.Sleep(1 * time.Second)
// Restart the loop, process is still set so we effectively jump back to
// the findProcess call above.
continue
} }
} else {
continue // Wait for child to exit
ps, err = process.Wait()
} }
// Process exited somehow.
process = nil process = nil
if err != nil { if err != nil {
p.Logger.Printf("[INFO] agent/proxy: daemon exited with error: %s", err) p.Logger.Printf("[INFO] agent/proxy: daemon exited with error: %s", err)
@ -219,6 +238,10 @@ func (p *Daemon) start() (*os.Process, error) {
cmd.Args = []string{cmd.Path} cmd.Args = []string{cmd.Path}
} }
// Start it in a new sessions (and hence process group) so that killing agent
// (even with Ctrl-C) won't kill proxy.
cmd.SysProcAttr = &syscall.SysProcAttr{Setsid: true}
// Start it // Start it
p.Logger.Printf("[DEBUG] agent/proxy: starting proxy: %q %#v", cmd.Path, cmd.Args[1:]) p.Logger.Printf("[DEBUG] agent/proxy: starting proxy: %q %#v", cmd.Path, cmd.Args[1:])
if err := cmd.Start(); err != nil { if err := cmd.Start(); err != nil {

View File

@ -5,6 +5,8 @@ import (
"os" "os"
"os/exec" "os/exec"
"path/filepath" "path/filepath"
"strconv"
"syscall"
"testing" "testing"
"time" "time"
@ -102,6 +104,150 @@ func TestDaemonRestart(t *testing.T) {
waitFile() waitFile()
} }
func TestDaemonLaunchesNewProcessGroup(t *testing.T) {
t.Parallel()
require := require.New(t)
td, closer := testTempDir(t)
defer closer()
path := filepath.Join(td, "file")
pidPath := filepath.Join(td, "child.pid")
// Start the parent process wrapping a start-stop test. The parent is acting
// as our "agent". We need an extra indirection to be able to kill the "agent"
// and still be running the test process.
parentCmd := helperProcess("parent", pidPath, "start-stop", path)
// We MUST run this as a separate process group otherwise the Kill below will
// kill this test process (and possibly your shell/editor that launched it!)
parentCmd.SysProcAttr = &syscall.SysProcAttr{Setsid: true}
require.NoError(parentCmd.Start())
// Wait for the pid file to exist so we know parent is running
retry.Run(t, func(r *retry.R) {
_, err := os.Stat(pidPath)
if err == nil {
return
}
r.Fatalf("error: %s", err)
})
// And wait for the actual file to be sure the child is running (it should be
// since parent doesn't write PID until child starts but the child might not
// have completed the write to disk yet which causes flakiness below).
retry.Run(t, func(r *retry.R) {
_, err := os.Stat(path)
if err == nil {
return
}
r.Fatalf("error: %s", err)
})
// Get the child PID
bs, err := ioutil.ReadFile(pidPath)
require.NoError(err)
pid, err := strconv.Atoi(string(bs))
require.NoError(err)
proc, err := os.FindProcess(pid)
require.NoError(err)
// Always cleanup child process after
defer func() {
if proc != nil {
proc.Kill()
}
}()
// Now kill the parent's whole process group and wait for it
pgid, err := syscall.Getpgid(parentCmd.Process.Pid)
require.NoError(err)
// Yep the minus PGid is how you kill a whole process group in unix... no idea
// how this works on windows. We TERM no KILL since we rely on the child
// catching the signal and deleting it's file to detect correct behaviour.
require.NoError(syscall.Kill(-pgid, syscall.SIGTERM))
_, err = parentCmd.Process.Wait()
require.NoError(err)
// The child should still be running so file should still be there
_, err = os.Stat(path)
require.NoError(err, "child should still be running")
// TEST PART 2 - verify that adopting an existing process works and picks up
// monitoring even though it's not a child. We can't do this accurately with
// Restart test since even if we create a new `Daemon` object the test process
// is still the parent. We need the indirection of the `parent` test helper to
// actually verify "adoption" on restart works.
// Start a new parent that will "adopt" the existing child even though it will
// not be an actual child process.
fosterCmd := helperProcess("parent", pidPath, "start-stop", path)
// Don't care about it being same process group this time as we will just kill
// it normally.
require.NoError(fosterCmd.Start())
defer func() {
// Clean up the daemon and wait for it to prevent it becoming a zombie.
fosterCmd.Process.Kill()
fosterCmd.Wait()
}()
// The child should still be running so file should still be there
_, err = os.Stat(path)
require.NoError(err, "child should still be running")
{
// Get the child PID - it shouldn't have changed and should be running
bs2, err := ioutil.ReadFile(pidPath)
require.NoError(err)
pid2, err := strconv.Atoi(string(bs2))
require.NoError(err)
// Defer a cleanup (til end of test function)
proc, err := os.FindProcess(pid)
require.NoError(err)
defer func() { proc.Kill() }()
require.Equal(pid, pid2)
t.Logf("Child PID was %d and still %d", pid, pid2)
}
// Now killing the child directly should still be restarted by the Daemon
require.NoError(proc.Kill())
proc = nil
retry.Run(t, func(r *retry.R) {
// Get the child PID - it should have changed
bs, err := ioutil.ReadFile(pidPath)
r.Check(err)
newPid, err := strconv.Atoi(string(bs))
r.Check(err)
if newPid == pid {
r.Fatalf("Child PID file not changed, Daemon not restarting it")
}
t.Logf("Child PID was %d and is now %d", pid, newPid)
})
// I had to run through this test in debugger a lot of times checking ps state
// by hand at different points to convince myself it was doing the right
// thing. It doesn't help that with verbose logs on it seems that the stdio
// from the `parent` process can sometimes miss lines out due to timing. For
// example the `[INFO] agent/proxy: daemon exited...` log from Daemon that
// indicates that the child was detected to have failed and is restarting is
// never output on my Mac at full speed. But if I run in debugger and have it
// pause at the step after the child is killed above, then it shows. The
// `[DEBUG] agent/proxy: starting proxy:` for the restart does always come
// through though which is odd. I assume this is some odd quirk of timing
// between processes and stdio or something but it makes debugging this stuff
// even harder!
// Let defer clean up the child process(es)
}
func TestDaemonStop_kill(t *testing.T) { func TestDaemonStop_kill(t *testing.T) {
t.Parallel() t.Parallel()
@ -132,7 +278,85 @@ func TestDaemonStop_kill(t *testing.T) {
// Stop the process // Stop the process
require.NoError(d.Stop()) require.NoError(d.Stop())
// State the file so that we can get the mtime // Stat the file so that we can get the mtime
fi, err := os.Stat(path)
require.NoError(err)
mtime := fi.ModTime()
// The mtime shouldn't change
time.Sleep(100 * time.Millisecond)
fi, err = os.Stat(path)
require.NoError(err)
require.Equal(mtime, fi.ModTime())
}
func TestDaemonStop_killAdopted(t *testing.T) {
t.Parallel()
require := require.New(t)
td, closer := testTempDir(t)
defer closer()
path := filepath.Join(td, "file")
// In this test we want to ensure that gracefull/ungraceful stop works with
// processes that were adopted by current process but not started by it. (i.e.
// we have to poll them not use Wait).
//
// We could use `parent` indirection to get a child that is actually not
// started by this process but that's a lot of hoops to jump through on top of
// an already complex multi-process test case.
//
// For now we rely on an implementation detail of Daemon which is potentially
// brittle but beats lots of extra complexity here. Currently, if
// Daemon.process is non-nil, the keepAlive loop will explicitly assume it's
// not a child and so will use polling to monitor it. If we ever change that
// it might invalidate this test and we would either need more indirection
// here, or an alternative explicit signal on Daemon like Daemon.forcePoll to
// ensure we are exercising that code path.
// Start the "child" process
childCmd := helperProcess("stop-kill", path)
require.NoError(childCmd.Start())
go func() { childCmd.Wait() }() // Prevent it becoming a zombie when killed
defer func() { childCmd.Process.Kill() }()
// Create the Daemon
d := &Daemon{
Command: helperProcess("stop-kill", path),
ProxyToken: "hello",
Logger: testLogger,
gracefulWait: 200 * time.Millisecond,
// Can't just set process as it will bypass intializing stopCh etc.
}
// Adopt the pid from a fake state snapshot (this correctly initialises Daemon
// for adoption)
fakeSnap := map[string]interface{}{
"Pid": childCmd.Process.Pid,
"CommandPath": childCmd.Path,
"CommandArgs": childCmd.Args,
"CommandDir": childCmd.Dir,
"CommandEnv": childCmd.Env,
"ProxyToken": d.ProxyToken,
}
require.NoError(d.UnmarshalSnapshot(fakeSnap))
require.NoError(d.Start())
// Wait for the file to exist (child was already running so this doesn't
// gaurantee that Daemon is in "polling" state)
retry.Run(t, func(r *retry.R) {
_, err := os.Stat(path)
if err == nil {
return
}
r.Fatalf("error: %s", err)
})
// Stop the process
require.NoError(d.Stop())
// Stat the file so that we can get the mtime
fi, err := os.Stat(path) fi, err := os.Stat(path)
require.NoError(err) require.NoError(err)
mtime := fi.ModTime() mtime := fi.ModTime()
@ -404,6 +628,10 @@ func TestDaemonUnmarshalSnapshot(t *testing.T) {
d2 := &Daemon{Logger: testLogger} d2 := &Daemon{Logger: testLogger}
require.NoError(d2.UnmarshalSnapshot(snap)) require.NoError(d2.UnmarshalSnapshot(snap))
// Verify the daemon is still running
_, err = os.Stat(path)
require.NoError(err)
// Stop the process // Stop the process
require.NoError(d2.Stop()) require.NoError(d2.Stop())

View File

@ -7,6 +7,8 @@ import (
"os" "os"
"os/exec" "os/exec"
"os/signal" "os/signal"
"strconv"
"syscall"
"testing" "testing"
"time" "time"
) )
@ -74,7 +76,7 @@ func TestHelperProcess(t *testing.T) {
// and deletes it only when it is stopped. // and deletes it only when it is stopped.
case "start-stop": case "start-stop":
ch := make(chan os.Signal, 1) ch := make(chan os.Signal, 1)
signal.Notify(ch, os.Interrupt) signal.Notify(ch, os.Interrupt, syscall.SIGTERM)
defer signal.Stop(ch) defer signal.Stop(ch)
path := args[0] path := args[0]
@ -155,6 +157,64 @@ func TestHelperProcess(t *testing.T) {
<-make(chan struct{}) <-make(chan struct{})
// Parent runs the given process in a Daemon and then sleeps until the test
// code kills it. It exists to test that the Daemon-managed child process
// survives it's parent exiting which we can't test directly without exiting
// the test process so we need an extra level of indirection. The test code
// using this must pass a file path as the first argument for the child
// processes PID to be written and then must take care to clean up that PID
// later or the child will be left running forever.
//
// If the PID file already exists, it will "adopt" the child rather than
// launch a new one.
case "parent":
// We will write the PID for the child to the file in the first argument
// then pass rest of args through to command.
pidFile := args[0]
d := &Daemon{
Command: helperProcess(args[1:]...),
Logger: testLogger,
PidPath: pidFile,
}
_, err := os.Stat(pidFile)
if err == nil {
// pidFile exists, read it and "adopt" the process
bs, err := ioutil.ReadFile(pidFile)
if err != nil {
log.Printf("Error: %s", err)
os.Exit(1)
}
pid, err := strconv.Atoi(string(bs))
if err != nil {
log.Printf("Error: %s", err)
os.Exit(1)
}
// Make a fake snapshot to load
snapshot := map[string]interface{}{
"Pid": pid,
"CommandPath": d.Command.Path,
"CommandArgs": d.Command.Args,
"CommandDir": d.Command.Dir,
"CommandEnv": d.Command.Env,
"ProxyToken": "",
}
d.UnmarshalSnapshot(snapshot)
}
if err := d.Start(); err != nil {
log.Printf("Error: %s", err)
os.Exit(1)
}
log.Println("Started child")
// Wait "forever" (calling test chooses when we exit with signal/Wait to
// minimise coordination).
for {
time.Sleep(time.Hour)
}
default: default:
fmt.Fprintf(os.Stderr, "Unknown command: %q\n", cmd) fmt.Fprintf(os.Stderr, "Unknown command: %q\n", cmd)
os.Exit(2) os.Exit(2)