mirror of
https://github.com/status-im/consul.git
synced 2025-01-11 06:16:08 +00:00
Test for adopted process Stop race and fix
This commit is contained in:
parent
03131398c4
commit
b2ff583392
@ -8,6 +8,7 @@ import (
|
||||
"reflect"
|
||||
"strconv"
|
||||
"sync"
|
||||
"syscall"
|
||||
"time"
|
||||
|
||||
"github.com/hashicorp/consul/lib/file"
|
||||
@ -108,7 +109,11 @@ func (p *Daemon) keepAlive(stopCh <-chan struct{}, exitedCh chan<- struct{}) {
|
||||
// attempts keeps track of the number of restart attempts we've had and
|
||||
// is used to calculate the wait time using an exponential backoff.
|
||||
var attemptsDeadline time.Time
|
||||
var attempts uint
|
||||
var attempts uint32
|
||||
|
||||
// Assume the process is adopted, we reset this when we start a new process
|
||||
// ourselves below and use it to decide on a strategy for waiting.
|
||||
adopted := true
|
||||
|
||||
for {
|
||||
if process == nil {
|
||||
@ -121,7 +126,11 @@ func (p *Daemon) keepAlive(stopCh <-chan struct{}, exitedCh chan<- struct{}) {
|
||||
|
||||
// Calculate the exponential backoff and wait if we have to
|
||||
if attempts > DaemonRestartBackoffMin {
|
||||
waitTime := (1 << (attempts - DaemonRestartBackoffMin)) * time.Second
|
||||
exponent := (attempts - DaemonRestartBackoffMin)
|
||||
if exponent > 31 {
|
||||
exponent = 31
|
||||
}
|
||||
waitTime := (1 << exponent) * time.Second
|
||||
if waitTime > DaemonRestartMaxWait {
|
||||
waitTime = DaemonRestartMaxWait
|
||||
}
|
||||
@ -159,6 +168,7 @@ func (p *Daemon) keepAlive(stopCh <-chan struct{}, exitedCh chan<- struct{}) {
|
||||
process, err = p.start()
|
||||
if err == nil {
|
||||
p.process = process
|
||||
adopted = false
|
||||
}
|
||||
p.lock.Unlock()
|
||||
|
||||
@ -169,25 +179,34 @@ func (p *Daemon) keepAlive(stopCh <-chan struct{}, exitedCh chan<- struct{}) {
|
||||
|
||||
}
|
||||
|
||||
// Wait for the process to exit. Note that if we restored this proxy
|
||||
// then Wait will always fail because we likely aren't the parent
|
||||
// process. Therefore, we do an extra sanity check after to use other
|
||||
// syscalls to verify the process is truly dead.
|
||||
ps, err := process.Wait()
|
||||
if _, err := findProcess(process.Pid); err == nil {
|
||||
select {
|
||||
case <-time.After(1 * time.Second):
|
||||
// We want a busy loop, but not too busy. 1 second between
|
||||
// detecting a process death seems reasonable.
|
||||
var ps *os.ProcessState
|
||||
var err error
|
||||
|
||||
case <-stopCh:
|
||||
// If we receive a stop request we want to exit immediately.
|
||||
return
|
||||
if adopted {
|
||||
// assign to err outside scope
|
||||
_, err = findProcess(process.Pid)
|
||||
if err == nil {
|
||||
// Process appears to be running still, wait a bit before we poll again.
|
||||
// We want a busy loop, but not too busy. 1 second between detecting a
|
||||
// process death seems reasonable.
|
||||
//
|
||||
// SUBTELTY: we must NOT select on stopCh here since the Stop function
|
||||
// assumes that as soon as this method returns and closes exitedCh, that
|
||||
// the process is no longer running. If we are polling then we don't
|
||||
// know that is true until we've polled again so we have to keep polling
|
||||
// until the process goes away even if we know the Daemon is stopping.
|
||||
time.Sleep(1 * time.Second)
|
||||
|
||||
// Restart the loop, process is still set so we effectively jump back to
|
||||
// the findProcess call above.
|
||||
continue
|
||||
}
|
||||
|
||||
continue
|
||||
} else {
|
||||
// Wait for child to exit
|
||||
ps, err = process.Wait()
|
||||
}
|
||||
|
||||
// Process exited somehow.
|
||||
process = nil
|
||||
if err != nil {
|
||||
p.Logger.Printf("[INFO] agent/proxy: daemon exited with error: %s", err)
|
||||
@ -219,6 +238,10 @@ func (p *Daemon) start() (*os.Process, error) {
|
||||
cmd.Args = []string{cmd.Path}
|
||||
}
|
||||
|
||||
// Start it in a new sessions (and hence process group) so that killing agent
|
||||
// (even with Ctrl-C) won't kill proxy.
|
||||
cmd.SysProcAttr = &syscall.SysProcAttr{Setsid: true}
|
||||
|
||||
// Start it
|
||||
p.Logger.Printf("[DEBUG] agent/proxy: starting proxy: %q %#v", cmd.Path, cmd.Args[1:])
|
||||
if err := cmd.Start(); err != nil {
|
||||
|
@ -5,6 +5,8 @@ import (
|
||||
"os"
|
||||
"os/exec"
|
||||
"path/filepath"
|
||||
"strconv"
|
||||
"syscall"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
@ -102,6 +104,150 @@ func TestDaemonRestart(t *testing.T) {
|
||||
waitFile()
|
||||
}
|
||||
|
||||
func TestDaemonLaunchesNewProcessGroup(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
require := require.New(t)
|
||||
td, closer := testTempDir(t)
|
||||
defer closer()
|
||||
|
||||
path := filepath.Join(td, "file")
|
||||
pidPath := filepath.Join(td, "child.pid")
|
||||
|
||||
// Start the parent process wrapping a start-stop test. The parent is acting
|
||||
// as our "agent". We need an extra indirection to be able to kill the "agent"
|
||||
// and still be running the test process.
|
||||
parentCmd := helperProcess("parent", pidPath, "start-stop", path)
|
||||
|
||||
// We MUST run this as a separate process group otherwise the Kill below will
|
||||
// kill this test process (and possibly your shell/editor that launched it!)
|
||||
parentCmd.SysProcAttr = &syscall.SysProcAttr{Setsid: true}
|
||||
|
||||
require.NoError(parentCmd.Start())
|
||||
|
||||
// Wait for the pid file to exist so we know parent is running
|
||||
retry.Run(t, func(r *retry.R) {
|
||||
_, err := os.Stat(pidPath)
|
||||
if err == nil {
|
||||
return
|
||||
}
|
||||
|
||||
r.Fatalf("error: %s", err)
|
||||
})
|
||||
|
||||
// And wait for the actual file to be sure the child is running (it should be
|
||||
// since parent doesn't write PID until child starts but the child might not
|
||||
// have completed the write to disk yet which causes flakiness below).
|
||||
retry.Run(t, func(r *retry.R) {
|
||||
_, err := os.Stat(path)
|
||||
if err == nil {
|
||||
return
|
||||
}
|
||||
|
||||
r.Fatalf("error: %s", err)
|
||||
})
|
||||
|
||||
// Get the child PID
|
||||
bs, err := ioutil.ReadFile(pidPath)
|
||||
require.NoError(err)
|
||||
pid, err := strconv.Atoi(string(bs))
|
||||
require.NoError(err)
|
||||
proc, err := os.FindProcess(pid)
|
||||
require.NoError(err)
|
||||
|
||||
// Always cleanup child process after
|
||||
defer func() {
|
||||
if proc != nil {
|
||||
proc.Kill()
|
||||
}
|
||||
}()
|
||||
|
||||
// Now kill the parent's whole process group and wait for it
|
||||
pgid, err := syscall.Getpgid(parentCmd.Process.Pid)
|
||||
|
||||
require.NoError(err)
|
||||
// Yep the minus PGid is how you kill a whole process group in unix... no idea
|
||||
// how this works on windows. We TERM no KILL since we rely on the child
|
||||
// catching the signal and deleting it's file to detect correct behaviour.
|
||||
require.NoError(syscall.Kill(-pgid, syscall.SIGTERM))
|
||||
|
||||
_, err = parentCmd.Process.Wait()
|
||||
require.NoError(err)
|
||||
|
||||
// The child should still be running so file should still be there
|
||||
_, err = os.Stat(path)
|
||||
require.NoError(err, "child should still be running")
|
||||
|
||||
// TEST PART 2 - verify that adopting an existing process works and picks up
|
||||
// monitoring even though it's not a child. We can't do this accurately with
|
||||
// Restart test since even if we create a new `Daemon` object the test process
|
||||
// is still the parent. We need the indirection of the `parent` test helper to
|
||||
// actually verify "adoption" on restart works.
|
||||
|
||||
// Start a new parent that will "adopt" the existing child even though it will
|
||||
// not be an actual child process.
|
||||
fosterCmd := helperProcess("parent", pidPath, "start-stop", path)
|
||||
// Don't care about it being same process group this time as we will just kill
|
||||
// it normally.
|
||||
require.NoError(fosterCmd.Start())
|
||||
defer func() {
|
||||
// Clean up the daemon and wait for it to prevent it becoming a zombie.
|
||||
fosterCmd.Process.Kill()
|
||||
fosterCmd.Wait()
|
||||
}()
|
||||
|
||||
// The child should still be running so file should still be there
|
||||
_, err = os.Stat(path)
|
||||
require.NoError(err, "child should still be running")
|
||||
|
||||
{
|
||||
// Get the child PID - it shouldn't have changed and should be running
|
||||
bs2, err := ioutil.ReadFile(pidPath)
|
||||
require.NoError(err)
|
||||
pid2, err := strconv.Atoi(string(bs2))
|
||||
require.NoError(err)
|
||||
// Defer a cleanup (til end of test function)
|
||||
proc, err := os.FindProcess(pid)
|
||||
require.NoError(err)
|
||||
defer func() { proc.Kill() }()
|
||||
|
||||
require.Equal(pid, pid2)
|
||||
t.Logf("Child PID was %d and still %d", pid, pid2)
|
||||
}
|
||||
|
||||
// Now killing the child directly should still be restarted by the Daemon
|
||||
require.NoError(proc.Kill())
|
||||
proc = nil
|
||||
|
||||
retry.Run(t, func(r *retry.R) {
|
||||
// Get the child PID - it should have changed
|
||||
bs, err := ioutil.ReadFile(pidPath)
|
||||
r.Check(err)
|
||||
|
||||
newPid, err := strconv.Atoi(string(bs))
|
||||
r.Check(err)
|
||||
if newPid == pid {
|
||||
r.Fatalf("Child PID file not changed, Daemon not restarting it")
|
||||
}
|
||||
t.Logf("Child PID was %d and is now %d", pid, newPid)
|
||||
})
|
||||
|
||||
// I had to run through this test in debugger a lot of times checking ps state
|
||||
// by hand at different points to convince myself it was doing the right
|
||||
// thing. It doesn't help that with verbose logs on it seems that the stdio
|
||||
// from the `parent` process can sometimes miss lines out due to timing. For
|
||||
// example the `[INFO] agent/proxy: daemon exited...` log from Daemon that
|
||||
// indicates that the child was detected to have failed and is restarting is
|
||||
// never output on my Mac at full speed. But if I run in debugger and have it
|
||||
// pause at the step after the child is killed above, then it shows. The
|
||||
// `[DEBUG] agent/proxy: starting proxy:` for the restart does always come
|
||||
// through though which is odd. I assume this is some odd quirk of timing
|
||||
// between processes and stdio or something but it makes debugging this stuff
|
||||
// even harder!
|
||||
|
||||
// Let defer clean up the child process(es)
|
||||
}
|
||||
|
||||
func TestDaemonStop_kill(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
@ -132,7 +278,85 @@ func TestDaemonStop_kill(t *testing.T) {
|
||||
// Stop the process
|
||||
require.NoError(d.Stop())
|
||||
|
||||
// State the file so that we can get the mtime
|
||||
// Stat the file so that we can get the mtime
|
||||
fi, err := os.Stat(path)
|
||||
require.NoError(err)
|
||||
mtime := fi.ModTime()
|
||||
|
||||
// The mtime shouldn't change
|
||||
time.Sleep(100 * time.Millisecond)
|
||||
fi, err = os.Stat(path)
|
||||
require.NoError(err)
|
||||
require.Equal(mtime, fi.ModTime())
|
||||
}
|
||||
|
||||
func TestDaemonStop_killAdopted(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
require := require.New(t)
|
||||
td, closer := testTempDir(t)
|
||||
defer closer()
|
||||
|
||||
path := filepath.Join(td, "file")
|
||||
|
||||
// In this test we want to ensure that gracefull/ungraceful stop works with
|
||||
// processes that were adopted by current process but not started by it. (i.e.
|
||||
// we have to poll them not use Wait).
|
||||
//
|
||||
// We could use `parent` indirection to get a child that is actually not
|
||||
// started by this process but that's a lot of hoops to jump through on top of
|
||||
// an already complex multi-process test case.
|
||||
//
|
||||
// For now we rely on an implementation detail of Daemon which is potentially
|
||||
// brittle but beats lots of extra complexity here. Currently, if
|
||||
// Daemon.process is non-nil, the keepAlive loop will explicitly assume it's
|
||||
// not a child and so will use polling to monitor it. If we ever change that
|
||||
// it might invalidate this test and we would either need more indirection
|
||||
// here, or an alternative explicit signal on Daemon like Daemon.forcePoll to
|
||||
// ensure we are exercising that code path.
|
||||
|
||||
// Start the "child" process
|
||||
childCmd := helperProcess("stop-kill", path)
|
||||
require.NoError(childCmd.Start())
|
||||
go func() { childCmd.Wait() }() // Prevent it becoming a zombie when killed
|
||||
defer func() { childCmd.Process.Kill() }()
|
||||
|
||||
// Create the Daemon
|
||||
d := &Daemon{
|
||||
Command: helperProcess("stop-kill", path),
|
||||
ProxyToken: "hello",
|
||||
Logger: testLogger,
|
||||
gracefulWait: 200 * time.Millisecond,
|
||||
// Can't just set process as it will bypass intializing stopCh etc.
|
||||
}
|
||||
// Adopt the pid from a fake state snapshot (this correctly initialises Daemon
|
||||
// for adoption)
|
||||
fakeSnap := map[string]interface{}{
|
||||
"Pid": childCmd.Process.Pid,
|
||||
"CommandPath": childCmd.Path,
|
||||
"CommandArgs": childCmd.Args,
|
||||
"CommandDir": childCmd.Dir,
|
||||
"CommandEnv": childCmd.Env,
|
||||
"ProxyToken": d.ProxyToken,
|
||||
}
|
||||
require.NoError(d.UnmarshalSnapshot(fakeSnap))
|
||||
require.NoError(d.Start())
|
||||
|
||||
// Wait for the file to exist (child was already running so this doesn't
|
||||
// gaurantee that Daemon is in "polling" state)
|
||||
retry.Run(t, func(r *retry.R) {
|
||||
_, err := os.Stat(path)
|
||||
if err == nil {
|
||||
return
|
||||
}
|
||||
|
||||
r.Fatalf("error: %s", err)
|
||||
})
|
||||
|
||||
// Stop the process
|
||||
require.NoError(d.Stop())
|
||||
|
||||
// Stat the file so that we can get the mtime
|
||||
fi, err := os.Stat(path)
|
||||
require.NoError(err)
|
||||
mtime := fi.ModTime()
|
||||
@ -404,6 +628,10 @@ func TestDaemonUnmarshalSnapshot(t *testing.T) {
|
||||
d2 := &Daemon{Logger: testLogger}
|
||||
require.NoError(d2.UnmarshalSnapshot(snap))
|
||||
|
||||
// Verify the daemon is still running
|
||||
_, err = os.Stat(path)
|
||||
require.NoError(err)
|
||||
|
||||
// Stop the process
|
||||
require.NoError(d2.Stop())
|
||||
|
||||
|
@ -7,6 +7,8 @@ import (
|
||||
"os"
|
||||
"os/exec"
|
||||
"os/signal"
|
||||
"strconv"
|
||||
"syscall"
|
||||
"testing"
|
||||
"time"
|
||||
)
|
||||
@ -74,7 +76,7 @@ func TestHelperProcess(t *testing.T) {
|
||||
// and deletes it only when it is stopped.
|
||||
case "start-stop":
|
||||
ch := make(chan os.Signal, 1)
|
||||
signal.Notify(ch, os.Interrupt)
|
||||
signal.Notify(ch, os.Interrupt, syscall.SIGTERM)
|
||||
defer signal.Stop(ch)
|
||||
|
||||
path := args[0]
|
||||
@ -155,6 +157,64 @@ func TestHelperProcess(t *testing.T) {
|
||||
|
||||
<-make(chan struct{})
|
||||
|
||||
// Parent runs the given process in a Daemon and then sleeps until the test
|
||||
// code kills it. It exists to test that the Daemon-managed child process
|
||||
// survives it's parent exiting which we can't test directly without exiting
|
||||
// the test process so we need an extra level of indirection. The test code
|
||||
// using this must pass a file path as the first argument for the child
|
||||
// processes PID to be written and then must take care to clean up that PID
|
||||
// later or the child will be left running forever.
|
||||
//
|
||||
// If the PID file already exists, it will "adopt" the child rather than
|
||||
// launch a new one.
|
||||
case "parent":
|
||||
// We will write the PID for the child to the file in the first argument
|
||||
// then pass rest of args through to command.
|
||||
pidFile := args[0]
|
||||
|
||||
d := &Daemon{
|
||||
Command: helperProcess(args[1:]...),
|
||||
Logger: testLogger,
|
||||
PidPath: pidFile,
|
||||
}
|
||||
|
||||
_, err := os.Stat(pidFile)
|
||||
if err == nil {
|
||||
// pidFile exists, read it and "adopt" the process
|
||||
bs, err := ioutil.ReadFile(pidFile)
|
||||
if err != nil {
|
||||
log.Printf("Error: %s", err)
|
||||
os.Exit(1)
|
||||
}
|
||||
pid, err := strconv.Atoi(string(bs))
|
||||
if err != nil {
|
||||
log.Printf("Error: %s", err)
|
||||
os.Exit(1)
|
||||
}
|
||||
// Make a fake snapshot to load
|
||||
snapshot := map[string]interface{}{
|
||||
"Pid": pid,
|
||||
"CommandPath": d.Command.Path,
|
||||
"CommandArgs": d.Command.Args,
|
||||
"CommandDir": d.Command.Dir,
|
||||
"CommandEnv": d.Command.Env,
|
||||
"ProxyToken": "",
|
||||
}
|
||||
d.UnmarshalSnapshot(snapshot)
|
||||
}
|
||||
|
||||
if err := d.Start(); err != nil {
|
||||
log.Printf("Error: %s", err)
|
||||
os.Exit(1)
|
||||
}
|
||||
log.Println("Started child")
|
||||
|
||||
// Wait "forever" (calling test chooses when we exit with signal/Wait to
|
||||
// minimise coordination).
|
||||
for {
|
||||
time.Sleep(time.Hour)
|
||||
}
|
||||
|
||||
default:
|
||||
fmt.Fprintf(os.Stderr, "Unknown command: %q\n", cmd)
|
||||
os.Exit(2)
|
||||
|
Loading…
x
Reference in New Issue
Block a user