diff --git a/agent/proxy/daemon.go b/agent/proxy/daemon.go index 15950c196c..e9bf318a28 100644 --- a/agent/proxy/daemon.go +++ b/agent/proxy/daemon.go @@ -165,7 +165,25 @@ func (p *Daemon) keepAlive(stopCh <-chan struct{}, exitedCh chan<- struct{}) { } + // Wait for the process to exit. Note that if we restored this proxy + // then Wait will always fail because we likely aren't the parent + // process. Therefore, we do an extra sanity check after to use other + // syscalls to verify the process is truly dead. ps, err := process.Wait() + if _, err := findProcess(process.Pid); err == nil { + select { + case <-time.After(1 * time.Second): + // We want a busy loop, but not too busy. 1 second between + // detecting a process death seems reasonable. + + case <-stopCh: + // If we receive a stop request we want to exit immediately. + return + } + + continue + } + process = nil if err != nil { p.Logger.Printf("[INFO] agent/proxy: daemon exited with error: %s", err) @@ -336,15 +354,10 @@ func (p *Daemon) UnmarshalSnapshot(m map[string]interface{}) error { Env: s.CommandEnv, } - // For the pid, we want to find the process. - proc, err := os.FindProcess(s.Pid) - if err != nil { - return err - } - // FindProcess on many systems returns no error even if the process // is now dead. We perform an extra check that the process is alive. - if err := processAlive(proc); err != nil { + proc, err := findProcess(s.Pid) + if err != nil { return err } diff --git a/agent/proxy/process_unix.go b/agent/proxy/process_unix.go index 6db64c59c7..deb45d080d 100644 --- a/agent/proxy/process_unix.go +++ b/agent/proxy/process_unix.go @@ -3,21 +3,28 @@ package proxy import ( + "fmt" "os" "syscall" ) -// processAlive for non-Windows. Note that this very likely doesn't +// findProcess for non-Windows. Note that this very likely doesn't // work for all non-Windows platforms Go supports and we should expand // support as we experience it. -func processAlive(p *os.Process) error { +func findProcess(pid int) (*os.Process, error) { + // FindProcess never fails on unix-like systems. + p, err := os.FindProcess(pid) + if err != nil { + return nil, err + } + // On Unix-like systems, we can verify a process is alive by sending // a 0 signal. This will do nothing to the process but will still // return errors if the process is gone. - err := p.Signal(syscall.Signal(0)) + err = p.Signal(syscall.Signal(0)) if err == nil || err == syscall.EPERM { - return nil + return p, nil } - return err + return nil, fmt.Errorf("process is dead") } diff --git a/agent/proxy/process_windows.go b/agent/proxy/process_windows.go index 0268958e91..0a00d81ee0 100644 --- a/agent/proxy/process_windows.go +++ b/agent/proxy/process_windows.go @@ -3,17 +3,12 @@ package proxy import ( - "fmt" "os" ) -func processAlive(p *os.Process) error { +func findProcess(pid int) (*os.Process, error) { // On Windows, os.FindProcess will error if the process is not alive, // so we don't have to do any further checking. The nature of it being // non-nil means it seems to be healthy. - if p == nil { - return fmt.Errof("process no longer alive") - } - - return nil + return os.FindProcess(pid) } diff --git a/agent/proxy/snapshot.go b/agent/proxy/snapshot.go index 8f81a182a2..dbe03fd83d 100644 --- a/agent/proxy/snapshot.go +++ b/agent/proxy/snapshot.go @@ -151,8 +151,13 @@ func (m *Manager) Restore(path string) error { return err } + // Unmarshal the proxy. If there is an error we just continue on and + // ignore it. Errors restoring proxies should be exceptionally rare + // and only under scenarios where the proxy isn't running anymore or + // we won't have permission to access it. We log and continue. if err := p.UnmarshalSnapshot(sp.Config); err != nil { - return err + m.Logger.Printf("[WARN] agent/proxy: error restoring proxy %q: %s", id, err) + continue } proxies[id] = p