mirror of
https://github.com/status-im/consul.git
synced 2025-01-15 08:14:54 +00:00
b06ddc9187
* Rename agent/proxy package to reflect that it is limited to managed proxy processes Rationale: we have several other components of the agent that relate to Connect proxies for example the ProxyConfigManager component needed for Envoy work. Those things are pretty separate from the focus of this package so far which is only concerned with managing external proxy processes so it's nota good fit to put code for that in here, yet there is a naming clash if we have other packages related to proxy functionality that are not in the `agent/proxy` package. Happy to bikeshed the name. I started by calling it `managedproxy` but `managedproxy.Manager` is especially unpleasant. `proxyprocess` seems good in that it's more specific about purpose but less clearly connected with the concept of "managed proxies". The names in use are cleaner though e.g. `proxyprocess.Manager`. This rename was completed automatically using golang.org/x/tools/cmd/gomvpkg. Depends on #4541 * Fix missed windows tagged files
470 lines
13 KiB
Go
470 lines
13 KiB
Go
package proxyprocess
|
|
|
|
import (
|
|
"fmt"
|
|
"log"
|
|
"os"
|
|
"os/exec"
|
|
"reflect"
|
|
"strconv"
|
|
"sync"
|
|
"time"
|
|
|
|
"github.com/hashicorp/consul/lib/file"
|
|
"github.com/mitchellh/mapstructure"
|
|
)
|
|
|
|
// Constants related to restart timers with the daemon mode proxies. At some
|
|
// point we will probably want to expose these knobs to an end user, but
|
|
// reasonable defaults are chosen.
|
|
const (
|
|
DaemonRestartHealthy = 10 * time.Second // time before considering healthy
|
|
DaemonRestartBackoffMin = 3 // 3 attempts before backing off
|
|
DaemonRestartMaxWait = 1 * time.Minute // maximum backoff wait time
|
|
)
|
|
|
|
// Daemon is a long-running proxy process. It is expected to keep running
|
|
// and to use blocking queries to detect changes in configuration, certs,
|
|
// and more.
|
|
//
|
|
// Consul will ensure that if the daemon crashes, that it is restarted.
|
|
type Daemon struct {
|
|
// Command is the command to execute to start this daemon. This must
|
|
// be a Cmd that isn't yet started.
|
|
Command *exec.Cmd
|
|
|
|
// ProxyID is the ID of the proxy service. This is required for API
|
|
// requests (along with the token) and is passed via env var.
|
|
ProxyID string
|
|
|
|
// ProxyToken is the special local-only ACL token that allows a proxy
|
|
// to communicate to the Connect-specific endpoints.
|
|
ProxyToken string
|
|
|
|
// Logger is where logs will be sent around the management of this
|
|
// daemon. The actual logs for the daemon itself will be sent to
|
|
// a file.
|
|
Logger *log.Logger
|
|
|
|
// PidPath is the path where a pid file will be created storing the
|
|
// pid of the active process. If this is empty then a pid-file won't
|
|
// be created. Under erroneous conditions, the pid file may not be
|
|
// created but the error will be logged to the Logger.
|
|
PidPath string
|
|
|
|
// For tests, they can set this to change the default duration to wait
|
|
// for a graceful quit.
|
|
gracefulWait time.Duration
|
|
|
|
// process is the started process
|
|
lock sync.Mutex
|
|
stopped bool
|
|
stopCh chan struct{}
|
|
exitedCh chan struct{}
|
|
process *os.Process
|
|
}
|
|
|
|
// Start starts the daemon and keeps it running.
|
|
//
|
|
// This function returns after the process is successfully started.
|
|
func (p *Daemon) Start() error {
|
|
p.lock.Lock()
|
|
defer p.lock.Unlock()
|
|
|
|
// A stopped proxy cannot be restarted
|
|
if p.stopped {
|
|
return fmt.Errorf("stopped")
|
|
}
|
|
|
|
// If we're already running, that is okay
|
|
if p.process != nil {
|
|
return nil
|
|
}
|
|
|
|
// Setup our stop channel
|
|
stopCh := make(chan struct{})
|
|
exitedCh := make(chan struct{})
|
|
p.stopCh = stopCh
|
|
p.exitedCh = exitedCh
|
|
|
|
// Start the loop.
|
|
go p.keepAlive(stopCh, exitedCh)
|
|
|
|
return nil
|
|
}
|
|
|
|
// keepAlive starts and keeps the configured process alive until it
|
|
// is stopped via Stop.
|
|
func (p *Daemon) keepAlive(stopCh <-chan struct{}, exitedCh chan<- struct{}) {
|
|
defer close(exitedCh)
|
|
|
|
p.lock.Lock()
|
|
process := p.process
|
|
p.lock.Unlock()
|
|
|
|
// attemptsDeadline is the time at which we consider the daemon to have
|
|
// been alive long enough that we can reset the attempt counter.
|
|
//
|
|
// attempts keeps track of the number of restart attempts we've had and
|
|
// is used to calculate the wait time using an exponential backoff.
|
|
var attemptsDeadline time.Time
|
|
var attempts uint32
|
|
|
|
// Assume the process is adopted, we reset this when we start a new process
|
|
// ourselves below and use it to decide on a strategy for waiting.
|
|
adopted := true
|
|
|
|
for {
|
|
if process == nil {
|
|
// If we're passed the attempt deadline then reset the attempts
|
|
if !attemptsDeadline.IsZero() && time.Now().After(attemptsDeadline) {
|
|
attempts = 0
|
|
}
|
|
// Set ourselves a deadline - we have to make it at least this long before
|
|
// we come around the loop to consider it to have been a "successful"
|
|
// daemon startup and rest the counter above. Note that if the daemon
|
|
// fails before this, we reset the deadline to zero below so that backoff
|
|
// sleeps in the loop don't count as "success" time.
|
|
attemptsDeadline = time.Now().Add(DaemonRestartHealthy)
|
|
attempts++
|
|
|
|
// Calculate the exponential backoff and wait if we have to
|
|
if attempts > DaemonRestartBackoffMin {
|
|
exponent := (attempts - DaemonRestartBackoffMin)
|
|
if exponent > 31 {
|
|
exponent = 31
|
|
}
|
|
waitTime := (1 << exponent) * time.Second
|
|
if waitTime > DaemonRestartMaxWait {
|
|
waitTime = DaemonRestartMaxWait
|
|
}
|
|
|
|
if waitTime > 0 {
|
|
// If we are waiting, reset the success deadline so we don't
|
|
// accidentally interpret backoff sleep as successful runtime.
|
|
attemptsDeadline = time.Time{}
|
|
|
|
p.Logger.Printf(
|
|
"[WARN] agent/proxy: waiting %s before restarting daemon",
|
|
waitTime)
|
|
|
|
timer := time.NewTimer(waitTime)
|
|
select {
|
|
case <-timer.C:
|
|
// Timer is up, good!
|
|
|
|
case <-stopCh:
|
|
// During our backoff wait, we've been signalled to
|
|
// quit, so just quit.
|
|
timer.Stop()
|
|
return
|
|
}
|
|
}
|
|
}
|
|
|
|
p.lock.Lock()
|
|
|
|
// If we gracefully stopped then don't restart.
|
|
if p.stopped {
|
|
p.lock.Unlock()
|
|
return
|
|
}
|
|
|
|
// Process isn't started currently. We're restarting. Start it
|
|
// and save the process if we have it.
|
|
var err error
|
|
process, err = p.start()
|
|
if err == nil {
|
|
p.process = process
|
|
adopted = false
|
|
}
|
|
p.lock.Unlock()
|
|
|
|
if err != nil {
|
|
p.Logger.Printf("[ERR] agent/proxy: error restarting daemon: %s", err)
|
|
continue
|
|
}
|
|
|
|
}
|
|
|
|
var ps *os.ProcessState
|
|
var err error
|
|
|
|
if adopted {
|
|
// assign to err outside scope
|
|
_, err = findProcess(process.Pid)
|
|
if err == nil {
|
|
// Process appears to be running still, wait a bit before we poll again.
|
|
// We want a busy loop, but not too busy. 1 second between detecting a
|
|
// process death seems reasonable.
|
|
//
|
|
// SUBTELTY: we must NOT select on stopCh here since the Stop function
|
|
// assumes that as soon as this method returns and closes exitedCh, that
|
|
// the process is no longer running. If we are polling then we don't
|
|
// know that is true until we've polled again so we have to keep polling
|
|
// until the process goes away even if we know the Daemon is stopping.
|
|
time.Sleep(1 * time.Second)
|
|
|
|
// Restart the loop, process is still set so we effectively jump back to
|
|
// the findProcess call above.
|
|
continue
|
|
}
|
|
} else {
|
|
// Wait for child to exit
|
|
ps, err = process.Wait()
|
|
}
|
|
|
|
// Process exited somehow.
|
|
process = nil
|
|
if err != nil {
|
|
p.Logger.Printf("[INFO] agent/proxy: daemon exited with error: %s", err)
|
|
} else if ps != nil && !ps.Exited() {
|
|
p.Logger.Printf("[INFO] agent/proxy: daemon left running")
|
|
} else if status, ok := exitStatus(ps); ok {
|
|
p.Logger.Printf("[INFO] agent/proxy: daemon exited with exit code: %d", status)
|
|
}
|
|
}
|
|
}
|
|
|
|
// start starts and returns the process. This will create a copy of the
|
|
// configured *exec.Command with the modifications documented on Daemon
|
|
// such as setting the proxy token environmental variable.
|
|
func (p *Daemon) start() (*os.Process, error) {
|
|
cmd := *p.Command
|
|
|
|
// Add the proxy token to the environment. We first copy the env because it is
|
|
// a slice and therefore the "copy" above will only copy the slice reference.
|
|
// We allocate an exactly sized slice.
|
|
//
|
|
// Note that anything we add to the Env here is NOT persisted in the snapshot
|
|
// which only looks at p.Command.Env so it needs to be reconstructible exactly
|
|
// from data in the snapshot otherwise.
|
|
cmd.Env = make([]string, len(p.Command.Env), len(p.Command.Env)+2)
|
|
copy(cmd.Env, p.Command.Env)
|
|
cmd.Env = append(cmd.Env,
|
|
fmt.Sprintf("%s=%s", EnvProxyID, p.ProxyID),
|
|
fmt.Sprintf("%s=%s", EnvProxyToken, p.ProxyToken))
|
|
|
|
// Update the Daemon env
|
|
|
|
// Args must always contain a 0 entry which is usually the executed binary.
|
|
// To be safe and a bit more robust we default this, but only to prevent
|
|
// a panic below.
|
|
if len(cmd.Args) == 0 {
|
|
cmd.Args = []string{cmd.Path}
|
|
}
|
|
|
|
// Perform system-specific setup. In particular, Unix-like systems
|
|
// shuld set sid so that killing the agent doesn't kill the daemon.
|
|
configureDaemon(&cmd)
|
|
|
|
// Start it
|
|
p.Logger.Printf("[DEBUG] agent/proxy: starting proxy: %q %#v", cmd.Path, cmd.Args[1:])
|
|
if err := cmd.Start(); err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
// Write the pid file. This might error and that's okay.
|
|
if p.PidPath != "" {
|
|
pid := strconv.FormatInt(int64(cmd.Process.Pid), 10)
|
|
if err := file.WriteAtomic(p.PidPath, []byte(pid)); err != nil {
|
|
p.Logger.Printf(
|
|
"[DEBUG] agent/proxy: error writing pid file %q: %s",
|
|
p.PidPath, err)
|
|
}
|
|
}
|
|
|
|
return cmd.Process, nil
|
|
}
|
|
|
|
// Stop stops the daemon.
|
|
//
|
|
// This will attempt a graceful stop (SIGINT) before force killing the
|
|
// process (SIGKILL). In either case, the process won't be automatically
|
|
// restarted unless Start is called again.
|
|
//
|
|
// This is safe to call multiple times. If the daemon is already stopped,
|
|
// then this returns no error.
|
|
func (p *Daemon) Stop() error {
|
|
p.lock.Lock()
|
|
|
|
// If we're already stopped or never started, then no problem.
|
|
if p.stopped || p.process == nil {
|
|
// In the case we never even started, calling Stop makes it so
|
|
// that we can't ever start in the future, either, so mark this.
|
|
p.stopped = true
|
|
p.lock.Unlock()
|
|
return nil
|
|
}
|
|
|
|
// Note that we've stopped
|
|
p.stopped = true
|
|
close(p.stopCh)
|
|
process := p.process
|
|
p.lock.Unlock()
|
|
|
|
gracefulWait := p.gracefulWait
|
|
if gracefulWait == 0 {
|
|
gracefulWait = 5 * time.Second
|
|
}
|
|
|
|
// Defer removing the pid file. Even under error conditions we
|
|
// delete the pid file since Stop means that the manager is no
|
|
// longer managing this proxy and therefore nothing else will ever
|
|
// clean it up.
|
|
if p.PidPath != "" {
|
|
defer func() {
|
|
if err := os.Remove(p.PidPath); err != nil && !os.IsNotExist(err) {
|
|
p.Logger.Printf(
|
|
"[DEBUG] agent/proxy: error removing pid file %q: %s",
|
|
p.PidPath, err)
|
|
}
|
|
}()
|
|
}
|
|
|
|
// First, try a graceful stop
|
|
err := process.Signal(os.Interrupt)
|
|
if err == nil {
|
|
select {
|
|
case <-p.exitedCh:
|
|
// Success!
|
|
return nil
|
|
|
|
case <-time.After(gracefulWait):
|
|
// Interrupt didn't work
|
|
p.Logger.Printf("[DEBUG] agent/proxy: graceful wait of %s passed, "+
|
|
"killing", gracefulWait)
|
|
}
|
|
} else if isProcessAlreadyFinishedErr(err) {
|
|
// This can happen due to races between signals and polling.
|
|
return nil
|
|
} else {
|
|
p.Logger.Printf("[DEBUG] agent/proxy: sigint failed, killing: %s", err)
|
|
}
|
|
|
|
// Graceful didn't work (e.g. on windows where SIGINT isn't implemented),
|
|
// forcibly kill
|
|
err = process.Kill()
|
|
if err != nil && isProcessAlreadyFinishedErr(err) {
|
|
return nil
|
|
}
|
|
return err
|
|
}
|
|
|
|
// Close implements Proxy by stopping the run loop but not killing the process.
|
|
// One Close is called, Stop has no effect.
|
|
func (p *Daemon) Close() error {
|
|
p.lock.Lock()
|
|
defer p.lock.Unlock()
|
|
|
|
// If we're already stopped or never started, then no problem.
|
|
if p.stopped || p.process == nil {
|
|
p.stopped = true
|
|
return nil
|
|
}
|
|
|
|
// Note that we've stopped
|
|
p.stopped = true
|
|
close(p.stopCh)
|
|
|
|
return nil
|
|
}
|
|
|
|
// Equal implements Proxy to check for equality.
|
|
func (p *Daemon) Equal(raw Proxy) bool {
|
|
p2, ok := raw.(*Daemon)
|
|
if !ok {
|
|
return false
|
|
}
|
|
|
|
// We compare equality on a subset of the command configuration
|
|
return p.ProxyToken == p2.ProxyToken &&
|
|
p.ProxyID == p2.ProxyID &&
|
|
p.Command.Path == p2.Command.Path &&
|
|
p.Command.Dir == p2.Command.Dir &&
|
|
reflect.DeepEqual(p.Command.Args, p2.Command.Args) &&
|
|
reflect.DeepEqual(p.Command.Env, p2.Command.Env)
|
|
}
|
|
|
|
// MarshalSnapshot implements Proxy
|
|
func (p *Daemon) MarshalSnapshot() map[string]interface{} {
|
|
p.lock.Lock()
|
|
defer p.lock.Unlock()
|
|
|
|
// If we're stopped or have no process, then nothing to snapshot.
|
|
if p.stopped || p.process == nil {
|
|
return nil
|
|
}
|
|
|
|
return map[string]interface{}{
|
|
"Pid": p.process.Pid,
|
|
"CommandPath": p.Command.Path,
|
|
"CommandArgs": p.Command.Args,
|
|
"CommandDir": p.Command.Dir,
|
|
"CommandEnv": p.Command.Env,
|
|
"ProxyToken": p.ProxyToken,
|
|
"ProxyID": p.ProxyID,
|
|
}
|
|
}
|
|
|
|
// UnmarshalSnapshot implements Proxy
|
|
func (p *Daemon) UnmarshalSnapshot(m map[string]interface{}) error {
|
|
var s daemonSnapshot
|
|
if err := mapstructure.Decode(m, &s); err != nil {
|
|
return err
|
|
}
|
|
|
|
p.lock.Lock()
|
|
defer p.lock.Unlock()
|
|
|
|
// Set the basic fields
|
|
p.ProxyToken = s.ProxyToken
|
|
p.ProxyID = s.ProxyID
|
|
p.Command = &exec.Cmd{
|
|
Path: s.CommandPath,
|
|
Args: s.CommandArgs,
|
|
Dir: s.CommandDir,
|
|
Env: s.CommandEnv,
|
|
}
|
|
|
|
// FindProcess on many systems returns no error even if the process
|
|
// is now dead. We perform an extra check that the process is alive.
|
|
proc, err := findProcess(s.Pid)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
// "Start it"
|
|
stopCh := make(chan struct{})
|
|
exitedCh := make(chan struct{})
|
|
p.stopCh = stopCh
|
|
p.exitedCh = exitedCh
|
|
p.process = proc
|
|
go p.keepAlive(stopCh, exitedCh)
|
|
|
|
return nil
|
|
}
|
|
|
|
// daemonSnapshot is the structure of the marshalled data for snapshotting.
|
|
//
|
|
// Note we don't have to store the ProxyId because this is stored directly
|
|
// within the manager snapshot and is restored automatically.
|
|
type daemonSnapshot struct {
|
|
// Pid of the process. This is the only value actually required to
|
|
// regain management control. The remainder values are for Equal.
|
|
Pid int
|
|
|
|
// Command information
|
|
CommandPath string
|
|
CommandArgs []string
|
|
CommandDir string
|
|
CommandEnv []string
|
|
|
|
// NOTE(mitchellh): longer term there are discussions/plans to only
|
|
// store the hash of the token but for now we need the full token in
|
|
// case the process dies and has to be restarted.
|
|
ProxyToken string
|
|
|
|
ProxyID string
|
|
}
|