command/lock: Add -child-exitcode, return 2 on child error (#3329)

* Exit 2 if -child-exit-code and the child returned with an error.
* There is no platform independent way to check the exact return code of
* the child, so on error always return 2.
* Closes #947
* Closes #1503
This commit is contained in:
James Phillips 2017-07-26 22:09:19 -07:00 committed by GitHub
parent 9f10566314
commit 08a8d9f2a7
3 changed files with 64 additions and 14 deletions

View File

@ -41,8 +41,7 @@ type LockCommand struct {
child *os.Process child *os.Process
childLock sync.Mutex childLock sync.Mutex
verbose bool
verbose bool
} }
func (c *LockCommand) Help() string { func (c *LockCommand) Help() string {
@ -75,14 +74,18 @@ func (c *LockCommand) Run(args []string) int {
} }
func (c *LockCommand) run(args []string, lu **LockUnlock) int { func (c *LockCommand) run(args []string, lu **LockUnlock) int {
var childDone chan struct{}
var limit int var limit int
var monitorRetry int var monitorRetry int
var name string var name string
var passStdin bool var passStdin bool
var propagateChildCode bool
var timeout time.Duration var timeout time.Duration
f := c.BaseCommand.NewFlagSet(c) f := c.BaseCommand.NewFlagSet(c)
f.BoolVar(&propagateChildCode, "child-exit-code", false,
"Exit 2 if the child process exited with an error if this is true, "+
"otherwise this doesn't propagate an error from the child. The "+
"default value is false.")
f.IntVar(&limit, "n", 1, f.IntVar(&limit, "n", 1,
"Optional limit on the number of concurrent lock holders. The underlying "+ "Optional limit on the number of concurrent lock holders. The underlying "+
"implementation switches from a lock to a semaphore when the value is "+ "implementation switches from a lock to a semaphore when the value is "+
@ -100,7 +103,7 @@ func (c *LockCommand) run(args []string, lu **LockUnlock) int {
"Pass stdin to the child process.") "Pass stdin to the child process.")
f.DurationVar(&timeout, "timeout", 0, f.DurationVar(&timeout, "timeout", 0,
"Maximum amount of time to wait to acquire the lock, specified as a "+ "Maximum amount of time to wait to acquire the lock, specified as a "+
"timestamp like \"1s\" or \"3h\". The default value is 0.") "duration like \"1s\" or \"3h\". The default value is 0.")
f.BoolVar(&c.verbose, "verbose", false, f.BoolVar(&c.verbose, "verbose", false,
"Enable verbose (debugging) output.") "Enable verbose (debugging) output.")
@ -185,6 +188,8 @@ func (c *LockCommand) run(args []string, lu **LockUnlock) int {
} }
// Check if we were shutdown but managed to still acquire the lock // Check if we were shutdown but managed to still acquire the lock
var childCode int
var childErr chan error
select { select {
case <-c.ShutdownCh: case <-c.ShutdownCh:
c.UI.Error("Shutdown triggered during lock acquisition") c.UI.Error("Shutdown triggered during lock acquisition")
@ -193,11 +198,9 @@ func (c *LockCommand) run(args []string, lu **LockUnlock) int {
} }
// Start the child process // Start the child process
childDone = make(chan struct{}) childErr = make(chan error, 1)
go func() { go func() {
if err := c.startChild(script, childDone, passStdin); err != nil { childErr <- c.startChild(script, passStdin)
c.UI.Error(fmt.Sprintf("%s", err))
}
}() }()
// Monitor for shutdown, child termination, or lock loss // Monitor for shutdown, child termination, or lock loss
@ -210,7 +213,10 @@ func (c *LockCommand) run(args []string, lu **LockUnlock) int {
if c.verbose { if c.verbose {
c.UI.Info("Lock lost, killing child") c.UI.Info("Lock lost, killing child")
} }
case <-childDone: case err := <-childErr:
if err != nil {
childCode = 2
}
if c.verbose { if c.verbose {
c.UI.Info("Child terminated, releasing lock") c.UI.Info("Child terminated, releasing lock")
} }
@ -220,8 +226,9 @@ func (c *LockCommand) run(args []string, lu **LockUnlock) int {
// Prevent starting a new child. The lock is never released // Prevent starting a new child. The lock is never released
// after this point. // after this point.
c.childLock.Lock() c.childLock.Lock()
// Kill any existing child // Kill any existing child
if err := c.killChild(childDone); err != nil { if err := c.killChild(childErr); err != nil {
c.UI.Error(fmt.Sprintf("%s", err)) c.UI.Error(fmt.Sprintf("%s", err))
} }
@ -243,6 +250,13 @@ RELEASE:
} else if c.verbose { } else if c.verbose {
c.UI.Info("Cleanup succeeded") c.UI.Info("Cleanup succeeded")
} }
// If we detected an error from the child process then we propagate
// that.
if propagateChildCode {
return childCode
}
return 0 return 0
} }
@ -321,8 +335,7 @@ func (c *LockCommand) setupSemaphore(client *api.Client, limit int, prefix, name
// startChild is a long running routine used to start and // startChild is a long running routine used to start and
// wait for the child process to exit. // wait for the child process to exit.
func (c *LockCommand) startChild(script string, doneCh chan struct{}, passStdin bool) error { func (c *LockCommand) startChild(script string, passStdin bool) error {
defer close(doneCh)
if c.verbose { if c.verbose {
c.UI.Info(fmt.Sprintf("Starting handler '%s'", script)) c.UI.Info(fmt.Sprintf("Starting handler '%s'", script))
} }
@ -373,7 +386,7 @@ func (c *LockCommand) startChild(script string, doneCh chan struct{}, passStdin
// termination. // termination.
// On Windows, the child is always hard terminated with a SIGKILL, even // On Windows, the child is always hard terminated with a SIGKILL, even
// on the first attempt. // on the first attempt.
func (c *LockCommand) killChild(childDone chan struct{}) error { func (c *LockCommand) killChild(childErr chan error) error {
// Get the child process // Get the child process
child := c.child child := c.child
@ -395,7 +408,7 @@ func (c *LockCommand) killChild(childDone chan struct{}) error {
// Wait for termination, or until a timeout // Wait for termination, or until a timeout
select { select {
case <-childDone: case <-childErr:
if c.verbose { if c.verbose {
c.UI.Info("Child terminated") c.UI.Info("Child terminated")
} }

View File

@ -257,3 +257,33 @@ func TestLockCommand_MonitorRetry_Semaphore_Arg(t *testing.T) {
t.Fatalf("bad: %#v", opts) t.Fatalf("bad: %#v", opts)
} }
} }
func TestLockCommand_ChildExitCode(t *testing.T) {
t.Parallel()
a := agent.NewTestAgent(t.Name(), nil)
defer a.Shutdown()
t.Run("clean exit", func(t *testing.T) {
_, c := testLockCommand(t)
args := []string{"-http-addr=" + a.HTTPAddr(), "-child-exit-code", "test/prefix", "exit 0"}
if got, want := c.Run(args), 0; got != want {
t.Fatalf("got %d want %d", got, want)
}
})
t.Run("error exit", func(t *testing.T) {
_, c := testLockCommand(t)
args := []string{"-http-addr=" + a.HTTPAddr(), "-child-exit-code", "test/prefix", "exit 1"}
if got, want := c.Run(args), 2; got != want {
t.Fatalf("got %d want %d", got, want)
}
})
t.Run("not propagated", func(t *testing.T) {
_, c := testLockCommand(t)
args := []string{"-http-addr=" + a.HTTPAddr(), "test/prefix", "exit 1"}
if got, want := c.Run(args), 0; got != want {
t.Fatalf("got %d want %d", got, want)
}
})
}

View File

@ -52,6 +52,10 @@ Windows has no POSIX compatible notion for `SIGTERM`.
#### Command Options #### Command Options
* `-child-exit-code` - Exit 2 if the child process exited with an error
if this is true, otherwise this doesn't propagate an error from the
child. The default value is false.
* `-monitor-retry` - Retry up to this number of times if Consul returns a 500 error * `-monitor-retry` - Retry up to this number of times if Consul returns a 500 error
while monitoring the lock. This allows riding out brief periods of unavailability while monitoring the lock. This allows riding out brief periods of unavailability
without causing leader elections, but increases the amount of time required without causing leader elections, but increases the amount of time required
@ -67,6 +71,9 @@ Windows has no POSIX compatible notion for `SIGTERM`.
* `-pass-stdin` - Pass stdin to child process. * `-pass-stdin` - Pass stdin to child process.
* `timeout` - Maximum amount of time to wait to acquire the lock, specified
as a duration like `1s` or `3h`. The default value is 0.
* `-try` - Attempt to acquire the lock up to the given timeout. The timeout is a * `-try` - Attempt to acquire the lock up to the given timeout. The timeout is a
positive decimal number, with unit suffix, such as "500ms". Valid time units positive decimal number, with unit suffix, such as "500ms". Valid time units
are "ns", "us" (or "µs"), "ms", "s", "m", "h". are "ns", "us" (or "µs"), "ms", "s", "m", "h".