Merge pull request #3296 from hashicorp/ensure_registration_race

Fix race condition between removing a service and adding a check for …
This commit is contained in:
preetapan 2017-07-18 18:36:47 -05:00 committed by GitHub
commit fb43953894
3 changed files with 31 additions and 12 deletions

View File

@ -1790,7 +1790,11 @@ func (a *Agent) AddCheck(check *structs.HealthCheck, chkType *structs.CheckType,
}
// Add to the local state for anti-entropy
a.state.AddCheck(check, token)
err := a.state.AddCheck(check, token)
if err != nil {
a.cancelCheckMonitors(check.CheckID)
return err
}
// Persist the check
if persist && !a.config.DevMode {
@ -1814,6 +1818,21 @@ func (a *Agent) RemoveCheck(checkID types.CheckID, persist bool) error {
a.checkLock.Lock()
defer a.checkLock.Unlock()
a.cancelCheckMonitors(checkID)
if persist {
if err := a.purgeCheck(checkID); err != nil {
return err
}
if err := a.purgeCheckState(checkID); err != nil {
return err
}
}
a.logger.Printf("[DEBUG] agent: removed check %q", checkID)
return nil
}
func (a *Agent) cancelCheckMonitors(checkID types.CheckID) {
// Stop any monitors
delete(a.checkReapAfter, checkID)
if check, ok := a.checkMonitors[checkID]; ok {
@ -1836,16 +1855,6 @@ func (a *Agent) RemoveCheck(checkID types.CheckID, persist bool) error {
check.Stop()
delete(a.checkDockers, checkID)
}
if persist {
if err := a.purgeCheck(checkID); err != nil {
return err
}
if err := a.purgeCheckState(checkID); err != nil {
return err
}
}
a.logger.Printf("[DEBUG] agent: removed check %q", checkID)
return nil
}
// updateTTLCheck is used to update the status of a TTL check via the Agent API.

View File

@ -247,18 +247,25 @@ func (l *localState) checkToken(checkID types.CheckID) string {
// AddCheck is used to add a health check to the local state.
// This entry is persistent and the agent will make a best effort to
// ensure it is registered
func (l *localState) AddCheck(check *structs.HealthCheck, token string) {
func (l *localState) AddCheck(check *structs.HealthCheck, token string) error {
// Set the node name
check.Node = l.config.NodeName
l.Lock()
defer l.Unlock()
// if there is a serviceID associated with the check, make sure it exists before adding it
// NOTE - This logic may be moved to be handled within the Agent's Addcheck method after a refactor
if check.ServiceID != "" && l.services[check.ServiceID] == nil {
return fmt.Errorf("ServiceID %q does not exist", check.ServiceID)
}
l.checks[check.CheckID] = check
l.checkStatus[check.CheckID] = syncStatus{}
l.checkTokens[check.CheckID] = token
delete(l.checkCriticalTime, check.CheckID)
l.changeMade()
return nil
}
// RemoveCheck is used to remove a health check from the local state.

View File

@ -1475,6 +1475,9 @@ func TestAgent_checkCriticalTime(t *testing.T) {
cfg := TestConfig()
l := NewLocalState(cfg, nil)
svc := &structs.NodeService{ID: "redis", Service: "redis", Port: 8000}
l.AddService(svc, "")
// Add a passing check and make sure it's not critical.
checkID := types.CheckID("redis:1")
chk := &structs.HealthCheck{