Fix race condition between removing a service and adding a check for the same service, which was causing orphaned checks

This commit is contained in:
Preetha Appan 2017-07-18 14:09:19 -05:00
parent fe51640263
commit 9f048afe29
3 changed files with 80 additions and 6 deletions

View File

@ -14,6 +14,8 @@ import (
"testing"
"time"
"sync"
"github.com/hashicorp/consul/agent/consul"
"github.com/hashicorp/consul/agent/consul/structs"
"github.com/hashicorp/consul/api"
@ -626,6 +628,71 @@ func TestAgent_RemoveServiceRemovesAllChecks(t *testing.T) {
}
}
func TestAgent_AddCheck_RemoveServiceRace(t *testing.T) {
t.Parallel()
cfg := TestConfig()
cfg.NodeName = "node1"
a := NewTestAgent(t.Name(), cfg)
defer a.Shutdown()
// NOTE - trying a few times reproduces the race condition where a check can be added while a service is being deleted.
// This test may return a false positive if Addcheck happens to execute before RemoveService
for i := 0; i < 50; i++ {
addServiceAndChecks(a, t)
//try removing the service and adding a check for that service in two different go routines
var wg sync.WaitGroup
wg.Add(2)
chk1 := &structs.CheckType{CheckID: "chk1", Name: "chk1", TTL: time.Minute}
hchk1 := &structs.HealthCheck{Node: "node1", CheckID: "chk1", Name: "chk1", Status: "critical", ServiceID: "redis", ServiceName: "redis"}
go func() {
a.AddCheck(hchk1, chk1, false, "")
wg.Done()
}()
go func() {
a.RemoveService("redis", false)
wg.Done()
}()
wg.Wait()
checks := a.state.Checks()
if len(checks) > 0 {
t.Fatalf("Expected checks map to be empty")
}
}
}
func addServiceAndChecks(a *TestAgent, t *testing.T) {
svc := &structs.NodeService{ID: "redis", Service: "redis", Port: 8000}
chk1 := &structs.CheckType{CheckID: "chk1", Name: "chk1", TTL: time.Minute}
chk2 := &structs.CheckType{CheckID: "chk2", Name: "chk2", TTL: 2 * time.Minute}
hchk1 := &structs.HealthCheck{Node: "node1", CheckID: "chk1", Name: "chk1", Status: "critical", ServiceID: "redis", ServiceName: "redis"}
hchk2 := &structs.HealthCheck{Node: "node1", CheckID: "chk2", Name: "chk2", Status: "critical", ServiceID: "redis", ServiceName: "redis"}
// register service with chk1
if err := a.AddService(svc, []*structs.CheckType{chk1}, false, ""); err != nil {
t.Fatal("Failed to register service", err)
}
// verify chk1 exists
if a.state.Checks()["chk1"] == nil {
t.Fatal("Could not find health check chk1")
}
// update the service with chk2
if err := a.AddService(svc, []*structs.CheckType{chk2}, false, ""); err != nil {
t.Fatal("Failed to update service", err)
}
// check that both checks are there
if got, want := a.state.Checks()["chk1"], hchk1; !verify.Values(t, "", got, want) {
t.FailNow()
}
if got, want := a.state.Checks()["chk2"], hchk2; !verify.Values(t, "", got, want) {
t.FailNow()
}
}
func TestAgent_AddCheck(t *testing.T) {
t.Parallel()
cfg := TestConfig()

View File

@ -253,12 +253,17 @@ func (l *localState) AddCheck(check *structs.HealthCheck, token string) {
l.Lock()
defer l.Unlock()
l.checks[check.CheckID] = check
l.checkStatus[check.CheckID] = syncStatus{}
l.checkTokens[check.CheckID] = token
delete(l.checkCriticalTime, check.CheckID)
l.changeMade()
allow := true
if check.ServiceID != "" { //if there is a serviceID associated with the check, make sure it exists
_, allow = l.services[check.ServiceID]
}
if allow {
l.checks[check.CheckID] = check
l.checkStatus[check.CheckID] = syncStatus{}
l.checkTokens[check.CheckID] = token
delete(l.checkCriticalTime, check.CheckID)
l.changeMade()
}
}
// RemoveCheck is used to remove a health check from the local state.

View File

@ -1475,6 +1475,8 @@ func TestAgent_checkCriticalTime(t *testing.T) {
cfg := TestConfig()
l := NewLocalState(cfg, nil)
svc := &structs.NodeService{ID: "redis", Service: "redis", Port: 8000}
l.AddService(svc, "")
// Add a passing check and make sure it's not critical.
checkID := types.CheckID("redis:1")
chk := &structs.HealthCheck{