From 29e18c7494545120cf67946781a0ee0dd76a1d6d Mon Sep 17 00:00:00 2001 From: Frank Schroeder Date: Mon, 28 Aug 2017 14:17:09 +0200 Subject: [PATCH 01/36] agent: decouple anti-entropy from local state The anti-entropy code manages background synchronizations of the local state on a regular basis or on demand when either the state has changed or a new consul server has been added. This patch moves the anti-entropy code into its own package and decouples it from the local state code since they are performing two different functions. To simplify code-review this revision does not make any optimizations, renames or refactorings. This will happen in subsequent commits. --- agent/ae/ae.go | 146 ++++++++++++++++++++++++++++++++++++++++ agent/ae/ae_test.go | 55 +++++++++++++++ agent/agent.go | 48 +++++++++++-- agent/agent_endpoint.go | 2 +- agent/local.go | 114 +++---------------------------- agent/local_test.go | 40 ++--------- agent/util.go | 24 ------- agent/util_test.go | 18 ----- 8 files changed, 257 insertions(+), 190 deletions(-) create mode 100644 agent/ae/ae.go create mode 100644 agent/ae/ae_test.go diff --git a/agent/ae/ae.go b/agent/ae/ae.go new file mode 100644 index 0000000000..d5f56c37a9 --- /dev/null +++ b/agent/ae/ae.go @@ -0,0 +1,146 @@ +// Package ae provides an anti-entropy mechanism for the local state. +package ae + +import ( + "log" + "math" + "sync/atomic" + "time" + + "github.com/hashicorp/consul/lib" +) + +const ( + // This scale factor means we will add a minute after we cross 128 nodes, + // another at 256, another at 512, etc. By 8192 nodes, we will scale up + // by a factor of 8. + // + // If you update this, you may need to adjust the tuning of + // CoordinateUpdatePeriod and CoordinateUpdateMaxBatchSize. + aeScaleThreshold = 128 + + syncStaggerIntv = 3 * time.Second + syncRetryIntv = 15 * time.Second +) + +// aeScale is used to scale the time interval at which anti-entropy updates take +// place. It is used to prevent saturation as the cluster size grows. +func aeScale(d time.Duration, n int) time.Duration { + // Don't scale until we cross the threshold + if n <= aeScaleThreshold { + return d + } + + mult := math.Ceil(math.Log2(float64(n))-math.Log2(aeScaleThreshold)) + 1.0 + return time.Duration(mult) * d +} + +type StateSyncer struct { + // paused is used to check if we are paused. Must be the first + // element due to a go bug. + // todo(fs): which bug? still relevant? + paused int32 + + // State contains the data that needs to be synchronized. + State interface { + UpdateSyncState() error + SyncChanges() error + } + + // Interval is the time between two sync runs. + Interval time.Duration + + // ClusterSize returns the number of members in the cluster. + // todo(fs): we use this for staggering but what about a random number? + ClusterSize func() int + + // ShutdownCh is closed when the application is shutting down. + ShutdownCh chan struct{} + + // ConsulCh contains data when a new consul server has been added to the cluster. + ConsulCh chan struct{} + + // TriggerCh contains data when a sync should run immediately. + TriggerCh chan struct{} + + Logger *log.Logger +} + +// Pause is used to pause state synchronization, this can be +// used to make batch changes +func (ae *StateSyncer) Pause() { + atomic.AddInt32(&ae.paused, 1) +} + +// Resume is used to resume state synchronization +func (ae *StateSyncer) Resume() { + paused := atomic.AddInt32(&ae.paused, -1) + if paused < 0 { + panic("unbalanced State.Resume() detected") + } + ae.changeMade() +} + +// Paused is used to check if we are paused +func (ae *StateSyncer) Paused() bool { + return atomic.LoadInt32(&ae.paused) > 0 +} + +func (ae *StateSyncer) changeMade() { + select { + case ae.TriggerCh <- struct{}{}: + default: + } +} + +// antiEntropy is a long running method used to perform anti-entropy +// between local and remote state. +func (ae *StateSyncer) Run() { +SYNC: + // Sync our state with the servers + for { + err := ae.State.UpdateSyncState() + if err == nil { + break + } + ae.Logger.Printf("[ERR] agent: failed to sync remote state: %v", err) + select { + case <-ae.ConsulCh: + // Stagger the retry on leader election, avoid a thundering heard + select { + case <-time.After(lib.RandomStagger(aeScale(syncStaggerIntv, ae.ClusterSize()))): + case <-ae.ShutdownCh: + return + } + case <-time.After(syncRetryIntv + lib.RandomStagger(aeScale(syncRetryIntv, ae.ClusterSize()))): + case <-ae.ShutdownCh: + return + } + } + + // Force-trigger AE to pickup any changes + ae.changeMade() + + // Schedule the next full sync, with a random stagger + aeIntv := aeScale(ae.Interval, ae.ClusterSize()) + aeIntv = aeIntv + lib.RandomStagger(aeIntv) + aeTimer := time.After(aeIntv) + + // Wait for sync events + for { + select { + case <-aeTimer: + goto SYNC + case <-ae.TriggerCh: + // Skip the sync if we are paused + if ae.Paused() { + continue + } + if err := ae.State.SyncChanges(); err != nil { + ae.Logger.Printf("[ERR] agent: failed to sync changes: %v", err) + } + case <-ae.ShutdownCh: + return + } + } +} diff --git a/agent/ae/ae_test.go b/agent/ae/ae_test.go new file mode 100644 index 0000000000..5d91f2f9ba --- /dev/null +++ b/agent/ae/ae_test.go @@ -0,0 +1,55 @@ +package ae + +import ( + "testing" + "time" +) + +func TestAE_scale(t *testing.T) { + t.Parallel() + intv := time.Minute + if v := aeScale(intv, 100); v != intv { + t.Fatalf("Bad: %v", v) + } + if v := aeScale(intv, 200); v != 2*intv { + t.Fatalf("Bad: %v", v) + } + if v := aeScale(intv, 1000); v != 4*intv { + t.Fatalf("Bad: %v", v) + } + if v := aeScale(intv, 10000); v != 8*intv { + t.Fatalf("Bad: %v", v) + } +} + +func TestAE_nestedPauseResume(t *testing.T) { + t.Parallel() + l := new(StateSyncer) + if l.Paused() != false { + t.Fatal("syncer should be unPaused after init") + } + l.Pause() + if l.Paused() != true { + t.Fatal("syncer should be Paused after first call to Pause()") + } + l.Pause() + if l.Paused() != true { + t.Fatal("syncer should STILL be Paused after second call to Pause()") + } + l.Resume() + if l.Paused() != true { + t.Fatal("syncer should STILL be Paused after FIRST call to Resume()") + } + l.Resume() + if l.Paused() != false { + t.Fatal("syncer should NOT be Paused after SECOND call to Resume()") + } + + defer func() { + err := recover() + if err == nil { + t.Fatal("unbalanced Resume() should cause a panic()") + } + }() + l.Resume() +} diff --git a/agent/agent.go b/agent/agent.go index c2abbb76dc..1238359f3a 100644 --- a/agent/agent.go +++ b/agent/agent.go @@ -20,6 +20,7 @@ import ( "github.com/armon/go-metrics" "github.com/hashicorp/consul/acl" + "github.com/hashicorp/consul/agent/ae" "github.com/hashicorp/consul/agent/config" "github.com/hashicorp/consul/agent/consul" "github.com/hashicorp/consul/agent/structs" @@ -109,6 +110,10 @@ type Agent struct { // services and checks. Used for anti-entropy. state *localState + // sync manages the synchronization of the local + // and the remote state. + sync *ae.StateSyncer + // checkReapAfter maps the check ID to a timeout after which we should // reap its associated service checkReapAfter map[types.CheckID]time.Duration @@ -241,8 +246,27 @@ func (a *Agent) Start() error { return fmt.Errorf("Failed to setup node ID: %v", err) } + // create a notif channel to trigger state sychronizations + // when a consul server was added to the cluster. + consulCh := make(chan struct{}, 1) + + // create a notif channel to trigger state synchronizations + // when the state has changed. + triggerCh := make(chan struct{}, 1) + // create the local state - a.state = NewLocalState(c, a.logger, a.tokens) + a.state = NewLocalState(c, a.logger, a.tokens, triggerCh) + + // create the state synchronization manager which performs + // regular and on-demand state synchronizations (anti-entropy). + a.sync = &ae.StateSyncer{ + State: a.state, + Interval: c.AEInterval, + ShutdownCh: a.shutdownCh, + ConsulCh: consulCh, + TriggerCh: triggerCh, + Logger: a.logger, + } // create the config for the rpc server/client consulCfg, err := a.consulConfig() @@ -250,8 +274,16 @@ func (a *Agent) Start() error { return err } - // link consul client/server with the state - consulCfg.ServerUp = a.state.ConsulServerUp + // ServerUp is used to inform that a new consul server is now + // up. This can be used to speed up the sync process if we are blocking + // waiting to discover a consul server + // todo(fs): IMO, the non-blocking nature of this call should be hidden in the syncer + consulCfg.ServerUp = func() { + select { + case consulCh <- struct{}{}: + default: + } + } // Setup either the client or the server. if c.ServerMode { @@ -262,6 +294,7 @@ func (a *Agent) Start() error { a.delegate = server a.state.delegate = server + a.sync.ClusterSize = func() int { return len(server.LANMembers()) } } else { client, err := consul.NewClientLogger(consulCfg, a.logger) if err != nil { @@ -270,6 +303,7 @@ func (a *Agent) Start() error { a.delegate = client a.state.delegate = client + a.sync.ClusterSize = func() int { return len(client.LANMembers()) } } // Load checks/services/metadata. @@ -1264,18 +1298,18 @@ func (a *Agent) WANMembers() []serf.Member { // StartSync is called once Services and Checks are registered. // This is called to prevent a race between clients and the anti-entropy routines func (a *Agent) StartSync() { - // Start the anti entropy routine - go a.state.antiEntropy(a.shutdownCh) + go a.sync.Run() + a.logger.Printf("[INFO] agent: starting state syncer") } // PauseSync is used to pause anti-entropy while bulk changes are make func (a *Agent) PauseSync() { - a.state.Pause() + a.sync.Pause() } // ResumeSync is used to unpause anti-entropy after bulk changes are make func (a *Agent) ResumeSync() { - a.state.Resume() + a.sync.Resume() } // GetLANCoordinate returns the coordinates of this node in the local pools diff --git a/agent/agent_endpoint.go b/agent/agent_endpoint.go index 271f53da62..13dcfb062f 100644 --- a/agent/agent_endpoint.go +++ b/agent/agent_endpoint.go @@ -304,7 +304,7 @@ func (s *HTTPServer) AgentForceLeave(resp http.ResponseWriter, req *http.Request // services and checks to the server. If the operation fails, we only // only warn because the write did succeed and anti-entropy will sync later. func (s *HTTPServer) syncChanges() { - if err := s.agent.state.syncChanges(); err != nil { + if err := s.agent.state.SyncChanges(); err != nil { s.agent.logger.Printf("[ERR] agent: failed to sync changes: %v", err) } } diff --git a/agent/local.go b/agent/local.go index 2a29a9c6c0..58da5f7f38 100644 --- a/agent/local.go +++ b/agent/local.go @@ -18,11 +18,6 @@ import ( "github.com/hashicorp/consul/types" ) -const ( - syncStaggerIntv = 3 * time.Second - syncRetryIntv = 15 * time.Second -) - // syncStatus is used to represent the difference between // the local and remote state, and if action needs to be taken type syncStatus struct { @@ -33,7 +28,6 @@ type syncStatus struct { // populated during NewLocalAgent from the agent configuration to avoid // race conditions with the agent configuration. type localStateConfig struct { - AEInterval time.Duration AdvertiseAddr string CheckUpdateInterval time.Duration Datacenter string @@ -47,10 +41,6 @@ type localStateConfig struct { // and checks. We used it to perform anti-entropy with the // catalog representation type localState struct { - // paused is used to check if we are paused. Must be the first - // element due to a go bug. - paused int32 - sync.RWMutex logger *log.Logger @@ -81,10 +71,6 @@ type localState struct { // metadata tracks the local metadata fields metadata map[string]string - // consulCh is used to inform of a change to the known - // consul nodes. This may be used to retry a sync run - consulCh chan struct{} - // triggerCh is used to inform of a change to local state // that requires anti-entropy with the server triggerCh chan struct{} @@ -95,9 +81,8 @@ type localState struct { } // NewLocalState creates a is used to initialize the local state -func NewLocalState(c *config.RuntimeConfig, lg *log.Logger, tokens *token.Store) *localState { +func NewLocalState(c *config.RuntimeConfig, lg *log.Logger, tokens *token.Store, triggerCh chan struct{}) *localState { lc := localStateConfig{ - AEInterval: c.AEInterval, AdvertiseAddr: c.AdvertiseAddrLAN.String(), CheckUpdateInterval: c.CheckUpdateInterval, Datacenter: c.Datacenter, @@ -122,8 +107,7 @@ func NewLocalState(c *config.RuntimeConfig, lg *log.Logger, tokens *token.Store) checkCriticalTime: make(map[types.CheckID]time.Time), deferCheck: make(map[types.CheckID]*time.Timer), metadata: make(map[string]string), - consulCh: make(chan struct{}, 1), - triggerCh: make(chan struct{}, 1), + triggerCh: triggerCh, } l.discardCheckOutput.Store(c.DiscardCheckOutput) return l @@ -131,42 +115,13 @@ func NewLocalState(c *config.RuntimeConfig, lg *log.Logger, tokens *token.Store) // changeMade is used to trigger an anti-entropy run func (l *localState) changeMade() { + // todo(fs): IMO, the non-blocking nature of this call should be hidden in the syncer select { case l.triggerCh <- struct{}{}: default: } } -// ConsulServerUp is used to inform that a new consul server is now -// up. This can be used to speed up the sync process if we are blocking -// waiting to discover a consul server -func (l *localState) ConsulServerUp() { - select { - case l.consulCh <- struct{}{}: - default: - } -} - -// Pause is used to pause state synchronization, this can be -// used to make batch changes -func (l *localState) Pause() { - atomic.AddInt32(&l.paused, 1) -} - -// Resume is used to resume state synchronization -func (l *localState) Resume() { - paused := atomic.AddInt32(&l.paused, -1) - if paused < 0 { - panic("unbalanced localState.Resume() detected") - } - l.changeMade() -} - -// isPaused is used to check if we are paused -func (l *localState) isPaused() bool { - return atomic.LoadInt32(&l.paused) > 0 -} - func (l *localState) SetDiscardCheckOutput(b bool) { l.discardCheckOutput.Store(b) } @@ -412,61 +367,12 @@ func (l *localState) Metadata() map[string]string { return metadata } -// antiEntropy is a long running method used to perform anti-entropy -// between local and remote state. -func (l *localState) antiEntropy(shutdownCh chan struct{}) { -SYNC: - // Sync our state with the servers - for { - err := l.setSyncState() - if err == nil { - break - } - l.logger.Printf("[ERR] agent: failed to sync remote state: %v", err) - select { - case <-l.consulCh: - // Stagger the retry on leader election, avoid a thundering heard - select { - case <-time.After(lib.RandomStagger(aeScale(syncStaggerIntv, len(l.delegate.LANMembers())))): - case <-shutdownCh: - return - } - case <-time.After(syncRetryIntv + lib.RandomStagger(aeScale(syncRetryIntv, len(l.delegate.LANMembers())))): - case <-shutdownCh: - return - } +// UpdateSyncState does a read of the server state, and updates +// the local sync status as appropriate +func (l *localState) UpdateSyncState() error { + if l == nil { + panic("config == nil") } - - // Force-trigger AE to pickup any changes - l.changeMade() - - // Schedule the next full sync, with a random stagger - aeIntv := aeScale(l.config.AEInterval, len(l.delegate.LANMembers())) - aeIntv = aeIntv + lib.RandomStagger(aeIntv) - aeTimer := time.After(aeIntv) - - // Wait for sync events - for { - select { - case <-aeTimer: - goto SYNC - case <-l.triggerCh: - // Skip the sync if we are paused - if l.isPaused() { - continue - } - if err := l.syncChanges(); err != nil { - l.logger.Printf("[ERR] agent: failed to sync changes: %v", err) - } - case <-shutdownCh: - return - } - } -} - -// setSyncState does a read of the server state, and updates -// the local syncStatus as appropriate -func (l *localState) setSyncState() error { req := structs.NodeSpecificRequest{ Datacenter: l.config.Datacenter, Node: l.config.NodeName, @@ -590,9 +496,9 @@ func (l *localState) setSyncState() error { return nil } -// syncChanges is used to scan the status our local services and checks +// SyncChanges is used to scan the status our local services and checks // and update any that are out of sync with the server -func (l *localState) syncChanges() error { +func (l *localState) SyncChanges() error { l.Lock() defer l.Unlock() diff --git a/agent/local_test.go b/agent/local_test.go index 2d0d2fb022..baad74a1e5 100644 --- a/agent/local_test.go +++ b/agent/local_test.go @@ -1482,7 +1482,7 @@ func TestAgent_serviceTokens(t *testing.T) { tokens := new(token.Store) tokens.UpdateUserToken("default") - l := NewLocalState(config.DefaultRuntimeConfig(`bind_addr = "127.0.0.1" data_dir = "dummy"`), nil, tokens) + l := NewLocalState(config.DefaultRuntimeConfig(`bind_addr = "127.0.0.1" data_dir = "dummy"`), nil, tokens, make(chan struct{}, 1)) l.AddService(&structs.NodeService{ ID: "redis", @@ -1511,7 +1511,7 @@ func TestAgent_checkTokens(t *testing.T) { tokens := new(token.Store) tokens.UpdateUserToken("default") - l := NewLocalState(config.DefaultRuntimeConfig(`bind_addr = "127.0.0.1" data_dir = "dummy"`), nil, tokens) + l := NewLocalState(config.DefaultRuntimeConfig(`bind_addr = "127.0.0.1" data_dir = "dummy"`), nil, tokens, make(chan struct{}, 1)) // Returns default when no token is set if token := l.CheckToken("mem"); token != "default" { @@ -1533,7 +1533,7 @@ func TestAgent_checkTokens(t *testing.T) { func TestAgent_checkCriticalTime(t *testing.T) { t.Parallel() - l := NewLocalState(config.DefaultRuntimeConfig(`bind_addr = "127.0.0.1" data_dir = "dummy"`), nil, new(token.Store)) + l := NewLocalState(config.DefaultRuntimeConfig(`bind_addr = "127.0.0.1" data_dir = "dummy"`), nil, new(token.Store), make(chan struct{}, 1)) svc := &structs.NodeService{ID: "redis", Service: "redis", Port: 8000} l.AddService(svc, "") @@ -1595,7 +1595,7 @@ func TestAgent_checkCriticalTime(t *testing.T) { func TestAgent_AddCheckFailure(t *testing.T) { t.Parallel() - l := NewLocalState(config.DefaultRuntimeConfig(`bind_addr = "127.0.0.1" data_dir = "dummy"`), nil, new(token.Store)) + l := NewLocalState(config.DefaultRuntimeConfig(`bind_addr = "127.0.0.1" data_dir = "dummy"`), nil, new(token.Store), make(chan struct{}, 1)) // Add a check for a service that does not exist and verify that it fails checkID := types.CheckID("redis:1") @@ -1613,38 +1613,6 @@ func TestAgent_AddCheckFailure(t *testing.T) { } -func TestAgent_nestedPauseResume(t *testing.T) { - t.Parallel() - l := new(localState) - if l.isPaused() != false { - t.Fatal("localState should be unPaused after init") - } - l.Pause() - if l.isPaused() != true { - t.Fatal("localState should be Paused after first call to Pause()") - } - l.Pause() - if l.isPaused() != true { - t.Fatal("localState should STILL be Paused after second call to Pause()") - } - l.Resume() - if l.isPaused() != true { - t.Fatal("localState should STILL be Paused after FIRST call to Resume()") - } - l.Resume() - if l.isPaused() != false { - t.Fatal("localState should NOT be Paused after SECOND call to Resume()") - } - - defer func() { - err := recover() - if err == nil { - t.Fatal("unbalanced Resume() should cause a panic()") - } - }() - l.Resume() -} - func TestAgent_sendCoordinate(t *testing.T) { t.Parallel() a := NewTestAgent(t.Name(), ` diff --git a/agent/util.go b/agent/util.go index 3a13c2172a..ae7c3c154f 100644 --- a/agent/util.go +++ b/agent/util.go @@ -4,28 +4,16 @@ import ( "bytes" "crypto/md5" "fmt" - "math" "os" "os/exec" "os/signal" osuser "os/user" "strconv" - "time" "github.com/hashicorp/consul/types" "github.com/hashicorp/go-msgpack/codec" ) -const ( - // This scale factor means we will add a minute after we cross 128 nodes, - // another at 256, another at 512, etc. By 8192 nodes, we will scale up - // by a factor of 8. - // - // If you update this, you may need to adjust the tuning of - // CoordinateUpdatePeriod and CoordinateUpdateMaxBatchSize. - aeScaleThreshold = 128 -) - // msgpackHandle is a shared handle for encoding/decoding of // messages var msgpackHandle = &codec.MsgpackHandle{ @@ -33,18 +21,6 @@ var msgpackHandle = &codec.MsgpackHandle{ WriteExt: true, } -// aeScale is used to scale the time interval at which anti-entropy updates take -// place. It is used to prevent saturation as the cluster size grows. -func aeScale(interval time.Duration, n int) time.Duration { - // Don't scale until we cross the threshold - if n <= aeScaleThreshold { - return interval - } - - multiplier := math.Ceil(math.Log2(float64(n))-math.Log2(aeScaleThreshold)) + 1.0 - return time.Duration(multiplier) * interval -} - // decodeMsgPack is used to decode a MsgPack encoded object func decodeMsgPack(buf []byte, out interface{}) error { return codec.NewDecoder(bytes.NewReader(buf), msgpackHandle).Decode(out) diff --git a/agent/util_test.go b/agent/util_test.go index ea12c708db..b10f274c02 100644 --- a/agent/util_test.go +++ b/agent/util_test.go @@ -4,28 +4,10 @@ import ( "os" "runtime" "testing" - "time" "github.com/hashicorp/consul/testutil" ) -func TestAEScale(t *testing.T) { - t.Parallel() - intv := time.Minute - if v := aeScale(intv, 100); v != intv { - t.Fatalf("Bad: %v", v) - } - if v := aeScale(intv, 200); v != 2*intv { - t.Fatalf("Bad: %v", v) - } - if v := aeScale(intv, 1000); v != 4*intv { - t.Fatalf("Bad: %v", v) - } - if v := aeScale(intv, 10000); v != 8*intv { - t.Fatalf("Bad: %v", v) - } -} - func TestStringHash(t *testing.T) { t.Parallel() in := "hello world" From 94ef1041a12d0e36b483d8179f9a1579009f3142 Mon Sep 17 00:00:00 2001 From: Frank Schroeder Date: Mon, 28 Aug 2017 14:17:10 +0200 Subject: [PATCH 02/36] agent: cleanup StateSyncer This patch cleans up the state syncer code by renaming fields, adding helpers and documentation. --- agent/ae/ae.go | 203 ++++++++++++++++++++++++++------------------ agent/ae/ae_test.go | 29 ++++--- agent/agent.go | 6 +- 3 files changed, 140 insertions(+), 98 deletions(-) diff --git a/agent/ae/ae.go b/agent/ae/ae.go index d5f56c37a9..78bf93e251 100644 --- a/agent/ae/ae.go +++ b/agent/ae/ae.go @@ -1,4 +1,4 @@ -// Package ae provides an anti-entropy mechanism for the local state. +// Package ae provides tools to synchronize state between local and remote consul servers. package ae import ( @@ -10,35 +10,43 @@ import ( "github.com/hashicorp/consul/lib" ) -const ( - // This scale factor means we will add a minute after we cross 128 nodes, - // another at 256, another at 512, etc. By 8192 nodes, we will scale up - // by a factor of 8. - // - // If you update this, you may need to adjust the tuning of - // CoordinateUpdatePeriod and CoordinateUpdateMaxBatchSize. - aeScaleThreshold = 128 +// scaleThreshold is the number of nodes after which regular sync runs are +// spread out farther apart. The value should be a power of 2 since the +// scale function uses log2. +// +// When set to 128 nodes the delay between regular runs is doubled when the +// cluster is larger than 128 nodes. It doubles again when it passes 256 +// nodes, and again at 512 nodes and so forth. At 8192 nodes, the delay +// factor is 8. +// +// If you update this, you may need to adjust the tuning of +// CoordinateUpdatePeriod and CoordinateUpdateMaxBatchSize. +const scaleThreshold = 128 - syncStaggerIntv = 3 * time.Second - syncRetryIntv = 15 * time.Second -) - -// aeScale is used to scale the time interval at which anti-entropy updates take -// place. It is used to prevent saturation as the cluster size grows. -func aeScale(d time.Duration, n int) time.Duration { - // Don't scale until we cross the threshold - if n <= aeScaleThreshold { - return d +// scaleFactor returns a factor by which the next sync run should be delayed to +// avoid saturation of the cluster. The larger the cluster grows the farther +// the sync runs should be spread apart. +// +// The current implementation uses a log2 scale which doubles the delay between +// runs every time the cluster doubles in size. +func scaleFactor(nodes int) int { + if nodes <= scaleThreshold { + return 1.0 } - - mult := math.Ceil(math.Log2(float64(n))-math.Log2(aeScaleThreshold)) + 1.0 - return time.Duration(mult) * d + return int(math.Ceil(math.Log2(float64(nodes))-math.Log2(float64(scaleThreshold))) + 1.0) } +// StateSyncer manages background synchronization of the given state. +// +// The state is synchronized on a regular basis or on demand when either +// the state has changed or a new Consul server has joined the cluster. +// +// The regular state sychronization provides a self-healing mechanism +// for the cluster which is also called anti-entropy. type StateSyncer struct { - // paused is used to check if we are paused. Must be the first - // element due to a go bug. - // todo(fs): which bug? still relevant? + // paused flags whether sync runs are temporarily disabled. + // Must be the first element due to a go bug. + // todo(fs): which bug? Is this still relevant? paused int32 // State contains the data that needs to be synchronized. @@ -47,18 +55,18 @@ type StateSyncer struct { SyncChanges() error } - // Interval is the time between two sync runs. + // Interval is the time between two regular sync runs. Interval time.Duration - // ClusterSize returns the number of members in the cluster. - // todo(fs): we use this for staggering but what about a random number? + // ClusterSize returns the number of members in the cluster to + // allow staggering the sync runs based on cluster size. ClusterSize func() int // ShutdownCh is closed when the application is shutting down. ShutdownCh chan struct{} - // ConsulCh contains data when a new consul server has been added to the cluster. - ConsulCh chan struct{} + // ServerUpCh contains data when a new consul server has been added to the cluster. + ServerUpCh chan struct{} // TriggerCh contains data when a sync should run immediately. TriggerCh chan struct{} @@ -66,81 +74,112 @@ type StateSyncer struct { Logger *log.Logger } -// Pause is used to pause state synchronization, this can be -// used to make batch changes -func (ae *StateSyncer) Pause() { - atomic.AddInt32(&ae.paused, 1) -} +const ( + // serverUpIntv is the max time to wait before a sync is triggered + // when a consul server has been added to the cluster. + serverUpIntv = 3 * time.Second -// Resume is used to resume state synchronization -func (ae *StateSyncer) Resume() { - paused := atomic.AddInt32(&ae.paused, -1) - if paused < 0 { - panic("unbalanced State.Resume() detected") + // retryFailIntv is the min time to wait before a failed sync is retried. + retryFailIntv = 15 * time.Second +) + +// Run is the long running method to perform state synchronization +// between local and remote servers. +func (s *StateSyncer) Run() { + stagger := func(d time.Duration) time.Duration { + f := scaleFactor(s.ClusterSize()) + return lib.RandomStagger(time.Duration(f) * d) } - ae.changeMade() -} -// Paused is used to check if we are paused -func (ae *StateSyncer) Paused() bool { - return atomic.LoadInt32(&ae.paused) > 0 -} - -func (ae *StateSyncer) changeMade() { - select { - case ae.TriggerCh <- struct{}{}: - default: - } -} - -// antiEntropy is a long running method used to perform anti-entropy -// between local and remote state. -func (ae *StateSyncer) Run() { -SYNC: - // Sync our state with the servers +Sync: for { - err := ae.State.UpdateSyncState() + // update the sync status + err := s.State.UpdateSyncState() if err == nil { break } - ae.Logger.Printf("[ERR] agent: failed to sync remote state: %v", err) + + s.Logger.Printf("[ERR] agent: failed to sync remote state: %v", err) + + // retry updating sync status after some time or when a consul + // server was added. select { - case <-ae.ConsulCh: - // Stagger the retry on leader election, avoid a thundering heard + + // consul server added to cluster. + // retry sooner than retryFailIntv to converge cluster quicker + // but stagger delay to avoid thundering herd + case <-s.ServerUpCh: select { - case <-time.After(lib.RandomStagger(aeScale(syncStaggerIntv, ae.ClusterSize()))): - case <-ae.ShutdownCh: + case <-time.After(stagger(serverUpIntv)): + case <-s.ShutdownCh: return } - case <-time.After(syncRetryIntv + lib.RandomStagger(aeScale(syncRetryIntv, ae.ClusterSize()))): - case <-ae.ShutdownCh: + + // retry full sync after some time + // todo(fs): why don't we use s.Interval here? + case <-time.After(retryFailIntv + stagger(retryFailIntv)): + + case <-s.ShutdownCh: return } } - // Force-trigger AE to pickup any changes - ae.changeMade() - - // Schedule the next full sync, with a random stagger - aeIntv := aeScale(ae.Interval, ae.ClusterSize()) - aeIntv = aeIntv + lib.RandomStagger(aeIntv) - aeTimer := time.After(aeIntv) + // Force-trigger sync to pickup any changes + s.triggerSync() // Wait for sync events for { select { - case <-aeTimer: - goto SYNC - case <-ae.TriggerCh: - // Skip the sync if we are paused - if ae.Paused() { + // todo(fs): why don't we honor the ServerUpCh here as well? + // todo(fs): by default, s.Interval is 60s which is >> 3s (serverUpIntv) + // case <-s.ServerUpCh: + // select { + // case <-time.After(stagger(serverUpIntv)): + // continue Sync + // case <-s.ShutdownCh: + // return + // } + + case <-time.After(s.Interval + stagger(s.Interval)): + goto Sync + + case <-s.TriggerCh: + if s.Paused() { continue } - if err := ae.State.SyncChanges(); err != nil { - ae.Logger.Printf("[ERR] agent: failed to sync changes: %v", err) + if err := s.State.SyncChanges(); err != nil { + s.Logger.Printf("[ERR] agent: failed to sync changes: %v", err) } - case <-ae.ShutdownCh: + + case <-s.ShutdownCh: return } } } + +// Pause temporarily disables sync runs. +func (s *StateSyncer) Pause() { + atomic.AddInt32(&s.paused, 1) +} + +// Paused returns whether sync runs are temporarily disabled. +func (s *StateSyncer) Paused() bool { + return atomic.LoadInt32(&s.paused) > 0 +} + +// Resume re-enables sync runs. +func (s *StateSyncer) Resume() { + paused := atomic.AddInt32(&s.paused, -1) + if paused < 0 { + panic("unbalanced StateSyncer.Resume() detected") + } + s.triggerSync() +} + +// triggerSync queues a sync run if one has not been triggered already. +func (s *StateSyncer) triggerSync() { + select { + case s.TriggerCh <- struct{}{}: + default: + } +} diff --git a/agent/ae/ae_test.go b/agent/ae/ae_test.go index 5d91f2f9ba..7246e7e24a 100644 --- a/agent/ae/ae_test.go +++ b/agent/ae/ae_test.go @@ -1,24 +1,27 @@ package ae import ( + "fmt" "testing" - "time" ) -func TestAE_scale(t *testing.T) { +func TestAE_scaleFactor(t *testing.T) { t.Parallel() - intv := time.Minute - if v := aeScale(intv, 100); v != intv { - t.Fatalf("Bad: %v", v) + tests := []struct { + nodes int + scale int + }{ + {100, 1}, + {200, 2}, + {1000, 4}, + {10000, 8}, } - if v := aeScale(intv, 200); v != 2*intv { - t.Fatalf("Bad: %v", v) - } - if v := aeScale(intv, 1000); v != 4*intv { - t.Fatalf("Bad: %v", v) - } - if v := aeScale(intv, 10000); v != 8*intv { - t.Fatalf("Bad: %v", v) + for _, tt := range tests { + t.Run(fmt.Sprintf("%d nodes", tt.nodes), func(t *testing.T) { + if got, want := scaleFactor(tt.nodes), tt.scale; got != want { + t.Fatalf("got scale factor %d want %d", got, want) + } + }) } } diff --git a/agent/agent.go b/agent/agent.go index 1238359f3a..ddd438a6b3 100644 --- a/agent/agent.go +++ b/agent/agent.go @@ -248,7 +248,7 @@ func (a *Agent) Start() error { // create a notif channel to trigger state sychronizations // when a consul server was added to the cluster. - consulCh := make(chan struct{}, 1) + serverUpCh := make(chan struct{}, 1) // create a notif channel to trigger state synchronizations // when the state has changed. @@ -263,7 +263,7 @@ func (a *Agent) Start() error { State: a.state, Interval: c.AEInterval, ShutdownCh: a.shutdownCh, - ConsulCh: consulCh, + ServerUpCh: serverUpCh, TriggerCh: triggerCh, Logger: a.logger, } @@ -280,7 +280,7 @@ func (a *Agent) Start() error { // todo(fs): IMO, the non-blocking nature of this call should be hidden in the syncer consulCfg.ServerUp = func() { select { - case consulCh <- struct{}{}: + case serverUpCh <- struct{}{}: default: } } From cbaf97bced8e791dce9792bfbadb46194d88f2b3 Mon Sep 17 00:00:00 2001 From: Frank Schroeder Date: Mon, 28 Aug 2017 14:17:10 +0200 Subject: [PATCH 03/36] agent: refactor sync loop to linear flow of control --- agent/ae/ae.go | 97 +++++++++++++++++++++++++------------------------- 1 file changed, 49 insertions(+), 48 deletions(-) diff --git a/agent/ae/ae.go b/agent/ae/ae.go index 78bf93e251..f3b6745036 100644 --- a/agent/ae/ae.go +++ b/agent/ae/ae.go @@ -93,66 +93,67 @@ func (s *StateSyncer) Run() { Sync: for { - // update the sync status - err := s.State.UpdateSyncState() - if err == nil { - break - } + switch err := s.State.UpdateSyncState(); { - s.Logger.Printf("[ERR] agent: failed to sync remote state: %v", err) + // update sync status failed + case err != nil: + s.Logger.Printf("[ERR] agent: failed to sync remote state: %v", err) - // retry updating sync status after some time or when a consul - // server was added. - select { - - // consul server added to cluster. - // retry sooner than retryFailIntv to converge cluster quicker - // but stagger delay to avoid thundering herd - case <-s.ServerUpCh: + // retry updating sync status after some time or when a consul + // server was added. select { - case <-time.After(stagger(serverUpIntv)): + + // consul server added to cluster. + // retry sooner than retryFailIntv to converge cluster sooner + // but stagger delay to avoid thundering herd + case <-s.ServerUpCh: + select { + case <-time.After(stagger(serverUpIntv)): + case <-s.ShutdownCh: + return + } + + // retry full sync after some time + // todo(fs): why don't we use s.Interval here? + case <-time.After(retryFailIntv + stagger(retryFailIntv)): + case <-s.ShutdownCh: return } - // retry full sync after some time - // todo(fs): why don't we use s.Interval here? - case <-time.After(retryFailIntv + stagger(retryFailIntv)): + // update sync status OK + default: + // force-trigger sync to pickup any changes + s.triggerSync() - case <-s.ShutdownCh: - return - } - } + // do partial syncs until it is time for a full sync again + for { + select { + // todo(fs): why don't we honor the ServerUpCh here as well? + // todo(fs): by default, s.Interval is 60s which is >> 3s (serverUpIntv) + // case <-s.ServerUpCh: + // select { + // case <-time.After(stagger(serverUpIntv)): + // continue Sync + // case <-s.ShutdownCh: + // return + // } - // Force-trigger sync to pickup any changes - s.triggerSync() + case <-time.After(s.Interval + stagger(s.Interval)): + continue Sync - // Wait for sync events - for { - select { - // todo(fs): why don't we honor the ServerUpCh here as well? - // todo(fs): by default, s.Interval is 60s which is >> 3s (serverUpIntv) - // case <-s.ServerUpCh: - // select { - // case <-time.After(stagger(serverUpIntv)): - // continue Sync - // case <-s.ShutdownCh: - // return - // } + case <-s.TriggerCh: + if s.Paused() { + continue + } + if err := s.State.SyncChanges(); err != nil { + s.Logger.Printf("[ERR] agent: failed to sync changes: %v", err) + } - case <-time.After(s.Interval + stagger(s.Interval)): - goto Sync - - case <-s.TriggerCh: - if s.Paused() { - continue + case <-s.ShutdownCh: + return + } } - if err := s.State.SyncChanges(); err != nil { - s.Logger.Printf("[ERR] agent: failed to sync changes: %v", err) - } - - case <-s.ShutdownCh: - return } } } From c00bbdb5e485967f37ee643ddcd795904394f309 Mon Sep 17 00:00:00 2001 From: Frank Schroeder Date: Mon, 28 Aug 2017 14:17:11 +0200 Subject: [PATCH 04/36] agent: simplify some loops --- agent/agent.go | 24 +++++++++++------------- 1 file changed, 11 insertions(+), 13 deletions(-) diff --git a/agent/agent.go b/agent/agent.go index ddd438a6b3..40f98c585c 100644 --- a/agent/agent.go +++ b/agent/agent.go @@ -1620,8 +1620,8 @@ func (a *Agent) RemoveService(serviceID string, persist bool) error { } // Deregister any associated health checks - for checkID, health := range a.state.Checks() { - if health.ServiceID != serviceID { + for checkID, check := range a.state.Checks() { + if check.ServiceID != serviceID { continue } if err := a.RemoveCheck(checkID, persist); err != nil { @@ -1653,11 +1653,11 @@ func (a *Agent) AddCheck(check *structs.HealthCheck, chkType *structs.CheckType, } if check.ServiceID != "" { - svc, ok := a.state.Services()[check.ServiceID] - if !ok { + s := a.state.Services()[check.ServiceID] + if s == nil { return fmt.Errorf("ServiceID %q does not exist", check.ServiceID) } - check.ServiceName = svc.Service + check.ServiceName = s.Service } a.checkLock.Lock() @@ -2159,12 +2159,11 @@ func (a *Agent) loadServices(conf *config.RuntimeConfig) error { // unloadServices will deregister all services other than the 'consul' service // known to the local agent. func (a *Agent) unloadServices() error { - for _, service := range a.state.Services() { - if err := a.RemoveService(service.ID, false); err != nil { - return fmt.Errorf("Failed deregistering service '%s': %v", service.ID, err) + for id := range a.state.Services() { + if err := a.RemoveService(id, false); err != nil { + return fmt.Errorf("Failed deregistering service '%s': %v", id, err) } } - return nil } @@ -2247,12 +2246,11 @@ func (a *Agent) loadChecks(conf *config.RuntimeConfig) error { // unloadChecks will deregister all checks known to the local agent. func (a *Agent) unloadChecks() error { - for _, check := range a.state.Checks() { - if err := a.RemoveCheck(check.CheckID, false); err != nil { - return fmt.Errorf("Failed deregistering check '%s': %s", check.CheckID, err) + for id := range a.state.Checks() { + if err := a.RemoveCheck(id, false); err != nil { + return fmt.Errorf("Failed deregistering check '%s': %s", id, err) } } - return nil } From 6027a9e2a51860f7a90f99a539215faa17b8b768 Mon Sep 17 00:00:00 2001 From: Frank Schroeder Date: Mon, 28 Aug 2017 14:17:12 +0200 Subject: [PATCH 05/36] local state: move to separate package This patch moves the local state to a separate package to further decouple it from the agent code. The code compiles but the tests do not yet. --- agent/agent.go | 58 +++---- agent/{local.go => local/state.go} | 150 ++++++++++++------- agent/{local_test.go => local/state_test.go} | 2 +- 3 files changed, 128 insertions(+), 82 deletions(-) rename agent/{local.go => local/state.go} (85%) rename agent/{local_test.go => local/state_test.go} (99%) diff --git a/agent/agent.go b/agent/agent.go index 40f98c585c..600432e8b3 100644 --- a/agent/agent.go +++ b/agent/agent.go @@ -23,6 +23,7 @@ import ( "github.com/hashicorp/consul/agent/ae" "github.com/hashicorp/consul/agent/config" "github.com/hashicorp/consul/agent/consul" + "github.com/hashicorp/consul/agent/local" "github.com/hashicorp/consul/agent/structs" "github.com/hashicorp/consul/agent/systemd" "github.com/hashicorp/consul/agent/token" @@ -108,7 +109,7 @@ type Agent struct { // state stores a local representation of the node, // services and checks. Used for anti-entropy. - state *localState + state *local.State // sync manages the synchronization of the local // and the remote state. @@ -255,7 +256,19 @@ func (a *Agent) Start() error { triggerCh := make(chan struct{}, 1) // create the local state - a.state = NewLocalState(c, a.logger, a.tokens, triggerCh) + lc := local.Config{ + AdvertiseAddr: c.AdvertiseAddrLAN.String(), + CheckUpdateInterval: c.CheckUpdateInterval, + Datacenter: c.Datacenter, + DiscardCheckOutput: c.DiscardCheckOutput, + NodeID: c.NodeID, + NodeName: c.NodeName, + TaggedAddresses: map[string]string{}, + } + for k, v := range c.TaggedAddresses { + lc.TaggedAddresses[k] = v + } + a.state = local.NewState(lc, a.logger, a.tokens, triggerCh) // create the state synchronization manager which performs // regular and on-demand state synchronizations (anti-entropy). @@ -293,7 +306,7 @@ func (a *Agent) Start() error { } a.delegate = server - a.state.delegate = server + a.state.SetDelegate(server) a.sync.ClusterSize = func() int { return len(server.LANMembers()) } } else { client, err := consul.NewClientLogger(consulCfg, a.logger) @@ -302,7 +315,7 @@ func (a *Agent) Start() error { } a.delegate = client - a.state.delegate = client + a.state.SetDelegate(client) a.sync.ClusterSize = func() int { return len(client.LANMembers()) } } @@ -2005,15 +2018,13 @@ func (a *Agent) GossipEncrypted() bool { // Stats is used to get various debugging state from the sub-systems func (a *Agent) Stats() map[string]map[string]string { - toString := func(v uint64) string { - return strconv.FormatUint(v, 10) - } stats := a.delegate.Stats() stats["agent"] = map[string]string{ - "check_monitors": toString(uint64(len(a.checkMonitors))), - "check_ttls": toString(uint64(len(a.checkTTLs))), - "checks": toString(uint64(len(a.state.checks))), - "services": toString(uint64(len(a.state.services))), + "check_monitors": strconv.Itoa(len(a.checkMonitors)), + "check_ttls": strconv.Itoa(len(a.checkTTLs)), + } + for k, v := range a.state.Stats() { + stats["agent"][k] = v } revision := a.config.Revision @@ -2136,7 +2147,7 @@ func (a *Agent) loadServices(conf *config.RuntimeConfig) error { } serviceID := p.Service.ID - if _, ok := a.state.services[serviceID]; ok { + if a.state.Service(serviceID) != nil { // Purge previously persisted service. This allows config to be // preferred over services persisted from the API. a.logger.Printf("[DEBUG] agent: service %q exists, not restoring from %q", @@ -2215,7 +2226,7 @@ func (a *Agent) loadChecks(conf *config.RuntimeConfig) error { } checkID := p.Check.CheckID - if _, ok := a.state.checks[checkID]; ok { + if a.state.Check(checkID) != nil { // Purge previously persisted check. This allows config to be // preferred over persisted checks from the API. a.logger.Printf("[DEBUG] agent: check %q exists, not restoring from %q", @@ -2273,26 +2284,17 @@ func (a *Agent) restoreCheckState(snap map[types.CheckID]*structs.HealthCheck) { // loadMetadata loads node metadata fields from the agent config and // updates them on the local agent. func (a *Agent) loadMetadata(conf *config.RuntimeConfig) error { - a.state.Lock() - defer a.state.Unlock() - - for key, value := range conf.NodeMeta { - a.state.metadata[key] = value + meta := map[string]string{} + for k, v := range conf.NodeMeta { + meta[k] = v } - - a.state.metadata[structs.MetaSegmentKey] = conf.SegmentName - - a.state.changeMade() - - return nil + meta[structs.MetaSegmentKey] = conf.SegmentName + return a.state.LoadMetadata(meta) } // unloadMetadata resets the local metadata state func (a *Agent) unloadMetadata() { - a.state.Lock() - defer a.state.Unlock() - - a.state.metadata = make(map[string]string) + a.state.UnloadMetadata() } // serviceMaintCheckID returns the ID of a given service's maintenance check diff --git a/agent/local.go b/agent/local/state.go similarity index 85% rename from agent/local.go rename to agent/local/state.go index 58da5f7f38..3e8379696b 100644 --- a/agent/local.go +++ b/agent/local/state.go @@ -1,16 +1,16 @@ -package agent +package local import ( "fmt" "log" "reflect" + "strconv" "strings" "sync" "sync/atomic" "time" "github.com/hashicorp/consul/acl" - "github.com/hashicorp/consul/agent/config" "github.com/hashicorp/consul/agent/structs" "github.com/hashicorp/consul/agent/token" "github.com/hashicorp/consul/api" @@ -18,34 +18,41 @@ import ( "github.com/hashicorp/consul/types" ) +// permissionDenied is returned when an ACL based rejection happens. +const permissionDenied = "Permission denied" + // syncStatus is used to represent the difference between // the local and remote state, and if action needs to be taken type syncStatus struct { inSync bool // Is this in sync with the server } -// localStateConfig is the configuration for the localState. It is +// Config is the configuration for the State. It is // populated during NewLocalAgent from the agent configuration to avoid // race conditions with the agent configuration. -type localStateConfig struct { +type Config struct { AdvertiseAddr string CheckUpdateInterval time.Duration Datacenter string + DiscardCheckOutput bool NodeID types.NodeID NodeName string TaggedAddresses map[string]string - Tokens *token.Store } -// localState is used to represent the node's services, +type delegate interface { + RPC(method string, args interface{}, reply interface{}) error +} + +// State is used to represent the node's services, // and checks. We used it to perform anti-entropy with the // catalog representation -type localState struct { +type State struct { sync.RWMutex logger *log.Logger // Config is the agent config - config localStateConfig + config Config // delegate is the consul interface to use for keeping in sync delegate delegate @@ -78,25 +85,14 @@ type localState struct { // discardCheckOutput stores whether the output of health checks // is stored in the raft log. discardCheckOutput atomic.Value // bool + + tokens *token.Store } // NewLocalState creates a is used to initialize the local state -func NewLocalState(c *config.RuntimeConfig, lg *log.Logger, tokens *token.Store, triggerCh chan struct{}) *localState { - lc := localStateConfig{ - AdvertiseAddr: c.AdvertiseAddrLAN.String(), - CheckUpdateInterval: c.CheckUpdateInterval, - Datacenter: c.Datacenter, - NodeID: c.NodeID, - NodeName: c.NodeName, - TaggedAddresses: map[string]string{}, - Tokens: tokens, - } - for k, v := range c.TaggedAddresses { - lc.TaggedAddresses[k] = v - } - - l := &localState{ - config: lc, +func NewState(c Config, lg *log.Logger, tokens *token.Store, triggerCh chan struct{}) *State { + l := &State{ + config: c, logger: lg, services: make(map[string]*structs.NodeService), serviceStatus: make(map[string]syncStatus), @@ -108,13 +104,18 @@ func NewLocalState(c *config.RuntimeConfig, lg *log.Logger, tokens *token.Store, deferCheck: make(map[types.CheckID]*time.Timer), metadata: make(map[string]string), triggerCh: triggerCh, + tokens: tokens, } l.discardCheckOutput.Store(c.DiscardCheckOutput) return l } +func (l *State) SetDelegate(d delegate) { + l.delegate = d +} + // changeMade is used to trigger an anti-entropy run -func (l *localState) changeMade() { +func (l *State) changeMade() { // todo(fs): IMO, the non-blocking nature of this call should be hidden in the syncer select { case l.triggerCh <- struct{}{}: @@ -122,23 +123,23 @@ func (l *localState) changeMade() { } } -func (l *localState) SetDiscardCheckOutput(b bool) { +func (l *State) SetDiscardCheckOutput(b bool) { l.discardCheckOutput.Store(b) } // ServiceToken returns the configured ACL token for the given // service ID. If none is present, the agent's token is returned. -func (l *localState) ServiceToken(id string) string { +func (l *State) ServiceToken(id string) string { l.RLock() defer l.RUnlock() return l.serviceToken(id) } // serviceToken returns an ACL token associated with a service. -func (l *localState) serviceToken(id string) string { +func (l *State) serviceToken(id string) string { token := l.serviceTokens[id] if token == "" { - token = l.config.Tokens.UserToken() + token = l.tokens.UserToken() } return token } @@ -146,7 +147,7 @@ func (l *localState) serviceToken(id string) string { // AddService is used to add a service entry to the local state. // This entry is persistent and the agent will make a best effort to // ensure it is registered -func (l *localState) AddService(service *structs.NodeService, token string) { +func (l *State) AddService(service *structs.NodeService, token string) { // Assign the ID if none given if service.ID == "" && service.Service != "" { service.ID = service.Service @@ -163,7 +164,7 @@ func (l *localState) AddService(service *structs.NodeService, token string) { // RemoveService is used to remove a service entry from the local state. // The agent will make a best effort to ensure it is deregistered -func (l *localState) RemoveService(serviceID string) error { +func (l *State) RemoveService(serviceID string) error { l.Lock() defer l.Unlock() @@ -180,9 +181,17 @@ func (l *localState) RemoveService(serviceID string) error { return nil } +// Service returns the locally registered service that the +// agent is aware of and are being kept in sync with the server +func (l *State) Service(id string) *structs.NodeService { + l.RLock() + defer l.RUnlock() + return l.services[id] +} + // Services returns the locally registered services that the // agent is aware of and are being kept in sync with the server -func (l *localState) Services() map[string]*structs.NodeService { +func (l *State) Services() map[string]*structs.NodeService { services := make(map[string]*structs.NodeService) l.RLock() defer l.RUnlock() @@ -195,17 +204,17 @@ func (l *localState) Services() map[string]*structs.NodeService { // CheckToken is used to return the configured health check token for a // Check, or if none is configured, the default agent ACL token. -func (l *localState) CheckToken(checkID types.CheckID) string { +func (l *State) CheckToken(checkID types.CheckID) string { l.RLock() defer l.RUnlock() return l.checkToken(checkID) } // checkToken returns an ACL token associated with a check. -func (l *localState) checkToken(checkID types.CheckID) string { +func (l *State) checkToken(checkID types.CheckID) string { token := l.checkTokens[checkID] if token == "" { - token = l.config.Tokens.UserToken() + token = l.tokens.UserToken() } return token } @@ -213,7 +222,7 @@ func (l *localState) checkToken(checkID types.CheckID) string { // AddCheck is used to add a health check to the local state. // This entry is persistent and the agent will make a best effort to // ensure it is registered -func (l *localState) AddCheck(check *structs.HealthCheck, token string) error { +func (l *State) AddCheck(check *structs.HealthCheck, token string) error { l.Lock() defer l.Unlock() @@ -240,7 +249,7 @@ func (l *localState) AddCheck(check *structs.HealthCheck, token string) error { // RemoveCheck is used to remove a health check from the local state. // The agent will make a best effort to ensure it is deregistered -func (l *localState) RemoveCheck(checkID types.CheckID) { +func (l *State) RemoveCheck(checkID types.CheckID) { l.Lock() defer l.Unlock() @@ -253,7 +262,7 @@ func (l *localState) RemoveCheck(checkID types.CheckID) { } // UpdateCheck is used to update the status of a check -func (l *localState) UpdateCheck(checkID types.CheckID, status, output string) { +func (l *State) UpdateCheck(checkID types.CheckID, status, output string) { l.Lock() defer l.Unlock() @@ -311,9 +320,17 @@ func (l *localState) UpdateCheck(checkID types.CheckID, status, output string) { l.changeMade() } +// Check returns the locally registered check that the +// agent is aware of and are being kept in sync with the server +func (l *State) Check(id types.CheckID) *structs.HealthCheck { + l.RLock() + defer l.RUnlock() + return l.checks[id] +} + // Checks returns the locally registered checks that the // agent is aware of and are being kept in sync with the server -func (l *localState) Checks() map[types.CheckID]*structs.HealthCheck { +func (l *State) Checks() map[types.CheckID]*structs.HealthCheck { l.RLock() defer l.RUnlock() @@ -337,7 +354,7 @@ type CriticalCheck struct { // aware of and are being kept in sync with the server, and that are in a // critical state. This also returns information about how long each check has // been critical. -func (l *localState) CriticalChecks() map[types.CheckID]CriticalCheck { +func (l *State) CriticalChecks() map[types.CheckID]CriticalCheck { checks := make(map[types.CheckID]CriticalCheck) l.RLock() @@ -356,7 +373,7 @@ func (l *localState) CriticalChecks() map[types.CheckID]CriticalCheck { // Metadata returns the local node metadata fields that the // agent is aware of and are being kept in sync with the server -func (l *localState) Metadata() map[string]string { +func (l *State) Metadata() map[string]string { metadata := make(map[string]string) l.RLock() defer l.RUnlock() @@ -369,14 +386,11 @@ func (l *localState) Metadata() map[string]string { // UpdateSyncState does a read of the server state, and updates // the local sync status as appropriate -func (l *localState) UpdateSyncState() error { - if l == nil { - panic("config == nil") - } +func (l *State) UpdateSyncState() error { req := structs.NodeSpecificRequest{ Datacenter: l.config.Datacenter, Node: l.config.NodeName, - QueryOptions: structs.QueryOptions{Token: l.config.Tokens.AgentToken()}, + QueryOptions: structs.QueryOptions{Token: l.tokens.AgentToken()}, } var out1 structs.IndexedNodeServices var out2 structs.IndexedHealthChecks @@ -498,7 +512,7 @@ func (l *localState) UpdateSyncState() error { // SyncChanges is used to scan the status our local services and checks // and update any that are out of sync with the server -func (l *localState) SyncChanges() error { +func (l *State) SyncChanges() error { l.Lock() defer l.Unlock() @@ -555,8 +569,38 @@ func (l *localState) SyncChanges() error { return nil } +// LoadMetadata loads node metadata fields from the agent config and +// updates them on the local agent. +func (l *State) LoadMetadata(data map[string]string) error { + l.Lock() + defer l.Unlock() + + for k, v := range data { + l.metadata[k] = v + } + l.changeMade() + return nil +} + +// UnloadMetadata resets the local metadata state +func (l *State) UnloadMetadata() { + l.Lock() + defer l.Unlock() + l.metadata = make(map[string]string) +} + +// Stats is used to get various debugging state from the sub-systems +func (l *State) Stats() map[string]string { + l.RLock() + defer l.RUnlock() + return map[string]string{ + "services": strconv.Itoa(len(l.services)), + "checks": strconv.Itoa(len(l.checks)), + } +} + // deleteService is used to delete a service from the server -func (l *localState) deleteService(id string) error { +func (l *State) deleteService(id string) error { if id == "" { return fmt.Errorf("ServiceID missing") } @@ -583,7 +627,7 @@ func (l *localState) deleteService(id string) error { } // deleteCheck is used to delete a check from the server -func (l *localState) deleteCheck(id types.CheckID) error { +func (l *State) deleteCheck(id types.CheckID) error { if id == "" { return fmt.Errorf("CheckID missing") } @@ -610,7 +654,7 @@ func (l *localState) deleteCheck(id types.CheckID) error { } // syncService is used to sync a service to the server -func (l *localState) syncService(id string) error { +func (l *State) syncService(id string) error { req := structs.RegisterRequest{ Datacenter: l.config.Datacenter, ID: l.config.NodeID, @@ -667,7 +711,7 @@ func (l *localState) syncService(id string) error { } // syncCheck is used to sync a check to the server -func (l *localState) syncCheck(id types.CheckID) error { +func (l *State) syncCheck(id types.CheckID) error { // Pull in the associated service if any check := l.checks[id] var service *structs.NodeService @@ -704,7 +748,7 @@ func (l *localState) syncCheck(id types.CheckID) error { return err } -func (l *localState) syncNodeInfo() error { +func (l *State) syncNodeInfo() error { req := structs.RegisterRequest{ Datacenter: l.config.Datacenter, ID: l.config.NodeID, @@ -712,7 +756,7 @@ func (l *localState) syncNodeInfo() error { Address: l.config.AdvertiseAddr, TaggedAddresses: l.config.TaggedAddresses, NodeMeta: l.metadata, - WriteRequest: structs.WriteRequest{Token: l.config.Tokens.AgentToken()}, + WriteRequest: structs.WriteRequest{Token: l.tokens.AgentToken()}, } var out struct{} err := l.delegate.RPC("Catalog.Register", &req, &out) diff --git a/agent/local_test.go b/agent/local/state_test.go similarity index 99% rename from agent/local_test.go rename to agent/local/state_test.go index baad74a1e5..2e27c4475a 100644 --- a/agent/local_test.go +++ b/agent/local/state_test.go @@ -1,4 +1,4 @@ -package agent +package local import ( "reflect" From 0a9ac9749e22b4b3b27909a1ffdf20c07ca236c2 Mon Sep 17 00:00:00 2001 From: Frank Schroeder Date: Mon, 28 Aug 2017 14:17:12 +0200 Subject: [PATCH 06/36] local state: replace multi-map state with structs The state of the service and health check records was spread out over multiple maps guarded by a single lock. Access to the maps has to happen in a coordinated effort and the tests often violated this which made them brittle and racy. This patch replaces the multiple maps with a single one for both checks and services to make the code less fragile. This is also necessary since moving the local state into its own package creates circular dependencies for the tests. To avoid this the tests can no longer access internal data structures which they should not be doing in the first place. The tests still don't compile but this is a ncessary step in that direction. --- agent/agent.go | 18 +- agent/local/state.go | 647 ++++++++++++++++++++++++++----------------- 2 files changed, 398 insertions(+), 267 deletions(-) diff --git a/agent/agent.go b/agent/agent.go index 600432e8b3..b7f51d8f5c 100644 --- a/agent/agent.go +++ b/agent/agent.go @@ -1386,29 +1386,31 @@ OUTER: // reapServicesInternal does a single pass, looking for services to reap. func (a *Agent) reapServicesInternal() { - reaped := make(map[string]struct{}) - for checkID, check := range a.state.CriticalChecks() { + reaped := make(map[string]bool) + for checkID, cs := range a.state.CriticalCheckStates() { + serviceID := cs.Check.ServiceID + // There's nothing to do if there's no service. - if check.Check.ServiceID == "" { + if serviceID == "" { continue } // There might be multiple checks for one service, so // we don't need to reap multiple times. - serviceID := check.Check.ServiceID - if _, ok := reaped[serviceID]; ok { + if reaped[serviceID] { continue } // See if there's a timeout. + // todo(fs): this looks fishy... why is there anoter data structure in the agent with its own lock? a.checkLock.Lock() - timeout, ok := a.checkReapAfter[checkID] + timeout := a.checkReapAfter[checkID] a.checkLock.Unlock() // Reap, if necessary. We keep track of which service // this is so that we won't try to remove it again. - if ok && check.CriticalFor > timeout { - reaped[serviceID] = struct{}{} + if timeout > 0 && cs.CriticalFor() > timeout { + reaped[serviceID] = true a.RemoveService(serviceID, true) a.logger.Printf("[INFO] agent: Check %q for service %q has been critical for too long; deregistered service", checkID, serviceID) diff --git a/agent/local/state.go b/agent/local/state.go index 3e8379696b..b2fdb833f4 100644 --- a/agent/local/state.go +++ b/agent/local/state.go @@ -21,12 +21,6 @@ import ( // permissionDenied is returned when an ACL based rejection happens. const permissionDenied = "Permission denied" -// syncStatus is used to represent the difference between -// the local and remote state, and if action needs to be taken -type syncStatus struct { - inSync bool // Is this in sync with the server -} - // Config is the configuration for the State. It is // populated during NewLocalAgent from the agent configuration to avoid // race conditions with the agent configuration. @@ -40,6 +34,62 @@ type Config struct { TaggedAddresses map[string]string } +// ServiceState describes the state of a service record. +type ServiceState struct { + // Service is the local copy of the service record. + Service *structs.NodeService + + // Token is the ACL to update the service record on the server. + Token string + + // InSync contains whether the local state of the service record + // is in sync with the remote state on the server. + InSync bool + + // Deleted is true when the service record has been marked as deleted + // but has not been removed on the server yet. + Deleted bool +} + +// CheckState describes the state of a health check record. +type CheckState struct { + // Check is the local copy of the health check record. + Check *structs.HealthCheck + + // Token is the ACL record to update the health check record + // on the server. + Token string + + // CriticalTime is the last time the health check status went + // from non-critical to critical. When the health check is not + // in critical state the value is the zero value. + CriticalTime time.Time + + // DeferCheck is used to delay the sync of a health check when + // only the status has changed. + // todo(fs): ^^ this needs double checking... + DeferCheck *time.Timer + + // InSync contains whether the local state of the health check + // record is in sync with the remote state on the server. + InSync bool + + // Deleted is true when the health check record has been marked as + // deleted but has not been removed on the server yet. + Deleted bool +} + +// Critical returns true when the health check is in critical state. +func (c *CheckState) Critical() bool { + return !c.CriticalTime.IsZero() +} + +// CriticalFor returns the amount of time the service has been in critical +// state. Its value is undefined when the service is not in critical state. +func (c *CheckState) CriticalFor() time.Duration { + return time.Since(c.CriticalTime) +} + type delegate interface { RPC(method string, args interface{}, reply interface{}) error } @@ -62,18 +112,10 @@ type State struct { nodeInfoInSync bool // Services tracks the local services - services map[string]*structs.NodeService - serviceStatus map[string]syncStatus - serviceTokens map[string]string + services map[string]*ServiceState // Checks tracks the local checks - checks map[types.CheckID]*structs.HealthCheck - checkStatus map[types.CheckID]syncStatus - checkTokens map[types.CheckID]string - checkCriticalTime map[types.CheckID]time.Time - - // Used to track checks that are being deferred - deferCheck map[types.CheckID]*time.Timer + checks map[types.CheckID]*CheckState // metadata tracks the local metadata fields metadata map[string]string @@ -86,25 +128,20 @@ type State struct { // is stored in the raft log. discardCheckOutput atomic.Value // bool + // tokens contains the ACL tokens tokens *token.Store } // NewLocalState creates a is used to initialize the local state func NewState(c Config, lg *log.Logger, tokens *token.Store, triggerCh chan struct{}) *State { l := &State{ - config: c, - logger: lg, - services: make(map[string]*structs.NodeService), - serviceStatus: make(map[string]syncStatus), - serviceTokens: make(map[string]string), - checks: make(map[types.CheckID]*structs.HealthCheck), - checkStatus: make(map[types.CheckID]syncStatus), - checkTokens: make(map[types.CheckID]string), - checkCriticalTime: make(map[types.CheckID]time.Time), - deferCheck: make(map[types.CheckID]*time.Timer), - metadata: make(map[string]string), - triggerCh: triggerCh, - tokens: tokens, + config: c, + logger: lg, + services: make(map[string]*ServiceState), + checks: make(map[types.CheckID]*CheckState), + metadata: make(map[string]string), + triggerCh: triggerCh, + tokens: tokens, } l.discardCheckOutput.Store(c.DiscardCheckOutput) return l @@ -137,7 +174,10 @@ func (l *State) ServiceToken(id string) string { // serviceToken returns an ACL token associated with a service. func (l *State) serviceToken(id string) string { - token := l.serviceTokens[id] + var token string + if s := l.services[id]; s != nil { + token = s.Token + } if token == "" { token = l.tokens.UserToken() } @@ -147,37 +187,48 @@ func (l *State) serviceToken(id string) string { // AddService is used to add a service entry to the local state. // This entry is persistent and the agent will make a best effort to // ensure it is registered -func (l *State) AddService(service *structs.NodeService, token string) { - // Assign the ID if none given - if service.ID == "" && service.Service != "" { +// todo(fs): where is the persistence happening? +func (l *State) AddService(service *structs.NodeService, token string) error { + l.Lock() + defer l.Unlock() + + if service == nil { + return fmt.Errorf("no service") + } + + // use the service name as id if the id was omitted + // todo(fs): is this for backwards compatibility? + if service.ID == "" { service.ID = service.Service } - l.Lock() - defer l.Unlock() - - l.services[service.ID] = service - l.serviceStatus[service.ID] = syncStatus{} - l.serviceTokens[service.ID] = token + l.services[service.ID] = &ServiceState{ + Service: service, + Token: token, + } l.changeMade() + + return nil } // RemoveService is used to remove a service entry from the local state. -// The agent will make a best effort to ensure it is deregistered -func (l *State) RemoveService(serviceID string) error { +// The agent will make a best effort to ensure it is deregistered. +func (l *State) RemoveService(id string) error { l.Lock() defer l.Unlock() - if _, ok := l.services[serviceID]; ok { - delete(l.services, serviceID) - // Leave the service token around, if any, until we successfully - // delete the service. - l.serviceStatus[serviceID] = syncStatus{inSync: false} - l.changeMade() - } else { - return fmt.Errorf("Service does not exist") + s := l.services[id] + if s == nil || s.Deleted { + return fmt.Errorf("Service %q does not exist", id) } + // To remove the service on the server we need the token. + // Therefore, we mark the service as deleted and keep the + // entry around until it is actually removed. + s.InSync = false + s.Deleted = true + l.changeMade() + return nil } @@ -186,20 +237,28 @@ func (l *State) RemoveService(serviceID string) error { func (l *State) Service(id string) *structs.NodeService { l.RLock() defer l.RUnlock() - return l.services[id] + + s := l.services[id] + if s == nil || s.Deleted { + return nil + } + return s.Service } // Services returns the locally registered services that the // agent is aware of and are being kept in sync with the server func (l *State) Services() map[string]*structs.NodeService { - services := make(map[string]*structs.NodeService) l.RLock() defer l.RUnlock() - for name, serv := range l.services { - services[name] = serv + m := make(map[string]*structs.NodeService) + for id, s := range l.services { + if s.Deleted { + continue + } + m[id] = s.Service } - return services + return m } // CheckToken is used to return the configured health check token for a @@ -211,8 +270,12 @@ func (l *State) CheckToken(checkID types.CheckID) string { } // checkToken returns an ACL token associated with a check. -func (l *State) checkToken(checkID types.CheckID) string { - token := l.checkTokens[checkID] +func (l *State) checkToken(id types.CheckID) string { + var token string + c := l.checks[id] + if c != nil { + token = c.Token + } if token == "" { token = l.tokens.UserToken() } @@ -226,8 +289,9 @@ func (l *State) AddCheck(check *structs.HealthCheck, token string) error { l.Lock() defer l.Unlock() - // Set the node name - check.Node = l.config.NodeName + if check == nil { + return fmt.Errorf("no check") + } if l.discardCheckOutput.Load().(bool) { check.Output = "" @@ -236,38 +300,51 @@ func (l *State) AddCheck(check *structs.HealthCheck, token string) error { // if there is a serviceID associated with the check, make sure it exists before adding it // NOTE - This logic may be moved to be handled within the Agent's Addcheck method after a refactor if check.ServiceID != "" && l.services[check.ServiceID] == nil { - return fmt.Errorf("ServiceID %q does not exist", check.ServiceID) + return fmt.Errorf("Check %q refers to non-existent service %q does not exist", check.CheckID, check.ServiceID) } - l.checks[check.CheckID] = check - l.checkStatus[check.CheckID] = syncStatus{} - l.checkTokens[check.CheckID] = token - delete(l.checkCriticalTime, check.CheckID) + // hard-set the node name + check.Node = l.config.NodeName + + l.checks[check.CheckID] = &CheckState{ + Check: check, + Token: token, + } l.changeMade() + return nil } // RemoveCheck is used to remove a health check from the local state. // The agent will make a best effort to ensure it is deregistered -func (l *State) RemoveCheck(checkID types.CheckID) { +// todo(fs): RemoveService returns an error for a non-existant service. RemoveCheck should as well. +// todo(fs): Check code that calls this to handle the error. +func (l *State) RemoveCheck(id types.CheckID) error { l.Lock() defer l.Unlock() - delete(l.checks, checkID) - // Leave the check token around, if any, until we successfully delete - // the check. - delete(l.checkCriticalTime, checkID) - l.checkStatus[checkID] = syncStatus{inSync: false} + c := l.checks[id] + if c == nil || c.Deleted { + return fmt.Errorf("Check %q does not exist", id) + } + + // To remove the check on the server we need the token. + // Therefore, we mark the service as deleted and keep the + // entry around until it is actually removed. + c.InSync = false + c.Deleted = true l.changeMade() + + return nil } // UpdateCheck is used to update the status of a check -func (l *State) UpdateCheck(checkID types.CheckID, status, output string) { +func (l *State) UpdateCheck(id types.CheckID, status, output string) { l.Lock() defer l.Unlock() - check, ok := l.checks[checkID] - if !ok { + c := l.checks[id] + if c == nil || c.Deleted { return } @@ -278,16 +355,15 @@ func (l *State) UpdateCheck(checkID types.CheckID, status, output string) { // Update the critical time tracking (this doesn't cause a server updates // so we can always keep this up to date). if status == api.HealthCritical { - _, wasCritical := l.checkCriticalTime[checkID] - if !wasCritical { - l.checkCriticalTime[checkID] = time.Now() + if !c.Critical() { + c.CriticalTime = time.Now() } } else { - delete(l.checkCriticalTime, checkID) + c.CriticalTime = time.Time{} } // Do nothing if update is idempotent - if check.Status == status && check.Output == output { + if c.Check.Status == status && c.Check.Output == output { return } @@ -295,28 +371,34 @@ func (l *State) UpdateCheck(checkID types.CheckID, status, output string) { // frequent updates of output. Instead, we update the output internally, // and periodically do a write-back to the servers. If there is a status // change we do the write immediately. - if l.config.CheckUpdateInterval > 0 && check.Status == status { - check.Output = output - if _, ok := l.deferCheck[checkID]; !ok { - intv := time.Duration(uint64(l.config.CheckUpdateInterval)/2) + lib.RandomStagger(l.config.CheckUpdateInterval) - deferSync := time.AfterFunc(intv, func() { + if l.config.CheckUpdateInterval > 0 && c.Check.Status == status { + c.Check.Output = output + if c.DeferCheck == nil { + d := l.config.CheckUpdateInterval + intv := time.Duration(uint64(d)/2) + lib.RandomStagger(d) + c.DeferCheck = time.AfterFunc(intv, func() { l.Lock() - if _, ok := l.checkStatus[checkID]; ok { - l.checkStatus[checkID] = syncStatus{inSync: false} - l.changeMade() + defer l.Unlock() + + c := l.checks[id] + if c == nil { + return } - delete(l.deferCheck, checkID) - l.Unlock() + c.DeferCheck = nil + if c.Deleted { + return + } + c.InSync = false + l.changeMade() }) - l.deferCheck[checkID] = deferSync } return } // Update status and mark out of sync - check.Status = status - check.Output = output - l.checkStatus[checkID] = syncStatus{inSync: false} + c.Check.Status = status + c.Check.Output = output + c.InSync = false l.changeMade() } @@ -325,7 +407,12 @@ func (l *State) UpdateCheck(checkID types.CheckID, status, output string) { func (l *State) Check(id types.CheckID) *structs.HealthCheck { l.RLock() defer l.RUnlock() - return l.checks[id] + + c := l.checks[id] + if c == nil || c.Deleted { + return nil + } + return c.Check } // Checks returns the locally registered checks that the @@ -334,78 +421,83 @@ func (l *State) Checks() map[types.CheckID]*structs.HealthCheck { l.RLock() defer l.RUnlock() - checks := make(map[types.CheckID]*structs.HealthCheck) + m := make(map[types.CheckID]*structs.HealthCheck) for id, c := range l.checks { + if c.Deleted { + continue + } c2 := new(structs.HealthCheck) - *c2 = *c - checks[id] = c2 + *c2 = *c.Check + m[id] = c2 } - return checks + return m } -// CriticalCheck is used to return the duration a check has been critical along -// with its associated health check. -type CriticalCheck struct { - CriticalFor time.Duration - Check *structs.HealthCheck -} - -// CriticalChecks returns locally registered health checks that the agent is -// aware of and are being kept in sync with the server, and that are in a -// critical state. This also returns information about how long each check has -// been critical. -func (l *State) CriticalChecks() map[types.CheckID]CriticalCheck { - checks := make(map[types.CheckID]CriticalCheck) - +// CriticalCheckStates returns the locally registered checks that the +// agent is aware of and are being kept in sync with the server +func (l *State) CriticalCheckStates() map[types.CheckID]*CheckState { l.RLock() defer l.RUnlock() - now := time.Now() - for checkID, criticalTime := range l.checkCriticalTime { - checks[checkID] = CriticalCheck{ - CriticalFor: now.Sub(criticalTime), - Check: l.checks[checkID], + m := make(map[types.CheckID]*CheckState) + for id, c := range l.checks { + if c.Deleted || !c.Critical() { + continue } + m[id] = c } - - return checks + return m } // Metadata returns the local node metadata fields that the // agent is aware of and are being kept in sync with the server func (l *State) Metadata() map[string]string { - metadata := make(map[string]string) l.RLock() defer l.RUnlock() - - for key, value := range l.metadata { - metadata[key] = value + m := make(map[string]string) + for k, v := range l.metadata { + m[k] = v } - return metadata + return m } // UpdateSyncState does a read of the server state, and updates // the local sync status as appropriate func (l *State) UpdateSyncState() error { + // 1. get all checks and services from the master req := structs.NodeSpecificRequest{ Datacenter: l.config.Datacenter, Node: l.config.NodeName, QueryOptions: structs.QueryOptions{Token: l.tokens.AgentToken()}, } + var out1 structs.IndexedNodeServices - var out2 structs.IndexedHealthChecks - if e := l.delegate.RPC("Catalog.NodeServices", &req, &out1); e != nil { - return e + if err := l.delegate.RPC("Catalog.NodeServices", &req, &out1); err != nil { + return err } + + var out2 structs.IndexedHealthChecks if err := l.delegate.RPC("Health.NodeChecks", &req, &out2); err != nil { return err } - checks := out2.HealthChecks + + // 2. create useful data structures for traversal + remoteServices := make(map[string]*structs.NodeService) + if out1.NodeServices != nil { + remoteServices = out1.NodeServices.Services + } + + remoteChecks := make(map[types.CheckID]*structs.HealthCheck, len(out2.HealthChecks)) + for _, rc := range out2.HealthChecks { + remoteChecks[rc.CheckID] = rc + } + + // 3. perform sync l.Lock() defer l.Unlock() - // Check the node info + // sync node info if out1.NodeServices == nil || out1.NodeServices.Node == nil || out1.NodeServices.Node.ID != l.config.NodeID || !reflect.DeepEqual(out1.NodeServices.Node.TaggedAddresses, l.config.TaggedAddresses) || @@ -413,99 +505,103 @@ func (l *State) UpdateSyncState() error { l.nodeInfoInSync = false } - // Check all our services - services := make(map[string]*structs.NodeService) - if out1.NodeServices != nil { - services = out1.NodeServices.Services - } + // sync services - for id := range l.services { - // If the local service doesn't exist remotely, then sync it - if _, ok := services[id]; !ok { - l.serviceStatus[id] = syncStatus{inSync: false} + // sync local services that do not exist remotely + for id, s := range l.services { + if remoteServices[id] == nil { + s.InSync = false } } - for id, service := range services { + for id, rs := range remoteServices { // If we don't have the service locally, deregister it - existing, ok := l.services[id] - if !ok { - // The consul service is created automatically, and does + ls := l.services[id] + if ls == nil { + // The consul service is created automatically and does // not need to be deregistered. if id == structs.ConsulServiceID { continue } - l.serviceStatus[id] = syncStatus{inSync: false} + + l.services[id] = &ServiceState{Deleted: true} + continue + } + + // If the service is scheduled for removal skip it. + // todo(fs): is this correct? + if ls.Deleted { continue } // If our definition is different, we need to update it. Make a // copy so that we don't retain a pointer to any actual state // store info for in-memory RPCs. - if existing.EnableTagOverride { - existing.Tags = make([]string, len(service.Tags)) - copy(existing.Tags, service.Tags) + if ls.Service.EnableTagOverride { + ls.Service.Tags = make([]string, len(rs.Tags)) + copy(ls.Service.Tags, rs.Tags) } - equal := existing.IsSame(service) - l.serviceStatus[id] = syncStatus{inSync: equal} + ls.InSync = ls.Service.IsSame(rs) } - // Index the remote health checks to improve efficiency - checkIndex := make(map[types.CheckID]*structs.HealthCheck, len(checks)) - for _, check := range checks { - checkIndex[check.CheckID] = check - } + // sync checks - // Sync any check which doesn't exist on the remote side - for id := range l.checks { - if _, ok := checkIndex[id]; !ok { - l.checkStatus[id] = syncStatus{inSync: false} + // sync local checks which do not exist remotely + for id, c := range l.checks { + if remoteChecks[id] == nil { + c.InSync = false } } - for _, check := range checks { + for id, rc := range remoteChecks { + + lc := l.checks[id] + // If we don't have the check locally, deregister it - id := check.CheckID - existing, ok := l.checks[id] - if !ok { - // The Serf check is created automatically, and does not + if lc == nil { + // The Serf check is created automatically and does not // need to be deregistered. if id == structs.SerfCheckID { + l.logger.Printf("Skipping remote check %q since it is managed automatically", id) continue } - l.checkStatus[id] = syncStatus{inSync: false} + + l.checks[id] = &CheckState{Deleted: true} + continue + } + + // If the check is scheduled for removal skip it. + // todo(fs): is this correct? + if lc.Deleted { continue } // If our definition is different, we need to update it - var equal bool if l.config.CheckUpdateInterval == 0 { - equal = existing.IsSame(check) - } else { - // Copy the existing check before potentially modifying - // it before the compare operation. - eCopy := existing.Clone() - - // Copy the server's check before modifying, otherwise - // in-memory RPCs will have side effects. - cCopy := check.Clone() - - // If there's a defer timer active then we've got a - // potentially spammy check so we don't sync the output - // during this sweep since the timer will mark the check - // out of sync for us. Otherwise, it is safe to sync the - // output now. This is especially important for checks - // that don't change state after they are created, in - // which case we'd never see their output synced back ever. - if _, ok := l.deferCheck[id]; ok { - eCopy.Output = "" - cCopy.Output = "" - } - equal = eCopy.IsSame(cCopy) + lc.InSync = lc.Check.IsSame(rc) + continue } - // Update the status - l.checkStatus[id] = syncStatus{inSync: equal} + // Copy the existing check before potentially modifying + // it before the compare operation. + lcCopy := lc.Check.Clone() + + // Copy the server's check before modifying, otherwise + // in-memory RPCs will have side effects. + rcCopy := rc.Clone() + + // If there's a defer timer active then we've got a + // potentially spammy check so we don't sync the output + // during this sweep since the timer will mark the check + // out of sync for us. Otherwise, it is safe to sync the + // output now. This is especially important for checks + // that don't change state after they are created, in + // which case we'd never see their output synced back ever. + if lc.DeferCheck != nil { + lcCopy.Output = "" + rcCopy.Output = "" + } + lc.InSync = lcCopy.IsSame(rcCopy) } return nil } @@ -521,39 +617,38 @@ func (l *State) SyncChanges() error { // API works. // Sync the services - for id, status := range l.serviceStatus { - if _, ok := l.services[id]; !ok { - if err := l.deleteService(id); err != nil { - return err - } - } else if !status.inSync { - if err := l.syncService(id); err != nil { - return err - } - } else { + for id, s := range l.services { + var err error + switch { + case s.Deleted: + err = l.deleteService(id) + case !s.InSync: + err = l.syncService(id) + default: l.logger.Printf("[DEBUG] agent: Service '%s' in sync", id) } + if err != nil { + return err + } } - // Sync the checks - for id, status := range l.checkStatus { - if _, ok := l.checks[id]; !ok { - if err := l.deleteCheck(id); err != nil { - return err + for id, c := range l.checks { + var err error + switch { + case c.Deleted: + err = l.deleteCheck(id) + case !c.InSync: + if c.DeferCheck != nil { + c.DeferCheck.Stop() + c.DeferCheck = nil } - } else if !status.inSync { - // Cancel a deferred sync - if timer := l.deferCheck[id]; timer != nil { - timer.Stop() - delete(l.deferCheck, id) - } - - if err := l.syncCheck(id); err != nil { - return err - } - } else { + err = l.syncCheck(id) + default: l.logger.Printf("[DEBUG] agent: Check '%s' in sync", id) } + if err != nil { + return err + } } // Now sync the node level info if we need to, and didn't do any of @@ -593,9 +688,26 @@ func (l *State) UnloadMetadata() { func (l *State) Stats() map[string]string { l.RLock() defer l.RUnlock() + + services := 0 + for _, s := range l.services { + if s.Deleted { + continue + } + services++ + } + + checks := 0 + for _, c := range l.checks { + if c.Deleted { + continue + } + checks++ + } + return map[string]string{ - "services": strconv.Itoa(len(l.services)), - "checks": strconv.Itoa(len(l.checks)), + "services": strconv.Itoa(services), + "checks": strconv.Itoa(checks), } } @@ -614,12 +726,13 @@ func (l *State) deleteService(id string) error { var out struct{} err := l.delegate.RPC("Catalog.Deregister", &req, &out) if err == nil || strings.Contains(err.Error(), "Unknown service") { - delete(l.serviceStatus, id) - delete(l.serviceTokens, id) + delete(l.services, id) l.logger.Printf("[INFO] agent: Deregistered service '%s'", id) return nil - } else if acl.IsErrPermissionDenied(err) { - l.serviceStatus[id] = syncStatus{inSync: true} + } + if acl.IsErrPermissionDenied(err) { + // todo(fs): why is the service in sync here? + l.services[id].InSync = true l.logger.Printf("[WARN] agent: Service '%s' deregistration blocked by ACLs", id) return nil } @@ -641,12 +754,14 @@ func (l *State) deleteCheck(id types.CheckID) error { var out struct{} err := l.delegate.RPC("Catalog.Deregister", &req, &out) if err == nil || strings.Contains(err.Error(), "Unknown check") { - delete(l.checkStatus, id) - delete(l.checkTokens, id) + // todo(fs): do we need to stop the deferCheck timer here? + delete(l.checks, id) l.logger.Printf("[INFO] agent: Deregistered check '%s'", id) return nil - } else if acl.IsErrPermissionDenied(err) { - l.checkStatus[id] = syncStatus{inSync: true} + } + if acl.IsErrPermissionDenied(err) { + // todo(fs): why is the check in sync here? + l.checks[id].InSync = true l.logger.Printf("[WARN] agent: Check '%s' deregistration blocked by ACLs", id) return nil } @@ -655,17 +770,6 @@ func (l *State) deleteCheck(id types.CheckID) error { // syncService is used to sync a service to the server func (l *State) syncService(id string) error { - req := structs.RegisterRequest{ - Datacenter: l.config.Datacenter, - ID: l.config.NodeID, - Node: l.config.NodeName, - Address: l.config.AdvertiseAddr, - TaggedAddresses: l.config.TaggedAddresses, - NodeMeta: l.metadata, - Service: l.services[id], - WriteRequest: structs.WriteRequest{Token: l.serviceToken(id)}, - } - // If the service has associated checks that are out of sync, // piggyback them on the service sync so they are part of the // same transaction and are registered atomically. We only let @@ -673,12 +777,28 @@ func (l *State) syncService(id string) error { // otherwise we need to register them separately so they don't // pick up privileges from the service token. var checks structs.HealthChecks - for _, check := range l.checks { - if check.ServiceID == id && (l.serviceToken(id) == l.checkToken(check.CheckID)) { - if stat, ok := l.checkStatus[check.CheckID]; !ok || !stat.inSync { - checks = append(checks, check) - } + for checkID, c := range l.checks { + if c.Deleted || c.InSync { + continue } + if c.Check.ServiceID != id { + continue + } + if l.serviceToken(id) != l.checkToken(checkID) { + continue + } + checks = append(checks, c.Check) + } + + req := structs.RegisterRequest{ + Datacenter: l.config.Datacenter, + ID: l.config.NodeID, + Node: l.config.NodeName, + Address: l.config.AdvertiseAddr, + TaggedAddresses: l.config.TaggedAddresses, + NodeMeta: l.metadata, + Service: l.services[id].Service, + WriteRequest: structs.WriteRequest{Token: l.serviceToken(id)}, } // Backwards-compatibility for Consul < 0.5 @@ -691,20 +811,24 @@ func (l *State) syncService(id string) error { var out struct{} err := l.delegate.RPC("Catalog.Register", &req, &out) if err == nil { - l.serviceStatus[id] = syncStatus{inSync: true} + l.services[id].InSync = true // Given how the register API works, this info is also updated // every time we sync a service. l.nodeInfoInSync = true + for _, check := range checks { + l.checks[check.CheckID].InSync = true + } l.logger.Printf("[INFO] agent: Synced service '%s'", id) + return nil + } + if acl.IsErrPermissionDenied(err) { + // todo(fs): why are the service and the checks in sync here? + // todo(fs): why is the node info not in sync here? + l.services[id].InSync = true for _, check := range checks { - l.checkStatus[check.CheckID] = syncStatus{inSync: true} + l.checks[check.CheckID].InSync = true } - } else if acl.IsErrPermissionDenied(err) { - l.serviceStatus[id] = syncStatus{inSync: true} l.logger.Printf("[WARN] agent: Service '%s' registration blocked by ACLs", id) - for _, check := range checks { - l.checkStatus[check.CheckID] = syncStatus{inSync: true} - } return nil } return err @@ -712,14 +836,7 @@ func (l *State) syncService(id string) error { // syncCheck is used to sync a check to the server func (l *State) syncCheck(id types.CheckID) error { - // Pull in the associated service if any - check := l.checks[id] - var service *structs.NodeService - if check.ServiceID != "" { - if serv, ok := l.services[check.ServiceID]; ok { - service = serv - } - } + c := l.checks[id] req := structs.RegisterRequest{ Datacenter: l.config.Datacenter, @@ -728,20 +845,29 @@ func (l *State) syncCheck(id types.CheckID) error { Address: l.config.AdvertiseAddr, TaggedAddresses: l.config.TaggedAddresses, NodeMeta: l.metadata, - Service: service, - Check: l.checks[id], + Check: c.Check, WriteRequest: structs.WriteRequest{Token: l.checkToken(id)}, } + + // Pull in the associated service if any + s := l.services[c.Check.ServiceID] + if s != nil && !s.Deleted { + req.Service = s.Service + } + var out struct{} err := l.delegate.RPC("Catalog.Register", &req, &out) if err == nil { - l.checkStatus[id] = syncStatus{inSync: true} + l.checks[id].InSync = true // Given how the register API works, this info is also updated // every time we sync a check. l.nodeInfoInSync = true l.logger.Printf("[INFO] agent: Synced check '%s'", id) - } else if acl.IsErrPermissionDenied(err) { - l.checkStatus[id] = syncStatus{inSync: true} + return nil + } + if acl.IsErrPermissionDenied(err) { + // todo(fs): why is the check in sync here? + l.checks[id].InSync = true l.logger.Printf("[WARN] agent: Check '%s' registration blocked by ACLs", id) return nil } @@ -763,7 +889,10 @@ func (l *State) syncNodeInfo() error { if err == nil { l.nodeInfoInSync = true l.logger.Printf("[INFO] agent: Synced node info") - } else if acl.IsErrPermissionDenied(err) { + return nil + } + if acl.IsErrPermissionDenied(err) { + // todo(fs): why is the node info in sync here? l.nodeInfoInSync = true l.logger.Printf("[WARN] agent: Node info update blocked by ACLs") return nil From b803bf3091c0cb4ea37fcf67f5c4f9c53ac618b5 Mon Sep 17 00:00:00 2001 From: Frank Schroeder Date: Mon, 28 Aug 2017 14:17:13 +0200 Subject: [PATCH 07/36] local state: tests compile --- agent/acl.go | 8 +- agent/acl_test.go | 16 +- agent/agent.go | 98 ++++---- agent/agent_endpoint.go | 8 +- agent/agent_endpoint_test.go | 62 ++--- agent/agent_test.go | 140 +++++------ agent/catalog_endpoint_test.go | 40 +-- agent/local/state.go | 115 +++++++-- agent/local/state_test.go | 431 ++++++++++++--------------------- agent/user_event.go | 2 +- agent/user_event_test.go | 4 +- 11 files changed, 444 insertions(+), 480 deletions(-) diff --git a/agent/acl.go b/agent/acl.go index 2aee3faadc..4245c53a05 100644 --- a/agent/acl.go +++ b/agent/acl.go @@ -259,7 +259,7 @@ func (a *Agent) vetServiceRegister(token string, service *structs.NodeService) e } // Vet any service that might be getting overwritten. - services := a.state.Services() + services := a.State.Services() if existing, ok := services[service.ID]; ok { if !rule.ServiceWrite(existing.Service, nil) { return acl.ErrPermissionDenied @@ -282,7 +282,7 @@ func (a *Agent) vetServiceUpdate(token string, serviceID string) error { } // Vet any changes based on the existing services's info. - services := a.state.Services() + services := a.State.Services() if existing, ok := services[serviceID]; ok { if !rule.ServiceWrite(existing.Service, nil) { return acl.ErrPermissionDenied @@ -318,7 +318,7 @@ func (a *Agent) vetCheckRegister(token string, check *structs.HealthCheck) error } // Vet any check that might be getting overwritten. - checks := a.state.Checks() + checks := a.State.Checks() if existing, ok := checks[check.CheckID]; ok { if len(existing.ServiceName) > 0 { if !rule.ServiceWrite(existing.ServiceName, nil) { @@ -346,7 +346,7 @@ func (a *Agent) vetCheckUpdate(token string, checkID types.CheckID) error { } // Vet any changes based on the existing check's info. - checks := a.state.Checks() + checks := a.State.Checks() if existing, ok := checks[checkID]; ok { if len(existing.ServiceName) > 0 { if !rule.ServiceWrite(existing.ServiceName, nil) { diff --git a/agent/acl_test.go b/agent/acl_test.go index 42ddbcaa25..8b16269981 100644 --- a/agent/acl_test.go +++ b/agent/acl_test.go @@ -564,7 +564,7 @@ func TestACL_vetServiceRegister(t *testing.T) { // Try to register over a service without write privs to the existing // service. - a.state.AddService(&structs.NodeService{ + a.State.AddService(&structs.NodeService{ ID: "my-service", Service: "other", }, "") @@ -596,7 +596,7 @@ func TestACL_vetServiceUpdate(t *testing.T) { } // Update with write privs. - a.state.AddService(&structs.NodeService{ + a.State.AddService(&structs.NodeService{ ID: "my-service", Service: "service", }, "") @@ -662,11 +662,11 @@ func TestACL_vetCheckRegister(t *testing.T) { // Try to register over a service check without write privs to the // existing service. - a.state.AddService(&structs.NodeService{ + a.State.AddService(&structs.NodeService{ ID: "my-service", Service: "service", }, "") - a.state.AddCheck(&structs.HealthCheck{ + a.State.AddCheck(&structs.HealthCheck{ CheckID: types.CheckID("my-check"), ServiceID: "my-service", ServiceName: "other", @@ -681,7 +681,7 @@ func TestACL_vetCheckRegister(t *testing.T) { } // Try to register over a node check without write privs to the node. - a.state.AddCheck(&structs.HealthCheck{ + a.State.AddCheck(&structs.HealthCheck{ CheckID: types.CheckID("my-node-check"), }, "") err = a.vetCheckRegister("service-rw", &structs.HealthCheck{ @@ -713,11 +713,11 @@ func TestACL_vetCheckUpdate(t *testing.T) { } // Update service check with write privs. - a.state.AddService(&structs.NodeService{ + a.State.AddService(&structs.NodeService{ ID: "my-service", Service: "service", }, "") - a.state.AddCheck(&structs.HealthCheck{ + a.State.AddCheck(&structs.HealthCheck{ CheckID: types.CheckID("my-service-check"), ServiceID: "my-service", ServiceName: "service", @@ -734,7 +734,7 @@ func TestACL_vetCheckUpdate(t *testing.T) { } // Update node check with write privs. - a.state.AddCheck(&structs.HealthCheck{ + a.State.AddCheck(&structs.HealthCheck{ CheckID: types.CheckID("my-node-check"), }, "") err = a.vetCheckUpdate("node-rw", "my-node-check") diff --git a/agent/agent.go b/agent/agent.go index b7f51d8f5c..d800002e9f 100644 --- a/agent/agent.go +++ b/agent/agent.go @@ -109,7 +109,7 @@ type Agent struct { // state stores a local representation of the node, // services and checks. Used for anti-entropy. - state *local.State + State *local.State // sync manages the synchronization of the local // and the remote state. @@ -230,6 +230,22 @@ func New(c *config.RuntimeConfig) (*Agent, error) { return a, nil } +func LocalConfig(cfg *config.RuntimeConfig) local.Config { + lc := local.Config{ + AdvertiseAddr: cfg.AdvertiseAddrLAN.String(), + CheckUpdateInterval: cfg.CheckUpdateInterval, + Datacenter: cfg.Datacenter, + DiscardCheckOutput: cfg.DiscardCheckOutput, + NodeID: cfg.NodeID, + NodeName: cfg.NodeName, + TaggedAddresses: map[string]string{}, + } + for k, v := range cfg.TaggedAddresses { + lc.TaggedAddresses[k] = v + } + return lc +} + func (a *Agent) Start() error { c := a.config @@ -256,24 +272,12 @@ func (a *Agent) Start() error { triggerCh := make(chan struct{}, 1) // create the local state - lc := local.Config{ - AdvertiseAddr: c.AdvertiseAddrLAN.String(), - CheckUpdateInterval: c.CheckUpdateInterval, - Datacenter: c.Datacenter, - DiscardCheckOutput: c.DiscardCheckOutput, - NodeID: c.NodeID, - NodeName: c.NodeName, - TaggedAddresses: map[string]string{}, - } - for k, v := range c.TaggedAddresses { - lc.TaggedAddresses[k] = v - } - a.state = local.NewState(lc, a.logger, a.tokens, triggerCh) + a.State = local.NewState(LocalConfig(c), a.logger, a.tokens, triggerCh) // create the state synchronization manager which performs // regular and on-demand state synchronizations (anti-entropy). a.sync = &ae.StateSyncer{ - State: a.state, + State: a.State, Interval: c.AEInterval, ShutdownCh: a.shutdownCh, ServerUpCh: serverUpCh, @@ -306,7 +310,7 @@ func (a *Agent) Start() error { } a.delegate = server - a.state.SetDelegate(server) + a.State.SetDelegate(server) a.sync.ClusterSize = func() int { return len(server.LANMembers()) } } else { client, err := consul.NewClientLogger(consulCfg, a.logger) @@ -315,7 +319,7 @@ func (a *Agent) Start() error { } a.delegate = client - a.state.SetDelegate(client) + a.State.SetDelegate(client) a.sync.ClusterSize = func() int { return len(client.LANMembers()) } } @@ -1387,7 +1391,7 @@ OUTER: // reapServicesInternal does a single pass, looking for services to reap. func (a *Agent) reapServicesInternal() { reaped := make(map[string]bool) - for checkID, cs := range a.state.CriticalCheckStates() { + for checkID, cs := range a.State.CriticalCheckStates() { serviceID := cs.Check.ServiceID // There's nothing to do if there's no service. @@ -1445,7 +1449,7 @@ func (a *Agent) persistService(service *structs.NodeService) error { svcPath := filepath.Join(a.config.DataDir, servicesDir, stringHash(service.ID)) wrapped := persistedService{ - Token: a.state.ServiceToken(service.ID), + Token: a.State.ServiceToken(service.ID), Service: service, } encoded, err := json.Marshal(wrapped) @@ -1473,7 +1477,7 @@ func (a *Agent) persistCheck(check *structs.HealthCheck, chkType *structs.CheckT wrapped := persistedCheck{ Check: check, ChkType: chkType, - Token: a.state.CheckToken(check.CheckID), + Token: a.State.CheckToken(check.CheckID), } encoded, err := json.Marshal(wrapped) @@ -1572,7 +1576,7 @@ func (a *Agent) AddService(service *structs.NodeService, chkTypes []*structs.Che defer a.restoreCheckState(snap) // Add the service - a.state.AddService(service, token) + a.State.AddService(service, token) // Persist the service to a file if persist && !a.config.DevMode { @@ -1622,7 +1626,7 @@ func (a *Agent) RemoveService(serviceID string, persist bool) error { } // Remove service immediately - if err := a.state.RemoveService(serviceID); err != nil { + if err := a.State.RemoveService(serviceID); err != nil { a.logger.Printf("[WARN] agent: Failed to deregister service %q: %s", serviceID, err) return nil } @@ -1635,7 +1639,7 @@ func (a *Agent) RemoveService(serviceID string, persist bool) error { } // Deregister any associated health checks - for checkID, check := range a.state.Checks() { + for checkID, check := range a.State.Checks() { if check.ServiceID != serviceID { continue } @@ -1668,7 +1672,7 @@ func (a *Agent) AddCheck(check *structs.HealthCheck, chkType *structs.CheckType, } if check.ServiceID != "" { - s := a.state.Services()[check.ServiceID] + s := a.State.Service(check.ServiceID) if s == nil { return fmt.Errorf("ServiceID %q does not exist", check.ServiceID) } @@ -1689,7 +1693,7 @@ func (a *Agent) AddCheck(check *structs.HealthCheck, chkType *structs.CheckType, } ttl := &CheckTTL{ - Notify: a.state, + Notify: a.State, CheckID: check.CheckID, TTL: chkType.TTL, Logger: a.logger, @@ -1716,7 +1720,7 @@ func (a *Agent) AddCheck(check *structs.HealthCheck, chkType *structs.CheckType, } http := &CheckHTTP{ - Notify: a.state, + Notify: a.State, CheckID: check.CheckID, HTTP: chkType.HTTP, Header: chkType.Header, @@ -1741,7 +1745,7 @@ func (a *Agent) AddCheck(check *structs.HealthCheck, chkType *structs.CheckType, } tcp := &CheckTCP{ - Notify: a.state, + Notify: a.State, CheckID: check.CheckID, TCP: chkType.TCP, Interval: chkType.Interval, @@ -1778,7 +1782,7 @@ func (a *Agent) AddCheck(check *structs.HealthCheck, chkType *structs.CheckType, } dockerCheck := &CheckDocker{ - Notify: a.state, + Notify: a.State, CheckID: check.CheckID, DockerContainerID: chkType.DockerContainerID, Shell: chkType.Shell, @@ -1808,7 +1812,7 @@ func (a *Agent) AddCheck(check *structs.HealthCheck, chkType *structs.CheckType, } monitor := &CheckMonitor{ - Notify: a.state, + Notify: a.State, CheckID: check.CheckID, Script: chkType.Script, ScriptArgs: chkType.ScriptArgs, @@ -1837,7 +1841,7 @@ func (a *Agent) AddCheck(check *structs.HealthCheck, chkType *structs.CheckType, } // Add to the local state for anti-entropy - err := a.state.AddCheck(check, token) + err := a.State.AddCheck(check, token) if err != nil { a.cancelCheckMonitors(check.CheckID) return err @@ -1860,7 +1864,7 @@ func (a *Agent) RemoveCheck(checkID types.CheckID, persist bool) error { } // Add to the local state for anti-entropy - a.state.RemoveCheck(checkID) + a.State.RemoveCheck(checkID) a.checkLock.Lock() defer a.checkLock.Unlock() @@ -2025,7 +2029,7 @@ func (a *Agent) Stats() map[string]map[string]string { "check_monitors": strconv.Itoa(len(a.checkMonitors)), "check_ttls": strconv.Itoa(len(a.checkTTLs)), } - for k, v := range a.state.Stats() { + for k, v := range a.State.Stats() { stats["agent"][k] = v } @@ -2149,7 +2153,7 @@ func (a *Agent) loadServices(conf *config.RuntimeConfig) error { } serviceID := p.Service.ID - if a.state.Service(serviceID) != nil { + if a.State.Service(serviceID) != nil { // Purge previously persisted service. This allows config to be // preferred over services persisted from the API. a.logger.Printf("[DEBUG] agent: service %q exists, not restoring from %q", @@ -2172,7 +2176,7 @@ func (a *Agent) loadServices(conf *config.RuntimeConfig) error { // unloadServices will deregister all services other than the 'consul' service // known to the local agent. func (a *Agent) unloadServices() error { - for id := range a.state.Services() { + for id := range a.State.Services() { if err := a.RemoveService(id, false); err != nil { return fmt.Errorf("Failed deregistering service '%s': %v", id, err) } @@ -2228,7 +2232,7 @@ func (a *Agent) loadChecks(conf *config.RuntimeConfig) error { } checkID := p.Check.CheckID - if a.state.Check(checkID) != nil { + if a.State.Check(checkID) != nil { // Purge previously persisted check. This allows config to be // preferred over persisted checks from the API. a.logger.Printf("[DEBUG] agent: check %q exists, not restoring from %q", @@ -2259,7 +2263,7 @@ func (a *Agent) loadChecks(conf *config.RuntimeConfig) error { // unloadChecks will deregister all checks known to the local agent. func (a *Agent) unloadChecks() error { - for id := range a.state.Checks() { + for id := range a.State.Checks() { if err := a.RemoveCheck(id, false); err != nil { return fmt.Errorf("Failed deregistering check '%s': %s", id, err) } @@ -2271,7 +2275,7 @@ func (a *Agent) unloadChecks() error { // checks. This is done before we reload our checks, so that we can properly // restore into the same state. func (a *Agent) snapshotCheckState() map[types.CheckID]*structs.HealthCheck { - return a.state.Checks() + return a.State.Checks() } // restoreCheckState is used to reset the health state based on a snapshot. @@ -2279,7 +2283,7 @@ func (a *Agent) snapshotCheckState() map[types.CheckID]*structs.HealthCheck { // in health state and potential session invalidations. func (a *Agent) restoreCheckState(snap map[types.CheckID]*structs.HealthCheck) { for id, check := range snap { - a.state.UpdateCheck(id, check.Status, check.Output) + a.State.UpdateCheck(id, check.Status, check.Output) } } @@ -2291,12 +2295,12 @@ func (a *Agent) loadMetadata(conf *config.RuntimeConfig) error { meta[k] = v } meta[structs.MetaSegmentKey] = conf.SegmentName - return a.state.LoadMetadata(meta) + return a.State.LoadMetadata(meta) } // unloadMetadata resets the local metadata state func (a *Agent) unloadMetadata() { - a.state.UnloadMetadata() + a.State.UnloadMetadata() } // serviceMaintCheckID returns the ID of a given service's maintenance check @@ -2307,14 +2311,14 @@ func serviceMaintCheckID(serviceID string) types.CheckID { // EnableServiceMaintenance will register a false health check against the given // service ID with critical status. This will exclude the service from queries. func (a *Agent) EnableServiceMaintenance(serviceID, reason, token string) error { - service, ok := a.state.Services()[serviceID] + service, ok := a.State.Services()[serviceID] if !ok { return fmt.Errorf("No service registered with ID %q", serviceID) } // Check if maintenance mode is not already enabled checkID := serviceMaintCheckID(serviceID) - if _, ok := a.state.Checks()[checkID]; ok { + if _, ok := a.State.Checks()[checkID]; ok { return nil } @@ -2342,13 +2346,13 @@ func (a *Agent) EnableServiceMaintenance(serviceID, reason, token string) error // DisableServiceMaintenance will deregister the fake maintenance mode check // if the service has been marked as in maintenance. func (a *Agent) DisableServiceMaintenance(serviceID string) error { - if _, ok := a.state.Services()[serviceID]; !ok { + if _, ok := a.State.Services()[serviceID]; !ok { return fmt.Errorf("No service registered with ID %q", serviceID) } // Check if maintenance mode is enabled checkID := serviceMaintCheckID(serviceID) - if _, ok := a.state.Checks()[checkID]; !ok { + if _, ok := a.State.Checks()[checkID]; !ok { return nil } @@ -2362,7 +2366,7 @@ func (a *Agent) DisableServiceMaintenance(serviceID string) error { // EnableNodeMaintenance places a node into maintenance mode. func (a *Agent) EnableNodeMaintenance(reason, token string) { // Ensure node maintenance is not already enabled - if _, ok := a.state.Checks()[structs.NodeMaint]; ok { + if _, ok := a.State.Checks()[structs.NodeMaint]; ok { return } @@ -2385,7 +2389,7 @@ func (a *Agent) EnableNodeMaintenance(reason, token string) { // DisableNodeMaintenance removes a node from maintenance mode func (a *Agent) DisableNodeMaintenance() { - if _, ok := a.state.Checks()[structs.NodeMaint]; !ok { + if _, ok := a.State.Checks()[structs.NodeMaint]; !ok { return } a.RemoveCheck(structs.NodeMaint, true) @@ -2429,7 +2433,7 @@ func (a *Agent) ReloadConfig(newCfg *config.RuntimeConfig) error { // Update filtered metrics metrics.UpdateFilter(newCfg.TelemetryAllowedPrefixes, newCfg.TelemetryBlockedPrefixes) - a.state.SetDiscardCheckOutput(newCfg.DiscardCheckOutput) + a.State.SetDiscardCheckOutput(newCfg.DiscardCheckOutput) return nil } diff --git a/agent/agent_endpoint.go b/agent/agent_endpoint.go index 13dcfb062f..6f105b8b0e 100644 --- a/agent/agent_endpoint.go +++ b/agent/agent_endpoint.go @@ -72,7 +72,7 @@ func (s *HTTPServer) AgentSelf(resp http.ResponseWriter, req *http.Request) (int Coord: cs[s.agent.config.SegmentName], Member: s.agent.LocalMember(), Stats: s.agent.Stats(), - Meta: s.agent.state.Metadata(), + Meta: s.agent.State.Metadata(), }, nil } @@ -137,7 +137,7 @@ func (s *HTTPServer) AgentServices(resp http.ResponseWriter, req *http.Request) var token string s.parseToken(req, &token) - services := s.agent.state.Services() + services := s.agent.State.Services() if err := s.agent.filterServices(token, &services); err != nil { return nil, err } @@ -161,7 +161,7 @@ func (s *HTTPServer) AgentChecks(resp http.ResponseWriter, req *http.Request) (i var token string s.parseToken(req, &token) - checks := s.agent.state.Checks() + checks := s.agent.State.Checks() if err := s.agent.filterChecks(token, &checks); err != nil { return nil, err } @@ -304,7 +304,7 @@ func (s *HTTPServer) AgentForceLeave(resp http.ResponseWriter, req *http.Request // services and checks to the server. If the operation fails, we only // only warn because the write did succeed and anti-entropy will sync later. func (s *HTTPServer) syncChanges() { - if err := s.agent.state.SyncChanges(); err != nil { + if err := s.agent.State.SyncChanges(); err != nil { s.agent.logger.Printf("[ERR] agent: failed to sync changes: %v", err) } } diff --git a/agent/agent_endpoint_test.go b/agent/agent_endpoint_test.go index 7068eab2df..d735d3b51a 100644 --- a/agent/agent_endpoint_test.go +++ b/agent/agent_endpoint_test.go @@ -51,7 +51,7 @@ func TestAgent_Services(t *testing.T) { Tags: []string{"master"}, Port: 5000, } - a.state.AddService(srv1, "") + a.State.AddService(srv1, "") req, _ := http.NewRequest("GET", "/v1/agent/services", nil) obj, err := a.srv.AgentServices(nil, req) @@ -78,7 +78,7 @@ func TestAgent_Services_ACLFilter(t *testing.T) { Tags: []string{"master"}, Port: 5000, } - a.state.AddService(srv1, "") + a.State.AddService(srv1, "") t.Run("no token", func(t *testing.T) { req, _ := http.NewRequest("GET", "/v1/agent/services", nil) @@ -116,7 +116,7 @@ func TestAgent_Checks(t *testing.T) { Name: "mysql", Status: api.HealthPassing, } - a.state.AddCheck(chk1, "") + a.State.AddCheck(chk1, "") req, _ := http.NewRequest("GET", "/v1/agent/checks", nil) obj, err := a.srv.AgentChecks(nil, req) @@ -143,7 +143,7 @@ func TestAgent_Checks_ACLFilter(t *testing.T) { Name: "mysql", Status: api.HealthPassing, } - a.state.AddCheck(chk1, "") + a.State.AddCheck(chk1, "") t.Run("no token", func(t *testing.T) { req, _ := http.NewRequest("GET", "/v1/agent/checks", nil) @@ -283,8 +283,8 @@ func TestAgent_Reload(t *testing.T) { `) defer a.Shutdown() - if _, ok := a.state.services["redis"]; !ok { - t.Fatalf("missing redis service") + if a.State.Service("redis") == nil { + t.Fatal("missing redis service") } cfg2 := TestConfig(config.Source{ @@ -307,8 +307,8 @@ func TestAgent_Reload(t *testing.T) { if err := a.ReloadConfig(cfg2); err != nil { t.Fatalf("got error %v want nil", err) } - if _, ok := a.state.services["redis-reloaded"]; !ok { - t.Fatalf("missing redis-reloaded service") + if a.State.Service("redis-reloaded") == nil { + t.Fatal("missing redis-reloaded service") } for _, wp := range a.watchPlans { @@ -682,7 +682,7 @@ func TestAgent_RegisterCheck(t *testing.T) { // Ensure we have a check mapping checkID := types.CheckID("test") - if _, ok := a.state.Checks()[checkID]; !ok { + if _, ok := a.State.Checks()[checkID]; !ok { t.Fatalf("missing test check") } @@ -691,12 +691,12 @@ func TestAgent_RegisterCheck(t *testing.T) { } // Ensure the token was configured - if token := a.state.CheckToken(checkID); token == "" { + if token := a.State.CheckToken(checkID); token == "" { t.Fatalf("missing token") } // By default, checks start in critical state. - state := a.state.Checks()[checkID] + state := a.State.Checks()[checkID] if state.Status != api.HealthCritical { t.Fatalf("bad: %v", state) } @@ -817,7 +817,7 @@ func TestAgent_RegisterCheck_Passing(t *testing.T) { // Ensure we have a check mapping checkID := types.CheckID("test") - if _, ok := a.state.Checks()[checkID]; !ok { + if _, ok := a.State.Checks()[checkID]; !ok { t.Fatalf("missing test check") } @@ -825,7 +825,7 @@ func TestAgent_RegisterCheck_Passing(t *testing.T) { t.Fatalf("missing test check ttl") } - state := a.state.Checks()[checkID] + state := a.State.Checks()[checkID] if state.Status != api.HealthPassing { t.Fatalf("bad: %v", state) } @@ -896,7 +896,7 @@ func TestAgent_DeregisterCheck(t *testing.T) { } // Ensure we have a check mapping - if _, ok := a.state.Checks()["test"]; ok { + if _, ok := a.State.Checks()["test"]; ok { t.Fatalf("have test check") } } @@ -947,7 +947,7 @@ func TestAgent_PassCheck(t *testing.T) { } // Ensure we have a check mapping - state := a.state.Checks()["test"] + state := a.State.Checks()["test"] if state.Status != api.HealthPassing { t.Fatalf("bad: %v", state) } @@ -1000,7 +1000,7 @@ func TestAgent_WarnCheck(t *testing.T) { } // Ensure we have a check mapping - state := a.state.Checks()["test"] + state := a.State.Checks()["test"] if state.Status != api.HealthWarning { t.Fatalf("bad: %v", state) } @@ -1053,7 +1053,7 @@ func TestAgent_FailCheck(t *testing.T) { } // Ensure we have a check mapping - state := a.state.Checks()["test"] + state := a.State.Checks()["test"] if state.Status != api.HealthCritical { t.Fatalf("bad: %v", state) } @@ -1117,7 +1117,7 @@ func TestAgent_UpdateCheck(t *testing.T) { t.Fatalf("expected 200, got %d", resp.Code) } - state := a.state.Checks()["test"] + state := a.State.Checks()["test"] if state.Status != c.Status || state.Output != c.Output { t.Fatalf("bad: %v", state) } @@ -1145,7 +1145,7 @@ func TestAgent_UpdateCheck(t *testing.T) { // Since we append some notes about truncating, we just do a // rough check that the output buffer was cut down so this test // isn't super brittle. - state := a.state.Checks()["test"] + state := a.State.Checks()["test"] if state.Status != api.HealthPassing || len(state.Output) > 2*CheckBufSize { t.Fatalf("bad: %v", state) } @@ -1228,12 +1228,12 @@ func TestAgent_RegisterService(t *testing.T) { } // Ensure the servie - if _, ok := a.state.Services()["test"]; !ok { + if _, ok := a.State.Services()["test"]; !ok { t.Fatalf("missing test service") } // Ensure we have a check mapping - checks := a.state.Checks() + checks := a.State.Checks() if len(checks) != 3 { t.Fatalf("bad: %v", checks) } @@ -1243,7 +1243,7 @@ func TestAgent_RegisterService(t *testing.T) { } // Ensure the token was configured - if token := a.state.ServiceToken("test"); token == "" { + if token := a.State.ServiceToken("test"); token == "" { t.Fatalf("missing token") } } @@ -1364,11 +1364,11 @@ func TestAgent_DeregisterService(t *testing.T) { } // Ensure we have a check mapping - if _, ok := a.state.Services()["test"]; ok { + if _, ok := a.State.Services()["test"]; ok { t.Fatalf("have test service") } - if _, ok := a.state.Checks()["test"]; ok { + if _, ok := a.State.Checks()["test"]; ok { t.Fatalf("have test check") } } @@ -1466,13 +1466,13 @@ func TestAgent_ServiceMaintenance_Enable(t *testing.T) { // Ensure the maintenance check was registered checkID := serviceMaintCheckID("test") - check, ok := a.state.Checks()[checkID] + check, ok := a.State.Checks()[checkID] if !ok { t.Fatalf("should have registered maintenance check") } // Ensure the token was added - if token := a.state.CheckToken(checkID); token != "mytoken" { + if token := a.State.CheckToken(checkID); token != "mytoken" { t.Fatalf("expected 'mytoken', got '%s'", token) } @@ -1513,7 +1513,7 @@ func TestAgent_ServiceMaintenance_Disable(t *testing.T) { // Ensure the maintenance check was removed checkID := serviceMaintCheckID("test") - if _, ok := a.state.Checks()[checkID]; ok { + if _, ok := a.State.Checks()[checkID]; ok { t.Fatalf("should have removed maintenance check") } } @@ -1579,13 +1579,13 @@ func TestAgent_NodeMaintenance_Enable(t *testing.T) { } // Ensure the maintenance check was registered - check, ok := a.state.Checks()[structs.NodeMaint] + check, ok := a.State.Checks()[structs.NodeMaint] if !ok { t.Fatalf("should have registered maintenance check") } // Check that the token was used - if token := a.state.CheckToken(structs.NodeMaint); token != "mytoken" { + if token := a.State.CheckToken(structs.NodeMaint); token != "mytoken" { t.Fatalf("expected 'mytoken', got '%s'", token) } @@ -1614,7 +1614,7 @@ func TestAgent_NodeMaintenance_Disable(t *testing.T) { } // Ensure the maintenance check was removed - if _, ok := a.state.Checks()[structs.NodeMaint]; ok { + if _, ok := a.State.Checks()[structs.NodeMaint]; ok { t.Fatalf("should have removed maintenance check") } } @@ -1670,7 +1670,7 @@ func TestAgent_RegisterCheck_Service(t *testing.T) { } // Ensure we have a check mapping - result := a.state.Checks() + result := a.State.Checks() if _, ok := result["service:memcache"]; !ok { t.Fatalf("missing memcached check") } diff --git a/agent/agent_test.go b/agent/agent_test.go index 19acf63d6d..ca6665a8bf 100644 --- a/agent/agent_test.go +++ b/agent/agent_test.go @@ -363,14 +363,14 @@ func TestAgent_AddService(t *testing.T) { t.Fatalf("err: %v", err) } - got, want := a.state.Services()[tt.srv.ID], tt.srv + got, want := a.State.Services()[tt.srv.ID], tt.srv verify.Values(t, "", got, want) }) // check the health checks for k, v := range tt.healthChks { t.Run(k, func(t *testing.T) { - got, want := a.state.Checks()[types.CheckID(k)], v + got, want := a.State.Checks()[types.CheckID(k)], v verify.Values(t, k, got, want) }) } @@ -437,10 +437,10 @@ func TestAgent_RemoveService(t *testing.T) { if err := a.RemoveService("memcache", false); err != nil { t.Fatalf("err: %s", err) } - if _, ok := a.state.Checks()["service:memcache"]; ok { + if _, ok := a.State.Checks()["service:memcache"]; ok { t.Fatalf("have memcache check") } - if _, ok := a.state.Checks()["check2"]; ok { + if _, ok := a.State.Checks()["check2"]; ok { t.Fatalf("have check2 check") } } @@ -466,15 +466,15 @@ func TestAgent_RemoveService(t *testing.T) { } // Ensure we have a state mapping - if _, ok := a.state.Services()["redis"]; ok { + if _, ok := a.State.Services()["redis"]; ok { t.Fatalf("have redis service") } // Ensure checks were removed - if _, ok := a.state.Checks()["service:redis:1"]; ok { + if _, ok := a.State.Checks()["service:redis:1"]; ok { t.Fatalf("check redis:1 should be removed") } - if _, ok := a.state.Checks()["service:redis:2"]; ok { + if _, ok := a.State.Checks()["service:redis:2"]; ok { t.Fatalf("check redis:2 should be removed") } @@ -507,7 +507,7 @@ func TestAgent_RemoveServiceRemovesAllChecks(t *testing.T) { } // verify chk1 exists - if a.state.Checks()["chk1"] == nil { + if a.State.Checks()["chk1"] == nil { t.Fatal("Could not find health check chk1") } @@ -517,10 +517,10 @@ func TestAgent_RemoveServiceRemovesAllChecks(t *testing.T) { } // check that both checks are there - if got, want := a.state.Checks()["chk1"], hchk1; !verify.Values(t, "", got, want) { + if got, want := a.State.Checks()["chk1"], hchk1; !verify.Values(t, "", got, want) { t.FailNow() } - if got, want := a.state.Checks()["chk2"], hchk2; !verify.Values(t, "", got, want) { + if got, want := a.State.Checks()["chk2"], hchk2; !verify.Values(t, "", got, want) { t.FailNow() } @@ -530,10 +530,10 @@ func TestAgent_RemoveServiceRemovesAllChecks(t *testing.T) { } // Check that both checks are gone - if a.state.Checks()["chk1"] != nil { + if a.State.Checks()["chk1"] != nil { t.Fatal("Found health check chk1 want nil") } - if a.state.Checks()["chk2"] != nil { + if a.State.Checks()["chk2"] != nil { t.Fatal("Found health check chk2 want nil") } } @@ -561,7 +561,7 @@ func TestAgent_AddCheck(t *testing.T) { } // Ensure we have a check mapping - sChk, ok := a.state.Checks()["mem"] + sChk, ok := a.State.Checks()["mem"] if !ok { t.Fatalf("missing mem check") } @@ -600,7 +600,7 @@ func TestAgent_AddCheck_StartPassing(t *testing.T) { } // Ensure we have a check mapping - sChk, ok := a.state.Checks()["mem"] + sChk, ok := a.State.Checks()["mem"] if !ok { t.Fatalf("missing mem check") } @@ -639,7 +639,7 @@ func TestAgent_AddCheck_MinInterval(t *testing.T) { } // Ensure we have a check mapping - if _, ok := a.state.Checks()["mem"]; !ok { + if _, ok := a.State.Checks()["mem"]; !ok { t.Fatalf("missing mem check") } @@ -704,7 +704,7 @@ func TestAgent_AddCheck_RestoreState(t *testing.T) { } // Ensure the check status was restored during registration - checks := a.state.Checks() + checks := a.State.Checks() check, ok := checks["baz"] if !ok { t.Fatalf("missing check") @@ -739,7 +739,7 @@ func TestAgent_AddCheck_ExecDisable(t *testing.T) { } // Ensure we don't have a check mapping - if memChk := a.state.Checks()["mem"]; memChk != nil { + if memChk := a.State.Checks()["mem"]; memChk != nil { t.Fatalf("should be missing mem check") } } @@ -782,7 +782,7 @@ func TestAgent_RemoveCheck(t *testing.T) { } // Ensure we have a check mapping - if _, ok := a.state.Checks()["mem"]; ok { + if _, ok := a.State.Checks()["mem"]; ok { t.Fatalf("have mem check") } @@ -817,7 +817,7 @@ func TestAgent_updateTTLCheck(t *testing.T) { } // Ensure we have a check mapping. - status := a.state.Checks()["mem"] + status := a.State.Checks()["mem"] if status.Status != api.HealthPassing { t.Fatalf("bad: %v", status) } @@ -904,15 +904,15 @@ func TestAgent_PersistService(t *testing.T) { a2.Start() defer a2.Shutdown() - restored, ok := a2.state.services[svc.ID] - if !ok { - t.Fatalf("bad: %#v", a2.state.services) + restored := a2.State.ServiceState(svc.ID) + if restored == nil { + t.Fatalf("service %q missing", svc.ID) } - if a2.state.serviceTokens[svc.ID] != "mytoken" { - t.Fatalf("bad: %#v", a2.state.services[svc.ID]) + if got, want := restored.Token, "mytoken"; got != want { + t.Fatalf("got token %q want %q", got, want) } - if restored.Port != 8001 { - t.Fatalf("bad: %#v", restored) + if got, want := restored.Service.Port, 8081; got != want { + t.Fatalf("got port %d want %d", got, want) } } @@ -951,7 +951,7 @@ func TestAgent_persistedService_compat(t *testing.T) { } // Ensure the service was restored - services := a.state.Services() + services := a.State.Services() result, ok := services["redis"] if !ok { t.Fatalf("missing service") @@ -1043,8 +1043,8 @@ func TestAgent_PurgeServiceOnDuplicate(t *testing.T) { if _, err := os.Stat(file); err == nil { t.Fatalf("should have removed persisted service") } - result, ok := a2.state.services["redis"] - if !ok { + result := a2.State.Service("redis") + if result == nil { t.Fatalf("missing service registration") } if !reflect.DeepEqual(result.Tags, []string{"bar"}) || result.Port != 9000 { @@ -1137,9 +1137,9 @@ func TestAgent_PersistCheck(t *testing.T) { a2.Start() defer a2.Shutdown() - result, ok := a2.state.checks[check.CheckID] - if !ok { - t.Fatalf("bad: %#v", a2.state.checks) + result := a2.State.Check(check.CheckID) + if result == nil { + t.Fatalf("bad: %#v", a2.State.Checks()) } if result.Status != api.HealthCritical { t.Fatalf("bad: %#v", result) @@ -1152,8 +1152,8 @@ func TestAgent_PersistCheck(t *testing.T) { if _, ok := a2.checkMonitors[check.CheckID]; !ok { t.Fatalf("bad: %#v", a2.checkMonitors) } - if a2.state.checkTokens[check.CheckID] != "mytoken" { - t.Fatalf("bad: %s", a2.state.checkTokens[check.CheckID]) + if a2.State.CheckState(check.CheckID).Token != "mytoken" { + t.Fatalf("bad: %s", a2.State.CheckState(check.CheckID).Token) } } @@ -1241,8 +1241,8 @@ func TestAgent_PurgeCheckOnDuplicate(t *testing.T) { if _, err := os.Stat(file); err == nil { t.Fatalf("should have removed persisted check") } - result, ok := a2.state.checks["mem"] - if !ok { + result := a2.State.Check("mem") + if result == nil { t.Fatalf("missing check registration") } expected := &structs.HealthCheck{ @@ -1269,11 +1269,11 @@ func TestAgent_loadChecks_token(t *testing.T) { `) defer a.Shutdown() - checks := a.state.Checks() + checks := a.State.Checks() if _, ok := checks["rabbitmq"]; !ok { t.Fatalf("missing check") } - if token := a.state.CheckToken("rabbitmq"); token != "abc123" { + if token := a.State.CheckToken("rabbitmq"); token != "abc123" { t.Fatalf("bad: %s", token) } } @@ -1307,7 +1307,7 @@ func TestAgent_unloadChecks(t *testing.T) { t.Fatalf("err: %s", err) } found := false - for check := range a.state.Checks() { + for check := range a.State.Checks() { if check == check1.CheckID { found = true break @@ -1323,7 +1323,7 @@ func TestAgent_unloadChecks(t *testing.T) { } // Make sure it was unloaded - for check := range a.state.Checks() { + for check := range a.State.Checks() { if check == check1.CheckID { t.Fatalf("should have unloaded checks") } @@ -1342,11 +1342,11 @@ func TestAgent_loadServices_token(t *testing.T) { `) defer a.Shutdown() - services := a.state.Services() + services := a.State.Services() if _, ok := services["rabbitmq"]; !ok { t.Fatalf("missing service") } - if token := a.state.ServiceToken("rabbitmq"); token != "abc123" { + if token := a.State.ServiceToken("rabbitmq"); token != "abc123" { t.Fatalf("bad: %s", token) } } @@ -1368,7 +1368,7 @@ func TestAgent_unloadServices(t *testing.T) { t.Fatalf("err: %v", err) } found := false - for id := range a.state.Services() { + for id := range a.State.Services() { if id == svc.ID { found = true break @@ -1382,7 +1382,7 @@ func TestAgent_unloadServices(t *testing.T) { if err := a.unloadServices(); err != nil { t.Fatalf("err: %s", err) } - if len(a.state.Services()) != 0 { + if len(a.State.Services()) != 0 { t.Fatalf("should have unloaded services") } } @@ -1411,13 +1411,13 @@ func TestAgent_Service_MaintenanceMode(t *testing.T) { // Make sure the critical health check was added checkID := serviceMaintCheckID("redis") - check, ok := a.state.Checks()[checkID] + check, ok := a.State.Checks()[checkID] if !ok { t.Fatalf("should have registered critical maintenance check") } // Check that the token was used to register the check - if token := a.state.CheckToken(checkID); token != "mytoken" { + if token := a.State.CheckToken(checkID); token != "mytoken" { t.Fatalf("expected 'mytoken', got: '%s'", token) } @@ -1432,7 +1432,7 @@ func TestAgent_Service_MaintenanceMode(t *testing.T) { } // Ensure the check was deregistered - if _, ok := a.state.Checks()[checkID]; ok { + if _, ok := a.State.Checks()[checkID]; ok { t.Fatalf("should have deregistered maintenance check") } @@ -1442,7 +1442,7 @@ func TestAgent_Service_MaintenanceMode(t *testing.T) { } // Ensure the check was registered with the default notes - check, ok = a.state.Checks()[checkID] + check, ok = a.State.Checks()[checkID] if !ok { t.Fatalf("should have registered critical check") } @@ -1479,19 +1479,19 @@ func TestAgent_Service_Reap(t *testing.T) { } // Make sure it's there and there's no critical check yet. - if _, ok := a.state.Services()["redis"]; !ok { + if _, ok := a.State.Services()["redis"]; !ok { t.Fatalf("should have redis service") } - if checks := a.state.CriticalChecks(); len(checks) > 0 { + if checks := a.State.CriticalCheckStates(); len(checks) > 0 { t.Fatalf("should not have critical checks") } // Wait for the check TTL to fail but before the check is reaped. time.Sleep(100 * time.Millisecond) - if _, ok := a.state.Services()["redis"]; !ok { + if _, ok := a.State.Services()["redis"]; !ok { t.Fatalf("should have redis service") } - if checks := a.state.CriticalChecks(); len(checks) != 1 { + if checks := a.State.CriticalCheckStates(); len(checks) != 1 { t.Fatalf("should have a critical check") } @@ -1499,28 +1499,28 @@ func TestAgent_Service_Reap(t *testing.T) { if err := a.updateTTLCheck("service:redis", api.HealthPassing, "foo"); err != nil { t.Fatalf("err: %v", err) } - if _, ok := a.state.Services()["redis"]; !ok { + if _, ok := a.State.Services()["redis"]; !ok { t.Fatalf("should have redis service") } - if checks := a.state.CriticalChecks(); len(checks) > 0 { + if checks := a.State.CriticalCheckStates(); len(checks) > 0 { t.Fatalf("should not have critical checks") } // Wait for the check TTL to fail again. time.Sleep(100 * time.Millisecond) - if _, ok := a.state.Services()["redis"]; !ok { + if _, ok := a.State.Services()["redis"]; !ok { t.Fatalf("should have redis service") } - if checks := a.state.CriticalChecks(); len(checks) != 1 { + if checks := a.State.CriticalCheckStates(); len(checks) != 1 { t.Fatalf("should have a critical check") } // Wait for the reap. time.Sleep(400 * time.Millisecond) - if _, ok := a.state.Services()["redis"]; ok { + if _, ok := a.State.Services()["redis"]; ok { t.Fatalf("redis service should have been reaped") } - if checks := a.state.CriticalChecks(); len(checks) > 0 { + if checks := a.State.CriticalCheckStates(); len(checks) > 0 { t.Fatalf("should not have critical checks") } } @@ -1552,28 +1552,28 @@ func TestAgent_Service_NoReap(t *testing.T) { } // Make sure it's there and there's no critical check yet. - if _, ok := a.state.Services()["redis"]; !ok { + if _, ok := a.State.Services()["redis"]; !ok { t.Fatalf("should have redis service") } - if checks := a.state.CriticalChecks(); len(checks) > 0 { + if checks := a.State.CriticalCheckStates(); len(checks) > 0 { t.Fatalf("should not have critical checks") } // Wait for the check TTL to fail. time.Sleep(200 * time.Millisecond) - if _, ok := a.state.Services()["redis"]; !ok { + if _, ok := a.State.Services()["redis"]; !ok { t.Fatalf("should have redis service") } - if checks := a.state.CriticalChecks(); len(checks) != 1 { + if checks := a.State.CriticalCheckStates(); len(checks) != 1 { t.Fatalf("should have a critical check") } // Wait a while and make sure it doesn't reap. time.Sleep(200 * time.Millisecond) - if _, ok := a.state.Services()["redis"]; !ok { + if _, ok := a.State.Services()["redis"]; !ok { t.Fatalf("should have redis service") } - if checks := a.state.CriticalChecks(); len(checks) != 1 { + if checks := a.State.CriticalCheckStates(); len(checks) != 1 { t.Fatalf("should have a critical check") } } @@ -1612,7 +1612,7 @@ func TestAgent_addCheck_restoresSnapshot(t *testing.T) { if err := a.AddService(svc, chkTypes, false, ""); err != nil { t.Fatalf("err: %s", err) } - check, ok := a.state.Checks()["service:redis"] + check, ok := a.State.Checks()["service:redis"] if !ok { t.Fatalf("missing check") } @@ -1630,13 +1630,13 @@ func TestAgent_NodeMaintenanceMode(t *testing.T) { a.EnableNodeMaintenance("broken", "mytoken") // Make sure the critical health check was added - check, ok := a.state.Checks()[structs.NodeMaint] + check, ok := a.State.Checks()[structs.NodeMaint] if !ok { t.Fatalf("should have registered critical node check") } // Check that the token was used to register the check - if token := a.state.CheckToken(structs.NodeMaint); token != "mytoken" { + if token := a.State.CheckToken(structs.NodeMaint); token != "mytoken" { t.Fatalf("expected 'mytoken', got: '%s'", token) } @@ -1649,7 +1649,7 @@ func TestAgent_NodeMaintenanceMode(t *testing.T) { a.DisableNodeMaintenance() // Ensure the check was deregistered - if _, ok := a.state.Checks()[structs.NodeMaint]; ok { + if _, ok := a.State.Checks()[structs.NodeMaint]; ok { t.Fatalf("should have deregistered critical node check") } @@ -1657,7 +1657,7 @@ func TestAgent_NodeMaintenanceMode(t *testing.T) { a.EnableNodeMaintenance("", "") // Make sure the check was registered with the default note - check, ok = a.state.Checks()[structs.NodeMaint] + check, ok = a.State.Checks()[structs.NodeMaint] if !ok { t.Fatalf("should have registered critical node check") } @@ -1712,7 +1712,7 @@ func TestAgent_checkStateSnapshot(t *testing.T) { a.restoreCheckState(snap) // Search for the check - out, ok := a.state.Checks()[check1.CheckID] + out, ok := a.State.Checks()[check1.CheckID] if !ok { t.Fatalf("check should have been registered") } diff --git a/agent/catalog_endpoint_test.go b/agent/catalog_endpoint_test.go index 048800d44f..566ba62013 100644 --- a/agent/catalog_endpoint_test.go +++ b/agent/catalog_endpoint_test.go @@ -33,22 +33,32 @@ func TestCatalogRegister(t *testing.T) { t.Fatalf("bad: %v", res) } - // data race - func() { - a.state.Lock() - defer a.state.Unlock() + // todo(fs): data race + // func() { + // a.State.Lock() + // defer a.State.Unlock() - // Service should be in sync - if err := a.state.syncService("foo"); err != nil { - t.Fatalf("err: %s", err) - } - if _, ok := a.state.serviceStatus["foo"]; !ok { - t.Fatalf("bad: %#v", a.state.serviceStatus) - } - if !a.state.serviceStatus["foo"].inSync { - t.Fatalf("should be in sync") - } - }() + // // Service should be in sync + // if err := a.State.syncService("foo"); err != nil { + // t.Fatalf("err: %s", err) + // } + // if _, ok := a.State.serviceStatus["foo"]; !ok { + // t.Fatalf("bad: %#v", a.State.serviceStatus) + // } + // if !a.State.serviceStatus["foo"].inSync { + // t.Fatalf("should be in sync") + // } + // }() + if err := a.State.SyncChanges(); err != nil { + t.Fatal("sync failed: ", err) + } + s := a.State.ServiceState("foo") + if s == nil { + t.Fatal("service 'foo' missing") + } + if !s.InSync { + t.Fatalf("service 'foo' should be in sync") + } } func TestCatalogRegister_Service_InvalidAddress(t *testing.T) { diff --git a/agent/local/state.go b/agent/local/state.go index b2fdb833f4..409e3fc175 100644 --- a/agent/local/state.go +++ b/agent/local/state.go @@ -51,6 +51,14 @@ type ServiceState struct { Deleted bool } +// Clone returns a shallow copy of the object. The service record still +// points to the original service record and must not be modified. +func (s *ServiceState) Clone() *ServiceState { + s2 := new(ServiceState) + *s2 = *s + return s2 +} + // CheckState describes the state of a health check record. type CheckState struct { // Check is the local copy of the health check record. @@ -79,6 +87,15 @@ type CheckState struct { Deleted bool } +// Clone returns a shallow copy of the object. The check record and the +// defer timer still point to the original values and must not be +// modified. +func (c *CheckState) Clone() *CheckState { + c2 := new(CheckState) + *c2 = *c + return c2 +} + // Critical returns true when the health check is in critical state. func (c *CheckState) Critical() bool { return !c.CriticalTime.IsZero() @@ -189,9 +206,6 @@ func (l *State) serviceToken(id string) string { // ensure it is registered // todo(fs): where is the persistence happening? func (l *State) AddService(service *structs.NodeService, token string) error { - l.Lock() - defer l.Unlock() - if service == nil { return fmt.Errorf("no service") } @@ -202,15 +216,21 @@ func (l *State) AddService(service *structs.NodeService, token string) error { service.ID = service.Service } - l.services[service.ID] = &ServiceState{ + l.AddServiceState(&ServiceState{ Service: service, Token: token, - } - l.changeMade() - + }) return nil } +func (l *State) AddServiceState(s *ServiceState) { + l.Lock() + defer l.Unlock() + + l.services[s.Service.ID] = s + l.changeMade() +} + // RemoveService is used to remove a service entry from the local state. // The agent will make a best effort to ensure it is deregistered. func (l *State) RemoveService(id string) error { @@ -261,6 +281,37 @@ func (l *State) Services() map[string]*structs.NodeService { return m } +// ServiceState returns a shallow copy of the current service state +// record. The service record still points to the original service +// record and must not be modified. +func (l *State) ServiceState(id string) *ServiceState { + l.RLock() + defer l.RUnlock() + + s := l.services[id] + if s == nil || s.Deleted { + return nil + } + return s.Clone() +} + +// ServiceStates returns a shallow copy of all service state records. +// The service record still points to the original service record and +// must not be modified. +func (l *State) ServiceStates() map[string]*ServiceState { + l.RLock() + defer l.RUnlock() + + m := make(map[string]*ServiceState) + for id, s := range l.services { + if s.Deleted { + continue + } + m[id] = s.Clone() + } + return m +} + // CheckToken is used to return the configured health check token for a // Check, or if none is configured, the default agent ACL token. func (l *State) CheckToken(checkID types.CheckID) string { @@ -286,9 +337,6 @@ func (l *State) checkToken(id types.CheckID) string { // This entry is persistent and the agent will make a best effort to // ensure it is registered func (l *State) AddCheck(check *structs.HealthCheck, token string) error { - l.Lock() - defer l.Unlock() - if check == nil { return fmt.Errorf("no check") } @@ -306,15 +354,21 @@ func (l *State) AddCheck(check *structs.HealthCheck, token string) error { // hard-set the node name check.Node = l.config.NodeName - l.checks[check.CheckID] = &CheckState{ + l.AddCheckState(&CheckState{ Check: check, Token: token, - } - l.changeMade() - + }) return nil } +func (l *State) AddCheckState(c *CheckState) { + l.Lock() + defer l.Unlock() + + l.checks[c.Check.CheckID] = c + l.changeMade() +} + // RemoveCheck is used to remove a health check from the local state. // The agent will make a best effort to ensure it is deregistered // todo(fs): RemoveService returns an error for a non-existant service. RemoveCheck should as well. @@ -418,17 +472,40 @@ func (l *State) Check(id types.CheckID) *structs.HealthCheck { // Checks returns the locally registered checks that the // agent is aware of and are being kept in sync with the server func (l *State) Checks() map[types.CheckID]*structs.HealthCheck { + m := make(map[types.CheckID]*structs.HealthCheck) + for id, c := range l.CheckStates() { + m[id] = c.Check + } + return m +} + +// CheckState returns a shallow copy of the current health check state +// record. The health check record and the deferred check still point to +// the original values and must not be modified. +func (l *State) CheckState(id types.CheckID) *CheckState { l.RLock() defer l.RUnlock() - m := make(map[types.CheckID]*structs.HealthCheck) + c := l.checks[id] + if c == nil || c.Deleted { + return nil + } + return c.Clone() +} + +// CheckStates returns a shallow copy of all health check state records. +// The health check records and the deferred checks still point to +// the original values and must not be modified. +func (l *State) CheckStates() map[types.CheckID]*CheckState { + l.RLock() + defer l.RUnlock() + + m := make(map[types.CheckID]*CheckState) for id, c := range l.checks { if c.Deleted { continue } - c2 := new(structs.HealthCheck) - *c2 = *c.Check - m[id] = c2 + m[id] = c.Clone() } return m } @@ -444,7 +521,7 @@ func (l *State) CriticalCheckStates() map[types.CheckID]*CheckState { if c.Deleted || !c.Critical() { continue } - m[id] = c + m[id] = c.Clone() } return m } diff --git a/agent/local/state_test.go b/agent/local/state_test.go index 2e27c4475a..844006fde3 100644 --- a/agent/local/state_test.go +++ b/agent/local/state_test.go @@ -1,11 +1,14 @@ -package local +package local_test import ( + "fmt" "reflect" "testing" "time" "github.com/hashicorp/consul/agent/config" + "github.com/hashicorp/consul/agent" + "github.com/hashicorp/consul/agent/local" "github.com/hashicorp/consul/agent/structs" "github.com/hashicorp/consul/agent/token" "github.com/hashicorp/consul/api" @@ -16,7 +19,7 @@ import ( func TestAgentAntiEntropy_Services(t *testing.T) { t.Parallel() - a := &TestAgent{Name: t.Name(), NoInitialSync: true} + a := &agent.TestAgent{Name: t.Name(), NoInitialSync: true} a.Start() defer a.Shutdown() @@ -35,7 +38,7 @@ func TestAgentAntiEntropy_Services(t *testing.T) { Tags: []string{"master"}, Port: 5000, } - a.state.AddService(srv1, "") + a.State.AddService(srv1, "") args.Service = srv1 if err := a.RPC("Catalog.Register", args, &out); err != nil { t.Fatalf("err: %v", err) @@ -48,7 +51,7 @@ func TestAgentAntiEntropy_Services(t *testing.T) { Tags: []string{}, Port: 8000, } - a.state.AddService(srv2, "") + a.State.AddService(srv2, "") srv2_mod := new(structs.NodeService) *srv2_mod = *srv2 @@ -65,7 +68,7 @@ func TestAgentAntiEntropy_Services(t *testing.T) { Tags: []string{}, Port: 80, } - a.state.AddService(srv3, "") + a.State.AddService(srv3, "") // Exists remote (delete) srv4 := &structs.NodeService{ @@ -87,7 +90,7 @@ func TestAgentAntiEntropy_Services(t *testing.T) { Address: "127.0.0.10", Port: 8000, } - a.state.AddService(srv5, "") + a.State.AddService(srv5, "") srv5_mod := new(structs.NodeService) *srv5_mod = *srv5 @@ -104,12 +107,10 @@ func TestAgentAntiEntropy_Services(t *testing.T) { Tags: []string{}, Port: 11211, } - a.state.AddService(srv6, "") - - // todo(fs): data race - a.state.Lock() - a.state.serviceStatus["cache"] = syncStatus{inSync: true} - a.state.Unlock() + a.State.AddServiceState(&local.ServiceState{ + Service: srv6, + InSync: true, + }) // Trigger anti-entropy run and wait a.StartSync() @@ -170,26 +171,13 @@ func TestAgentAntiEntropy_Services(t *testing.T) { } } - // todo(fs): data race - a.state.RLock() - defer a.state.RUnlock() - - // Check the local state - if len(a.state.services) != 5 { - r.Fatalf("bad: %v", a.state.services) - } - if len(a.state.serviceStatus) != 5 { - r.Fatalf("bad: %v", a.state.serviceStatus) - } - for name, status := range a.state.serviceStatus { - if !status.inSync { - r.Fatalf("should be in sync: %v %v", name, status) - } + if err := servicesInSync(a.State, 5); err != nil { + r.Fatal(err) } }) // Remove one of the services - a.state.RemoveService("api") + a.State.RemoveService("api") // Trigger anti-entropy run and wait a.StartSync() @@ -231,28 +219,15 @@ func TestAgentAntiEntropy_Services(t *testing.T) { } } - // todo(fs): data race - a.state.RLock() - defer a.state.RUnlock() - - // Check the local state - if len(a.state.services) != 4 { - r.Fatalf("bad: %v", a.state.services) - } - if len(a.state.serviceStatus) != 4 { - r.Fatalf("bad: %v", a.state.serviceStatus) - } - for name, status := range a.state.serviceStatus { - if !status.inSync { - r.Fatalf("should be in sync: %v %v", name, status) - } + if err := servicesInSync(a.State, 4); err != nil { + r.Fatal(err) } }) } func TestAgentAntiEntropy_EnableTagOverride(t *testing.T) { t.Parallel() - a := &TestAgent{Name: t.Name(), NoInitialSync: true} + a := &agent.TestAgent{Name: t.Name(), NoInitialSync: true} a.Start() defer a.Shutdown() @@ -271,7 +246,7 @@ func TestAgentAntiEntropy_EnableTagOverride(t *testing.T) { Port: 6100, EnableTagOverride: true, } - a.state.AddService(srv1, "") + a.State.AddService(srv1, "") srv1_mod := new(structs.NodeService) *srv1_mod = *srv1 srv1_mod.Port = 7100 @@ -289,7 +264,7 @@ func TestAgentAntiEntropy_EnableTagOverride(t *testing.T) { Port: 6200, EnableTagOverride: false, } - a.state.AddService(srv2, "") + a.State.AddService(srv2, "") srv2_mod := new(structs.NodeService) *srv2_mod = *srv2 srv2_mod.Port = 7200 @@ -314,8 +289,8 @@ func TestAgentAntiEntropy_EnableTagOverride(t *testing.T) { r.Fatalf("err: %v", err) } - a.state.RLock() - defer a.state.RUnlock() + a.State.RLock() + defer a.State.RUnlock() // All the services should match for id, serv := range services.NodeServices.Services { @@ -342,21 +317,15 @@ func TestAgentAntiEntropy_EnableTagOverride(t *testing.T) { } } - // todo(fs): data race - a.state.RLock() - defer a.state.RUnlock() - - for name, status := range a.state.serviceStatus { - if !status.inSync { - r.Fatalf("should be in sync: %v %v", name, status) - } + if err := servicesInSync(a.State, 2); err != nil { + r.Fatal(err) } }) } func TestAgentAntiEntropy_Services_WithChecks(t *testing.T) { t.Parallel() - a := NewTestAgent(t.Name(), "") + a := agent.NewTestAgent(t.Name(), "") defer a.Shutdown() { @@ -367,7 +336,7 @@ func TestAgentAntiEntropy_Services_WithChecks(t *testing.T) { Tags: []string{"master"}, Port: 5000, } - a.state.AddService(srv, "") + a.State.AddService(srv, "") chk := &structs.HealthCheck{ Node: a.Config.NodeName, @@ -376,18 +345,22 @@ func TestAgentAntiEntropy_Services_WithChecks(t *testing.T) { ServiceID: "mysql", Status: api.HealthPassing, } - a.state.AddCheck(chk, "") + a.State.AddCheck(chk, "") // todo(fs): data race - func() { - a.state.RLock() - defer a.state.RUnlock() + // func() { + // a.State.RLock() + // defer a.State.RUnlock() - // Sync the service once - if err := a.state.syncService("mysql"); err != nil { - t.Fatalf("err: %s", err) - } - }() + // // Sync the service once + // if err := a.State.syncService("mysql"); err != nil { + // t.Fatalf("err: %s", err) + // } + // }() + // todo(fs): is this correct? + if err := a.State.SyncChanges(); err != nil { + t.Fatal("sync failed: ", err) + } // We should have 2 services (consul included) svcReq := structs.NodeSpecificRequest{ @@ -424,7 +397,7 @@ func TestAgentAntiEntropy_Services_WithChecks(t *testing.T) { Tags: []string{"master"}, Port: 5000, } - a.state.AddService(srv, "") + a.State.AddService(srv, "") chk1 := &structs.HealthCheck{ Node: a.Config.NodeName, @@ -433,7 +406,7 @@ func TestAgentAntiEntropy_Services_WithChecks(t *testing.T) { ServiceID: "redis", Status: api.HealthPassing, } - a.state.AddCheck(chk1, "") + a.State.AddCheck(chk1, "") chk2 := &structs.HealthCheck{ Node: a.Config.NodeName, @@ -442,18 +415,22 @@ func TestAgentAntiEntropy_Services_WithChecks(t *testing.T) { ServiceID: "redis", Status: api.HealthPassing, } - a.state.AddCheck(chk2, "") + a.State.AddCheck(chk2, "") // todo(fs): data race - func() { - a.state.RLock() - defer a.state.RUnlock() + // func() { + // a.State.RLock() + // defer a.State.RUnlock() - // Sync the service once - if err := a.state.syncService("redis"); err != nil { - t.Fatalf("err: %s", err) - } - }() + // // Sync the service once + // if err := a.State.syncService("redis"); err != nil { + // t.Fatalf("err: %s", err) + // } + // }() + // todo(fs): is this correct? + if err := a.State.SyncChanges(); err != nil { + t.Fatal("sync failed: ", err) + } // We should have 3 services (consul included) svcReq := structs.NodeSpecificRequest{ @@ -499,7 +476,7 @@ var testRegisterRules = ` func TestAgentAntiEntropy_Services_ACLDeny(t *testing.T) { t.Parallel() - a := &TestAgent{Name: t.Name(), HCL: ` + a := &agent.TestAgent{Name: t.Name(), HCL: ` acl_datacenter = "dc1" acl_master_token = "root" acl_default_policy = "deny" @@ -533,7 +510,7 @@ func TestAgentAntiEntropy_Services_ACLDeny(t *testing.T) { Tags: []string{"master"}, Port: 5000, } - a.state.AddService(srv1, token) + a.State.AddService(srv1, token) // Create service (allowed) srv2 := &structs.NodeService{ @@ -542,7 +519,7 @@ func TestAgentAntiEntropy_Services_ACLDeny(t *testing.T) { Tags: []string{"foo"}, Port: 5001, } - a.state.AddService(srv2, token) + a.State.AddService(srv2, token) // Trigger anti-entropy run and wait a.StartSync() @@ -584,28 +561,13 @@ func TestAgentAntiEntropy_Services_ACLDeny(t *testing.T) { } } - // todo(fs): data race - func() { - a.state.RLock() - defer a.state.RUnlock() - - // Check the local state - if len(a.state.services) != 2 { - t.Fatalf("bad: %v", a.state.services) - } - if len(a.state.serviceStatus) != 2 { - t.Fatalf("bad: %v", a.state.serviceStatus) - } - for name, status := range a.state.serviceStatus { - if !status.inSync { - t.Fatalf("should be in sync: %v %v", name, status) - } - } - }() + if err := servicesInSync(a.State, 2); err != nil { + t.Fatal(err) + } } // Now remove the service and re-sync - a.state.RemoveService("api") + a.State.RemoveService("api") a.StartSync() time.Sleep(200 * time.Millisecond) @@ -643,35 +605,20 @@ func TestAgentAntiEntropy_Services_ACLDeny(t *testing.T) { } } - // todo(fs): data race - func() { - a.state.RLock() - defer a.state.RUnlock() - - // Check the local state - if len(a.state.services) != 1 { - t.Fatalf("bad: %v", a.state.services) - } - if len(a.state.serviceStatus) != 1 { - t.Fatalf("bad: %v", a.state.serviceStatus) - } - for name, status := range a.state.serviceStatus { - if !status.inSync { - t.Fatalf("should be in sync: %v %v", name, status) - } - } - }() + if err := servicesInSync(a.State, 1); err != nil { + t.Fatal(err) + } } // Make sure the token got cleaned up. - if token := a.state.ServiceToken("api"); token != "" { + if token := a.State.ServiceToken("api"); token != "" { t.Fatalf("bad: %s", token) } } func TestAgentAntiEntropy_Checks(t *testing.T) { t.Parallel() - a := &TestAgent{Name: t.Name(), NoInitialSync: true} + a := &agent.TestAgent{Name: t.Name(), NoInitialSync: true} a.Start() defer a.Shutdown() @@ -690,7 +637,7 @@ func TestAgentAntiEntropy_Checks(t *testing.T) { Name: "mysql", Status: api.HealthPassing, } - a.state.AddCheck(chk1, "") + a.State.AddCheck(chk1, "") args.Check = chk1 if err := a.RPC("Catalog.Register", args, &out); err != nil { t.Fatalf("err: %v", err) @@ -703,7 +650,7 @@ func TestAgentAntiEntropy_Checks(t *testing.T) { Name: "redis", Status: api.HealthPassing, } - a.state.AddCheck(chk2, "") + a.State.AddCheck(chk2, "") chk2_mod := new(structs.HealthCheck) *chk2_mod = *chk2 @@ -720,7 +667,7 @@ func TestAgentAntiEntropy_Checks(t *testing.T) { Name: "web", Status: api.HealthPassing, } - a.state.AddCheck(chk3, "") + a.State.AddCheck(chk3, "") // Exists remote (delete) chk4 := &structs.HealthCheck{ @@ -741,12 +688,10 @@ func TestAgentAntiEntropy_Checks(t *testing.T) { Name: "cache", Status: api.HealthPassing, } - a.state.AddCheck(chk5, "") - - // todo(fs): data race - a.state.Lock() - a.state.checkStatus["cache"] = syncStatus{inSync: true} - a.state.Unlock() + a.State.AddCheckState(&local.CheckState{ + Check: chk5, + InSync: true, + }) // Trigger anti-entropy run and wait a.StartSync() @@ -796,24 +741,9 @@ func TestAgentAntiEntropy_Checks(t *testing.T) { } }) - // todo(fs): data race - func() { - a.state.RLock() - defer a.state.RUnlock() - - // Check the local state - if len(a.state.checks) != 4 { - t.Fatalf("bad: %v", a.state.checks) - } - if len(a.state.checkStatus) != 4 { - t.Fatalf("bad: %v", a.state.checkStatus) - } - for name, status := range a.state.checkStatus { - if !status.inSync { - t.Fatalf("should be in sync: %v %v", name, status) - } - } - }() + if err := checksInSync(a.State, 4); err != nil { + t.Fatal(err) + } // Make sure we sent along our node info addresses when we synced. { @@ -836,7 +766,7 @@ func TestAgentAntiEntropy_Checks(t *testing.T) { } // Remove one of the checks - a.state.RemoveCheck("redis") + a.State.RemoveCheck("redis") // Trigger anti-entropy run and wait a.StartSync() @@ -876,29 +806,14 @@ func TestAgentAntiEntropy_Checks(t *testing.T) { } }) - // todo(fs): data race - func() { - a.state.RLock() - defer a.state.RUnlock() - - // Check the local state - if len(a.state.checks) != 3 { - t.Fatalf("bad: %v", a.state.checks) - } - if len(a.state.checkStatus) != 3 { - t.Fatalf("bad: %v", a.state.checkStatus) - } - for name, status := range a.state.checkStatus { - if !status.inSync { - t.Fatalf("should be in sync: %v %v", name, status) - } - } - }() + if err := checksInSync(a.State, 3); err != nil { + t.Fatal(err) + } } func TestAgentAntiEntropy_Checks_ACLDeny(t *testing.T) { t.Parallel() - a := &TestAgent{Name: t.Name(), HCL: ` + a := &agent.TestAgent{Name: t.Name(), HCL: ` acl_datacenter = "dc1" acl_master_token = "root" acl_default_policy = "deny" @@ -932,14 +847,14 @@ func TestAgentAntiEntropy_Checks_ACLDeny(t *testing.T) { Tags: []string{"master"}, Port: 5000, } - a.state.AddService(srv1, "root") + a.State.AddService(srv1, "root") srv2 := &structs.NodeService{ ID: "api", Service: "api", Tags: []string{"foo"}, Port: 5001, } - a.state.AddService(srv2, "root") + a.State.AddService(srv2, "root") // Trigger anti-entropy run and wait a.StartSync() @@ -983,24 +898,9 @@ func TestAgentAntiEntropy_Checks_ACLDeny(t *testing.T) { } } - // todo(fs): data race - func() { - a.state.RLock() - defer a.state.RUnlock() - - // Check the local state - if len(a.state.services) != 2 { - t.Fatalf("bad: %v", a.state.services) - } - if len(a.state.serviceStatus) != 2 { - t.Fatalf("bad: %v", a.state.serviceStatus) - } - for name, status := range a.state.serviceStatus { - if !status.inSync { - t.Fatalf("should be in sync: %v %v", name, status) - } - } - }() + if err := servicesInSync(a.State, 2); err != nil { + t.Fatal(err) + } } // This check won't be allowed. @@ -1013,7 +913,7 @@ func TestAgentAntiEntropy_Checks_ACLDeny(t *testing.T) { Name: "mysql", Status: api.HealthPassing, } - a.state.AddCheck(chk1, token) + a.State.AddCheck(chk1, token) // This one will be allowed. chk2 := &structs.HealthCheck{ @@ -1025,7 +925,7 @@ func TestAgentAntiEntropy_Checks_ACLDeny(t *testing.T) { Name: "api", Status: api.HealthPassing, } - a.state.AddCheck(chk2, token) + a.State.AddCheck(chk2, token) // Trigger anti-entropy run and wait. a.StartSync() @@ -1068,27 +968,12 @@ func TestAgentAntiEntropy_Checks_ACLDeny(t *testing.T) { } }) - // todo(fs): data race - func() { - a.state.RLock() - defer a.state.RUnlock() - - // Check the local state. - if len(a.state.checks) != 2 { - t.Fatalf("bad: %v", a.state.checks) - } - if len(a.state.checkStatus) != 2 { - t.Fatalf("bad: %v", a.state.checkStatus) - } - for name, status := range a.state.checkStatus { - if !status.inSync { - t.Fatalf("should be in sync: %v %v", name, status) - } - } - }() + if err := checksInSync(a.State, 2); err != nil { + t.Fatal(err) + } // Now delete the check and wait for sync. - a.state.RemoveCheck("api-check") + a.State.RemoveCheck("api-check") a.StartSync() time.Sleep(200 * time.Millisecond) // Verify that we are in sync @@ -1126,27 +1011,12 @@ func TestAgentAntiEntropy_Checks_ACLDeny(t *testing.T) { } }) - // todo(fs): data race - func() { - a.state.RLock() - defer a.state.RUnlock() - - // Check the local state. - if len(a.state.checks) != 1 { - t.Fatalf("bad: %v", a.state.checks) - } - if len(a.state.checkStatus) != 1 { - t.Fatalf("bad: %v", a.state.checkStatus) - } - for name, status := range a.state.checkStatus { - if !status.inSync { - t.Fatalf("should be in sync: %v %v", name, status) - } - } - }() + if err := checksInSync(a.State, 1); err != nil { + t.Fatal(err) + } // Make sure the token got cleaned up. - if token := a.state.CheckToken("api-check"); token != "" { + if token := a.State.CheckToken("api-check"); token != "" { t.Fatalf("bad: %s", token) } } @@ -1203,7 +1073,7 @@ func TestAgent_UpdateCheck_DiscardOutput(t *testing.T) { func TestAgentAntiEntropy_Check_DeferSync(t *testing.T) { t.Parallel() - a := &TestAgent{Name: t.Name(), HCL: ` + a := &agent.TestAgent{Name: t.Name(), HCL: ` check_update_interval = "500ms" `, NoInitialSync: true} a.Start() @@ -1217,7 +1087,7 @@ func TestAgentAntiEntropy_Check_DeferSync(t *testing.T) { Status: api.HealthPassing, Output: "", } - a.state.AddCheck(check, "") + a.State.AddCheck(check, "") // Trigger anti-entropy run and wait a.StartSync() @@ -1238,7 +1108,7 @@ func TestAgentAntiEntropy_Check_DeferSync(t *testing.T) { }) // Update the check output! Should be deferred - a.state.UpdateCheck("web", api.HealthPassing, "output") + a.State.UpdateCheck("web", api.HealthPassing, "output") // Should not update for 500 milliseconds time.Sleep(250 * time.Millisecond) @@ -1337,7 +1207,7 @@ func TestAgentAntiEntropy_Check_DeferSync(t *testing.T) { } // Now make an update that should be deferred. - a.state.UpdateCheck("web", api.HealthPassing, "deferred") + a.State.UpdateCheck("web", api.HealthPassing, "deferred") // Trigger anti-entropy run and wait. a.StartSync() @@ -1381,7 +1251,7 @@ func TestAgentAntiEntropy_NodeInfo(t *testing.T) { nodeMeta := map[string]string{ "somekey": "somevalue", } - a := &TestAgent{Name: t.Name(), HCL: ` + a := &agent.TestAgent{Name: t.Name(), HCL: ` node_id = "40e4a748-2192-161a-0510-9bf59fe950b5" node_meta { somekey = "somevalue" @@ -1453,40 +1323,15 @@ func TestAgentAntiEntropy_NodeInfo(t *testing.T) { }) } -func TestAgentAntiEntropy_deleteService_fails(t *testing.T) { - t.Parallel() - l := new(localState) - - // todo(fs): data race - l.Lock() - defer l.Unlock() - if err := l.deleteService(""); err == nil { - t.Fatalf("should have failed") - } -} - -func TestAgentAntiEntropy_deleteCheck_fails(t *testing.T) { - t.Parallel() - l := new(localState) - - // todo(fs): data race - l.Lock() - defer l.Unlock() - if err := l.deleteCheck(""); err == nil { - t.Fatalf("should have errored") - } -} - func TestAgent_serviceTokens(t *testing.T) { t.Parallel() + cfg := agent.TestConfig() tokens := new(token.Store) tokens.UpdateUserToken("default") - l := NewLocalState(config.DefaultRuntimeConfig(`bind_addr = "127.0.0.1" data_dir = "dummy"`), nil, tokens, make(chan struct{}, 1)) + l := local.NewState(config.DefaultRuntimeConfig(`bind_addr = "127.0.0.1" data_dir = "dummy"`), nil, tokens, make(chan struct{}, 1)) - l.AddService(&structs.NodeService{ - ID: "redis", - }, "") + l.AddService(&structs.NodeService{ID: "redis"}, "") // Returns default when no token is set if token := l.ServiceToken("redis"); token != "default" { @@ -1494,7 +1339,7 @@ func TestAgent_serviceTokens(t *testing.T) { } // Returns configured token - l.serviceTokens["redis"] = "abc123" + l.AddService(&structs.NodeService{ID: "redis"}, "abc123") if token := l.ServiceToken("redis"); token != "abc123" { t.Fatalf("bad: %s", token) } @@ -1509,17 +1354,19 @@ func TestAgent_serviceTokens(t *testing.T) { func TestAgent_checkTokens(t *testing.T) { t.Parallel() + cfg := agent.TestConfig() tokens := new(token.Store) tokens.UpdateUserToken("default") - l := NewLocalState(config.DefaultRuntimeConfig(`bind_addr = "127.0.0.1" data_dir = "dummy"`), nil, tokens, make(chan struct{}, 1)) + l := local.NewState(config.DefaultRuntimeConfig(`bind_addr = "127.0.0.1" data_dir = "dummy"`), nil, tokens, make(chan struct{}, 1)) // Returns default when no token is set + l.AddCheck(&structs.HealthCheck{CheckID: types.CheckID("mem")}, "") if token := l.CheckToken("mem"); token != "default" { t.Fatalf("bad: %s", token) } // Returns configured token - l.checkTokens["mem"] = "abc123" + l.AddCheck(&structs.HealthCheck{CheckID: types.CheckID("mem")}, "abc123") if token := l.CheckToken("mem"); token != "abc123" { t.Fatalf("bad: %s", token) } @@ -1533,7 +1380,7 @@ func TestAgent_checkTokens(t *testing.T) { func TestAgent_checkCriticalTime(t *testing.T) { t.Parallel() - l := NewLocalState(config.DefaultRuntimeConfig(`bind_addr = "127.0.0.1" data_dir = "dummy"`), nil, new(token.Store), make(chan struct{}, 1)) + l := local.NewState(config.DefaultRuntimeConfig(`bind_addr = "127.0.0.1" data_dir = "dummy"`), nil, new(token.Store), make(chan struct{}, 1)) svc := &structs.NodeService{ID: "redis", Service: "redis", Port: 8000} l.AddService(svc, "") @@ -1548,54 +1395,54 @@ func TestAgent_checkCriticalTime(t *testing.T) { Status: api.HealthPassing, } l.AddCheck(chk, "") - if checks := l.CriticalChecks(); len(checks) > 0 { + if checks := l.CriticalCheckStates(); len(checks) > 0 { t.Fatalf("should not have any critical checks") } // Set it to warning and make sure that doesn't show up as critical. l.UpdateCheck(checkID, api.HealthWarning, "") - if checks := l.CriticalChecks(); len(checks) > 0 { + if checks := l.CriticalCheckStates(); len(checks) > 0 { t.Fatalf("should not have any critical checks") } // Fail the check and make sure the time looks reasonable. l.UpdateCheck(checkID, api.HealthCritical, "") - if crit, ok := l.CriticalChecks()[checkID]; !ok { + if c, ok := l.CriticalCheckStates()[checkID]; !ok { t.Fatalf("should have a critical check") - } else if crit.CriticalFor > time.Millisecond { - t.Fatalf("bad: %#v", crit) + } else if c.CriticalFor() > time.Millisecond { + t.Fatalf("bad: %#v", c) } // Wait a while, then fail it again and make sure the time keeps track // of the initial failure, and doesn't reset here. time.Sleep(50 * time.Millisecond) l.UpdateCheck(chk.CheckID, api.HealthCritical, "") - if crit, ok := l.CriticalChecks()[checkID]; !ok { + if c, ok := l.CriticalCheckStates()[checkID]; !ok { t.Fatalf("should have a critical check") - } else if crit.CriticalFor < 25*time.Millisecond || - crit.CriticalFor > 75*time.Millisecond { - t.Fatalf("bad: %#v", crit) + } else if c.CriticalFor() < 25*time.Millisecond || + c.CriticalFor() > 75*time.Millisecond { + t.Fatalf("bad: %#v", c) } // Set it passing again. l.UpdateCheck(checkID, api.HealthPassing, "") - if checks := l.CriticalChecks(); len(checks) > 0 { + if checks := l.CriticalCheckStates(); len(checks) > 0 { t.Fatalf("should not have any critical checks") } // Fail the check and make sure the time looks like it started again // from the latest failure, not the original one. l.UpdateCheck(checkID, api.HealthCritical, "") - if crit, ok := l.CriticalChecks()[checkID]; !ok { + if c, ok := l.CriticalCheckStates()[checkID]; !ok { t.Fatalf("should have a critical check") - } else if crit.CriticalFor > time.Millisecond { - t.Fatalf("bad: %#v", crit) + } else if c.CriticalFor() > time.Millisecond { + t.Fatalf("bad: %#v", c) } } func TestAgent_AddCheckFailure(t *testing.T) { t.Parallel() - l := NewLocalState(config.DefaultRuntimeConfig(`bind_addr = "127.0.0.1" data_dir = "dummy"`), nil, new(token.Store), make(chan struct{}, 1)) + l := local.NewState(config.DefaultRuntimeConfig(`bind_addr = "127.0.0.1" data_dir = "dummy"`), nil, new(token.Store), make(chan struct{}, 1)) // Add a check for a service that does not exist and verify that it fails checkID := types.CheckID("redis:1") @@ -1615,7 +1462,7 @@ func TestAgent_AddCheckFailure(t *testing.T) { func TestAgent_sendCoordinate(t *testing.T) { t.Parallel() - a := NewTestAgent(t.Name(), ` + a := agent.NewTestAgent(t.Name(), ` sync_coordinate_interval_min = "1ms" sync_coordinate_rate_target = 10.0 consul = { @@ -1649,3 +1496,29 @@ func TestAgent_sendCoordinate(t *testing.T) { } }) } + +func servicesInSync(state *local.State, wantServices int) error { + services := state.ServiceStates() + if got, want := len(services), wantServices; got != want { + return fmt.Errorf("got %d services want %d", got, want) + } + for id, s := range services { + if !s.InSync { + return fmt.Errorf("service %q should be in sync", id) + } + } + return nil +} + +func checksInSync(state *local.State, wantChecks int) error { + checks := state.CheckStates() + if got, want := len(checks), wantChecks; got != want { + return fmt.Errorf("got %d checks want %d", got, want) + } + for id, c := range checks { + if !c.InSync { + return fmt.Errorf("check %q should be in sync", id) + } + } + return nil +} diff --git a/agent/user_event.go b/agent/user_event.go index a83cb7fdce..24d4bb03d5 100644 --- a/agent/user_event.go +++ b/agent/user_event.go @@ -173,7 +173,7 @@ func (a *Agent) shouldProcessUserEvent(msg *UserEvent) bool { } // Scan for a match - services := a.state.Services() + services := a.State.Services() found := false OUTER: for name, info := range services { diff --git a/agent/user_event_test.go b/agent/user_event_test.go index 0f1e59d00b..1f79145798 100644 --- a/agent/user_event_test.go +++ b/agent/user_event_test.go @@ -57,7 +57,7 @@ func TestShouldProcessUserEvent(t *testing.T) { Tags: []string{"test", "foo", "bar", "master"}, Port: 5000, } - a.state.AddService(srv1, "") + a.State.AddService(srv1, "") p := &UserEvent{} if !a.shouldProcessUserEvent(p) { @@ -157,7 +157,7 @@ func TestFireReceiveEvent(t *testing.T) { Tags: []string{"test", "foo", "bar", "master"}, Port: 5000, } - a.state.AddService(srv1, "") + a.State.AddService(srv1, "") p1 := &UserEvent{Name: "deploy", ServiceFilter: "web"} err := a.UserEvent("dc1", "root", p1) From d2cc0e6e8acb0873ed27ac31a6b7570cfcd6a9d2 Mon Sep 17 00:00:00 2001 From: Frank Schroeder Date: Mon, 28 Aug 2017 14:17:14 +0200 Subject: [PATCH 08/36] local state: fix failing tests --- agent/agent_test.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/agent/agent_test.go b/agent/agent_test.go index ca6665a8bf..a883caf703 100644 --- a/agent/agent_test.go +++ b/agent/agent_test.go @@ -911,7 +911,7 @@ func TestAgent_PersistService(t *testing.T) { if got, want := restored.Token, "mytoken"; got != want { t.Fatalf("got token %q want %q", got, want) } - if got, want := restored.Service.Port, 8081; got != want { + if got, want := restored.Service.Port, 8001; got != want { t.Fatalf("got port %d want %d", got, want) } } From 37b95ef98e8e8cbb377c0de0f8033977b08669f1 Mon Sep 17 00:00:00 2001 From: Frank Schroeder Date: Mon, 28 Aug 2017 14:17:15 +0200 Subject: [PATCH 09/36] local state: fix test with updated error message --- agent/local/state.go | 2 +- agent/local/state_test.go | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/agent/local/state.go b/agent/local/state.go index 409e3fc175..5058eed0bf 100644 --- a/agent/local/state.go +++ b/agent/local/state.go @@ -348,7 +348,7 @@ func (l *State) AddCheck(check *structs.HealthCheck, token string) error { // if there is a serviceID associated with the check, make sure it exists before adding it // NOTE - This logic may be moved to be handled within the Agent's Addcheck method after a refactor if check.ServiceID != "" && l.services[check.ServiceID] == nil { - return fmt.Errorf("Check %q refers to non-existent service %q does not exist", check.CheckID, check.ServiceID) + return fmt.Errorf("Check %q refers to non-existent service %q", check.CheckID, check.ServiceID) } // hard-set the node name diff --git a/agent/local/state_test.go b/agent/local/state_test.go index 844006fde3..89136e88fe 100644 --- a/agent/local/state_test.go +++ b/agent/local/state_test.go @@ -1,6 +1,7 @@ package local_test import ( + "errors" "fmt" "reflect" "testing" @@ -1453,11 +1454,10 @@ func TestAgent_AddCheckFailure(t *testing.T) { ServiceID: "redis", Status: api.HealthPassing, } - expectedErr := "ServiceID \"redis\" does not exist" - if err := l.AddCheck(chk, ""); err == nil || expectedErr != err.Error() { - t.Fatalf("Expected error when adding a check for a non-existent service but got %v", err) + wantErr := errors.New(`Check "redis:1" refers to non-existent service "redis"`) + if got, want := l.AddCheck(chk, ""), wantErr; !reflect.DeepEqual(got, want) { + t.Fatalf("got error %q want %q", got, want) } - } func TestAgent_sendCoordinate(t *testing.T) { From 71c74e62c77b52d66a91028f85d739205b4582a9 Mon Sep 17 00:00:00 2001 From: Frank Schroeder Date: Mon, 28 Aug 2017 14:17:16 +0200 Subject: [PATCH 10/36] local state: fix anti-entropy state tests The anti-entropy tests relied on the side-effect of the StartSync() method to perform a full sync instead of a partial sync. This lead to multiple anti-entropy go routines being started unnecessary retry loops. This change changes the behavior to perform synchronous full syncs when necessary removing the need for all of the time.Sleep and most of the retry loops. --- agent/ae/ae.go | 16 +++--- agent/local/state.go | 22 ++++++-- agent/local/state_test.go | 116 +++++++++++++++++--------------------- agent/testagent.go | 12 ++-- 4 files changed, 82 insertions(+), 84 deletions(-) diff --git a/agent/ae/ae.go b/agent/ae/ae.go index f3b6745036..b055f7aa17 100644 --- a/agent/ae/ae.go +++ b/agent/ae/ae.go @@ -51,8 +51,8 @@ type StateSyncer struct { // State contains the data that needs to be synchronized. State interface { - UpdateSyncState() error SyncChanges() error + SyncFull() error } // Interval is the time between two regular sync runs. @@ -91,15 +91,15 @@ func (s *StateSyncer) Run() { return lib.RandomStagger(time.Duration(f) * d) } -Sync: +FullSync: for { - switch err := s.State.UpdateSyncState(); { + switch err := s.State.SyncFull(); { - // update sync status failed + // full sync failed case err != nil: s.Logger.Printf("[ERR] agent: failed to sync remote state: %v", err) - // retry updating sync status after some time or when a consul + // retry full sync after some time or when a consul // server was added. select { @@ -121,10 +121,8 @@ Sync: return } - // update sync status OK + // full sync OK default: - // force-trigger sync to pickup any changes - s.triggerSync() // do partial syncs until it is time for a full sync again for { @@ -140,7 +138,7 @@ Sync: // } case <-time.After(s.Interval + stagger(s.Interval)): - continue Sync + continue FullSync case <-s.TriggerCh: if s.Paused() { diff --git a/agent/local/state.go b/agent/local/state.go index 5058eed0bf..b980c569f5 100644 --- a/agent/local/state.go +++ b/agent/local/state.go @@ -538,9 +538,9 @@ func (l *State) Metadata() map[string]string { return m } -// UpdateSyncState does a read of the server state, and updates +// updateSyncState does a read of the server state, and updates // the local sync status as appropriate -func (l *State) UpdateSyncState() error { +func (l *State) updateSyncState() error { // 1. get all checks and services from the master req := structs.NodeSpecificRequest{ Datacenter: l.config.Datacenter, @@ -631,7 +631,6 @@ func (l *State) UpdateSyncState() error { } for id, rc := range remoteChecks { - lc := l.checks[id] // If we don't have the check locally, deregister it @@ -639,7 +638,7 @@ func (l *State) UpdateSyncState() error { // The Serf check is created automatically and does not // need to be deregistered. if id == structs.SerfCheckID { - l.logger.Printf("Skipping remote check %q since it is managed automatically", id) + l.logger.Printf("[DEBUG] Skipping remote check %q since it is managed automatically", id) continue } @@ -683,6 +682,21 @@ func (l *State) UpdateSyncState() error { return nil } +// SyncFull determines the delta between the local and remote state +// and synchronizes the changes. +func (l *State) SyncFull() error { + // note that we do not acquire the lock here since the methods + // we are calling will do that themself. + + // todo(fs): is it an issue that we do not hold the lock for the entire time? + // todo(fs): IMO, this doesn't matter since SyncChanges will sync whatever + // todo(fs): was determined in the update step. + if err := l.updateSyncState(); err != nil { + return err + } + return l.SyncChanges() +} + // SyncChanges is used to scan the status our local services and checks // and update any that are out of sync with the server func (l *State) SyncChanges() error { diff --git a/agent/local/state_test.go b/agent/local/state_test.go index 89136e88fe..5646462d66 100644 --- a/agent/local/state_test.go +++ b/agent/local/state_test.go @@ -7,8 +7,8 @@ import ( "testing" "time" - "github.com/hashicorp/consul/agent/config" "github.com/hashicorp/consul/agent" + "github.com/hashicorp/consul/agent/config" "github.com/hashicorp/consul/agent/local" "github.com/hashicorp/consul/agent/structs" "github.com/hashicorp/consul/agent/token" @@ -20,7 +20,7 @@ import ( func TestAgentAntiEntropy_Services(t *testing.T) { t.Parallel() - a := &agent.TestAgent{Name: t.Name(), NoInitialSync: true} + a := &agent.TestAgent{Name: t.Name()} a.Start() defer a.Shutdown() @@ -113,8 +113,9 @@ func TestAgentAntiEntropy_Services(t *testing.T) { InSync: true, }) - // Trigger anti-entropy run and wait - a.StartSync() + if err := a.State.SyncFull(); err != nil { + t.Fatalf("err: %v", err) + } var services structs.IndexedNodeServices req := structs.NodeSpecificRequest{ @@ -180,8 +181,9 @@ func TestAgentAntiEntropy_Services(t *testing.T) { // Remove one of the services a.State.RemoveService("api") - // Trigger anti-entropy run and wait - a.StartSync() + if err := a.State.SyncFull(); err != nil { + t.Fatalf("err: %v", err) + } retry.Run(t, func(r *retry.R) { if err := a.RPC("Catalog.NodeServices", &req, &services); err != nil { @@ -228,7 +230,7 @@ func TestAgentAntiEntropy_Services(t *testing.T) { func TestAgentAntiEntropy_EnableTagOverride(t *testing.T) { t.Parallel() - a := &agent.TestAgent{Name: t.Name(), NoInitialSync: true} + a := &agent.TestAgent{Name: t.Name()} a.Start() defer a.Shutdown() @@ -275,8 +277,9 @@ func TestAgentAntiEntropy_EnableTagOverride(t *testing.T) { t.Fatalf("err: %v", err) } - // Trigger anti-entropy run and wait - a.StartSync() + if err := a.State.SyncFull(); err != nil { + t.Fatalf("err: %v", err) + } req := structs.NodeSpecificRequest{ Datacenter: "dc1", @@ -348,18 +351,7 @@ func TestAgentAntiEntropy_Services_WithChecks(t *testing.T) { } a.State.AddCheck(chk, "") - // todo(fs): data race - // func() { - // a.State.RLock() - // defer a.State.RUnlock() - - // // Sync the service once - // if err := a.State.syncService("mysql"); err != nil { - // t.Fatalf("err: %s", err) - // } - // }() - // todo(fs): is this correct? - if err := a.State.SyncChanges(); err != nil { + if err := a.State.SyncFull(); err != nil { t.Fatal("sync failed: ", err) } @@ -418,18 +410,7 @@ func TestAgentAntiEntropy_Services_WithChecks(t *testing.T) { } a.State.AddCheck(chk2, "") - // todo(fs): data race - // func() { - // a.State.RLock() - // defer a.State.RUnlock() - - // // Sync the service once - // if err := a.State.syncService("redis"); err != nil { - // t.Fatalf("err: %s", err) - // } - // }() - // todo(fs): is this correct? - if err := a.State.SyncChanges(); err != nil { + if err := a.State.SyncFull(); err != nil { t.Fatal("sync failed: ", err) } @@ -522,9 +503,9 @@ func TestAgentAntiEntropy_Services_ACLDeny(t *testing.T) { } a.State.AddService(srv2, token) - // Trigger anti-entropy run and wait - a.StartSync() - time.Sleep(200 * time.Millisecond) + if err := a.State.SyncFull(); err != nil { + t.Fatalf("err: %v", err) + } // Verify that we are in sync { @@ -569,8 +550,9 @@ func TestAgentAntiEntropy_Services_ACLDeny(t *testing.T) { // Now remove the service and re-sync a.State.RemoveService("api") - a.StartSync() - time.Sleep(200 * time.Millisecond) + if err := a.State.SyncFull(); err != nil { + t.Fatalf("err: %v", err) + } // Verify that we are in sync { @@ -619,7 +601,7 @@ func TestAgentAntiEntropy_Services_ACLDeny(t *testing.T) { func TestAgentAntiEntropy_Checks(t *testing.T) { t.Parallel() - a := &agent.TestAgent{Name: t.Name(), NoInitialSync: true} + a := &agent.TestAgent{Name: t.Name()} a.Start() defer a.Shutdown() @@ -694,8 +676,9 @@ func TestAgentAntiEntropy_Checks(t *testing.T) { InSync: true, }) - // Trigger anti-entropy run and wait - a.StartSync() + if err := a.State.SyncFull(); err != nil { + t.Fatalf("err: %v", err) + } req := structs.NodeSpecificRequest{ Datacenter: "dc1", @@ -769,8 +752,9 @@ func TestAgentAntiEntropy_Checks(t *testing.T) { // Remove one of the checks a.State.RemoveCheck("redis") - // Trigger anti-entropy run and wait - a.StartSync() + if err := a.State.SyncFull(); err != nil { + t.Fatalf("err: %v", err) + } // Verify that we are in sync retry.Run(t, func(r *retry.R) { @@ -857,9 +841,9 @@ func TestAgentAntiEntropy_Checks_ACLDeny(t *testing.T) { } a.State.AddService(srv2, "root") - // Trigger anti-entropy run and wait - a.StartSync() - time.Sleep(200 * time.Millisecond) + if err := a.State.SyncFull(); err != nil { + t.Fatalf("err: %v", err) + } // Verify that we are in sync { @@ -928,9 +912,9 @@ func TestAgentAntiEntropy_Checks_ACLDeny(t *testing.T) { } a.State.AddCheck(chk2, token) - // Trigger anti-entropy run and wait. - a.StartSync() - time.Sleep(200 * time.Millisecond) + if err := a.State.SyncFull(); err != nil { + t.Fatalf("err: %v", err) + } // Verify that we are in sync retry.Run(t, func(r *retry.R) { @@ -975,8 +959,10 @@ func TestAgentAntiEntropy_Checks_ACLDeny(t *testing.T) { // Now delete the check and wait for sync. a.State.RemoveCheck("api-check") - a.StartSync() - time.Sleep(200 * time.Millisecond) + if err := a.State.SyncFull(); err != nil { + t.Fatalf("err: %v", err) + } + // Verify that we are in sync retry.Run(t, func(r *retry.R) { req := structs.NodeSpecificRequest{ @@ -1090,8 +1076,9 @@ func TestAgentAntiEntropy_Check_DeferSync(t *testing.T) { } a.State.AddCheck(check, "") - // Trigger anti-entropy run and wait - a.StartSync() + if err := a.State.SyncFull(); err != nil { + t.Fatalf("err: %v", err) + } // Verify that we are in sync req := structs.NodeSpecificRequest{ @@ -1172,9 +1159,9 @@ func TestAgentAntiEntropy_Check_DeferSync(t *testing.T) { } } - // Trigger anti-entropy run and wait. - a.StartSync() - time.Sleep(200 * time.Millisecond) + if err := a.State.SyncFull(); err != nil { + t.Fatalf("err: %v", err) + } // Verify that the output was synced back to the agent's value. if err := a.RPC("Health.NodeChecks", &req, &checks); err != nil { @@ -1210,9 +1197,9 @@ func TestAgentAntiEntropy_Check_DeferSync(t *testing.T) { // Now make an update that should be deferred. a.State.UpdateCheck("web", api.HealthPassing, "deferred") - // Trigger anti-entropy run and wait. - a.StartSync() - time.Sleep(200 * time.Millisecond) + if err := a.State.SyncFull(); err != nil { + t.Fatalf("err: %v", err) + } // Verify that the output is still out of sync since there's a deferred // update pending. @@ -1272,8 +1259,9 @@ func TestAgentAntiEntropy_NodeInfo(t *testing.T) { t.Fatalf("err: %v", err) } - // Trigger anti-entropy run and wait - a.StartSync() + if err := a.State.SyncFull(); err != nil { + t.Fatalf("err: %v", err) + } req := structs.NodeSpecificRequest{ Datacenter: "dc1", @@ -1304,8 +1292,10 @@ func TestAgentAntiEntropy_NodeInfo(t *testing.T) { t.Fatalf("err: %v", err) } - // Trigger anti-entropy run and wait - a.StartSync() + if err := a.State.SyncFull(); err != nil { + t.Fatalf("err: %v", err) + } + // Wait for the sync - this should have been a sync of just the node info retry.Run(t, func(r *retry.R) { if err := a.RPC("Catalog.NodeServices", &req, &services); err != nil { diff --git a/agent/testagent.go b/agent/testagent.go index 9b99cd6471..1c4806d36c 100644 --- a/agent/testagent.go +++ b/agent/testagent.go @@ -65,10 +65,6 @@ type TestAgent struct { // Key is the optional encryption key for the LAN and WAN keyring. Key string - // NoInitialSync determines whether an anti-entropy run - // will be scheduled after the agent started. - NoInitialSync bool - // dns is a reference to the first started DNS endpoint. // It is valid after Start(). dns *DNSServer @@ -175,9 +171,9 @@ func (a *TestAgent) Start() *TestAgent { } } } - if !a.NoInitialSync { - a.Agent.StartSync() - } + + // Start the anti-entropy syncer + a.Agent.StartSync() var out structs.IndexedNodes retry.Run(&panicFailer{}, func(r *retry.R) { @@ -200,7 +196,7 @@ func (a *TestAgent) Start() *TestAgent { r.Fatal(a.Name, "No leader") } if out.Index == 0 { - r.Fatal(a.Name, "Consul index is 0") + r.Fatal(a.Name, ": Consul index is 0") } } else { req, _ := http.NewRequest("GET", "/v1/agent/self", nil) From 7e3adc4549347ed1e230139a8e2d5f05e0d40b22 Mon Sep 17 00:00:00 2001 From: Frank Schroeder Date: Mon, 28 Aug 2017 14:17:16 +0200 Subject: [PATCH 11/36] agent: skip non-sensical TestCatalogRegister It is not clear what this test is supposed to verify. --- agent/catalog_endpoint_test.go | 19 ++----------------- 1 file changed, 2 insertions(+), 17 deletions(-) diff --git a/agent/catalog_endpoint_test.go b/agent/catalog_endpoint_test.go index 566ba62013..fa73dbeee8 100644 --- a/agent/catalog_endpoint_test.go +++ b/agent/catalog_endpoint_test.go @@ -13,6 +13,8 @@ import ( ) func TestCatalogRegister(t *testing.T) { + t.Skip("skipping since it is not clear what this test is supposed to verify") + t.Parallel() a := NewTestAgent(t.Name(), "") defer a.Shutdown() @@ -27,28 +29,11 @@ func TestCatalogRegister(t *testing.T) { if err != nil { t.Fatalf("err: %v", err) } - res := obj.(bool) if res != true { t.Fatalf("bad: %v", res) } - // todo(fs): data race - // func() { - // a.State.Lock() - // defer a.State.Unlock() - - // // Service should be in sync - // if err := a.State.syncService("foo"); err != nil { - // t.Fatalf("err: %s", err) - // } - // if _, ok := a.State.serviceStatus["foo"]; !ok { - // t.Fatalf("bad: %#v", a.State.serviceStatus) - // } - // if !a.State.serviceStatus["foo"].inSync { - // t.Fatalf("should be in sync") - // } - // }() if err := a.State.SyncChanges(); err != nil { t.Fatal("sync failed: ", err) } From 1602ac56b5b58b986c93035fb0ebdfaffb33df97 Mon Sep 17 00:00:00 2001 From: Frank Schroeder Date: Mon, 28 Aug 2017 14:17:17 +0200 Subject: [PATCH 12/36] local state: drop retry loops from tests Since the tests are now using synchronous calls for state syncing we no longer need to use retry loops to wait for the changes to propagate. --- agent/local/state_test.go | 440 ++++++++++++++++++-------------------- 1 file changed, 214 insertions(+), 226 deletions(-) diff --git a/agent/local/state_test.go b/agent/local/state_test.go index 5646462d66..49a70669a7 100644 --- a/agent/local/state_test.go +++ b/agent/local/state_test.go @@ -123,60 +123,58 @@ func TestAgentAntiEntropy_Services(t *testing.T) { Node: a.Config.NodeName, } - retry.Run(t, func(r *retry.R) { - if err := a.RPC("Catalog.NodeServices", &req, &services); err != nil { - r.Fatalf("err: %v", err) - } + if err := a.RPC("Catalog.NodeServices", &req, &services); err != nil { + t.Fatalf("err: %v", err) + } - // Make sure we sent along our node info when we synced. - id := services.NodeServices.Node.ID - addrs := services.NodeServices.Node.TaggedAddresses - meta := services.NodeServices.Node.Meta - delete(meta, structs.MetaSegmentKey) // Added later, not in config. - verify.Values(r, "node id", id, a.config.NodeID) - verify.Values(r, "tagged addrs", addrs, a.config.TaggedAddresses) - verify.Values(r, "node meta", meta, a.config.NodeMeta) + // Make sure we sent along our node info when we synced. + id := services.NodeServices.Node.ID + addrs := services.NodeServices.Node.TaggedAddresses + meta := services.NodeServices.Node.Meta + delete(meta, structs.MetaSegmentKey) // Added later, not in config. + verify.Values(t, "node id", id, a.Config.NodeID) + verify.Values(t, "tagged addrs", addrs, a.Config.TaggedAddresses) + verify.Values(t, "node meta", meta, a.Config.NodeMeta) - // We should have 6 services (consul included) - if len(services.NodeServices.Services) != 6 { - r.Fatalf("bad: %v", services.NodeServices.Services) - } + // We should have 6 services (consul included) + if len(services.NodeServices.Services) != 6 { + t.Fatalf("bad: %v", services.NodeServices.Services) + } - // All the services should match - for id, serv := range services.NodeServices.Services { - serv.CreateIndex, serv.ModifyIndex = 0, 0 - switch id { - case "mysql": - if !reflect.DeepEqual(serv, srv1) { - r.Fatalf("bad: %v %v", serv, srv1) - } - case "redis": - if !reflect.DeepEqual(serv, srv2) { - r.Fatalf("bad: %#v %#v", serv, srv2) - } - case "web": - if !reflect.DeepEqual(serv, srv3) { - r.Fatalf("bad: %v %v", serv, srv3) - } - case "api": - if !reflect.DeepEqual(serv, srv5) { - r.Fatalf("bad: %v %v", serv, srv5) - } - case "cache": - if !reflect.DeepEqual(serv, srv6) { - r.Fatalf("bad: %v %v", serv, srv6) - } - case structs.ConsulServiceID: - // ignore - default: - r.Fatalf("unexpected service: %v", id) + // All the services should match + for id, serv := range services.NodeServices.Services { + serv.CreateIndex, serv.ModifyIndex = 0, 0 + switch id { + case "mysql": + if !reflect.DeepEqual(serv, srv1) { + t.Fatalf("bad: %v %v", serv, srv1) } + case "redis": + if !reflect.DeepEqual(serv, srv2) { + t.Fatalf("bad: %#v %#v", serv, srv2) + } + case "web": + if !reflect.DeepEqual(serv, srv3) { + t.Fatalf("bad: %v %v", serv, srv3) + } + case "api": + if !reflect.DeepEqual(serv, srv5) { + t.Fatalf("bad: %v %v", serv, srv5) + } + case "cache": + if !reflect.DeepEqual(serv, srv6) { + t.Fatalf("bad: %v %v", serv, srv6) + } + case structs.ConsulServiceID: + // ignore + default: + t.Fatalf("unexpected service: %v", id) } + } - if err := servicesInSync(a.State, 5); err != nil { - r.Fatal(err) - } - }) + if err := servicesInSync(a.State, 5); err != nil { + t.Fatal(err) + } // Remove one of the services a.State.RemoveService("api") @@ -185,47 +183,45 @@ func TestAgentAntiEntropy_Services(t *testing.T) { t.Fatalf("err: %v", err) } - retry.Run(t, func(r *retry.R) { - if err := a.RPC("Catalog.NodeServices", &req, &services); err != nil { - r.Fatalf("err: %v", err) - } + if err := a.RPC("Catalog.NodeServices", &req, &services); err != nil { + t.Fatalf("err: %v", err) + } - // We should have 5 services (consul included) - if len(services.NodeServices.Services) != 5 { - r.Fatalf("bad: %v", services.NodeServices.Services) - } + // We should have 5 services (consul included) + if len(services.NodeServices.Services) != 5 { + t.Fatalf("bad: %v", services.NodeServices.Services) + } - // All the services should match - for id, serv := range services.NodeServices.Services { - serv.CreateIndex, serv.ModifyIndex = 0, 0 - switch id { - case "mysql": - if !reflect.DeepEqual(serv, srv1) { - r.Fatalf("bad: %v %v", serv, srv1) - } - case "redis": - if !reflect.DeepEqual(serv, srv2) { - r.Fatalf("bad: %#v %#v", serv, srv2) - } - case "web": - if !reflect.DeepEqual(serv, srv3) { - r.Fatalf("bad: %v %v", serv, srv3) - } - case "cache": - if !reflect.DeepEqual(serv, srv6) { - r.Fatalf("bad: %v %v", serv, srv6) - } - case structs.ConsulServiceID: - // ignore - default: - r.Fatalf("unexpected service: %v", id) + // All the services should match + for id, serv := range services.NodeServices.Services { + serv.CreateIndex, serv.ModifyIndex = 0, 0 + switch id { + case "mysql": + if !reflect.DeepEqual(serv, srv1) { + t.Fatalf("bad: %v %v", serv, srv1) } + case "redis": + if !reflect.DeepEqual(serv, srv2) { + t.Fatalf("bad: %#v %#v", serv, srv2) + } + case "web": + if !reflect.DeepEqual(serv, srv3) { + t.Fatalf("bad: %v %v", serv, srv3) + } + case "cache": + if !reflect.DeepEqual(serv, srv6) { + t.Fatalf("bad: %v %v", serv, srv6) + } + case structs.ConsulServiceID: + // ignore + default: + t.Fatalf("unexpected service: %v", id) } + } - if err := servicesInSync(a.State, 4); err != nil { - r.Fatal(err) - } - }) + if err := servicesInSync(a.State, 4); err != nil { + t.Fatal(err) + } } func TestAgentAntiEntropy_EnableTagOverride(t *testing.T) { @@ -462,8 +458,7 @@ func TestAgentAntiEntropy_Services_ACLDeny(t *testing.T) { acl_datacenter = "dc1" acl_master_token = "root" acl_default_policy = "deny" - acl_enforce_version_8 = true - `, NoInitialSync: true} + acl_enforce_version_8 = true`} a.Start() defer a.Shutdown() @@ -687,43 +682,41 @@ func TestAgentAntiEntropy_Checks(t *testing.T) { var checks structs.IndexedHealthChecks // Verify that we are in sync - retry.Run(t, func(r *retry.R) { - if err := a.RPC("Health.NodeChecks", &req, &checks); err != nil { - r.Fatalf("err: %v", err) - } + if err := a.RPC("Health.NodeChecks", &req, &checks); err != nil { + t.Fatalf("err: %v", err) + } - // We should have 5 checks (serf included) - if len(checks.HealthChecks) != 5 { - r.Fatalf("bad: %v", checks) - } + // We should have 5 checks (serf included) + if len(checks.HealthChecks) != 5 { + t.Fatalf("bad: %v", checks) + } - // All the checks should match - for _, chk := range checks.HealthChecks { - chk.CreateIndex, chk.ModifyIndex = 0, 0 - switch chk.CheckID { - case "mysql": - if !reflect.DeepEqual(chk, chk1) { - r.Fatalf("bad: %v %v", chk, chk1) - } - case "redis": - if !reflect.DeepEqual(chk, chk2) { - r.Fatalf("bad: %v %v", chk, chk2) - } - case "web": - if !reflect.DeepEqual(chk, chk3) { - r.Fatalf("bad: %v %v", chk, chk3) - } - case "cache": - if !reflect.DeepEqual(chk, chk5) { - r.Fatalf("bad: %v %v", chk, chk5) - } - case "serfHealth": - // ignore - default: - r.Fatalf("unexpected check: %v", chk) + // All the checks should match + for _, chk := range checks.HealthChecks { + chk.CreateIndex, chk.ModifyIndex = 0, 0 + switch chk.CheckID { + case "mysql": + if !reflect.DeepEqual(chk, chk1) { + t.Fatalf("bad: %v %v", chk, chk1) } + case "redis": + if !reflect.DeepEqual(chk, chk2) { + t.Fatalf("bad: %v %v", chk, chk2) + } + case "web": + if !reflect.DeepEqual(chk, chk3) { + t.Fatalf("bad: %v %v", chk, chk3) + } + case "cache": + if !reflect.DeepEqual(chk, chk5) { + t.Fatalf("bad: %v %v", chk, chk5) + } + case "serfHealth": + // ignore + default: + t.Fatalf("unexpected check: %v", chk) } - }) + } if err := checksInSync(a.State, 4); err != nil { t.Fatal(err) @@ -744,9 +737,9 @@ func TestAgentAntiEntropy_Checks(t *testing.T) { addrs := services.NodeServices.Node.TaggedAddresses meta := services.NodeServices.Node.Meta delete(meta, structs.MetaSegmentKey) // Added later, not in config. - verify.Values(t, "node id", id, a.config.NodeID) - verify.Values(t, "tagged addrs", addrs, a.config.TaggedAddresses) - verify.Values(t, "node meta", meta, a.config.NodeMeta) + verify.Values(t, "node id", id, a.Config.NodeID) + verify.Values(t, "tagged addrs", addrs, a.Config.TaggedAddresses) + verify.Values(t, "node meta", meta, a.Config.NodeMeta) } // Remove one of the checks @@ -757,39 +750,37 @@ func TestAgentAntiEntropy_Checks(t *testing.T) { } // Verify that we are in sync - retry.Run(t, func(r *retry.R) { - if err := a.RPC("Health.NodeChecks", &req, &checks); err != nil { - r.Fatalf("err: %v", err) - } + if err := a.RPC("Health.NodeChecks", &req, &checks); err != nil { + t.Fatalf("err: %v", err) + } - // We should have 5 checks (serf included) - if len(checks.HealthChecks) != 4 { - r.Fatalf("bad: %v", checks) - } + // We should have 5 checks (serf included) + if len(checks.HealthChecks) != 4 { + t.Fatalf("bad: %v", checks) + } - // All the checks should match - for _, chk := range checks.HealthChecks { - chk.CreateIndex, chk.ModifyIndex = 0, 0 - switch chk.CheckID { - case "mysql": - if !reflect.DeepEqual(chk, chk1) { - r.Fatalf("bad: %v %v", chk, chk1) - } - case "web": - if !reflect.DeepEqual(chk, chk3) { - r.Fatalf("bad: %v %v", chk, chk3) - } - case "cache": - if !reflect.DeepEqual(chk, chk5) { - r.Fatalf("bad: %v %v", chk, chk5) - } - case "serfHealth": - // ignore - default: - r.Fatalf("unexpected check: %v", chk) + // All the checks should match + for _, chk := range checks.HealthChecks { + chk.CreateIndex, chk.ModifyIndex = 0, 0 + switch chk.CheckID { + case "mysql": + if !reflect.DeepEqual(chk, chk1) { + t.Fatalf("bad: %v %v", chk, chk1) } + case "web": + if !reflect.DeepEqual(chk, chk3) { + t.Fatalf("bad: %v %v", chk, chk3) + } + case "cache": + if !reflect.DeepEqual(chk, chk5) { + t.Fatalf("bad: %v %v", chk, chk5) + } + case "serfHealth": + // ignore + default: + t.Fatalf("unexpected check: %v", chk) } - }) + } if err := checksInSync(a.State, 3); err != nil { t.Fatal(err) @@ -802,8 +793,7 @@ func TestAgentAntiEntropy_Checks_ACLDeny(t *testing.T) { acl_datacenter = "dc1" acl_master_token = "root" acl_default_policy = "deny" - acl_enforce_version_8 = true - `, NoInitialSync: true} + acl_enforce_version_8 = true`} a.Start() defer a.Shutdown() @@ -917,41 +907,39 @@ func TestAgentAntiEntropy_Checks_ACLDeny(t *testing.T) { } // Verify that we are in sync - retry.Run(t, func(r *retry.R) { - req := structs.NodeSpecificRequest{ - Datacenter: "dc1", - Node: a.Config.NodeName, - QueryOptions: structs.QueryOptions{ - Token: "root", - }, - } - var checks structs.IndexedHealthChecks - if err := a.RPC("Health.NodeChecks", &req, &checks); err != nil { - r.Fatalf("err: %v", err) - } + req := structs.NodeSpecificRequest{ + Datacenter: "dc1", + Node: a.Config.NodeName, + QueryOptions: structs.QueryOptions{ + Token: "root", + }, + } + var checks structs.IndexedHealthChecks + if err := a.RPC("Health.NodeChecks", &req, &checks); err != nil { + t.Fatalf("err: %v", err) + } - // We should have 2 checks (serf included) - if len(checks.HealthChecks) != 2 { - r.Fatalf("bad: %v", checks) - } + // We should have 2 checks (serf included) + if len(checks.HealthChecks) != 2 { + t.Fatalf("bad: %v", checks) + } - // All the checks should match - for _, chk := range checks.HealthChecks { - chk.CreateIndex, chk.ModifyIndex = 0, 0 - switch chk.CheckID { - case "mysql-check": - t.Fatalf("should not be permitted") - case "api-check": - if !reflect.DeepEqual(chk, chk2) { - r.Fatalf("bad: %v %v", chk, chk2) - } - case "serfHealth": - // ignore - default: - r.Fatalf("unexpected check: %v", chk) + // All the checks should match + for _, chk := range checks.HealthChecks { + chk.CreateIndex, chk.ModifyIndex = 0, 0 + switch chk.CheckID { + case "mysql-check": + t.Fatalf("should not be permitted") + case "api-check": + if !reflect.DeepEqual(chk, chk2) { + t.Fatalf("bad: %v %v", chk, chk2) } + case "serfHealth": + // ignore + default: + t.Fatalf("unexpected check: %v", chk) } - }) + } if err := checksInSync(a.State, 2); err != nil { t.Fatal(err) @@ -964,7 +952,7 @@ func TestAgentAntiEntropy_Checks_ACLDeny(t *testing.T) { } // Verify that we are in sync - retry.Run(t, func(r *retry.R) { + { req := structs.NodeSpecificRequest{ Datacenter: "dc1", Node: a.Config.NodeName, @@ -974,12 +962,12 @@ func TestAgentAntiEntropy_Checks_ACLDeny(t *testing.T) { } var checks structs.IndexedHealthChecks if err := a.RPC("Health.NodeChecks", &req, &checks); err != nil { - r.Fatalf("err: %v", err) + t.Fatalf("err: %v", err) } // We should have 1 check (just serf) if len(checks.HealthChecks) != 1 { - r.Fatalf("bad: %v", checks) + t.Fatalf("bad: %v", checks) } // All the checks should match @@ -987,16 +975,16 @@ func TestAgentAntiEntropy_Checks_ACLDeny(t *testing.T) { chk.CreateIndex, chk.ModifyIndex = 0, 0 switch chk.CheckID { case "mysql-check": - r.Fatalf("should not be permitted") + t.Fatalf("should not be permitted") case "api-check": - r.Fatalf("should be deleted") + t.Fatalf("should be deleted") case "serfHealth": // ignore default: - r.Fatalf("unexpected check: %v", chk) + t.Fatalf("unexpected check: %v", chk) } } - }) + } if err := checksInSync(a.State, 1); err != nil { t.Fatal(err) @@ -1062,7 +1050,7 @@ func TestAgentAntiEntropy_Check_DeferSync(t *testing.T) { t.Parallel() a := &agent.TestAgent{Name: t.Name(), HCL: ` check_update_interval = "500ms" - `, NoInitialSync: true} + `} a.Start() defer a.Shutdown() @@ -1243,8 +1231,7 @@ func TestAgentAntiEntropy_NodeInfo(t *testing.T) { node_id = "40e4a748-2192-161a-0510-9bf59fe950b5" node_meta { somekey = "somevalue" - } - `, NoInitialSync: true} + }`} a.Start() defer a.Shutdown() @@ -1268,24 +1255,21 @@ func TestAgentAntiEntropy_NodeInfo(t *testing.T) { Node: a.Config.NodeName, } var services structs.IndexedNodeServices - // Wait for the sync - retry.Run(t, func(r *retry.R) { - if err := a.RPC("Catalog.NodeServices", &req, &services); err != nil { - r.Fatalf("err: %v", err) - } + if err := a.RPC("Catalog.NodeServices", &req, &services); err != nil { + t.Fatalf("err: %v", err) + } - // Make sure we synced our node info - this should have ridden on the - // "consul" service sync - id := services.NodeServices.Node.ID - addrs := services.NodeServices.Node.TaggedAddresses - meta := services.NodeServices.Node.Meta - delete(meta, structs.MetaSegmentKey) // Added later, not in config. - if id != nodeID || - !reflect.DeepEqual(addrs, a.config.TaggedAddresses) || - !reflect.DeepEqual(meta, nodeMeta) { - r.Fatalf("bad: %v", services.NodeServices.Node) - } - }) + // Make sure we synced our node info - this should have ridden on the + // "consul" service sync + id := services.NodeServices.Node.ID + addrs := services.NodeServices.Node.TaggedAddresses + meta := services.NodeServices.Node.Meta + delete(meta, structs.MetaSegmentKey) // Added later, not in config. + if id != a.Config.NodeID || + !reflect.DeepEqual(addrs, a.Config.TaggedAddresses) || + !reflect.DeepEqual(meta, a.Config.NodeMeta) { + t.Fatalf("bad: %v", services.NodeServices.Node) + } // Blow away the catalog version of the node info if err := a.RPC("Catalog.Register", args, &out); err != nil { @@ -1297,30 +1281,30 @@ func TestAgentAntiEntropy_NodeInfo(t *testing.T) { } // Wait for the sync - this should have been a sync of just the node info - retry.Run(t, func(r *retry.R) { - if err := a.RPC("Catalog.NodeServices", &req, &services); err != nil { - r.Fatalf("err: %v", err) - } + if err := a.RPC("Catalog.NodeServices", &req, &services); err != nil { + t.Fatalf("err: %v", err) + } + { id := services.NodeServices.Node.ID addrs := services.NodeServices.Node.TaggedAddresses meta := services.NodeServices.Node.Meta delete(meta, structs.MetaSegmentKey) // Added later, not in config. if id != nodeID || - !reflect.DeepEqual(addrs, a.config.TaggedAddresses) || + !reflect.DeepEqual(addrs, a.Config.TaggedAddresses) || !reflect.DeepEqual(meta, nodeMeta) { - r.Fatalf("bad: %v", services.NodeServices.Node) + t.Fatalf("bad: %v", services.NodeServices.Node) } - }) + } } func TestAgent_serviceTokens(t *testing.T) { t.Parallel() - cfg := agent.TestConfig() tokens := new(token.Store) tokens.UpdateUserToken("default") - l := local.NewState(config.DefaultRuntimeConfig(`bind_addr = "127.0.0.1" data_dir = "dummy"`), nil, tokens, make(chan struct{}, 1)) + lcfg := agent.LocalConfig(config.DefaultRuntimeConfig(`bind_addr = "127.0.0.1" data_dir = "dummy"`)) + l := local.NewState(lcfg, nil, tokens, make(chan struct{}, 1)) l.AddService(&structs.NodeService{ID: "redis"}, "") @@ -1345,10 +1329,10 @@ func TestAgent_serviceTokens(t *testing.T) { func TestAgent_checkTokens(t *testing.T) { t.Parallel() - cfg := agent.TestConfig() tokens := new(token.Store) tokens.UpdateUserToken("default") - l := local.NewState(config.DefaultRuntimeConfig(`bind_addr = "127.0.0.1" data_dir = "dummy"`), nil, tokens, make(chan struct{}, 1)) + lcfg := agent.LocalConfig(config.DefaultRuntimeConfig(`bind_addr = "127.0.0.1" data_dir = "dummy"`)) + l := local.NewState(lcfg, nil, tokens, make(chan struct{}, 1)) // Returns default when no token is set l.AddCheck(&structs.HealthCheck{CheckID: types.CheckID("mem")}, "") @@ -1371,7 +1355,8 @@ func TestAgent_checkTokens(t *testing.T) { func TestAgent_checkCriticalTime(t *testing.T) { t.Parallel() - l := local.NewState(config.DefaultRuntimeConfig(`bind_addr = "127.0.0.1" data_dir = "dummy"`), nil, new(token.Store), make(chan struct{}, 1)) + lcfg := agent.LocalConfig(config.DefaultRuntimeConfig(`bind_addr = "127.0.0.1" data_dir = "dummy"`)) + l := local.NewState(lcfg, nil, new(token.Store), make(chan struct{}, 1)) svc := &structs.NodeService{ID: "redis", Service: "redis", Port: 8000} l.AddService(svc, "") @@ -1433,7 +1418,8 @@ func TestAgent_checkCriticalTime(t *testing.T) { func TestAgent_AddCheckFailure(t *testing.T) { t.Parallel() - l := local.NewState(config.DefaultRuntimeConfig(`bind_addr = "127.0.0.1" data_dir = "dummy"`), nil, new(token.Store), make(chan struct{}, 1)) + lcfg := agent.LocalConfig(config.DefaultRuntimeConfig(`bind_addr = "127.0.0.1" data_dir = "dummy"`)) + l := local.NewState(lcfg, nil, new(token.Store), make(chan struct{}, 1)) // Add a check for a service that does not exist and verify that it fails checkID := types.CheckID("redis:1") @@ -1465,8 +1451,10 @@ func TestAgent_sendCoordinate(t *testing.T) { `) defer a.Shutdown() - t.Logf("%d %d %s", a.consulConfig().CoordinateUpdateBatchSize, a.consulConfig().CoordinateUpdateMaxBatches, - a.consulConfig().CoordinateUpdatePeriod.String()) + t.Logf("%d %d %s", + a.Config.ConsulCoordinateUpdateBatchSize, + a.Config.ConsulCoordinateUpdateMaxBatches, + a.Config.ConsulCoordinateUpdatePeriod.String()) // Make sure the coordinate is present. req := structs.DCSpecificRequest{ From 1181aaee6a65b7990e8d37c77425b610d91a776c Mon Sep 17 00:00:00 2001 From: Frank Schroeder Date: Mon, 28 Aug 2017 14:17:18 +0200 Subject: [PATCH 13/36] local state: rename tests --- agent/local/state_test.go | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/agent/local/state_test.go b/agent/local/state_test.go index 49a70669a7..c01d582378 100644 --- a/agent/local/state_test.go +++ b/agent/local/state_test.go @@ -1298,7 +1298,7 @@ func TestAgentAntiEntropy_NodeInfo(t *testing.T) { } } -func TestAgent_serviceTokens(t *testing.T) { +func TestAgent_ServiceTokens(t *testing.T) { t.Parallel() tokens := new(token.Store) @@ -1326,7 +1326,7 @@ func TestAgent_serviceTokens(t *testing.T) { } } -func TestAgent_checkTokens(t *testing.T) { +func TestAgent_CheckTokens(t *testing.T) { t.Parallel() tokens := new(token.Store) @@ -1353,7 +1353,7 @@ func TestAgent_checkTokens(t *testing.T) { } } -func TestAgent_checkCriticalTime(t *testing.T) { +func TestAgent_CheckCriticalTime(t *testing.T) { t.Parallel() lcfg := agent.LocalConfig(config.DefaultRuntimeConfig(`bind_addr = "127.0.0.1" data_dir = "dummy"`)) l := local.NewState(lcfg, nil, new(token.Store), make(chan struct{}, 1)) From 524981f367505898b582dac4f47e0c1b7337c0d1 Mon Sep 17 00:00:00 2001 From: Frank Schroeder Date: Mon, 28 Aug 2017 14:17:18 +0200 Subject: [PATCH 14/36] local state: fix TestAgentAntiEntropy_EnableTagOverride The test had a race condition where it relied on the first service to be synced to the remote catalog which sometimes failed. --- agent/local/state_test.go | 3 +++ 1 file changed, 3 insertions(+) diff --git a/agent/local/state_test.go b/agent/local/state_test.go index c01d582378..ebf187349d 100644 --- a/agent/local/state_test.go +++ b/agent/local/state_test.go @@ -246,6 +246,9 @@ func TestAgentAntiEntropy_EnableTagOverride(t *testing.T) { EnableTagOverride: true, } a.State.AddService(srv1, "") + if err := a.State.SyncChanges(); err != nil { + t.Fatalf("err: %v", err) + } srv1_mod := new(structs.NodeService) *srv1_mod = *srv1 srv1_mod.Port = 7100 From 5c77c59501d511d5002176f99a2e5a48a770188e Mon Sep 17 00:00:00 2001 From: Frank Schroeder Date: Mon, 28 Aug 2017 14:17:19 +0200 Subject: [PATCH 15/36] local state: refactor TestAgentAntiEntropy_EnableTagOverride Make intent clearer by being more explicit and adding some comments. Use verify.Values to compare service entries. --- agent/local/state_test.go | 114 +++++++++++++++++++++----------------- 1 file changed, 62 insertions(+), 52 deletions(-) diff --git a/agent/local/state_test.go b/agent/local/state_test.go index ebf187349d..8ea69698a3 100644 --- a/agent/local/state_test.go +++ b/agent/local/state_test.go @@ -237,7 +237,7 @@ func TestAgentAntiEntropy_EnableTagOverride(t *testing.T) { } var out struct{} - // EnableTagOverride = true + // register a local service with tag override enabled srv1 := &structs.NodeService{ ID: "svc_id1", Service: "svc1", @@ -246,19 +246,8 @@ func TestAgentAntiEntropy_EnableTagOverride(t *testing.T) { EnableTagOverride: true, } a.State.AddService(srv1, "") - if err := a.State.SyncChanges(); err != nil { - t.Fatalf("err: %v", err) - } - srv1_mod := new(structs.NodeService) - *srv1_mod = *srv1 - srv1_mod.Port = 7100 - srv1_mod.Tags = []string{"tag1_mod"} - args.Service = srv1_mod - if err := a.RPC("Catalog.Register", args, &out); err != nil { - t.Fatalf("err: %v", err) - } - // EnableTagOverride = false + // register a local service with tag override disabled srv2 := &structs.NodeService{ ID: "svc_id2", Service: "svc2", @@ -267,15 +256,38 @@ func TestAgentAntiEntropy_EnableTagOverride(t *testing.T) { EnableTagOverride: false, } a.State.AddService(srv2, "") - srv2_mod := new(structs.NodeService) - *srv2_mod = *srv2 - srv2_mod.Port = 7200 - srv2_mod.Tags = []string{"tag2_mod"} - args.Service = srv2_mod + + // make sure they are both in the catalog + if err := a.State.SyncChanges(); err != nil { + t.Fatalf("err: %v", err) + } + + // update the services in the catalog and change the tags and port. + // Only tag changes should be propagated for services where tag + // override is enabled. + args.Service = &structs.NodeService{ + ID: srv1.ID, + Service: srv1.Service, + Tags: []string{"tag1_mod"}, + Port: 7100, + EnableTagOverride: true, + } if err := a.RPC("Catalog.Register", args, &out); err != nil { t.Fatalf("err: %v", err) } + args.Service = &structs.NodeService{ + ID: srv2.ID, + Service: srv2.Service, + Tags: []string{"tag2_mod"}, + Port: 7200, + EnableTagOverride: false, + } + if err := a.RPC("Catalog.Register", args, &out); err != nil { + t.Fatalf("err: %v", err) + } + + // sync catalog and local state if err := a.State.SyncFull(); err != nil { t.Fatalf("err: %v", err) } @@ -286,44 +298,42 @@ func TestAgentAntiEntropy_EnableTagOverride(t *testing.T) { } var services structs.IndexedNodeServices - retry.Run(t, func(r *retry.R) { - // runtime.Gosched() - if err := a.RPC("Catalog.NodeServices", &req, &services); err != nil { - r.Fatalf("err: %v", err) - } + if err := a.RPC("Catalog.NodeServices", &req, &services); err != nil { + t.Fatalf("err: %v", err) + } - a.State.RLock() - defer a.State.RUnlock() - - // All the services should match - for id, serv := range services.NodeServices.Services { - serv.CreateIndex, serv.ModifyIndex = 0, 0 - switch id { - case "svc_id1": - if serv.ID != "svc_id1" || - serv.Service != "svc1" || - serv.Port != 6100 || - !reflect.DeepEqual(serv.Tags, []string{"tag1_mod"}) { - r.Fatalf("bad: %v %v", serv, srv1) - } - case "svc_id2": - if serv.ID != "svc_id2" || - serv.Service != "svc2" || - serv.Port != 6200 || - !reflect.DeepEqual(serv.Tags, []string{"tag2"}) { - r.Fatalf("bad: %v %v", serv, srv2) - } - case structs.ConsulServiceID: - // ignore - default: - r.Fatalf("unexpected service: %v", id) + // All the services should match + for id, serv := range services.NodeServices.Services { + serv.CreateIndex, serv.ModifyIndex = 0, 0 + switch id { + case "svc_id1": + // tags should be modified but not the port + got := serv + want := &structs.NodeService{ + ID: "svc_id1", + Service: "svc1", + Tags: []string{"tag1_mod"}, + Port: 6100, + EnableTagOverride: true, } + if !verify.Values(t, "", got, want) { + t.FailNow() + } + case "svc_id2": + got, want := serv, srv2 + if !verify.Values(t, "", got, want) { + t.FailNow() + } + case structs.ConsulServiceID: + // ignore + default: + t.Fatalf("unexpected service: %v", id) } + } - if err := servicesInSync(a.State, 2); err != nil { - r.Fatal(err) - } - }) + if err := servicesInSync(a.State, 2); err != nil { + t.Fatal(err) + } } func TestAgentAntiEntropy_Services_WithChecks(t *testing.T) { From de57b16d99cb588399d6f7f374d52806ee6eb0a3 Mon Sep 17 00:00:00 2001 From: Frank Schroeder Date: Wed, 30 Aug 2017 12:25:49 +0200 Subject: [PATCH 16/36] local state: address review comments * move non-blocking notification mechanism into ae.Trigger * move Pause/Resume into separate type --- agent/ae/ae.go | 157 ++++++++++++++---------- agent/ae/ae_test.go | 2 +- agent/ae/trigger.go | 23 ++++ agent/agent.go | 48 +++----- agent/agent_endpoint_test.go | 2 +- agent/catalog_endpoint_test.go | 34 ------ agent/local/state.go | 211 ++++++++++++++++++--------------- agent/local/state_test.go | 38 +++--- 8 files changed, 270 insertions(+), 245 deletions(-) create mode 100644 agent/ae/trigger.go diff --git a/agent/ae/ae.go b/agent/ae/ae.go index b055f7aa17..6572d815bc 100644 --- a/agent/ae/ae.go +++ b/agent/ae/ae.go @@ -36,6 +36,11 @@ func scaleFactor(nodes int) int { return int(math.Ceil(math.Log2(float64(nodes))-math.Log2(float64(scaleThreshold))) + 1.0) } +type State interface { + SyncChanges() error + SyncFull() error +} + // StateSyncer manages background synchronization of the given state. // // The state is synchronized on a regular basis or on demand when either @@ -44,34 +49,45 @@ func scaleFactor(nodes int) int { // The regular state sychronization provides a self-healing mechanism // for the cluster which is also called anti-entropy. type StateSyncer struct { - // paused flags whether sync runs are temporarily disabled. - // Must be the first element due to a go bug. - // todo(fs): which bug? Is this still relevant? - paused int32 - // State contains the data that needs to be synchronized. - State interface { - SyncChanges() error - SyncFull() error - } + State State // Interval is the time between two regular sync runs. Interval time.Duration - // ClusterSize returns the number of members in the cluster to - // allow staggering the sync runs based on cluster size. - ClusterSize func() int - // ShutdownCh is closed when the application is shutting down. ShutdownCh chan struct{} - // ServerUpCh contains data when a new consul server has been added to the cluster. - ServerUpCh chan struct{} - - // TriggerCh contains data when a sync should run immediately. - TriggerCh chan struct{} - + // Logger is the logger. Logger *log.Logger + + // ClusterSize returns the number of members in the cluster to + // allow staggering the sync runs based on cluster size. + // This needs to be set before Run() is called. + ClusterSize func() int + + // SyncFull allows triggering an immediate but staggered full sync + // in a non-blocking way. + SyncFull *Trigger + + // SyncChanges allows triggering an immediate partial sync + // in a non-blocking way. + SyncChanges *Trigger + + // paused stores whether sync runs are temporarily disabled. + paused *toggle +} + +func NewStateSyner(state State, intv time.Duration, shutdownCh chan struct{}, logger *log.Logger) *StateSyncer { + return &StateSyncer{ + State: state, + Interval: intv, + ShutdownCh: shutdownCh, + Logger: logger, + SyncFull: NewTrigger(), + SyncChanges: NewTrigger(), + paused: new(toggle), + } } const ( @@ -86,6 +102,10 @@ const ( // Run is the long running method to perform state synchronization // between local and remote servers. func (s *StateSyncer) Run() { + if s.ClusterSize == nil { + panic("ClusterSize not set") + } + stagger := func(d time.Duration) time.Duration { f := scaleFactor(s.ClusterSize()) return lib.RandomStagger(time.Duration(f) * d) @@ -93,20 +113,18 @@ func (s *StateSyncer) Run() { FullSync: for { - switch err := s.State.SyncFull(); { - - // full sync failed - case err != nil: + // attempt a full sync + if err := s.State.SyncFull(); err != nil { s.Logger.Printf("[ERR] agent: failed to sync remote state: %v", err) // retry full sync after some time or when a consul // server was added. select { - // consul server added to cluster. - // retry sooner than retryFailIntv to converge cluster sooner - // but stagger delay to avoid thundering herd - case <-s.ServerUpCh: + // trigger a full sync immediately. + // this is usually called when a consul server was added to the cluster. + // stagger the delay to avoid a thundering herd. + case <-s.SyncFull.Notif(): select { case <-time.After(stagger(serverUpIntv)): case <-s.ShutdownCh: @@ -121,36 +139,38 @@ FullSync: return } - // full sync OK - default: + continue + } - // do partial syncs until it is time for a full sync again - for { + // do partial syncs until it is time for a full sync again + for { + select { + // trigger a full sync immediately + // this is usually called when a consul server was added to the cluster. + // stagger the delay to avoid a thundering herd. + case <-s.SyncFull.Notif(): select { - // todo(fs): why don't we honor the ServerUpCh here as well? - // todo(fs): by default, s.Interval is 60s which is >> 3s (serverUpIntv) - // case <-s.ServerUpCh: - // select { - // case <-time.After(stagger(serverUpIntv)): - // continue Sync - // case <-s.ShutdownCh: - // return - // } - - case <-time.After(s.Interval + stagger(s.Interval)): + case <-time.After(stagger(serverUpIntv)): continue FullSync - - case <-s.TriggerCh: - if s.Paused() { - continue - } - if err := s.State.SyncChanges(); err != nil { - s.Logger.Printf("[ERR] agent: failed to sync changes: %v", err) - } - case <-s.ShutdownCh: return } + + // time for a full sync again + case <-time.After(s.Interval + stagger(s.Interval)): + continue FullSync + + // do partial syncs on demand + case <-s.SyncChanges.Notif(): + if s.Paused() { + continue + } + if err := s.State.SyncChanges(); err != nil { + s.Logger.Printf("[ERR] agent: failed to sync changes: %v", err) + } + + case <-s.ShutdownCh: + return } } } @@ -158,27 +178,38 @@ FullSync: // Pause temporarily disables sync runs. func (s *StateSyncer) Pause() { - atomic.AddInt32(&s.paused, 1) + s.paused.On() } // Paused returns whether sync runs are temporarily disabled. func (s *StateSyncer) Paused() bool { - return atomic.LoadInt32(&s.paused) > 0 + return s.paused.IsOn() } // Resume re-enables sync runs. func (s *StateSyncer) Resume() { - paused := atomic.AddInt32(&s.paused, -1) - if paused < 0 { - panic("unbalanced StateSyncer.Resume() detected") - } - s.triggerSync() + s.paused.Off() + s.SyncChanges.Trigger() } -// triggerSync queues a sync run if one has not been triggered already. -func (s *StateSyncer) triggerSync() { - select { - case s.TriggerCh <- struct{}{}: - default: +// toggle implements an on/off switch using methods from the atomic +// package. Since fields in structs that are accessed via +// atomic.Load/Add methods need to be aligned properly on some platforms +// we move that code into a separate struct. +// +// See https://golang.org/pkg/sync/atomic/#pkg-note-BUG for details +type toggle int32 + +func (p *toggle) On() { + atomic.AddInt32((*int32)(p), 1) +} + +func (p *toggle) Off() { + if atomic.AddInt32((*int32)(p), -1) < 0 { + panic("toggle not on") } } + +func (p *toggle) IsOn() bool { + return atomic.LoadInt32((*int32)(p)) > 0 +} diff --git a/agent/ae/ae_test.go b/agent/ae/ae_test.go index 7246e7e24a..3d3449d8b1 100644 --- a/agent/ae/ae_test.go +++ b/agent/ae/ae_test.go @@ -27,7 +27,7 @@ func TestAE_scaleFactor(t *testing.T) { func TestAE_nestedPauseResume(t *testing.T) { t.Parallel() - l := new(StateSyncer) + l := NewStateSyner(nil, 0, nil, nil) if l.Paused() != false { t.Fatal("syncer should be unPaused after init") } diff --git a/agent/ae/trigger.go b/agent/ae/trigger.go new file mode 100644 index 0000000000..1aa5f45862 --- /dev/null +++ b/agent/ae/trigger.go @@ -0,0 +1,23 @@ +package ae + +// Trigger implements a non-blocking event notifier. Events can be +// triggered without blocking and notifications happen only when the +// previous event was consumed. +type Trigger struct { + ch chan struct{} +} + +func NewTrigger() *Trigger { + return &Trigger{make(chan struct{}, 1)} +} + +func (t Trigger) Trigger() { + select { + case t.ch <- struct{}{}: + default: + } +} + +func (t Trigger) Notif() <-chan struct{} { + return t.ch +} diff --git a/agent/agent.go b/agent/agent.go index d800002e9f..d08307609d 100644 --- a/agent/agent.go +++ b/agent/agent.go @@ -263,27 +263,12 @@ func (a *Agent) Start() error { return fmt.Errorf("Failed to setup node ID: %v", err) } - // create a notif channel to trigger state sychronizations - // when a consul server was added to the cluster. - serverUpCh := make(chan struct{}, 1) - - // create a notif channel to trigger state synchronizations - // when the state has changed. - triggerCh := make(chan struct{}, 1) - // create the local state - a.State = local.NewState(LocalConfig(c), a.logger, a.tokens, triggerCh) + a.State = local.NewState(LocalConfig(c), a.logger, a.tokens) // create the state synchronization manager which performs // regular and on-demand state synchronizations (anti-entropy). - a.sync = &ae.StateSyncer{ - State: a.State, - Interval: c.AEInterval, - ShutdownCh: a.shutdownCh, - ServerUpCh: serverUpCh, - TriggerCh: triggerCh, - Logger: a.logger, - } + a.sync = ae.NewStateSyner(a.State, c.AEInterval, a.shutdownCh, a.logger) // create the config for the rpc server/client consulCfg, err := a.consulConfig() @@ -294,13 +279,7 @@ func (a *Agent) Start() error { // ServerUp is used to inform that a new consul server is now // up. This can be used to speed up the sync process if we are blocking // waiting to discover a consul server - // todo(fs): IMO, the non-blocking nature of this call should be hidden in the syncer - consulCfg.ServerUp = func() { - select { - case serverUpCh <- struct{}{}: - default: - } - } + consulCfg.ServerUp = a.sync.SyncFull.Trigger // Setup either the client or the server. if c.ServerMode { @@ -308,21 +287,25 @@ func (a *Agent) Start() error { if err != nil { return fmt.Errorf("Failed to start Consul server: %v", err) } - a.delegate = server - a.State.SetDelegate(server) - a.sync.ClusterSize = func() int { return len(server.LANMembers()) } } else { client, err := consul.NewClientLogger(consulCfg, a.logger) if err != nil { return fmt.Errorf("Failed to start Consul client: %v", err) } - a.delegate = client - a.State.SetDelegate(client) - a.sync.ClusterSize = func() int { return len(client.LANMembers()) } } + // the staggering of the state syncing depends on the cluster size. + a.sync.ClusterSize = func() int { return len(a.delegate.LANMembers()) } + + // link the state with the consul server/client and the state syncer + // via callbacks. After several attempts this was easier than using + // channels since the event notification needs to be non-blocking + // and that should be hidden in the state syncer implementation. + a.State.Delegate = a.delegate + a.State.TriggerSyncChanges = a.sync.SyncChanges.Trigger + // Load checks/services/metadata. if err := a.loadServices(c); err != nil { return err @@ -1316,7 +1299,7 @@ func (a *Agent) WANMembers() []serf.Member { // This is called to prevent a race between clients and the anti-entropy routines func (a *Agent) StartSync() { go a.sync.Run() - a.logger.Printf("[INFO] agent: starting state syncer") + a.logger.Printf("[INFO] agent: started state syncer") } // PauseSync is used to pause anti-entropy while bulk changes are make @@ -2173,8 +2156,7 @@ func (a *Agent) loadServices(conf *config.RuntimeConfig) error { return nil } -// unloadServices will deregister all services other than the 'consul' service -// known to the local agent. +// unloadServices will deregister all services. func (a *Agent) unloadServices() error { for id := range a.State.Services() { if err := a.RemoveService(id, false); err != nil { diff --git a/agent/agent_endpoint_test.go b/agent/agent_endpoint_test.go index d735d3b51a..55fd2536ce 100644 --- a/agent/agent_endpoint_test.go +++ b/agent/agent_endpoint_test.go @@ -1271,7 +1271,7 @@ func TestAgent_RegisterService_TranslateKeys(t *testing.T) { EnableTagOverride: true, } - if got, want := a.state.Services()["test"], svc; !verify.Values(t, "", got, want) { + if got, want := a.State.Service("test"), svc; !verify.Values(t, "", got, want) { t.Fail() } } diff --git a/agent/catalog_endpoint_test.go b/agent/catalog_endpoint_test.go index fa73dbeee8..8459291173 100644 --- a/agent/catalog_endpoint_test.go +++ b/agent/catalog_endpoint_test.go @@ -12,40 +12,6 @@ import ( "github.com/hashicorp/serf/coordinate" ) -func TestCatalogRegister(t *testing.T) { - t.Skip("skipping since it is not clear what this test is supposed to verify") - - t.Parallel() - a := NewTestAgent(t.Name(), "") - defer a.Shutdown() - - // Register node - args := &structs.RegisterRequest{ - Node: "foo", - Address: "127.0.0.1", - } - req, _ := http.NewRequest("PUT", "/v1/catalog/register", jsonReader(args)) - obj, err := a.srv.CatalogRegister(nil, req) - if err != nil { - t.Fatalf("err: %v", err) - } - res := obj.(bool) - if res != true { - t.Fatalf("bad: %v", res) - } - - if err := a.State.SyncChanges(); err != nil { - t.Fatal("sync failed: ", err) - } - s := a.State.ServiceState("foo") - if s == nil { - t.Fatal("service 'foo' missing") - } - if !s.InSync { - t.Fatalf("service 'foo' should be in sync") - } -} - func TestCatalogRegister_Service_InvalidAddress(t *testing.T) { t.Parallel() a := NewTestAgent(t.Name(), "") diff --git a/agent/local/state.go b/agent/local/state.go index b980c569f5..91bfc032dd 100644 --- a/agent/local/state.go +++ b/agent/local/state.go @@ -18,9 +18,6 @@ import ( "github.com/hashicorp/consul/types" ) -// permissionDenied is returned when an ACL based rejection happens. -const permissionDenied = "Permission denied" - // Config is the configuration for the State. It is // populated during NewLocalAgent from the agent configuration to avoid // race conditions with the agent configuration. @@ -39,7 +36,8 @@ type ServiceState struct { // Service is the local copy of the service record. Service *structs.NodeService - // Token is the ACL to update the service record on the server. + // Token is the ACL to update or delete the service record on the + // server. Token string // InSync contains whether the local state of the service record @@ -64,8 +62,8 @@ type CheckState struct { // Check is the local copy of the health check record. Check *structs.HealthCheck - // Token is the ACL record to update the health check record - // on the server. + // Token is the ACL record to update or delete the health check + // record on the server. Token string // CriticalTime is the last time the health check status went @@ -74,8 +72,8 @@ type CheckState struct { CriticalTime time.Time // DeferCheck is used to delay the sync of a health check when - // only the status has changed. - // todo(fs): ^^ this needs double checking... + // only the output has changed. This rate limits changes which + // do not affect the state of the node and/or service. DeferCheck *time.Timer // InSync contains whether the local state of the health check @@ -107,7 +105,7 @@ func (c *CheckState) CriticalFor() time.Duration { return time.Since(c.CriticalTime) } -type delegate interface { +type rpc interface { RPC(method string, args interface{}, reply interface{}) error } @@ -116,14 +114,25 @@ type delegate interface { // catalog representation type State struct { sync.RWMutex + + // Delegate the RPC interface to the consul server or agent. + // + // It is set after both the state and the consul server/agent have + // been created. + Delegate rpc + + // TriggerSyncChanges is used to notify the state syncer that a + // partial sync should be performed. + // + // It is set after both the state and the state syncer have been + // created. + TriggerSyncChanges func() + logger *log.Logger // Config is the agent config config Config - // delegate is the consul interface to use for keeping in sync - delegate delegate - // nodeInfoInSync tracks whether the server has our correct top-level // node information in sync nodeInfoInSync bool @@ -134,13 +143,9 @@ type State struct { // Checks tracks the local checks checks map[types.CheckID]*CheckState - // metadata tracks the local metadata fields + // metadata tracks the node metadata fields metadata map[string]string - // triggerCh is used to inform of a change to local state - // that requires anti-entropy with the server - triggerCh chan struct{} - // discardCheckOutput stores whether the output of health checks // is stored in the raft log. discardCheckOutput atomic.Value // bool @@ -150,33 +155,19 @@ type State struct { } // NewLocalState creates a is used to initialize the local state -func NewState(c Config, lg *log.Logger, tokens *token.Store, triggerCh chan struct{}) *State { +func NewState(c Config, lg *log.Logger, tokens *token.Store) *State { l := &State{ - config: c, - logger: lg, - services: make(map[string]*ServiceState), - checks: make(map[types.CheckID]*CheckState), - metadata: make(map[string]string), - triggerCh: triggerCh, - tokens: tokens, + config: c, + logger: lg, + services: make(map[string]*ServiceState), + checks: make(map[types.CheckID]*CheckState), + metadata: make(map[string]string), + tokens: tokens, } - l.discardCheckOutput.Store(c.DiscardCheckOutput) + l.SetDiscardCheckOutput(c.DiscardCheckOutput) return l } -func (l *State) SetDelegate(d delegate) { - l.delegate = d -} - -// changeMade is used to trigger an anti-entropy run -func (l *State) changeMade() { - // todo(fs): IMO, the non-blocking nature of this call should be hidden in the syncer - select { - case l.triggerCh <- struct{}{}: - default: - } -} - func (l *State) SetDiscardCheckOutput(b bool) { l.discardCheckOutput.Store(b) } @@ -204,14 +195,12 @@ func (l *State) serviceToken(id string) string { // AddService is used to add a service entry to the local state. // This entry is persistent and the agent will make a best effort to // ensure it is registered -// todo(fs): where is the persistence happening? func (l *State) AddService(service *structs.NodeService, token string) error { if service == nil { return fmt.Errorf("no service") } // use the service name as id if the id was omitted - // todo(fs): is this for backwards compatibility? if service.ID == "" { service.ID = service.Service } @@ -228,7 +217,7 @@ func (l *State) AddServiceState(s *ServiceState) { defer l.Unlock() l.services[s.Service.ID] = s - l.changeMade() + l.TriggerSyncChanges() } // RemoveService is used to remove a service entry from the local state. @@ -247,7 +236,7 @@ func (l *State) RemoveService(id string) error { // entry around until it is actually removed. s.InSync = false s.Deleted = true - l.changeMade() + l.TriggerSyncChanges() return nil } @@ -366,7 +355,7 @@ func (l *State) AddCheckState(c *CheckState) { defer l.Unlock() l.checks[c.Check.CheckID] = c - l.changeMade() + l.TriggerSyncChanges() } // RemoveCheck is used to remove a health check from the local state. @@ -387,7 +376,7 @@ func (l *State) RemoveCheck(id types.CheckID) error { // entry around until it is actually removed. c.InSync = false c.Deleted = true - l.changeMade() + l.TriggerSyncChanges() return nil } @@ -443,7 +432,7 @@ func (l *State) UpdateCheck(id types.CheckID, status, output string) { return } c.InSync = false - l.changeMade() + l.TriggerSyncChanges() }) } return @@ -453,7 +442,7 @@ func (l *State) UpdateCheck(id types.CheckID, status, output string) { c.Check.Status = status c.Check.Output = output c.InSync = false - l.changeMade() + l.TriggerSyncChanges() } // Check returns the locally registered check that the @@ -549,12 +538,12 @@ func (l *State) updateSyncState() error { } var out1 structs.IndexedNodeServices - if err := l.delegate.RPC("Catalog.NodeServices", &req, &out1); err != nil { + if err := l.Delegate.RPC("Catalog.NodeServices", &req, &out1); err != nil { return err } var out2 structs.IndexedHealthChecks - if err := l.delegate.RPC("Health.NodeChecks", &req, &out2); err != nil { + if err := l.Delegate.RPC("Health.NodeChecks", &req, &out2); err != nil { return err } @@ -605,8 +594,7 @@ func (l *State) updateSyncState() error { continue } - // If the service is scheduled for removal skip it. - // todo(fs): is this correct? + // If the service is already scheduled for removal skip it if ls.Deleted { continue } @@ -646,8 +634,7 @@ func (l *State) updateSyncState() error { continue } - // If the check is scheduled for removal skip it. - // todo(fs): is this correct? + // If the check is already scheduled for removal skip it. if lc.Deleted { continue } @@ -687,10 +674,13 @@ func (l *State) updateSyncState() error { func (l *State) SyncFull() error { // note that we do not acquire the lock here since the methods // we are calling will do that themself. + // + // Also note that we don't hold the lock for the entire operation + // but release it between the two calls. This is not an issue since + // the algorithm is best-effort to achieve eventual consistency. + // SyncChanges will sync whatever updateSyncState() has determined + // needs updating. - // todo(fs): is it an issue that we do not hold the lock for the entire time? - // todo(fs): IMO, this doesn't matter since SyncChanges will sync whatever - // todo(fs): was determined in the update step. if err := l.updateSyncState(); err != nil { return err } @@ -764,7 +754,7 @@ func (l *State) LoadMetadata(data map[string]string) error { for k, v := range data { l.metadata[k] = v } - l.changeMade() + l.TriggerSyncChanges() return nil } @@ -815,19 +805,24 @@ func (l *State) deleteService(id string) error { WriteRequest: structs.WriteRequest{Token: l.serviceToken(id)}, } var out struct{} - err := l.delegate.RPC("Catalog.Deregister", &req, &out) - if err == nil || strings.Contains(err.Error(), "Unknown service") { + err := l.Delegate.RPC("Catalog.Deregister", &req, &out) + switch { + case err == nil || strings.Contains(err.Error(), "Unknown service"): delete(l.services, id) - l.logger.Printf("[INFO] agent: Deregistered service '%s'", id) + l.logger.Printf("[INFO] agent: Deregistered service %q", id) return nil - } - if acl.IsErrPermissionDenied(err) { - // todo(fs): why is the service in sync here? + + case acl.IsErrPermissionDenied(err): + // todo(fs): mark the service to be in sync to prevent excessive retrying before next full sync + // todo(fs): some backoff strategy might be a better solution l.services[id].InSync = true - l.logger.Printf("[WARN] agent: Service '%s' deregistration blocked by ACLs", id) + l.logger.Printf("[WARN] agent: Service %q deregistration blocked by ACLs", id) return nil + + default: + l.logger.Printf("[WARN] agent: Deregistering service %q failed. %s", id, err) + return err } - return err } // deleteCheck is used to delete a check from the server @@ -843,20 +838,28 @@ func (l *State) deleteCheck(id types.CheckID) error { WriteRequest: structs.WriteRequest{Token: l.checkToken(id)}, } var out struct{} - err := l.delegate.RPC("Catalog.Deregister", &req, &out) - if err == nil || strings.Contains(err.Error(), "Unknown check") { - // todo(fs): do we need to stop the deferCheck timer here? + err := l.Delegate.RPC("Catalog.Deregister", &req, &out) + switch { + case err == nil || strings.Contains(err.Error(), "Unknown check"): + c := l.checks[id] + if c != nil && c.DeferCheck != nil { + c.DeferCheck.Stop() + } delete(l.checks, id) - l.logger.Printf("[INFO] agent: Deregistered check '%s'", id) + l.logger.Printf("[INFO] agent: Deregistered check %q", id) return nil - } - if acl.IsErrPermissionDenied(err) { - // todo(fs): why is the check in sync here? + + case acl.IsErrPermissionDenied(err): + // todo(fs): mark the check to be in sync to prevent excessive retrying before next full sync + // todo(fs): some backoff strategy might be a better solution l.checks[id].InSync = true - l.logger.Printf("[WARN] agent: Check '%s' deregistration blocked by ACLs", id) + l.logger.Printf("[WARN] agent: Check %q deregistration blocked by ACLs", id) return nil + + default: + l.logger.Printf("[WARN] agent: Deregistering check %q failed. %s", id, err) + return err } - return err } // syncService is used to sync a service to the server @@ -900,8 +903,9 @@ func (l *State) syncService(id string) error { } var out struct{} - err := l.delegate.RPC("Catalog.Register", &req, &out) - if err == nil { + err := l.Delegate.RPC("Catalog.Register", &req, &out) + switch { + case err == nil: l.services[id].InSync = true // Given how the register API works, this info is also updated // every time we sync a service. @@ -909,20 +913,23 @@ func (l *State) syncService(id string) error { for _, check := range checks { l.checks[check.CheckID].InSync = true } - l.logger.Printf("[INFO] agent: Synced service '%s'", id) + l.logger.Printf("[INFO] agent: Synced service %q", id) return nil - } - if acl.IsErrPermissionDenied(err) { - // todo(fs): why are the service and the checks in sync here? - // todo(fs): why is the node info not in sync here? + + case acl.IsErrPermissionDenied(err): + // todo(fs): mark the service and the checks to be in sync to prevent excessive retrying before next full sync + // todo(fs): some backoff strategy might be a better solution l.services[id].InSync = true for _, check := range checks { l.checks[check.CheckID].InSync = true } - l.logger.Printf("[WARN] agent: Service '%s' registration blocked by ACLs", id) + l.logger.Printf("[WARN] agent: Service %q registration blocked by ACLs", id) return nil + + default: + l.logger.Printf("[WARN] agent: Syncing service %q failed. %s", id, err) + return err } - return err } // syncCheck is used to sync a check to the server @@ -947,22 +954,27 @@ func (l *State) syncCheck(id types.CheckID) error { } var out struct{} - err := l.delegate.RPC("Catalog.Register", &req, &out) - if err == nil { + err := l.Delegate.RPC("Catalog.Register", &req, &out) + switch { + case err == nil: l.checks[id].InSync = true // Given how the register API works, this info is also updated // every time we sync a check. l.nodeInfoInSync = true - l.logger.Printf("[INFO] agent: Synced check '%s'", id) + l.logger.Printf("[INFO] agent: Synced check %q", id) return nil - } - if acl.IsErrPermissionDenied(err) { - // todo(fs): why is the check in sync here? + + case acl.IsErrPermissionDenied(err): + // todo(fs): mark the check to be in sync to prevent excessive retrying before next full sync + // todo(fs): some backoff strategy might be a better solution l.checks[id].InSync = true - l.logger.Printf("[WARN] agent: Check '%s' registration blocked by ACLs", id) + l.logger.Printf("[WARN] agent: Check %q registration blocked by ACLs", id) return nil + + default: + l.logger.Printf("[WARN] agent: Syncing check %q failed. %s", id, err) + return err } - return err } func (l *State) syncNodeInfo() error { @@ -976,17 +988,22 @@ func (l *State) syncNodeInfo() error { WriteRequest: structs.WriteRequest{Token: l.tokens.AgentToken()}, } var out struct{} - err := l.delegate.RPC("Catalog.Register", &req, &out) - if err == nil { + err := l.Delegate.RPC("Catalog.Register", &req, &out) + switch { + case err == nil: l.nodeInfoInSync = true l.logger.Printf("[INFO] agent: Synced node info") return nil - } - if acl.IsErrPermissionDenied(err) { - // todo(fs): why is the node info in sync here? + + case acl.IsErrPermissionDenied(err): + // todo(fs): mark the node info to be in sync to prevent excessive retrying before next full sync + // todo(fs): some backoff strategy might be a better solution l.nodeInfoInSync = true l.logger.Printf("[WARN] agent: Node info update blocked by ACLs") return nil + + default: + l.logger.Printf("[WARN] agent: Syncing node info failed. %s", err) + return err } - return err } diff --git a/agent/local/state_test.go b/agent/local/state_test.go index 8ea69698a3..b092998aa6 100644 --- a/agent/local/state_test.go +++ b/agent/local/state_test.go @@ -1011,16 +1011,18 @@ func TestAgentAntiEntropy_Checks_ACLDeny(t *testing.T) { func TestAgent_UpdateCheck_DiscardOutput(t *testing.T) { t.Parallel() - a := NewTestAgent(t.Name(), ` + a := agent.NewTestAgent(t.Name(), ` discard_check_output = true check_update_interval = "0s" # set to "0s" since otherwise output checks are deferred `) defer a.Shutdown() inSync := func(id string) bool { - a.state.Lock() - defer a.state.Unlock() - return a.state.checkStatus[types.CheckID(id)].inSync + s := a.State.CheckState(types.CheckID(id)) + if s == nil { + return false + } + return s.InSync } // register a check @@ -1031,7 +1033,7 @@ func TestAgent_UpdateCheck_DiscardOutput(t *testing.T) { Status: api.HealthPassing, Output: "first output", } - if err := a.state.AddCheck(check, ""); err != nil { + if err := a.State.AddCheck(check, ""); err != nil { t.Fatalf("bad: %s", err) } @@ -1045,15 +1047,15 @@ func TestAgent_UpdateCheck_DiscardOutput(t *testing.T) { // update the check with the same status but different output // and the check should still be in sync. - a.state.UpdateCheck(check.CheckID, api.HealthPassing, "second output") + a.State.UpdateCheck(check.CheckID, api.HealthPassing, "second output") if !inSync("web") { t.Fatal("check should be in sync") } // disable discarding of check output and update the check again with different // output. Then the check should be out of sync. - a.state.SetDiscardCheckOutput(false) - a.state.UpdateCheck(check.CheckID, api.HealthPassing, "third output") + a.State.SetDiscardCheckOutput(false) + a.State.UpdateCheck(check.CheckID, api.HealthPassing, "third output") if inSync("web") { t.Fatal("check should be out of sync") } @@ -1316,8 +1318,9 @@ func TestAgent_ServiceTokens(t *testing.T) { tokens := new(token.Store) tokens.UpdateUserToken("default") - lcfg := agent.LocalConfig(config.DefaultRuntimeConfig(`bind_addr = "127.0.0.1" data_dir = "dummy"`)) - l := local.NewState(lcfg, nil, tokens, make(chan struct{}, 1)) + cfg := config.DefaultRuntimeConfig(`bind_addr = "127.0.0.1" data_dir = "dummy"`) + l := local.NewState(agent.LocalConfig(cfg), nil, tokens) + l.TriggerSyncChanges = func() {} l.AddService(&structs.NodeService{ID: "redis"}, "") @@ -1344,8 +1347,9 @@ func TestAgent_CheckTokens(t *testing.T) { tokens := new(token.Store) tokens.UpdateUserToken("default") - lcfg := agent.LocalConfig(config.DefaultRuntimeConfig(`bind_addr = "127.0.0.1" data_dir = "dummy"`)) - l := local.NewState(lcfg, nil, tokens, make(chan struct{}, 1)) + cfg := config.DefaultRuntimeConfig(`bind_addr = "127.0.0.1" data_dir = "dummy"`) + l := local.NewState(agent.LocalConfig(cfg), nil, tokens) + l.TriggerSyncChanges = func() {} // Returns default when no token is set l.AddCheck(&structs.HealthCheck{CheckID: types.CheckID("mem")}, "") @@ -1368,8 +1372,9 @@ func TestAgent_CheckTokens(t *testing.T) { func TestAgent_CheckCriticalTime(t *testing.T) { t.Parallel() - lcfg := agent.LocalConfig(config.DefaultRuntimeConfig(`bind_addr = "127.0.0.1" data_dir = "dummy"`)) - l := local.NewState(lcfg, nil, new(token.Store), make(chan struct{}, 1)) + cfg := config.DefaultRuntimeConfig(`bind_addr = "127.0.0.1" data_dir = "dummy"`) + l := local.NewState(agent.LocalConfig(cfg), nil, new(token.Store)) + l.TriggerSyncChanges = func() {} svc := &structs.NodeService{ID: "redis", Service: "redis", Port: 8000} l.AddService(svc, "") @@ -1431,8 +1436,9 @@ func TestAgent_CheckCriticalTime(t *testing.T) { func TestAgent_AddCheckFailure(t *testing.T) { t.Parallel() - lcfg := agent.LocalConfig(config.DefaultRuntimeConfig(`bind_addr = "127.0.0.1" data_dir = "dummy"`)) - l := local.NewState(lcfg, nil, new(token.Store), make(chan struct{}, 1)) + cfg := config.DefaultRuntimeConfig(`bind_addr = "127.0.0.1" data_dir = "dummy"`) + l := local.NewState(agent.LocalConfig(cfg), nil, new(token.Store)) + l.TriggerSyncChanges = func() {} // Add a check for a service that does not exist and verify that it fails checkID := types.CheckID("redis:1") From 41c7b0927e1c0c39f85e4778f95a6b366d1b1454 Mon Sep 17 00:00:00 2001 From: Frank Schroeder Date: Wed, 18 Oct 2017 15:05:57 +0200 Subject: [PATCH 17/36] local state: update comments --- agent/local/state.go | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/agent/local/state.go b/agent/local/state.go index 91bfc032dd..0b0cf3c602 100644 --- a/agent/local/state.go +++ b/agent/local/state.go @@ -18,9 +18,7 @@ import ( "github.com/hashicorp/consul/types" ) -// Config is the configuration for the State. It is -// populated during NewLocalAgent from the agent configuration to avoid -// race conditions with the agent configuration. +// Config is the configuration for the State. type Config struct { AdvertiseAddr string CheckUpdateInterval time.Duration @@ -110,7 +108,7 @@ type rpc interface { } // State is used to represent the node's services, -// and checks. We used it to perform anti-entropy with the +// and checks. We use it to perform anti-entropy with the // catalog representation type State struct { sync.RWMutex @@ -154,7 +152,7 @@ type State struct { tokens *token.Store } -// NewLocalState creates a is used to initialize the local state +// NewLocalState creates a new local state for the agent. func NewState(c Config, lg *log.Logger, tokens *token.Store) *State { l := &State{ config: c, @@ -168,6 +166,8 @@ func NewState(c Config, lg *log.Logger, tokens *token.Store) *State { return l } +// SetDiscardCheckOutput configures whether the check output +// is discarded. This can be changed at runtime. func (l *State) SetDiscardCheckOutput(b bool) { l.discardCheckOutput.Store(b) } @@ -181,6 +181,7 @@ func (l *State) ServiceToken(id string) string { } // serviceToken returns an ACL token associated with a service. +// This method is not synchronized and the lock must already be held. func (l *State) serviceToken(id string) string { var token string if s := l.services[id]; s != nil { @@ -310,6 +311,7 @@ func (l *State) CheckToken(checkID types.CheckID) string { } // checkToken returns an ACL token associated with a check. +// This method is not synchronized and the lock must already be held. func (l *State) checkToken(id types.CheckID) string { var token string c := l.checks[id] @@ -483,7 +485,7 @@ func (l *State) CheckState(id types.CheckID) *CheckState { } // CheckStates returns a shallow copy of all health check state records. -// The health check records and the deferred checks still point to +// The health check records and the deferred checks still point to // the original values and must not be modified. func (l *State) CheckStates() map[types.CheckID]*CheckState { l.RLock() @@ -500,7 +502,10 @@ func (l *State) CheckStates() map[types.CheckID]*CheckState { } // CriticalCheckStates returns the locally registered checks that the -// agent is aware of and are being kept in sync with the server +// agent is aware of and are being kept in sync with the server. +// The map contains a shallow copy of the current check states but +// references to the actual check definition which must not be +// modified. func (l *State) CriticalCheckStates() map[types.CheckID]*CheckState { l.RLock() defer l.RUnlock() From 4f9e05f634763cdd49be3c69a73037de8ff7fc67 Mon Sep 17 00:00:00 2001 From: Frank Schroeder Date: Wed, 18 Oct 2017 15:11:49 +0200 Subject: [PATCH 18/36] local state: update documentation of updateSyncState --- agent/local/state.go | 57 ++++++++++++++++++++++++++------------------ 1 file changed, 34 insertions(+), 23 deletions(-) diff --git a/agent/local/state.go b/agent/local/state.go index 0b0cf3c602..744aa2c601 100644 --- a/agent/local/state.go +++ b/agent/local/state.go @@ -525,6 +525,7 @@ func (l *State) CriticalCheckStates() map[types.CheckID]*CheckState { func (l *State) Metadata() map[string]string { l.RLock() defer l.RUnlock() + m := make(map[string]string) for k, v := range l.metadata { m[k] = v @@ -535,7 +536,7 @@ func (l *State) Metadata() map[string]string { // updateSyncState does a read of the server state, and updates // the local sync status as appropriate func (l *State) updateSyncState() error { - // 1. get all checks and services from the master + // Get all checks and services from the master req := structs.NodeSpecificRequest{ Datacenter: l.config.Datacenter, Node: l.config.NodeName, @@ -552,7 +553,7 @@ func (l *State) updateSyncState() error { return err } - // 2. create useful data structures for traversal + // Create useful data structures for traversal remoteServices := make(map[string]*structs.NodeService) if out1.NodeServices != nil { remoteServices = out1.NodeServices.Services @@ -563,12 +564,13 @@ func (l *State) updateSyncState() error { remoteChecks[rc.CheckID] = rc } - // 3. perform sync + // Traverse all checks, services and the node info to determine + // which entries need to be updated on or removed from the server l.Lock() defer l.Unlock() - // sync node info + // Check if node info needs syncing if out1.NodeServices == nil || out1.NodeServices.Node == nil || out1.NodeServices.Node.ID != l.config.NodeID || !reflect.DeepEqual(out1.NodeServices.Node.TaggedAddresses, l.config.TaggedAddresses) || @@ -576,25 +578,30 @@ func (l *State) updateSyncState() error { l.nodeInfoInSync = false } - // sync services + // Check which services need syncing - // sync local services that do not exist remotely + // Look for local services that do not exist remotely and mark them for + // syncing so that they will be pushed to the server later for id, s := range l.services { if remoteServices[id] == nil { s.InSync = false } } + // Traverse the list of services from the server. + // Remote services which do not exist locally have been deregistered. + // Otherwise, check whether the two definitions are still in sync. for id, rs := range remoteServices { - // If we don't have the service locally, deregister it ls := l.services[id] if ls == nil { - // The consul service is created automatically and does - // not need to be deregistered. + // The consul service is managed automatically and does + // not need to be deregistered if id == structs.ConsulServiceID { continue } + // Mark a remote service that does not exist locally as deleted so + // that it will be removed on the server later. l.services[id] = &ServiceState{Deleted: true} continue } @@ -614,19 +621,22 @@ func (l *State) updateSyncState() error { ls.InSync = ls.Service.IsSame(rs) } - // sync checks + // Check which checks need syncing - // sync local checks which do not exist remotely + // Look for local checks that do not exist remotely and mark them for + // syncing so that they will be pushed to the server later for id, c := range l.checks { if remoteChecks[id] == nil { c.InSync = false } } + // Traverse the list of checks from the server. + // Remote checks which do not exist locally have been deregistered. + // Otherwise, check whether the two definitions are still in sync. for id, rc := range remoteChecks { lc := l.checks[id] - // If we don't have the check locally, deregister it if lc == nil { // The Serf check is created automatically and does not // need to be deregistered. @@ -635,6 +645,8 @@ func (l *State) updateSyncState() error { continue } + // Mark a remote check that does not exist locally as deleted so + // that it will be removed on the server later. l.checks[id] = &CheckState{Deleted: true} continue } @@ -692,8 +704,8 @@ func (l *State) SyncFull() error { return l.SyncChanges() } -// SyncChanges is used to scan the status our local services and checks -// and update any that are out of sync with the server +// SyncChanges pushes checks, services and node info data which has been +// marked out of sync or deleted to the server. func (l *State) SyncChanges() error { l.Lock() defer l.Unlock() @@ -703,6 +715,7 @@ func (l *State) SyncChanges() error { // API works. // Sync the services + // (logging happens in the helper methods) for id, s := range l.services { var err error switch { @@ -711,13 +724,15 @@ func (l *State) SyncChanges() error { case !s.InSync: err = l.syncService(id) default: - l.logger.Printf("[DEBUG] agent: Service '%s' in sync", id) + l.logger.Printf("[DEBUG] agent: Service %q in sync", id) } if err != nil { return err } } + // Sync the checks + // (logging happens in the helper methods) for id, c := range l.checks { var err error switch { @@ -730,7 +745,7 @@ func (l *State) SyncChanges() error { } err = l.syncCheck(id) default: - l.logger.Printf("[DEBUG] agent: Check '%s' in sync", id) + l.logger.Printf("[DEBUG] agent: Check %q in sync", id) } if err != nil { return err @@ -739,15 +754,11 @@ func (l *State) SyncChanges() error { // Now sync the node level info if we need to, and didn't do any of // the other sync operations. - if !l.nodeInfoInSync { - if err := l.syncNodeInfo(); err != nil { - return err - } - } else { + if l.nodeInfoInSync { l.logger.Printf("[DEBUG] agent: Node info in sync") + return nil } - - return nil + return l.syncNodeInfo() } // LoadMetadata loads node metadata fields from the agent config and From e144f51b2915f2d2e7c4a55f621ed7398704d032 Mon Sep 17 00:00:00 2001 From: Frank Schroeder Date: Wed, 18 Oct 2017 15:12:28 +0200 Subject: [PATCH 19/36] local state: move Metadata methods together --- agent/local/state.go | 94 ++++++++++++++++++++++---------------------- 1 file changed, 47 insertions(+), 47 deletions(-) diff --git a/agent/local/state.go b/agent/local/state.go index 744aa2c601..9f2aebb290 100644 --- a/agent/local/state.go +++ b/agent/local/state.go @@ -533,6 +533,53 @@ func (l *State) Metadata() map[string]string { return m } +// LoadMetadata loads node metadata fields from the agent config and +// updates them on the local agent. +func (l *State) LoadMetadata(data map[string]string) error { + l.Lock() + defer l.Unlock() + + for k, v := range data { + l.metadata[k] = v + } + l.TriggerSyncChanges() + return nil +} + +// UnloadMetadata resets the local metadata state +func (l *State) UnloadMetadata() { + l.Lock() + defer l.Unlock() + l.metadata = make(map[string]string) +} + +// Stats is used to get various debugging state from the sub-systems +func (l *State) Stats() map[string]string { + l.RLock() + defer l.RUnlock() + + services := 0 + for _, s := range l.services { + if s.Deleted { + continue + } + services++ + } + + checks := 0 + for _, c := range l.checks { + if c.Deleted { + continue + } + checks++ + } + + return map[string]string{ + "services": strconv.Itoa(services), + "checks": strconv.Itoa(checks), + } +} + // updateSyncState does a read of the server state, and updates // the local sync status as appropriate func (l *State) updateSyncState() error { @@ -761,53 +808,6 @@ func (l *State) SyncChanges() error { return l.syncNodeInfo() } -// LoadMetadata loads node metadata fields from the agent config and -// updates them on the local agent. -func (l *State) LoadMetadata(data map[string]string) error { - l.Lock() - defer l.Unlock() - - for k, v := range data { - l.metadata[k] = v - } - l.TriggerSyncChanges() - return nil -} - -// UnloadMetadata resets the local metadata state -func (l *State) UnloadMetadata() { - l.Lock() - defer l.Unlock() - l.metadata = make(map[string]string) -} - -// Stats is used to get various debugging state from the sub-systems -func (l *State) Stats() map[string]string { - l.RLock() - defer l.RUnlock() - - services := 0 - for _, s := range l.services { - if s.Deleted { - continue - } - services++ - } - - checks := 0 - for _, c := range l.checks { - if c.Deleted { - continue - } - checks++ - } - - return map[string]string{ - "services": strconv.Itoa(services), - "checks": strconv.Itoa(checks), - } -} - // deleteService is used to delete a service from the server func (l *State) deleteService(id string) error { if id == "" { From 58d52ac58031b4960ed6b2c5551a9c989b209866 Mon Sep 17 00:00:00 2001 From: Frank Schroeder Date: Wed, 18 Oct 2017 15:07:19 +0200 Subject: [PATCH 20/36] local state: rename Add{Check,Service}State to Set{Check,Service}State --- agent/local/state.go | 42 ++++++++++++++++++++++----------------- agent/local/state_test.go | 4 ++-- 2 files changed, 26 insertions(+), 20 deletions(-) diff --git a/agent/local/state.go b/agent/local/state.go index 9f2aebb290..69e136816a 100644 --- a/agent/local/state.go +++ b/agent/local/state.go @@ -206,21 +206,13 @@ func (l *State) AddService(service *structs.NodeService, token string) error { service.ID = service.Service } - l.AddServiceState(&ServiceState{ + l.SetServiceState(&ServiceState{ Service: service, Token: token, }) return nil } -func (l *State) AddServiceState(s *ServiceState) { - l.Lock() - defer l.Unlock() - - l.services[s.Service.ID] = s - l.TriggerSyncChanges() -} - // RemoveService is used to remove a service entry from the local state. // The agent will make a best effort to ensure it is deregistered. func (l *State) RemoveService(id string) error { @@ -285,6 +277,17 @@ func (l *State) ServiceState(id string) *ServiceState { return s.Clone() } +// SetServiceState is used to overwrite a raw service state with the given +// state. This method is safe to be called concurrently but should only be used +// during testing. You should most likely call AddService instead. +func (l *State) SetServiceState(s *ServiceState) { + l.Lock() + defer l.Unlock() + + l.services[s.Service.ID] = s + l.TriggerSyncChanges() +} + // ServiceStates returns a shallow copy of all service state records. // The service record still points to the original service record and // must not be modified. @@ -345,21 +348,13 @@ func (l *State) AddCheck(check *structs.HealthCheck, token string) error { // hard-set the node name check.Node = l.config.NodeName - l.AddCheckState(&CheckState{ + l.SetCheckState(&CheckState{ Check: check, Token: token, }) return nil } -func (l *State) AddCheckState(c *CheckState) { - l.Lock() - defer l.Unlock() - - l.checks[c.Check.CheckID] = c - l.TriggerSyncChanges() -} - // RemoveCheck is used to remove a health check from the local state. // The agent will make a best effort to ensure it is deregistered // todo(fs): RemoveService returns an error for a non-existant service. RemoveCheck should as well. @@ -484,6 +479,17 @@ func (l *State) CheckState(id types.CheckID) *CheckState { return c.Clone() } +// SetCheckState is used to overwrite a raw check state with the given +// state. This method is safe to be called concurrently but should only be used +// during testing. You should most likely call AddCheck instead. +func (l *State) SetCheckState(c *CheckState) { + l.Lock() + defer l.Unlock() + + l.checks[c.Check.CheckID] = c + l.TriggerSyncChanges() +} + // CheckStates returns a shallow copy of all health check state records. // The health check records and the deferred checks still point to // the original values and must not be modified. diff --git a/agent/local/state_test.go b/agent/local/state_test.go index b092998aa6..d40300a4c2 100644 --- a/agent/local/state_test.go +++ b/agent/local/state_test.go @@ -108,7 +108,7 @@ func TestAgentAntiEntropy_Services(t *testing.T) { Tags: []string{}, Port: 11211, } - a.State.AddServiceState(&local.ServiceState{ + a.State.SetServiceState(&local.ServiceState{ Service: srv6, InSync: true, }) @@ -679,7 +679,7 @@ func TestAgentAntiEntropy_Checks(t *testing.T) { Name: "cache", Status: api.HealthPassing, } - a.State.AddCheckState(&local.CheckState{ + a.State.SetCheckState(&local.CheckState{ Check: chk5, InSync: true, }) From aba072bd1d679f5e25e7df90f8996a18cc00c976 Mon Sep 17 00:00:00 2001 From: Frank Schroeder Date: Wed, 18 Oct 2017 16:50:15 +0200 Subject: [PATCH 21/36] ae: ensure that syncs are blocked when paused --- agent/ae/ae.go | 74 +++++++++++++++++++++++++------------------------- 1 file changed, 37 insertions(+), 37 deletions(-) diff --git a/agent/ae/ae.go b/agent/ae/ae.go index 6572d815bc..4ec72ed469 100644 --- a/agent/ae/ae.go +++ b/agent/ae/ae.go @@ -2,9 +2,10 @@ package ae import ( + "errors" "log" "math" - "sync/atomic" + "sync" "time" "github.com/hashicorp/consul/lib" @@ -75,7 +76,8 @@ type StateSyncer struct { SyncChanges *Trigger // paused stores whether sync runs are temporarily disabled. - paused *toggle + pauseLock sync.Mutex + paused bool } func NewStateSyner(state State, intv time.Duration, shutdownCh chan struct{}, logger *log.Logger) *StateSyncer { @@ -86,7 +88,6 @@ func NewStateSyner(state State, intv time.Duration, shutdownCh chan struct{}, lo Logger: logger, SyncFull: NewTrigger(), SyncChanges: NewTrigger(), - paused: new(toggle), } } @@ -99,6 +100,8 @@ const ( retryFailIntv = 15 * time.Second ) +var errPaused = errors.New("paused") + // Run is the long running method to perform state synchronization // between local and remote servers. func (s *StateSyncer) Run() { @@ -114,13 +117,13 @@ func (s *StateSyncer) Run() { FullSync: for { // attempt a full sync - if err := s.State.SyncFull(); err != nil { - s.Logger.Printf("[ERR] agent: failed to sync remote state: %v", err) + err := s.ifNotPausedRun(s.State.SyncFull) + if err != nil { + if err != errPaused { + s.Logger.Printf("[ERR] agent: failed to sync remote state: %v", err) + } - // retry full sync after some time or when a consul - // server was added. select { - // trigger a full sync immediately. // this is usually called when a consul server was added to the cluster. // stagger the delay to avoid a thundering herd. @@ -162,10 +165,8 @@ FullSync: // do partial syncs on demand case <-s.SyncChanges.Notif(): - if s.Paused() { - continue - } - if err := s.State.SyncChanges(); err != nil { + err := s.ifNotPausedRun(s.State.SyncChanges) + if err != nil && err != errPaused { s.Logger.Printf("[ERR] agent: failed to sync changes: %v", err) } @@ -176,40 +177,39 @@ FullSync: } } +func (s *StateSyncer) ifNotPausedRun(f func() error) error { + s.pauseLock.Lock() + defer s.pauseLock.Unlock() + if s.paused { + return errPaused + } + return f() +} + // Pause temporarily disables sync runs. func (s *StateSyncer) Pause() { - s.paused.On() + s.pauseLock.Lock() + if s.paused { + panic("pause while paused") + } + s.paused = true + s.pauseLock.Unlock() } // Paused returns whether sync runs are temporarily disabled. func (s *StateSyncer) Paused() bool { - return s.paused.IsOn() + s.pauseLock.Lock() + defer s.pauseLock.Unlock() + return s.paused } // Resume re-enables sync runs. func (s *StateSyncer) Resume() { - s.paused.Off() + s.pauseLock.Lock() + if !s.paused { + panic("resume while not paused") + } + s.paused = false + s.pauseLock.Unlock() s.SyncChanges.Trigger() } - -// toggle implements an on/off switch using methods from the atomic -// package. Since fields in structs that are accessed via -// atomic.Load/Add methods need to be aligned properly on some platforms -// we move that code into a separate struct. -// -// See https://golang.org/pkg/sync/atomic/#pkg-note-BUG for details -type toggle int32 - -func (p *toggle) On() { - atomic.AddInt32((*int32)(p), 1) -} - -func (p *toggle) Off() { - if atomic.AddInt32((*int32)(p), -1) < 0 { - panic("toggle not on") - } -} - -func (p *toggle) IsOn() bool { - return atomic.LoadInt32((*int32)(p)) > 0 -} From e2452efed832c82d86f38b2aa96d10efba097911 Mon Sep 17 00:00:00 2001 From: Frank Schroeder Date: Thu, 19 Oct 2017 11:11:57 +0200 Subject: [PATCH 22/36] ae: restore previous pause/resume behavior --- agent/ae/ae.go | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/agent/ae/ae.go b/agent/ae/ae.go index 4ec72ed469..e20d918391 100644 --- a/agent/ae/ae.go +++ b/agent/ae/ae.go @@ -77,7 +77,7 @@ type StateSyncer struct { // paused stores whether sync runs are temporarily disabled. pauseLock sync.Mutex - paused bool + paused int } func NewStateSyner(state State, intv time.Duration, shutdownCh chan struct{}, logger *log.Logger) *StateSyncer { @@ -180,7 +180,7 @@ FullSync: func (s *StateSyncer) ifNotPausedRun(f func() error) error { s.pauseLock.Lock() defer s.pauseLock.Unlock() - if s.paused { + if s.paused != 0 { return errPaused } return f() @@ -189,10 +189,7 @@ func (s *StateSyncer) ifNotPausedRun(f func() error) error { // Pause temporarily disables sync runs. func (s *StateSyncer) Pause() { s.pauseLock.Lock() - if s.paused { - panic("pause while paused") - } - s.paused = true + s.paused++ s.pauseLock.Unlock() } @@ -200,16 +197,18 @@ func (s *StateSyncer) Pause() { func (s *StateSyncer) Paused() bool { s.pauseLock.Lock() defer s.pauseLock.Unlock() - return s.paused + return s.paused != 0 } // Resume re-enables sync runs. func (s *StateSyncer) Resume() { s.pauseLock.Lock() - if !s.paused { - panic("resume while not paused") + s.paused-- + if s.paused < 0 { + panic("unbalanced pause/resume") + } + if s.paused == 0 { + s.SyncChanges.Trigger() } - s.paused = false s.pauseLock.Unlock() - s.SyncChanges.Trigger() } From 11e172d1e9b79884e47a40bf26c97955f79c0b0d Mon Sep 17 00:00:00 2001 From: Frank Schroeder Date: Thu, 19 Oct 2017 11:12:56 +0200 Subject: [PATCH 23/36] ae: make stagger function pluggable for testing --- agent/ae/ae.go | 52 +++++++++++++++++++++++++++++++------------------- 1 file changed, 32 insertions(+), 20 deletions(-) diff --git a/agent/ae/ae.go b/agent/ae/ae.go index e20d918391..36950df6b0 100644 --- a/agent/ae/ae.go +++ b/agent/ae/ae.go @@ -53,7 +53,7 @@ type StateSyncer struct { // State contains the data that needs to be synchronized. State State - // Interval is the time between two regular sync runs. + // Interval is the time between two full sync runs. Interval time.Duration // ShutdownCh is closed when the application is shutting down. @@ -78,17 +78,16 @@ type StateSyncer struct { // paused stores whether sync runs are temporarily disabled. pauseLock sync.Mutex paused int -} -func NewStateSyner(state State, intv time.Duration, shutdownCh chan struct{}, logger *log.Logger) *StateSyncer { - return &StateSyncer{ - State: state, - Interval: intv, - ShutdownCh: shutdownCh, - Logger: logger, - SyncFull: NewTrigger(), - SyncChanges: NewTrigger(), - } + // stagger randomly picks a duration between 0s and the given duration. + stagger func(time.Duration) time.Duration + + // serverUpInterval is the max time after which a full sync is + // performed when a server has been added to the cluster. + serverUpInterval time.Duration + + // retryFailInterval is the time after which a failed full sync is retried. + retryFailInterval time.Duration } const ( @@ -100,6 +99,24 @@ const ( retryFailIntv = 15 * time.Second ) +func NewStateSyner(state State, intv time.Duration, shutdownCh chan struct{}, logger *log.Logger) *StateSyncer { + s := &StateSyncer{ + State: state, + Interval: intv, + ShutdownCh: shutdownCh, + Logger: logger, + SyncFull: NewTrigger(), + SyncChanges: NewTrigger(), + serverUpInterval: serverUpIntv, + retryFailInterval: retryFailIntv, + } + s.stagger = func(d time.Duration) time.Duration { + f := scaleFactor(s.ClusterSize()) + return lib.RandomStagger(time.Duration(f) * d) + } + return s +} + var errPaused = errors.New("paused") // Run is the long running method to perform state synchronization @@ -109,11 +126,6 @@ func (s *StateSyncer) Run() { panic("ClusterSize not set") } - stagger := func(d time.Duration) time.Duration { - f := scaleFactor(s.ClusterSize()) - return lib.RandomStagger(time.Duration(f) * d) - } - FullSync: for { // attempt a full sync @@ -129,14 +141,14 @@ FullSync: // stagger the delay to avoid a thundering herd. case <-s.SyncFull.Notif(): select { - case <-time.After(stagger(serverUpIntv)): + case <-time.After(s.stagger(s.serverUpInterval)): case <-s.ShutdownCh: return } // retry full sync after some time // todo(fs): why don't we use s.Interval here? - case <-time.After(retryFailIntv + stagger(retryFailIntv)): + case <-time.After(s.retryFailInterval + s.stagger(s.retryFailInterval)): case <-s.ShutdownCh: return @@ -153,14 +165,14 @@ FullSync: // stagger the delay to avoid a thundering herd. case <-s.SyncFull.Notif(): select { - case <-time.After(stagger(serverUpIntv)): + case <-time.After(s.stagger(s.serverUpInterval)): continue FullSync case <-s.ShutdownCh: return } // time for a full sync again - case <-time.After(s.Interval + stagger(s.Interval)): + case <-time.After(s.Interval + s.stagger(s.Interval)): continue FullSync // do partial syncs on demand From 92088d21e847ec5beca561ff8fb1c4ec7fe49287 Mon Sep 17 00:00:00 2001 From: Frank Schroeder Date: Thu, 19 Oct 2017 11:12:29 +0200 Subject: [PATCH 24/36] ae: add test for ifNotPausedRun --- agent/ae/ae_test.go | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/agent/ae/ae_test.go b/agent/ae/ae_test.go index 3d3449d8b1..5004353590 100644 --- a/agent/ae/ae_test.go +++ b/agent/ae/ae_test.go @@ -1,7 +1,9 @@ package ae import ( + "errors" "fmt" + "reflect" "testing" ) @@ -56,3 +58,22 @@ func TestAE_nestedPauseResume(t *testing.T) { }() l.Resume() } + +func TestAE_ifNotPausedRun(t *testing.T) { + l := NewStateSyner(nil, 0, nil, nil) + + errCalled := errors.New("f called") + f := func() error { return errCalled } + + l.Pause() + err := l.ifNotPausedRun(f) + if got, want := err, errPaused; !reflect.DeepEqual(got, want) { + t.Fatalf("got error %q want %q", got, want) + } + l.Resume() + + err = l.ifNotPausedRun(f) + if got, want := err, errCalled; got != want { + t.Fatalf("got error %q want %q", got, want) + } +} From 51daa96dfe77136762230dbfa2855236ec2addc9 Mon Sep 17 00:00:00 2001 From: Frank Schroeder Date: Thu, 19 Oct 2017 11:17:24 +0200 Subject: [PATCH 25/36] ae: add test for resume triggering SyncChanges --- agent/ae/ae_test.go | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/agent/ae/ae_test.go b/agent/ae/ae_test.go index 5004353590..2b61c9a33a 100644 --- a/agent/ae/ae_test.go +++ b/agent/ae/ae_test.go @@ -27,7 +27,7 @@ func TestAE_scaleFactor(t *testing.T) { } } -func TestAE_nestedPauseResume(t *testing.T) { +func TestAE_Pause_nestedPauseResume(t *testing.T) { t.Parallel() l := NewStateSyner(nil, 0, nil, nil) if l.Paused() != false { @@ -59,7 +59,21 @@ func TestAE_nestedPauseResume(t *testing.T) { l.Resume() } -func TestAE_ifNotPausedRun(t *testing.T) { +func TestAE_Pause_ResumeTriggersSyncChanges(t *testing.T) { + l := NewStateSyner(nil, 0, nil, nil) + l.Pause() + l.Resume() + select { + case <-l.SyncChanges.Notif(): + // expected + case <-l.SyncFull.Notif(): + t.Fatal("resume triggered SyncFull instead of SyncChanges") + default: + t.Fatal("resume did not trigger SyncFull") + } +} + +func TestAE_Pause_ifNotPausedRun(t *testing.T) { l := NewStateSyner(nil, 0, nil, nil) errCalled := errors.New("f called") From 3231385089a66528a89ba6de896cb424a49c1b4c Mon Sep 17 00:00:00 2001 From: Frank Schroeder Date: Thu, 19 Oct 2017 11:20:24 +0200 Subject: [PATCH 26/36] ae: fix typo in constructor name --- agent/ae/ae.go | 2 +- agent/ae/ae_test.go | 6 +++--- agent/agent.go | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/agent/ae/ae.go b/agent/ae/ae.go index 36950df6b0..6d6735b413 100644 --- a/agent/ae/ae.go +++ b/agent/ae/ae.go @@ -99,7 +99,7 @@ const ( retryFailIntv = 15 * time.Second ) -func NewStateSyner(state State, intv time.Duration, shutdownCh chan struct{}, logger *log.Logger) *StateSyncer { +func NewStateSyncer(state State, intv time.Duration, shutdownCh chan struct{}, logger *log.Logger) *StateSyncer { s := &StateSyncer{ State: state, Interval: intv, diff --git a/agent/ae/ae_test.go b/agent/ae/ae_test.go index 2b61c9a33a..d544c135cf 100644 --- a/agent/ae/ae_test.go +++ b/agent/ae/ae_test.go @@ -29,7 +29,7 @@ func TestAE_scaleFactor(t *testing.T) { func TestAE_Pause_nestedPauseResume(t *testing.T) { t.Parallel() - l := NewStateSyner(nil, 0, nil, nil) + l := NewStateSyncer(nil, 0, nil, nil) if l.Paused() != false { t.Fatal("syncer should be unPaused after init") } @@ -60,7 +60,7 @@ func TestAE_Pause_nestedPauseResume(t *testing.T) { } func TestAE_Pause_ResumeTriggersSyncChanges(t *testing.T) { - l := NewStateSyner(nil, 0, nil, nil) + l := NewStateSyncer(nil, 0, nil, nil) l.Pause() l.Resume() select { @@ -74,7 +74,7 @@ func TestAE_Pause_ResumeTriggersSyncChanges(t *testing.T) { } func TestAE_Pause_ifNotPausedRun(t *testing.T) { - l := NewStateSyner(nil, 0, nil, nil) + l := NewStateSyncer(nil, 0, nil, nil) errCalled := errors.New("f called") f := func() error { return errCalled } diff --git a/agent/agent.go b/agent/agent.go index d08307609d..f40a027448 100644 --- a/agent/agent.go +++ b/agent/agent.go @@ -268,7 +268,7 @@ func (a *Agent) Start() error { // create the state synchronization manager which performs // regular and on-demand state synchronizations (anti-entropy). - a.sync = ae.NewStateSyner(a.State, c.AEInterval, a.shutdownCh, a.logger) + a.sync = ae.NewStateSyncer(a.State, c.AEInterval, a.shutdownCh, a.logger) // create the config for the rpc server/client consulCfg, err := a.consulConfig() From 4431e222fe7fab73ea403734f6e1cea674b349d5 Mon Sep 17 00:00:00 2001 From: Frank Schroeder Date: Thu, 19 Oct 2017 13:21:55 +0200 Subject: [PATCH 27/36] ae: make control flow more explicit --- agent/ae/ae.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/agent/ae/ae.go b/agent/ae/ae.go index 6d6735b413..83895f4d2b 100644 --- a/agent/ae/ae.go +++ b/agent/ae/ae.go @@ -142,6 +142,7 @@ FullSync: case <-s.SyncFull.Notif(): select { case <-time.After(s.stagger(s.serverUpInterval)): + continue FullSync case <-s.ShutdownCh: return } @@ -149,12 +150,11 @@ FullSync: // retry full sync after some time // todo(fs): why don't we use s.Interval here? case <-time.After(s.retryFailInterval + s.stagger(s.retryFailInterval)): + continue FullSync case <-s.ShutdownCh: return } - - continue } // do partial syncs until it is time for a full sync again From 1212598ae2beb039754f4a0d929e7df19fed5618 Mon Sep 17 00:00:00 2001 From: Frank Schroeder Date: Thu, 19 Oct 2017 11:57:08 +0200 Subject: [PATCH 28/36] ae: add test that we run a full before a partial sync --- agent/ae/ae_test.go | 59 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 59 insertions(+) diff --git a/agent/ae/ae_test.go b/agent/ae/ae_test.go index d544c135cf..8d0afb5cb1 100644 --- a/agent/ae/ae_test.go +++ b/agent/ae/ae_test.go @@ -3,8 +3,12 @@ package ae import ( "errors" "fmt" + "log" + "os" "reflect" + "sync" "testing" + "time" ) func TestAE_scaleFactor(t *testing.T) { @@ -91,3 +95,58 @@ func TestAE_Pause_ifNotPausedRun(t *testing.T) { t.Fatalf("got error %q want %q", got, want) } } + +func TestAE_Run_SyncFullBeforeChanges(t *testing.T) { + shutdownCh := make(chan struct{}) + state := &mock{ + syncChanges: func() error { + close(shutdownCh) + return nil + }, + } + + // indicate that we have partial changes before starting Run + l := testSyncer(state, shutdownCh) + l.SyncChanges.Trigger() + + var wg sync.WaitGroup + wg.Add(1) + go func() { + defer wg.Done() + l.Run() + }() + wg.Wait() + + if got, want := state.seq, []string{"full", "changes"}; !reflect.DeepEqual(got, want) { + t.Fatalf("got call sequence %v want %v", got, want) + } +} + +type mock struct { + seq []string + syncFull, syncChanges func() error +} + +func (m *mock) SyncFull() error { + m.seq = append(m.seq, "full") + if m.syncFull != nil { + return m.syncFull() + } + return nil +} + +func (m *mock) SyncChanges() error { + m.seq = append(m.seq, "changes") + if m.syncChanges != nil { + return m.syncChanges() + } + return nil +} + +func testSyncer(state State, shutdownCh chan struct{}) *StateSyncer { + logger := log.New(os.Stderr, "", 0) + l := NewStateSyncer(state, 0, shutdownCh, logger) + l.stagger = func(d time.Duration) time.Duration { return d } + l.ClusterSize = func() int { return 1 } + return l +} From 622ace28292d0346cebd8d771660f630ddd06aad Mon Sep 17 00:00:00 2001 From: Frank Schroeder Date: Thu, 19 Oct 2017 16:39:16 +0200 Subject: [PATCH 29/36] ae: refactor StateSyncer to state machine for better testing --- agent/ae/ae.go | 235 ++++++++++++++++++++++++++++++-------------- agent/ae/ae_test.go | 200 ++++++++++++++++++++++++++++++++----- 2 files changed, 336 insertions(+), 99 deletions(-) diff --git a/agent/ae/ae.go b/agent/ae/ae.go index 83895f4d2b..a6851e0198 100644 --- a/agent/ae/ae.go +++ b/agent/ae/ae.go @@ -2,7 +2,7 @@ package ae import ( - "errors" + "fmt" "log" "math" "sync" @@ -37,7 +37,7 @@ func scaleFactor(nodes int) int { return int(math.Ceil(math.Log2(float64(nodes))-math.Log2(float64(scaleThreshold))) + 1.0) } -type State interface { +type SyncState interface { SyncChanges() error SyncFull() error } @@ -51,7 +51,7 @@ type State interface { // for the cluster which is also called anti-entropy. type StateSyncer struct { // State contains the data that needs to be synchronized. - State State + State SyncState // Interval is the time between two full sync runs. Interval time.Duration @@ -79,15 +79,23 @@ type StateSyncer struct { pauseLock sync.Mutex paused int - // stagger randomly picks a duration between 0s and the given duration. - stagger func(time.Duration) time.Duration - // serverUpInterval is the max time after which a full sync is // performed when a server has been added to the cluster. serverUpInterval time.Duration // retryFailInterval is the time after which a failed full sync is retried. retryFailInterval time.Duration + + // stagger randomly picks a duration between 0s and the given duration. + stagger func(time.Duration) time.Duration + + // retrySyncFullEvent generates an event based on multiple conditions + // when the state machine is trying to retry a full state sync. + retrySyncFullEvent func() event + + // syncChangesEvent generates an event based on multiple conditions + // when the state machine is performing partial state syncs. + syncChangesEvent func() event } const ( @@ -99,7 +107,7 @@ const ( retryFailIntv = 15 * time.Second ) -func NewStateSyncer(state State, intv time.Duration, shutdownCh chan struct{}, logger *log.Logger) *StateSyncer { +func NewStateSyncer(state SyncState, intv time.Duration, shutdownCh chan struct{}, logger *log.Logger) *StateSyncer { s := &StateSyncer{ State: state, Interval: intv, @@ -110,14 +118,25 @@ func NewStateSyncer(state State, intv time.Duration, shutdownCh chan struct{}, l serverUpInterval: serverUpIntv, retryFailInterval: retryFailIntv, } - s.stagger = func(d time.Duration) time.Duration { - f := scaleFactor(s.ClusterSize()) - return lib.RandomStagger(time.Duration(f) * d) - } + + // retain these methods as member variables so that + // we can mock them for testing. + s.retrySyncFullEvent = s.retrySyncFullEventFn + s.syncChangesEvent = s.syncChangesEventFn + s.stagger = s.staggerFn + return s } -var errPaused = errors.New("paused") +// fsmState defines states for the state machine. +type fsmState string + +const ( + doneState fsmState = "done" + fullSyncState fsmState = "fullSync" + partialSyncState fsmState = "partialSync" + retryFullSyncState fsmState = "retryFullSync" +) // Run is the long running method to perform state synchronization // between local and remote servers. @@ -125,77 +144,141 @@ func (s *StateSyncer) Run() { if s.ClusterSize == nil { panic("ClusterSize not set") } + s.runFSM(fullSyncState, s.nextFSMState) +} -FullSync: +// runFSM runs the state machine. +func (s *StateSyncer) runFSM(fs fsmState, next func(fsmState) fsmState) { for { - // attempt a full sync - err := s.ifNotPausedRun(s.State.SyncFull) - if err != nil { - if err != errPaused { - s.Logger.Printf("[ERR] agent: failed to sync remote state: %v", err) - } - - select { - // trigger a full sync immediately. - // this is usually called when a consul server was added to the cluster. - // stagger the delay to avoid a thundering herd. - case <-s.SyncFull.Notif(): - select { - case <-time.After(s.stagger(s.serverUpInterval)): - continue FullSync - case <-s.ShutdownCh: - return - } - - // retry full sync after some time - // todo(fs): why don't we use s.Interval here? - case <-time.After(s.retryFailInterval + s.stagger(s.retryFailInterval)): - continue FullSync - - case <-s.ShutdownCh: - return - } - } - - // do partial syncs until it is time for a full sync again - for { - select { - // trigger a full sync immediately - // this is usually called when a consul server was added to the cluster. - // stagger the delay to avoid a thundering herd. - case <-s.SyncFull.Notif(): - select { - case <-time.After(s.stagger(s.serverUpInterval)): - continue FullSync - case <-s.ShutdownCh: - return - } - - // time for a full sync again - case <-time.After(s.Interval + s.stagger(s.Interval)): - continue FullSync - - // do partial syncs on demand - case <-s.SyncChanges.Notif(): - err := s.ifNotPausedRun(s.State.SyncChanges) - if err != nil && err != errPaused { - s.Logger.Printf("[ERR] agent: failed to sync changes: %v", err) - } - - case <-s.ShutdownCh: - return - } + if fs = next(fs); fs == doneState { + return } } } -func (s *StateSyncer) ifNotPausedRun(f func() error) error { - s.pauseLock.Lock() - defer s.pauseLock.Unlock() - if s.paused != 0 { - return errPaused +// nextFSMState determines the next state based on the current state. +func (s *StateSyncer) nextFSMState(fs fsmState) fsmState { + switch fs { + case fullSyncState: + if s.Paused() { + return retryFullSyncState + } + + err := s.State.SyncFull() + if err != nil { + s.Logger.Printf("[ERR] agent: failed to sync remote state: %v", err) + return retryFullSyncState + } + + return partialSyncState + + case retryFullSyncState: + e := s.retrySyncFullEvent() + switch e { + case syncFullNotifEvent, syncFullTimerEvent: + return fullSyncState + case shutdownEvent: + return doneState + default: + panic(fmt.Sprintf("invalid event: %s", e)) + } + + case partialSyncState: + e := s.syncChangesEvent() + switch e { + case syncFullNotifEvent, syncFullTimerEvent: + return fullSyncState + + case syncChangesNotifEvent: + if s.Paused() { + return partialSyncState + } + + err := s.State.SyncChanges() + if err != nil { + s.Logger.Printf("[ERR] agent: failed to sync changes: %v", err) + } + return partialSyncState + + case shutdownEvent: + return doneState + + default: + panic(fmt.Sprintf("invalid event: %s", e)) + } + + default: + panic(fmt.Sprintf("invalid state: %s", fs)) } - return f() +} + +// event defines a timing or notification event from a multiple +// timers and channels. +type event string + +const ( + shutdownEvent event = "shutdown" + syncFullNotifEvent event = "syncFullNotif" + syncFullTimerEvent event = "syncFullTimer" + syncChangesNotifEvent event = "syncChangesNotif" +) + +// retrySyncFullEventFn waits for an event which triggers a retry +// of a full sync or a termination signal. +func (s *StateSyncer) retrySyncFullEventFn() event { + select { + // trigger a full sync immediately. + // this is usually called when a consul server was added to the cluster. + // stagger the delay to avoid a thundering herd. + case <-s.SyncFull.Notif(): + select { + case <-time.After(s.stagger(s.serverUpInterval)): + return syncFullNotifEvent + case <-s.ShutdownCh: + return shutdownEvent + } + + // retry full sync after some time + // todo(fs): why don't we use s.Interval here? + case <-time.After(s.retryFailInterval + s.stagger(s.retryFailInterval)): + return syncFullTimerEvent + + case <-s.ShutdownCh: + return shutdownEvent + } +} + +// syncChangesEventFn waits for a event which either triggers a full +// or a partial sync or a termination signal. +func (s *StateSyncer) syncChangesEventFn() event { + select { + // trigger a full sync immediately + // this is usually called when a consul server was added to the cluster. + // stagger the delay to avoid a thundering herd. + case <-s.SyncFull.Notif(): + select { + case <-time.After(s.stagger(s.serverUpInterval)): + return syncFullNotifEvent + case <-s.ShutdownCh: + return shutdownEvent + } + + // time for a full sync again + case <-time.After(s.Interval + s.stagger(s.Interval)): + return syncFullTimerEvent + + // do partial syncs on demand + case <-s.SyncChanges.Notif(): + return syncChangesNotifEvent + + case <-s.ShutdownCh: + return shutdownEvent + } +} + +func (s *StateSyncer) staggerFn(d time.Duration) time.Duration { + f := scaleFactor(s.ClusterSize()) + return lib.RandomStagger(time.Duration(f) * d) } // Pause temporarily disables sync runs. diff --git a/agent/ae/ae_test.go b/agent/ae/ae_test.go index 8d0afb5cb1..eb33bcb0b7 100644 --- a/agent/ae/ae_test.go +++ b/agent/ae/ae_test.go @@ -57,7 +57,7 @@ func TestAE_Pause_nestedPauseResume(t *testing.T) { defer func() { err := recover() if err == nil { - t.Fatal("unbalanced Resume() should cause a panic()") + t.Fatal("unbalanced Resume() should panic") } }() l.Resume() @@ -77,25 +77,6 @@ func TestAE_Pause_ResumeTriggersSyncChanges(t *testing.T) { } } -func TestAE_Pause_ifNotPausedRun(t *testing.T) { - l := NewStateSyncer(nil, 0, nil, nil) - - errCalled := errors.New("f called") - f := func() error { return errCalled } - - l.Pause() - err := l.ifNotPausedRun(f) - if got, want := err, errPaused; !reflect.DeepEqual(got, want) { - t.Fatalf("got error %q want %q", got, want) - } - l.Resume() - - err = l.ifNotPausedRun(f) - if got, want := err, errCalled; got != want { - t.Fatalf("got error %q want %q", got, want) - } -} - func TestAE_Run_SyncFullBeforeChanges(t *testing.T) { shutdownCh := make(chan struct{}) state := &mock{ @@ -106,7 +87,9 @@ func TestAE_Run_SyncFullBeforeChanges(t *testing.T) { } // indicate that we have partial changes before starting Run - l := testSyncer(state, shutdownCh) + l := testSyncer() + l.State = state + l.ShutdownCh = shutdownCh l.SyncChanges.Trigger() var wg sync.WaitGroup @@ -122,6 +105,177 @@ func TestAE_Run_SyncFullBeforeChanges(t *testing.T) { } } +func TestAE_Run_Quit(t *testing.T) { + // start timer which explodes if runFSM does not quit + tm := time.AfterFunc(time.Second, func() { panic("timeout") }) + + l := testSyncer() + l.runFSM(fullSyncState, func(fsmState) fsmState { return doneState }) + // should just quit + tm.Stop() +} + +func TestAE_FSM(t *testing.T) { + + t.Run("fullSyncState", func(t *testing.T) { + t.Run("Paused -> retryFullSyncState", func(t *testing.T) { + l := testSyncer() + l.Pause() + fs := l.nextFSMState(fullSyncState) + if got, want := fs, retryFullSyncState; got != want { + t.Fatalf("got state %v want %v", got, want) + } + }) + t.Run("SyncFull() error -> retryFullSyncState", func(t *testing.T) { + l := testSyncer() + l.State = &mock{syncFull: func() error { return errors.New("boom") }} + fs := l.nextFSMState(fullSyncState) + if got, want := fs, retryFullSyncState; got != want { + t.Fatalf("got state %v want %v", got, want) + } + }) + t.Run("SyncFull() OK -> partialSyncState", func(t *testing.T) { + l := testSyncer() + l.State = &mock{} + fs := l.nextFSMState(fullSyncState) + if got, want := fs, partialSyncState; got != want { + t.Fatalf("got state %v want %v", got, want) + } + }) + }) + + t.Run("retryFullSyncState", func(t *testing.T) { + // helper for testing state transitions from retrySyncFullState + test := func(ev event, to fsmState) { + l := testSyncer() + l.retrySyncFullEvent = func() event { return ev } + fs := l.nextFSMState(retryFullSyncState) + if got, want := fs, to; got != want { + t.Fatalf("got state %v want %v", got, want) + } + } + t.Run("shutdownEvent -> doneState", func(t *testing.T) { + test(shutdownEvent, doneState) + }) + t.Run("syncFullNotifEvent -> fullSyncState", func(t *testing.T) { + test(syncFullNotifEvent, fullSyncState) + }) + t.Run("syncFullTimerEvent -> fullSyncState", func(t *testing.T) { + test(syncFullTimerEvent, fullSyncState) + }) + t.Run("invalid event -> panic ", func(t *testing.T) { + defer func() { + err := recover() + if err == nil { + t.Fatal("invalid event should panic") + } + }() + test(event("invalid"), fsmState("")) + }) + }) + + t.Run("partialSyncState", func(t *testing.T) { + // helper for testing state transitions from partialSyncState + test := func(ev event, to fsmState) { + l := testSyncer() + l.syncChangesEvent = func() event { return ev } + fs := l.nextFSMState(partialSyncState) + if got, want := fs, to; got != want { + t.Fatalf("got state %v want %v", got, want) + } + } + t.Run("shutdownEvent -> doneState", func(t *testing.T) { + test(shutdownEvent, doneState) + }) + t.Run("syncFullNotifEvent -> fullSyncState", func(t *testing.T) { + test(syncFullNotifEvent, fullSyncState) + }) + t.Run("syncFullTimerEvent -> fullSyncState", func(t *testing.T) { + test(syncFullTimerEvent, fullSyncState) + }) + t.Run("syncChangesEvent+Paused -> partialSyncState", func(t *testing.T) { + l := testSyncer() + l.Pause() + l.syncChangesEvent = func() event { return syncChangesNotifEvent } + fs := l.nextFSMState(partialSyncState) + if got, want := fs, partialSyncState; got != want { + t.Fatalf("got state %v want %v", got, want) + } + }) + t.Run("syncChangesEvent+SyncChanges() error -> partialSyncState", func(t *testing.T) { + l := testSyncer() + l.State = &mock{syncChanges: func() error { return errors.New("boom") }} + l.syncChangesEvent = func() event { return syncChangesNotifEvent } + fs := l.nextFSMState(partialSyncState) + if got, want := fs, partialSyncState; got != want { + t.Fatalf("got state %v want %v", got, want) + } + }) + t.Run("syncChangesEvent+SyncChanges() OK -> partialSyncState", func(t *testing.T) { + l := testSyncer() + l.State = &mock{} + l.syncChangesEvent = func() event { return syncChangesNotifEvent } + fs := l.nextFSMState(partialSyncState) + if got, want := fs, partialSyncState; got != want { + t.Fatalf("got state %v want %v", got, want) + } + }) + }) +} + +func TestAE_SyncChangesEvent(t *testing.T) { + t.Run("trigger shutdownEvent", func(t *testing.T) { + l := testSyncer() + l.ShutdownCh = make(chan struct{}) + evch := make(chan event) + go func() { evch <- l.syncChangesEvent() }() + close(l.ShutdownCh) + if got, want := <-evch, shutdownEvent; got != want { + t.Fatalf("got event %q want %q", got, want) + } + }) + t.Run("trigger shutdownEvent during FullNotif", func(t *testing.T) { + l := testSyncer() + l.ShutdownCh = make(chan struct{}) + evch := make(chan event) + go func() { evch <- l.syncChangesEvent() }() + l.SyncFull.Trigger() + time.Sleep(100 * time.Millisecond) + close(l.ShutdownCh) + if got, want := <-evch, shutdownEvent; got != want { + t.Fatalf("got event %q want %q", got, want) + } + }) + t.Run("trigger syncFullNotifEvent", func(t *testing.T) { + l := testSyncer() + l.serverUpInterval = 10 * time.Millisecond + evch := make(chan event) + go func() { evch <- l.syncChangesEvent() }() + l.SyncFull.Trigger() + if got, want := <-evch, syncFullNotifEvent; got != want { + t.Fatalf("got event %q want %q", got, want) + } + }) + t.Run("trigger syncFullTimerEvent", func(t *testing.T) { + l := testSyncer() + l.Interval = 10 * time.Millisecond + evch := make(chan event) + go func() { evch <- l.syncChangesEvent() }() + if got, want := <-evch, syncFullTimerEvent; got != want { + t.Fatalf("got event %q want %q", got, want) + } + }) + t.Run("trigger syncChangesNotifEvent", func(t *testing.T) { + l := testSyncer() + evch := make(chan event) + go func() { evch <- l.syncChangesEvent() }() + l.SyncChanges.Trigger() + if got, want := <-evch, syncChangesNotifEvent; got != want { + t.Fatalf("got event %q want %q", got, want) + } + }) +} + type mock struct { seq []string syncFull, syncChanges func() error @@ -143,9 +297,9 @@ func (m *mock) SyncChanges() error { return nil } -func testSyncer(state State, shutdownCh chan struct{}) *StateSyncer { +func testSyncer() *StateSyncer { logger := log.New(os.Stderr, "", 0) - l := NewStateSyncer(state, 0, shutdownCh, logger) + l := NewStateSyncer(nil, time.Second, nil, logger) l.stagger = func(d time.Duration) time.Duration { return d } l.ClusterSize = func() int { return 1 } return l From 92f136de12577c78d92f6c4dcc1d7f8034aa5ca5 Mon Sep 17 00:00:00 2001 From: Frank Schroeder Date: Thu, 19 Oct 2017 20:22:55 +0200 Subject: [PATCH 30/36] ae: add remaining test cases --- agent/ae/ae.go | 21 ++++++--- agent/ae/ae_test.go | 105 +++++++++++++++++++++++++++++++++++++++++--- 2 files changed, 114 insertions(+), 12 deletions(-) diff --git a/agent/ae/ae.go b/agent/ae/ae.go index a6851e0198..3d5568d950 100644 --- a/agent/ae/ae.go +++ b/agent/ae/ae.go @@ -212,8 +212,8 @@ func (s *StateSyncer) nextFSMState(fs fsmState) fsmState { } } -// event defines a timing or notification event from a multiple -// timers and channels. +// event defines a timing or notification event from multiple timers and +// channels. type event string const ( @@ -224,7 +224,9 @@ const ( ) // retrySyncFullEventFn waits for an event which triggers a retry -// of a full sync or a termination signal. +// of a full sync or a termination signal. This function should not be +// called directly but through s.retryFullSyncState to allow mocking for +// testing. func (s *StateSyncer) retrySyncFullEventFn() event { select { // trigger a full sync immediately. @@ -249,7 +251,9 @@ func (s *StateSyncer) retrySyncFullEventFn() event { } // syncChangesEventFn waits for a event which either triggers a full -// or a partial sync or a termination signal. +// or a partial sync or a termination signal. This function should not +// be called directly but through s.syncChangesEvent to allow mocking +// for testing. func (s *StateSyncer) syncChangesEventFn() event { select { // trigger a full sync immediately @@ -276,9 +280,16 @@ func (s *StateSyncer) syncChangesEventFn() event { } } +// stubbed out for testing +var libRandomStagger = lib.RandomStagger + +// staggerFn returns a random duration which depends on the cluster size +// and a random factor which should provide some timely distribution of +// cluster wide events. This function should not be called directly +// but through s.stagger to allow mocking for testing. func (s *StateSyncer) staggerFn(d time.Duration) time.Duration { f := scaleFactor(s.ClusterSize()) - return lib.RandomStagger(time.Duration(f) * d) + return libRandomStagger(time.Duration(f) * d) } // Pause temporarily disables sync runs. diff --git a/agent/ae/ae_test.go b/agent/ae/ae_test.go index eb33bcb0b7..a80532d80e 100644 --- a/agent/ae/ae_test.go +++ b/agent/ae/ae_test.go @@ -9,6 +9,8 @@ import ( "sync" "testing" "time" + + "github.com/hashicorp/consul/lib" ) func TestAE_scaleFactor(t *testing.T) { @@ -77,6 +79,20 @@ func TestAE_Pause_ResumeTriggersSyncChanges(t *testing.T) { } } +func TestAE_staggerDependsOnClusterSize(t *testing.T) { + libRandomStagger = func(d time.Duration) time.Duration { return d } + defer func() { libRandomStagger = lib.RandomStagger }() + + l := testSyncer() + if got, want := l.staggerFn(10*time.Millisecond), 10*time.Millisecond; got != want { + t.Fatalf("got %v want %v", got, want) + } + l.ClusterSize = func() int { return 256 } + if got, want := l.staggerFn(10*time.Millisecond), 20*time.Millisecond; got != want { + t.Fatalf("got %v want %v", got, want) + } +} + func TestAE_Run_SyncFullBeforeChanges(t *testing.T) { shutdownCh := make(chan struct{}) state := &mock{ @@ -106,17 +122,29 @@ func TestAE_Run_SyncFullBeforeChanges(t *testing.T) { } func TestAE_Run_Quit(t *testing.T) { - // start timer which explodes if runFSM does not quit - tm := time.AfterFunc(time.Second, func() { panic("timeout") }) + t.Run("Run panics without ClusterSize", func(t *testing.T) { + defer func() { + err := recover() + if err == nil { + t.Fatal("Run should panic") + } + }() + l := testSyncer() + l.ClusterSize = nil + l.Run() + }) + t.Run("runFSM quits", func(t *testing.T) { + // start timer which explodes if runFSM does not quit + tm := time.AfterFunc(time.Second, func() { panic("timeout") }) - l := testSyncer() - l.runFSM(fullSyncState, func(fsmState) fsmState { return doneState }) - // should just quit - tm.Stop() + l := testSyncer() + l.runFSM(fullSyncState, func(fsmState) fsmState { return doneState }) + // should just quit + tm.Stop() + }) } func TestAE_FSM(t *testing.T) { - t.Run("fullSyncState", func(t *testing.T) { t.Run("Paused -> retryFullSyncState", func(t *testing.T) { l := testSyncer() @@ -220,6 +248,69 @@ func TestAE_FSM(t *testing.T) { t.Fatalf("got state %v want %v", got, want) } }) + t.Run("invalid event -> panic ", func(t *testing.T) { + defer func() { + err := recover() + if err == nil { + t.Fatal("invalid event should panic") + } + }() + test(event("invalid"), fsmState("")) + }) + }) + t.Run("invalid state -> panic ", func(t *testing.T) { + defer func() { + err := recover() + if err == nil { + t.Fatal("invalid state should panic") + } + }() + l := testSyncer() + l.nextFSMState(fsmState("invalid")) + }) +} + +func TestAE_RetrySyncFullEvent(t *testing.T) { + t.Run("trigger shutdownEvent", func(t *testing.T) { + l := testSyncer() + l.ShutdownCh = make(chan struct{}) + evch := make(chan event) + go func() { evch <- l.retrySyncFullEvent() }() + close(l.ShutdownCh) + if got, want := <-evch, shutdownEvent; got != want { + t.Fatalf("got event %q want %q", got, want) + } + }) + t.Run("trigger shutdownEvent during FullNotif", func(t *testing.T) { + l := testSyncer() + l.ShutdownCh = make(chan struct{}) + evch := make(chan event) + go func() { evch <- l.retrySyncFullEvent() }() + l.SyncFull.Trigger() + time.Sleep(100 * time.Millisecond) + close(l.ShutdownCh) + if got, want := <-evch, shutdownEvent; got != want { + t.Fatalf("got event %q want %q", got, want) + } + }) + t.Run("trigger syncFullNotifEvent", func(t *testing.T) { + l := testSyncer() + l.serverUpInterval = 10 * time.Millisecond + evch := make(chan event) + go func() { evch <- l.retrySyncFullEvent() }() + l.SyncFull.Trigger() + if got, want := <-evch, syncFullNotifEvent; got != want { + t.Fatalf("got event %q want %q", got, want) + } + }) + t.Run("trigger syncFullTimerEvent", func(t *testing.T) { + l := testSyncer() + l.retryFailInterval = 10 * time.Millisecond + evch := make(chan event) + go func() { evch <- l.retrySyncFullEvent() }() + if got, want := <-evch, syncFullTimerEvent; got != want { + t.Fatalf("got event %q want %q", got, want) + } }) } From cc0499da3de7e751aae7e0720df72f42e8dbd9f2 Mon Sep 17 00:00:00 2001 From: Frank Schroeder Date: Fri, 20 Oct 2017 06:00:31 +0200 Subject: [PATCH 31/36] ae: do not trigger on Resume while holding the lock --- agent/ae/ae.go | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/agent/ae/ae.go b/agent/ae/ae.go index 3d5568d950..d177ee7763 100644 --- a/agent/ae/ae.go +++ b/agent/ae/ae.go @@ -313,8 +313,9 @@ func (s *StateSyncer) Resume() { if s.paused < 0 { panic("unbalanced pause/resume") } - if s.paused == 0 { + trigger := s.paused == 0 + s.pauseLock.Unlock() + if trigger { s.SyncChanges.Trigger() } - s.pauseLock.Unlock() } From 6bc9d66192460edbf1c271e6ef7c267617c593b8 Mon Sep 17 00:00:00 2001 From: Frank Schroeder Date: Fri, 20 Oct 2017 06:24:32 +0200 Subject: [PATCH 32/36] local state: use synchronized access to internal maps --- agent/local/state.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/agent/local/state.go b/agent/local/state.go index 69e136816a..8007505167 100644 --- a/agent/local/state.go +++ b/agent/local/state.go @@ -341,7 +341,7 @@ func (l *State) AddCheck(check *structs.HealthCheck, token string) error { // if there is a serviceID associated with the check, make sure it exists before adding it // NOTE - This logic may be moved to be handled within the Agent's Addcheck method after a refactor - if check.ServiceID != "" && l.services[check.ServiceID] == nil { + if check.ServiceID != "" && l.Service(check.ServiceID) == nil { return fmt.Errorf("Check %q refers to non-existent service %q", check.CheckID, check.ServiceID) } From 52e73301f6c29c1bb350419469bd41ff1eadfedb Mon Sep 17 00:00:00 2001 From: Frank Schroeder Date: Fri, 20 Oct 2017 06:41:12 +0200 Subject: [PATCH 33/36] local state: clone check to avoid side effect --- agent/local/state.go | 3 +++ 1 file changed, 3 insertions(+) diff --git a/agent/local/state.go b/agent/local/state.go index 8007505167..b0adcd53d7 100644 --- a/agent/local/state.go +++ b/agent/local/state.go @@ -335,6 +335,9 @@ func (l *State) AddCheck(check *structs.HealthCheck, token string) error { return fmt.Errorf("no check") } + // clone the check since we will be modifying it. + check = check.Clone() + if l.discardCheckOutput.Load().(bool) { check.Output = "" } From 3d547e30c71444e9495e4a386dd7fc66a2bb95b4 Mon Sep 17 00:00:00 2001 From: Frank Schroeder Date: Fri, 20 Oct 2017 06:41:49 +0200 Subject: [PATCH 34/36] local state: make test more robust --- agent/local/state_test.go | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/agent/local/state_test.go b/agent/local/state_test.go index d40300a4c2..a697162f7e 100644 --- a/agent/local/state_test.go +++ b/agent/local/state_test.go @@ -1036,14 +1036,12 @@ func TestAgent_UpdateCheck_DiscardOutput(t *testing.T) { if err := a.State.AddCheck(check, ""); err != nil { t.Fatalf("bad: %s", err) } - - // wait until the check is in sync - retry.Run(t, func(r *retry.R) { - if inSync("web") { - return - } - r.FailNow() - }) + if err := a.State.SyncFull(); err != nil { + t.Fatal("bad: %s", err) + } + if !inSync("web") { + t.Fatal("check should be in sync") + } // update the check with the same status but different output // and the check should still be in sync. From 7335c34c3270bd3e3bd4dc30c165aafe02035e3e Mon Sep 17 00:00:00 2001 From: Frank Schroeder Date: Fri, 20 Oct 2017 06:45:00 +0200 Subject: [PATCH 35/36] local state: remove stale comment --- agent/local/state_test.go | 2 -- 1 file changed, 2 deletions(-) diff --git a/agent/local/state_test.go b/agent/local/state_test.go index a697162f7e..cd6cb275ac 100644 --- a/agent/local/state_test.go +++ b/agent/local/state_test.go @@ -1272,8 +1272,6 @@ func TestAgentAntiEntropy_NodeInfo(t *testing.T) { t.Fatalf("err: %v", err) } - // Make sure we synced our node info - this should have ridden on the - // "consul" service sync id := services.NodeServices.Node.ID addrs := services.NodeServices.Node.TaggedAddresses meta := services.NodeServices.Node.Meta From 1cb8b0ffe31a76bcd4b9f271d7500f7de3bb023f Mon Sep 17 00:00:00 2001 From: Frank Schroeder Date: Mon, 23 Oct 2017 10:15:41 +0200 Subject: [PATCH 36/36] local state: fix go vet issue --- agent/local/state_test.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/agent/local/state_test.go b/agent/local/state_test.go index cd6cb275ac..166b8155e9 100644 --- a/agent/local/state_test.go +++ b/agent/local/state_test.go @@ -1037,7 +1037,7 @@ func TestAgent_UpdateCheck_DiscardOutput(t *testing.T) { t.Fatalf("bad: %s", err) } if err := a.State.SyncFull(); err != nil { - t.Fatal("bad: %s", err) + t.Fatalf("bad: %s", err) } if !inSync("web") { t.Fatal("check should be in sync")