consul/command/agent/local.go

597 lines
16 KiB
Go
Raw Normal View History

package agent
import (
"fmt"
2014-01-21 19:52:25 +00:00
"log"
"reflect"
"strings"
"sync"
2014-02-07 19:58:24 +00:00
"sync/atomic"
"time"
"github.com/hashicorp/consul/consul"
"github.com/hashicorp/consul/consul/structs"
)
const (
syncStaggerIntv = 3 * time.Second
syncRetryIntv = 15 * time.Second
// permissionDenied is returned when an ACL based rejection happens
permissionDenied = "Permission denied"
)
// syncStatus is used to represent the difference between
// the local and remote state, and if action needs to be taken
type syncStatus struct {
remoteDelete bool // Should this be deleted from the server
inSync bool // Is this in sync with the server
}
// localState is used to represent the node's services,
// and checks. We used it to perform anti-entropy with the
// catalog representation
type localState struct {
2014-02-07 19:58:24 +00:00
// paused is used to check if we are paused. Must be the first
// element due to a go bug.
paused int32
sync.RWMutex
2014-01-21 19:52:25 +00:00
logger *log.Logger
2014-01-21 19:52:25 +00:00
// Config is the agent config
config *Config
// iface is the consul interface to use for keeping in sync
iface consul.Interface
// Services tracks the local services
services map[string]*structs.NodeService
serviceStatus map[string]syncStatus
serviceTokens map[string]string
// Checks tracks the local checks
checks map[string]*structs.HealthCheck
checkStatus map[string]syncStatus
checkTokens map[string]string
// Used to track checks that are being deferred
deferCheck map[string]*time.Timer
// consulCh is used to inform of a change to the known
// consul nodes. This may be used to retry a sync run
consulCh chan struct{}
// triggerCh is used to inform of a change to local state
// that requires anti-entropy with the server
triggerCh chan struct{}
}
2014-01-21 19:52:25 +00:00
// Init is used to initialize the local state
func (l *localState) Init(config *Config, logger *log.Logger) {
2014-01-21 19:52:25 +00:00
l.config = config
l.logger = logger
l.services = make(map[string]*structs.NodeService)
l.serviceStatus = make(map[string]syncStatus)
l.serviceTokens = make(map[string]string)
2014-01-21 19:52:25 +00:00
l.checks = make(map[string]*structs.HealthCheck)
l.checkStatus = make(map[string]syncStatus)
l.checkTokens = make(map[string]string)
l.deferCheck = make(map[string]*time.Timer)
l.consulCh = make(chan struct{}, 1)
2014-01-21 19:52:25 +00:00
l.triggerCh = make(chan struct{}, 1)
}
// SetIface is used to set the Consul interface. Must be set prior to
// starting anti-entropy
func (l *localState) SetIface(iface consul.Interface) {
l.iface = iface
}
// changeMade is used to trigger an anti-entropy run
func (l *localState) changeMade() {
select {
case l.triggerCh <- struct{}{}:
default:
}
}
// ConsulServerUp is used to inform that a new consul server is now
// up. This can be used to speed up the sync process if we are blocking
// waiting to discover a consul server
func (l *localState) ConsulServerUp() {
select {
case l.consulCh <- struct{}{}:
default:
}
}
// Pause is used to pause state synchronization, this can be
2014-02-07 19:58:24 +00:00
// used to make batch changes
func (l *localState) Pause() {
make Pause()/Resume()/isPaused() behave more like a semaphore see: https://github.com/hashicorp/consul/issues/1173 #1173 Reasoning: somewhere during consul development Pause()/Resume() and PauseSync()/ResumeSync() were added to protect larger changes to agent's localState. A few of the places that it tries to protect are: - (a *Agent) AddService(...) # part of the method - (c *Command) handleReload(...) # almost the whole method - (l *localState) antiEntropy(...)# isPaused() prevents syncChanges() The main problem is, that in the middle of handleReload(...)'s critical section it indirectly (loadServices()) calls AddService(...). AddService() in turn calls Pause() to protect itself against syncChanges(). At the end of AddService() a defered call to Resume() is made. With the current implementation, this releases isPaused() "lock" in the middle of handleReload() allowing antiEntropy to kick in while configuration reload is still in progress. Specifically almost all services and probably all check are unloaded when syncChanges() is allowed to run. This in turn can causes massive service/check de-/re-registration, and since checks are by default registered in the critical state, a majority of services on a node can be marked as failing. It's made worse with automation, often calling `consul reload` in close proximity on many nodes in the cluster. This change basically turns Pause()/Resume() into P()/V() of a garden-variety semaphore. Allowing Pause() to be called multiple times, and releasing isPaused() only after all matching/defered Resumes() are called as well. TODO/NOTE: as with many semaphore implementations, it might be reasonable to panic() if l.paused ever becomes negative.
2015-09-11 16:28:06 +00:00
atomic.AddInt32(&l.paused, 1)
2014-02-07 19:58:24 +00:00
}
// Resume is used to resume state synchronization
func (l *localState) Resume() {
make Pause()/Resume()/isPaused() behave more like a semaphore see: https://github.com/hashicorp/consul/issues/1173 #1173 Reasoning: somewhere during consul development Pause()/Resume() and PauseSync()/ResumeSync() were added to protect larger changes to agent's localState. A few of the places that it tries to protect are: - (a *Agent) AddService(...) # part of the method - (c *Command) handleReload(...) # almost the whole method - (l *localState) antiEntropy(...)# isPaused() prevents syncChanges() The main problem is, that in the middle of handleReload(...)'s critical section it indirectly (loadServices()) calls AddService(...). AddService() in turn calls Pause() to protect itself against syncChanges(). At the end of AddService() a defered call to Resume() is made. With the current implementation, this releases isPaused() "lock" in the middle of handleReload() allowing antiEntropy to kick in while configuration reload is still in progress. Specifically almost all services and probably all check are unloaded when syncChanges() is allowed to run. This in turn can causes massive service/check de-/re-registration, and since checks are by default registered in the critical state, a majority of services on a node can be marked as failing. It's made worse with automation, often calling `consul reload` in close proximity on many nodes in the cluster. This change basically turns Pause()/Resume() into P()/V() of a garden-variety semaphore. Allowing Pause() to be called multiple times, and releasing isPaused() only after all matching/defered Resumes() are called as well. TODO/NOTE: as with many semaphore implementations, it might be reasonable to panic() if l.paused ever becomes negative.
2015-09-11 16:28:06 +00:00
atomic.AddInt32(&l.paused, -1)
2014-02-07 19:58:24 +00:00
l.changeMade()
}
// isPaused is used to check if we are paused
func (l *localState) isPaused() bool {
make Pause()/Resume()/isPaused() behave more like a semaphore see: https://github.com/hashicorp/consul/issues/1173 #1173 Reasoning: somewhere during consul development Pause()/Resume() and PauseSync()/ResumeSync() were added to protect larger changes to agent's localState. A few of the places that it tries to protect are: - (a *Agent) AddService(...) # part of the method - (c *Command) handleReload(...) # almost the whole method - (l *localState) antiEntropy(...)# isPaused() prevents syncChanges() The main problem is, that in the middle of handleReload(...)'s critical section it indirectly (loadServices()) calls AddService(...). AddService() in turn calls Pause() to protect itself against syncChanges(). At the end of AddService() a defered call to Resume() is made. With the current implementation, this releases isPaused() "lock" in the middle of handleReload() allowing antiEntropy to kick in while configuration reload is still in progress. Specifically almost all services and probably all check are unloaded when syncChanges() is allowed to run. This in turn can causes massive service/check de-/re-registration, and since checks are by default registered in the critical state, a majority of services on a node can be marked as failing. It's made worse with automation, often calling `consul reload` in close proximity on many nodes in the cluster. This change basically turns Pause()/Resume() into P()/V() of a garden-variety semaphore. Allowing Pause() to be called multiple times, and releasing isPaused() only after all matching/defered Resumes() are called as well. TODO/NOTE: as with many semaphore implementations, it might be reasonable to panic() if l.paused ever becomes negative.
2015-09-11 16:28:06 +00:00
return atomic.LoadInt32(&l.paused) > 0
2014-02-07 19:58:24 +00:00
}
// ServiceToken returns the configured ACL token for the given
// service ID. If none is present, the agent's token is returned.
func (l *localState) ServiceToken(id string) string {
2015-04-28 18:53:53 +00:00
l.RLock()
defer l.RUnlock()
return l.serviceToken(id)
}
// serviceToken returns an ACL token associated with a service.
func (l *localState) serviceToken(id string) string {
token := l.serviceTokens[id]
if token == "" {
token = l.config.ACLToken
}
return token
}
// AddService is used to add a service entry to the local state.
// This entry is persistent and the agent will make a best effort to
// ensure it is registered
func (l *localState) AddService(service *structs.NodeService, token string) {
2014-01-21 00:22:59 +00:00
// Assign the ID if none given
if service.ID == "" && service.Service != "" {
service.ID = service.Service
}
2014-01-21 19:52:25 +00:00
l.Lock()
defer l.Unlock()
2014-01-21 19:52:25 +00:00
l.services[service.ID] = service
l.serviceStatus[service.ID] = syncStatus{}
l.serviceTokens[service.ID] = token
2014-01-21 19:52:25 +00:00
l.changeMade()
}
// RemoveService is used to remove a service entry from the local state.
// The agent will make a best effort to ensure it is deregistered
2014-01-21 19:52:25 +00:00
func (l *localState) RemoveService(serviceID string) {
l.Lock()
defer l.Unlock()
2014-01-21 19:52:25 +00:00
delete(l.services, serviceID)
delete(l.serviceTokens, serviceID)
2014-01-21 19:52:25 +00:00
l.serviceStatus[serviceID] = syncStatus{remoteDelete: true}
l.changeMade()
}
// Services returns the locally registered services that the
// agent is aware of and are being kept in sync with the server
2014-01-21 19:52:25 +00:00
func (l *localState) Services() map[string]*structs.NodeService {
services := make(map[string]*structs.NodeService)
l.RLock()
defer l.RUnlock()
2014-01-21 19:52:25 +00:00
for name, serv := range l.services {
services[name] = serv
}
return services
}
// CheckToken is used to return the configured health check token, or
2015-04-28 18:53:53 +00:00
// if none is configured, the default agent ACL token.
func (l *localState) CheckToken(id string) string {
2015-04-28 18:53:53 +00:00
l.RLock()
defer l.RUnlock()
return l.checkToken(id)
}
// checkToken returns an ACL token associated with a check.
func (l *localState) checkToken(id string) string {
token := l.checkTokens[id]
if token == "" {
token = l.config.ACLToken
}
return token
}
// AddCheck is used to add a health check to the local state.
// This entry is persistent and the agent will make a best effort to
// ensure it is registered
func (l *localState) AddCheck(check *structs.HealthCheck, token string) {
// Set the node name
2014-01-21 19:52:25 +00:00
check.Node = l.config.NodeName
2014-01-21 19:52:25 +00:00
l.Lock()
defer l.Unlock()
2014-01-21 19:52:25 +00:00
l.checks[check.CheckID] = check
l.checkStatus[check.CheckID] = syncStatus{}
l.checkTokens[check.CheckID] = token
2014-01-21 19:52:25 +00:00
l.changeMade()
}
// RemoveCheck is used to remove a health check from the local state.
// The agent will make a best effort to ensure it is deregistered
2014-01-21 19:52:25 +00:00
func (l *localState) RemoveCheck(checkID string) {
l.Lock()
defer l.Unlock()
2014-01-21 19:52:25 +00:00
delete(l.checks, checkID)
delete(l.checkTokens, checkID)
2014-01-21 19:52:25 +00:00
l.checkStatus[checkID] = syncStatus{remoteDelete: true}
l.changeMade()
}
// UpdateCheck is used to update the status of a check
2014-01-21 19:52:25 +00:00
func (l *localState) UpdateCheck(checkID, status, output string) {
l.Lock()
defer l.Unlock()
2014-01-21 19:52:25 +00:00
check, ok := l.checks[checkID]
if !ok {
return
}
// Do nothing if update is idempotent
if check.Status == status && check.Output == output {
return
}
// Defer a sync if the output has changed. This is an optimization around
// frequent updates of output. Instead, we update the output internally,
// and periodically do a write-back to the servers. If there is a status
// change we do the write immediately.
if l.config.CheckUpdateInterval > 0 && check.Status == status {
check.Output = output
if _, ok := l.deferCheck[checkID]; !ok {
2015-04-24 14:51:40 +00:00
intv := time.Duration(uint64(l.config.CheckUpdateInterval)/2) + randomStagger(l.config.CheckUpdateInterval)
2015-04-23 20:37:20 +00:00
deferSync := time.AfterFunc(intv, func() {
l.Lock()
if _, ok := l.checkStatus[checkID]; ok {
l.checkStatus[checkID] = syncStatus{inSync: false}
l.changeMade()
}
delete(l.deferCheck, checkID)
l.Unlock()
})
l.deferCheck[checkID] = deferSync
}
return
}
// Update status and mark out of sync
check.Status = status
check.Output = output
2014-01-21 19:52:25 +00:00
l.checkStatus[checkID] = syncStatus{inSync: false}
l.changeMade()
}
// Checks returns the locally registered checks that the
// agent is aware of and are being kept in sync with the server
2014-01-21 19:52:25 +00:00
func (l *localState) Checks() map[string]*structs.HealthCheck {
checks := make(map[string]*structs.HealthCheck)
l.RLock()
defer l.RUnlock()
2014-01-21 19:52:25 +00:00
for name, check := range l.checks {
checks[name] = check
}
return checks
}
// antiEntropy is a long running method used to perform anti-entropy
// between local and remote state.
2014-01-21 19:52:25 +00:00
func (l *localState) antiEntropy(shutdownCh chan struct{}) {
SYNC:
// Sync our state with the servers
2014-01-21 19:52:25 +00:00
for {
2014-04-14 19:47:58 +00:00
err := l.setSyncState()
if err == nil {
break
}
l.logger.Printf("[ERR] agent: failed to sync remote state: %v", err)
select {
case <-l.consulCh:
// Stagger the retry on leader election, avoid a thundering heard
select {
case <-time.After(randomStagger(aeScale(syncStaggerIntv, len(l.iface.LANMembers())))):
case <-shutdownCh:
return
}
2014-06-06 21:38:01 +00:00
case <-time.After(syncRetryIntv + randomStagger(aeScale(syncRetryIntv, len(l.iface.LANMembers())))):
2014-04-14 19:47:58 +00:00
case <-shutdownCh:
return
}
}
// Force-trigger AE to pickup any changes
2014-01-21 19:52:25 +00:00
l.changeMade()
// Schedule the next full sync, with a random stagger
2014-01-21 19:52:25 +00:00
aeIntv := aeScale(l.config.AEInterval, len(l.iface.LANMembers()))
aeIntv = aeIntv + randomStagger(aeIntv)
aeTimer := time.After(aeIntv)
// Wait for sync events
for {
select {
case <-aeTimer:
goto SYNC
2014-01-21 19:52:25 +00:00
case <-l.triggerCh:
2014-02-07 19:58:24 +00:00
// Skip the sync if we are paused
if l.isPaused() {
continue
}
2014-01-21 19:52:25 +00:00
if err := l.syncChanges(); err != nil {
l.logger.Printf("[ERR] agent: failed to sync changes: %v", err)
}
2014-01-21 19:52:25 +00:00
case <-shutdownCh:
return
}
}
}
// setSyncState does a read of the server state, and updates
// the local syncStatus as appropriate
2014-01-21 19:52:25 +00:00
func (l *localState) setSyncState() error {
req := structs.NodeSpecificRequest{
Datacenter: l.config.Datacenter,
Node: l.config.NodeName,
QueryOptions: structs.QueryOptions{Token: l.config.ACLToken},
}
var out1 structs.IndexedNodeServices
var out2 structs.IndexedHealthChecks
if e := l.iface.RPC("Catalog.NodeServices", &req, &out1); e != nil {
return e
}
if err := l.iface.RPC("Health.NodeChecks", &req, &out2); err != nil {
return err
}
checks := out2.HealthChecks
2014-01-21 19:52:25 +00:00
l.Lock()
defer l.Unlock()
services := make(map[string]*structs.NodeService)
if out1.NodeServices != nil {
services = out1.NodeServices.Services
}
for id, _ := range l.services {
// If the local service doesn't exist remotely, then sync it
if _, ok := services[id]; !ok {
l.serviceStatus[id] = syncStatus{inSync: false}
}
}
for id, service := range services {
// If we don't have the service locally, deregister it
existing, ok := l.services[id]
if !ok {
l.serviceStatus[id] = syncStatus{remoteDelete: true}
continue
2014-03-05 23:03:23 +00:00
}
// If our definition is different, we need to update it
equal := reflect.DeepEqual(existing, service)
l.serviceStatus[id] = syncStatus{inSync: equal}
}
for id, _ := range l.checks {
// Sync any check which doesn't exist on the remote side
2015-04-09 17:40:05 +00:00
found := false
for _, check := range checks {
if check.CheckID == id {
2015-04-09 17:40:05 +00:00
found = true
break
}
2015-04-09 17:40:05 +00:00
}
if !found {
l.checkStatus[id] = syncStatus{inSync: false}
}
}
for _, check := range checks {
// If we don't have the check locally, deregister it
id := check.CheckID
2014-01-21 19:52:25 +00:00
existing, ok := l.checks[id]
if !ok {
// The Serf check is created automatically, and does not
// need to be registered
if id == consul.SerfCheckID {
continue
}
2014-01-21 19:52:25 +00:00
l.checkStatus[id] = syncStatus{remoteDelete: true}
continue
}
// If our definition is different, we need to update it
var equal bool
if l.config.CheckUpdateInterval == 0 {
equal = reflect.DeepEqual(existing, check)
} else {
eCopy := new(structs.HealthCheck)
*eCopy = *existing
eCopy.Output = ""
check.Output = ""
equal = reflect.DeepEqual(eCopy, check)
}
// Update the status
2014-01-21 19:52:25 +00:00
l.checkStatus[id] = syncStatus{inSync: equal}
}
return nil
}
// syncChanges is used to scan the status our local services and checks
// and update any that are out of sync with the server
2014-01-21 19:52:25 +00:00
func (l *localState) syncChanges() error {
l.Lock()
defer l.Unlock()
// Sync the services
for id, status := range l.serviceStatus {
if status.remoteDelete {
if err := l.deleteService(id); err != nil {
return err
}
} else if !status.inSync {
if err := l.syncService(id); err != nil {
return err
}
2014-04-23 19:21:47 +00:00
} else {
l.logger.Printf("[DEBUG] agent: Service '%s' in sync", id)
}
}
// Sync the checks
for id, status := range l.checkStatus {
if status.remoteDelete {
if err := l.deleteCheck(id); err != nil {
return err
}
} else if !status.inSync {
// Cancel a deferred sync
if timer := l.deferCheck[id]; timer != nil {
timer.Stop()
delete(l.deferCheck, id)
}
if err := l.syncCheck(id); err != nil {
return err
}
2014-04-23 19:21:47 +00:00
} else {
l.logger.Printf("[DEBUG] agent: Check '%s' in sync", id)
}
}
return nil
}
// deleteService is used to delete a service from the server
2014-01-21 19:52:25 +00:00
func (l *localState) deleteService(id string) error {
if id == "" {
return fmt.Errorf("ServiceID missing")
}
req := structs.DeregisterRequest{
Datacenter: l.config.Datacenter,
Node: l.config.NodeName,
ServiceID: id,
2015-04-28 18:53:53 +00:00
WriteRequest: structs.WriteRequest{Token: l.serviceToken(id)},
}
var out struct{}
2014-01-21 19:52:25 +00:00
err := l.iface.RPC("Catalog.Deregister", &req, &out)
if err == nil {
2014-01-21 19:52:25 +00:00
delete(l.serviceStatus, id)
l.logger.Printf("[INFO] agent: Deregistered service '%s'", id)
}
return err
}
// deleteCheck is used to delete a service from the server
2014-01-21 19:52:25 +00:00
func (l *localState) deleteCheck(id string) error {
if id == "" {
return fmt.Errorf("CheckID missing")
}
req := structs.DeregisterRequest{
Datacenter: l.config.Datacenter,
Node: l.config.NodeName,
CheckID: id,
2015-04-28 18:53:53 +00:00
WriteRequest: structs.WriteRequest{Token: l.checkToken(id)},
}
var out struct{}
2014-01-21 19:52:25 +00:00
err := l.iface.RPC("Catalog.Deregister", &req, &out)
if err == nil {
2014-01-21 19:52:25 +00:00
delete(l.checkStatus, id)
l.logger.Printf("[INFO] agent: Deregistered check '%s'", id)
}
return err
}
// syncService is used to sync a service to the server
2014-01-21 19:52:25 +00:00
func (l *localState) syncService(id string) error {
req := structs.RegisterRequest{
Datacenter: l.config.Datacenter,
Node: l.config.NodeName,
Address: l.config.AdvertiseAddr,
Service: l.services[id],
2015-04-28 18:53:53 +00:00
WriteRequest: structs.WriteRequest{Token: l.serviceToken(id)},
}
// If the service has associated checks that are out of sync,
// piggyback them on the service sync so they are part of the
// same transaction and are registered atomically.
var checks structs.HealthChecks
for _, check := range l.checks {
if check.ServiceID == id {
if stat, ok := l.checkStatus[check.CheckID]; !ok || !stat.inSync {
checks = append(checks, check)
}
}
}
// Backwards-compatibility for Consul < 0.5
if len(checks) == 1 {
req.Check = checks[0]
} else {
req.Checks = checks
}
var out struct{}
2014-01-21 19:52:25 +00:00
err := l.iface.RPC("Catalog.Register", &req, &out)
if err == nil {
2014-01-21 19:52:25 +00:00
l.serviceStatus[id] = syncStatus{inSync: true}
l.logger.Printf("[INFO] agent: Synced service '%s'", id)
for _, check := range checks {
l.checkStatus[check.CheckID] = syncStatus{inSync: true}
}
} else if strings.Contains(err.Error(), permissionDenied) {
l.serviceStatus[id] = syncStatus{inSync: true}
l.logger.Printf("[WARN] agent: Service '%s' registration blocked by ACLs", id)
for _, check := range checks {
l.checkStatus[check.CheckID] = syncStatus{inSync: true}
}
return nil
}
return err
}
// syncCheck is used to sync a service to the server
func (l *localState) syncCheck(id string) error {
// Pull in the associated service if any
check := l.checks[id]
var service *structs.NodeService
if check.ServiceID != "" {
if serv, ok := l.services[check.ServiceID]; ok {
service = serv
2015-01-14 07:23:52 +00:00
}
}
req := structs.RegisterRequest{
Datacenter: l.config.Datacenter,
Node: l.config.NodeName,
Address: l.config.AdvertiseAddr,
Service: service,
Check: l.checks[id],
2015-04-28 18:53:53 +00:00
WriteRequest: structs.WriteRequest{Token: l.checkToken(id)},
}
var out struct{}
err := l.iface.RPC("Catalog.Register", &req, &out)
if err == nil {
l.checkStatus[id] = syncStatus{inSync: true}
l.logger.Printf("[INFO] agent: Synced check '%s'", id)
} else if strings.Contains(err.Error(), permissionDenied) {
l.checkStatus[id] = syncStatus{inSync: true}
l.logger.Printf("[WARN] agent: Check '%s' registration blocked by ACLs", id)
return nil
}
return err
}