2013-12-20 01:14:46 +00:00
|
|
|
package agent
|
|
|
|
|
2013-12-20 23:33:13 +00:00
|
|
|
import (
|
2014-11-24 08:36:03 +00:00
|
|
|
"encoding/json"
|
2013-12-20 23:33:13 +00:00
|
|
|
"fmt"
|
2013-12-21 00:39:32 +00:00
|
|
|
"io"
|
2015-06-04 21:33:30 +00:00
|
|
|
"io/ioutil"
|
2013-12-21 00:39:32 +00:00
|
|
|
"log"
|
2014-01-01 00:45:13 +00:00
|
|
|
"net"
|
2013-12-21 00:39:32 +00:00
|
|
|
"os"
|
2014-09-06 00:22:33 +00:00
|
|
|
"path/filepath"
|
2015-11-12 17:19:33 +00:00
|
|
|
"reflect"
|
2015-02-09 17:22:51 +00:00
|
|
|
"regexp"
|
2014-02-24 00:42:39 +00:00
|
|
|
"strconv"
|
2013-12-21 00:39:32 +00:00
|
|
|
"sync"
|
2015-06-05 23:17:07 +00:00
|
|
|
"time"
|
2014-06-16 21:36:12 +00:00
|
|
|
|
|
|
|
"github.com/hashicorp/consul/consul"
|
2015-10-13 04:48:15 +00:00
|
|
|
"github.com/hashicorp/consul/consul/state"
|
2014-06-16 21:36:12 +00:00
|
|
|
"github.com/hashicorp/consul/consul/structs"
|
2016-01-29 19:42:34 +00:00
|
|
|
"github.com/hashicorp/consul/lib"
|
2016-06-06 20:19:31 +00:00
|
|
|
"github.com/hashicorp/consul/types"
|
2015-04-09 20:23:14 +00:00
|
|
|
"github.com/hashicorp/serf/coordinate"
|
2014-06-16 21:36:12 +00:00
|
|
|
"github.com/hashicorp/serf/serf"
|
2013-12-20 23:33:13 +00:00
|
|
|
)
|
|
|
|
|
2014-11-24 08:36:03 +00:00
|
|
|
const (
|
|
|
|
// Path to save agent service definitions
|
|
|
|
servicesDir = "services"
|
|
|
|
|
|
|
|
// Path to save local agent checks
|
2015-06-05 23:17:07 +00:00
|
|
|
checksDir = "checks"
|
|
|
|
checkStateDir = "checks/state"
|
2015-01-16 20:39:15 +00:00
|
|
|
|
2015-01-15 19:20:22 +00:00
|
|
|
// The ID of the faux health checks for maintenance mode
|
2015-01-15 20:20:57 +00:00
|
|
|
serviceMaintCheckPrefix = "_service_maintenance"
|
2015-01-21 19:03:42 +00:00
|
|
|
nodeMaintCheckID = "_node_maintenance"
|
2015-01-21 22:45:09 +00:00
|
|
|
|
|
|
|
// Default reasons for node/service maintenance mode
|
|
|
|
defaultNodeMaintReason = "Maintenance mode is enabled for this node, " +
|
|
|
|
"but no reason was provided. This is a default message."
|
|
|
|
defaultServiceMaintReason = "Maintenance mode is enabled for this " +
|
|
|
|
"service, but no reason was provided. This is a default message."
|
2014-11-24 08:36:03 +00:00
|
|
|
)
|
|
|
|
|
2015-02-09 17:22:51 +00:00
|
|
|
var (
|
2015-02-09 17:30:06 +00:00
|
|
|
// dnsNameRe checks if a name or tag is dns-compatible.
|
|
|
|
dnsNameRe = regexp.MustCompile(`^[a-zA-Z0-9\-]+$`)
|
2015-02-09 17:22:51 +00:00
|
|
|
)
|
|
|
|
|
2013-12-20 01:14:46 +00:00
|
|
|
/*
|
|
|
|
The agent is the long running process that is run on every machine.
|
|
|
|
It exposes an RPC interface that is used by the CLI to control the
|
|
|
|
agent. The agent runs the query interfaces like HTTP, DNS, and RPC.
|
|
|
|
However, it can run in either a client, or server mode. In server
|
|
|
|
mode, it runs a full Consul server. In client-only mode, it only forwards
|
|
|
|
requests to other Consul servers.
|
|
|
|
*/
|
|
|
|
type Agent struct {
|
|
|
|
config *Config
|
2013-12-20 23:33:13 +00:00
|
|
|
|
2013-12-21 00:39:32 +00:00
|
|
|
// Used for writing our logs
|
|
|
|
logger *log.Logger
|
|
|
|
|
|
|
|
// Output sink for logs
|
|
|
|
logOutput io.Writer
|
|
|
|
|
2013-12-20 23:33:13 +00:00
|
|
|
// We have one of a client or a server, depending
|
|
|
|
// on our configuration
|
|
|
|
server *consul.Server
|
|
|
|
client *consul.Client
|
2013-12-21 00:39:32 +00:00
|
|
|
|
2014-01-16 01:14:50 +00:00
|
|
|
// state stores a local representation of the node,
|
|
|
|
// services and checks. Used for anti-entropy.
|
|
|
|
state localState
|
2014-01-21 20:05:56 +00:00
|
|
|
|
|
|
|
// checkMonitors maps the check ID to an associated monitor
|
2016-06-06 20:19:31 +00:00
|
|
|
checkMonitors map[types.CheckID]*CheckMonitor
|
2015-01-09 22:43:24 +00:00
|
|
|
|
|
|
|
// checkHTTPs maps the check ID to an associated HTTP check
|
2016-06-06 20:19:31 +00:00
|
|
|
checkHTTPs map[types.CheckID]*CheckHTTP
|
2015-01-09 22:43:24 +00:00
|
|
|
|
2015-07-23 11:45:08 +00:00
|
|
|
// checkTCPs maps the check ID to an associated TCP check
|
2016-06-06 20:19:31 +00:00
|
|
|
checkTCPs map[types.CheckID]*CheckTCP
|
2015-07-23 11:45:08 +00:00
|
|
|
|
2015-01-09 22:43:24 +00:00
|
|
|
// checkTTLs maps the check ID to an associated check TTL
|
2016-06-06 20:19:31 +00:00
|
|
|
checkTTLs map[types.CheckID]*CheckTTL
|
2015-01-09 22:43:24 +00:00
|
|
|
|
2015-10-22 22:29:13 +00:00
|
|
|
// checkDockers maps the check ID to an associated Docker Exec based check
|
2016-06-06 20:19:31 +00:00
|
|
|
checkDockers map[types.CheckID]*CheckDocker
|
2015-10-22 22:29:13 +00:00
|
|
|
|
2015-01-09 22:43:24 +00:00
|
|
|
// checkLock protects updates to the check* maps
|
|
|
|
checkLock sync.Mutex
|
2014-01-21 20:05:56 +00:00
|
|
|
|
2014-08-27 23:49:12 +00:00
|
|
|
// eventCh is used to receive user events
|
|
|
|
eventCh chan serf.UserEvent
|
|
|
|
|
2014-08-28 00:01:10 +00:00
|
|
|
// eventBuf stores the most recent events in a ring buffer
|
|
|
|
// using eventIndex as the next index to insert into. This
|
|
|
|
// is guarded by eventLock. When an insert happens, the
|
|
|
|
// eventNotify group is notified.
|
2014-08-28 17:56:30 +00:00
|
|
|
eventBuf []*UserEvent
|
2014-08-28 00:01:10 +00:00
|
|
|
eventIndex int
|
|
|
|
eventLock sync.RWMutex
|
2015-10-13 04:48:15 +00:00
|
|
|
eventNotify state.NotifyGroup
|
2014-08-28 00:01:10 +00:00
|
|
|
|
2014-01-21 20:05:56 +00:00
|
|
|
shutdown bool
|
|
|
|
shutdownCh chan struct{}
|
|
|
|
shutdownLock sync.Mutex
|
2015-11-12 17:19:33 +00:00
|
|
|
|
|
|
|
// endpoints lets you override RPC endpoints for testing. Not all
|
|
|
|
// agent methods use this, so use with care and never override
|
|
|
|
// outside of a unit test.
|
|
|
|
endpoints map[string]string
|
2016-01-13 05:10:25 +00:00
|
|
|
|
|
|
|
// reapLock is used to prevent child process reaping from interfering
|
|
|
|
// with normal waiting for subprocesses to complete. Any time you exec
|
|
|
|
// and wait, you should take a read lock on this mutex. Only the reaper
|
|
|
|
// takes the write lock. This setup prevents us from serializing all the
|
|
|
|
// child process management with each other, it just serializes them
|
|
|
|
// with the child process reaper.
|
|
|
|
reapLock sync.RWMutex
|
2013-12-20 01:14:46 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// Create is used to create a new Agent. Returns
|
|
|
|
// the agent or potentially an error.
|
2013-12-21 00:39:32 +00:00
|
|
|
func Create(config *Config, logOutput io.Writer) (*Agent, error) {
|
|
|
|
// Ensure we have a log sink
|
|
|
|
if logOutput == nil {
|
|
|
|
logOutput = os.Stderr
|
|
|
|
}
|
|
|
|
|
2013-12-24 00:20:51 +00:00
|
|
|
// Validate the config
|
|
|
|
if config.Datacenter == "" {
|
|
|
|
return nil, fmt.Errorf("Must configure a Datacenter")
|
|
|
|
}
|
2015-11-29 04:40:05 +00:00
|
|
|
if config.DataDir == "" && !config.DevMode {
|
2013-12-24 00:20:51 +00:00
|
|
|
return nil, fmt.Errorf("Must configure a DataDir")
|
|
|
|
}
|
|
|
|
|
2014-01-01 00:45:13 +00:00
|
|
|
// Try to get an advertise address
|
|
|
|
if config.AdvertiseAddr != "" {
|
|
|
|
if ip := net.ParseIP(config.AdvertiseAddr); ip == nil {
|
|
|
|
return nil, fmt.Errorf("Failed to parse advertise address: %v", config.AdvertiseAddr)
|
|
|
|
}
|
2015-09-05 15:53:41 +00:00
|
|
|
} else if config.BindAddr != "0.0.0.0" && config.BindAddr != "" && config.BindAddr != "[::]" {
|
2014-04-11 22:46:55 +00:00
|
|
|
config.AdvertiseAddr = config.BindAddr
|
2014-01-01 00:45:13 +00:00
|
|
|
} else {
|
2015-09-05 15:53:41 +00:00
|
|
|
var err error
|
|
|
|
var ip net.IP
|
|
|
|
if config.BindAddr == "[::]" {
|
|
|
|
ip, err = consul.GetPublicIPv6()
|
|
|
|
} else {
|
|
|
|
ip, err = consul.GetPrivateIP()
|
|
|
|
}
|
2014-01-01 00:45:13 +00:00
|
|
|
if err != nil {
|
|
|
|
return nil, fmt.Errorf("Failed to get advertise address: %v", err)
|
|
|
|
}
|
2014-05-15 18:27:30 +00:00
|
|
|
config.AdvertiseAddr = ip.String()
|
2014-01-01 00:45:13 +00:00
|
|
|
}
|
|
|
|
|
2015-03-21 09:14:03 +00:00
|
|
|
// Try to get an advertise address for the wan
|
|
|
|
if config.AdvertiseAddrWan != "" {
|
|
|
|
if ip := net.ParseIP(config.AdvertiseAddrWan); ip == nil {
|
|
|
|
return nil, fmt.Errorf("Failed to parse advertise address for wan: %v", config.AdvertiseAddrWan)
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
config.AdvertiseAddrWan = config.AdvertiseAddr
|
|
|
|
}
|
|
|
|
|
2016-02-07 18:37:34 +00:00
|
|
|
// Create the default set of tagged addresses.
|
|
|
|
config.TaggedAddresses = map[string]string{
|
|
|
|
"wan": config.AdvertiseAddrWan,
|
|
|
|
}
|
|
|
|
|
2013-12-20 01:14:46 +00:00
|
|
|
agent := &Agent{
|
2014-01-21 20:05:56 +00:00
|
|
|
config: config,
|
|
|
|
logger: log.New(logOutput, "", log.LstdFlags),
|
|
|
|
logOutput: logOutput,
|
2016-06-06 20:19:31 +00:00
|
|
|
checkMonitors: make(map[types.CheckID]*CheckMonitor),
|
|
|
|
checkTTLs: make(map[types.CheckID]*CheckTTL),
|
|
|
|
checkHTTPs: make(map[types.CheckID]*CheckHTTP),
|
|
|
|
checkTCPs: make(map[types.CheckID]*CheckTCP),
|
|
|
|
checkDockers: make(map[types.CheckID]*CheckDocker),
|
2014-08-27 23:49:12 +00:00
|
|
|
eventCh: make(chan serf.UserEvent, 1024),
|
2014-08-28 17:56:30 +00:00
|
|
|
eventBuf: make([]*UserEvent, 256),
|
2014-01-21 20:05:56 +00:00
|
|
|
shutdownCh: make(chan struct{}),
|
2015-11-12 17:19:33 +00:00
|
|
|
endpoints: make(map[string]string),
|
2013-12-20 01:14:46 +00:00
|
|
|
}
|
2013-12-20 23:33:13 +00:00
|
|
|
|
2014-02-07 20:11:34 +00:00
|
|
|
// Initialize the local state
|
|
|
|
agent.state.Init(config, agent.logger)
|
|
|
|
|
2013-12-20 23:33:13 +00:00
|
|
|
// Setup either the client or the server
|
|
|
|
var err error
|
|
|
|
if config.Server {
|
|
|
|
err = agent.setupServer()
|
2014-02-07 20:11:34 +00:00
|
|
|
agent.state.SetIface(agent.server)
|
2014-10-14 22:05:41 +00:00
|
|
|
|
|
|
|
// Automatically register the "consul" service on server nodes
|
|
|
|
consulService := structs.NodeService{
|
2014-10-14 22:42:49 +00:00
|
|
|
Service: consul.ConsulServiceName,
|
|
|
|
ID: consul.ConsulServiceID,
|
2014-10-14 22:05:41 +00:00
|
|
|
Port: agent.config.Ports.Server,
|
2014-10-17 21:29:12 +00:00
|
|
|
Tags: []string{},
|
2014-10-14 22:05:41 +00:00
|
|
|
}
|
2015-05-05 00:36:17 +00:00
|
|
|
agent.state.AddService(&consulService, "")
|
2013-12-20 23:33:13 +00:00
|
|
|
} else {
|
|
|
|
err = agent.setupClient()
|
2014-02-07 20:11:34 +00:00
|
|
|
agent.state.SetIface(agent.client)
|
2013-12-20 23:33:13 +00:00
|
|
|
}
|
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
|
2014-11-26 07:58:02 +00:00
|
|
|
// Load checks/services
|
2015-01-08 02:05:46 +00:00
|
|
|
if err := agent.loadServices(config); err != nil {
|
2014-11-24 08:36:03 +00:00
|
|
|
return nil, err
|
|
|
|
}
|
2015-01-08 02:05:46 +00:00
|
|
|
if err := agent.loadChecks(config); err != nil {
|
2014-11-24 08:36:03 +00:00
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
|
2014-08-27 23:49:12 +00:00
|
|
|
// Start handling events
|
|
|
|
go agent.handleEvents()
|
|
|
|
|
2015-06-06 03:31:33 +00:00
|
|
|
// Start sending network coordinate to the server.
|
2015-06-20 00:47:42 +00:00
|
|
|
if !config.DisableCoordinates {
|
2015-06-06 03:31:33 +00:00
|
|
|
go agent.sendCoordinate()
|
|
|
|
}
|
|
|
|
|
2014-05-06 19:43:33 +00:00
|
|
|
// Write out the PID file if necessary
|
2014-05-06 16:57:53 +00:00
|
|
|
err = agent.storePid()
|
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
2014-05-06 03:29:50 +00:00
|
|
|
|
2013-12-20 01:14:46 +00:00
|
|
|
return agent, nil
|
|
|
|
}
|
|
|
|
|
2013-12-20 23:33:13 +00:00
|
|
|
// consulConfig is used to return a consul configuration
|
|
|
|
func (a *Agent) consulConfig() *consul.Config {
|
|
|
|
// Start with the provided config or default config
|
|
|
|
var base *consul.Config
|
|
|
|
if a.config.ConsulConfig != nil {
|
|
|
|
base = a.config.ConsulConfig
|
|
|
|
} else {
|
|
|
|
base = consul.DefaultConfig()
|
|
|
|
}
|
|
|
|
|
2015-11-29 04:40:05 +00:00
|
|
|
// Apply dev mode
|
|
|
|
base.DevMode = a.config.DevMode
|
|
|
|
|
2013-12-20 23:33:13 +00:00
|
|
|
// Override with our config
|
|
|
|
if a.config.Datacenter != "" {
|
|
|
|
base.Datacenter = a.config.Datacenter
|
|
|
|
}
|
|
|
|
if a.config.DataDir != "" {
|
|
|
|
base.DataDir = a.config.DataDir
|
|
|
|
}
|
|
|
|
if a.config.NodeName != "" {
|
|
|
|
base.NodeName = a.config.NodeName
|
|
|
|
}
|
2014-04-11 22:22:35 +00:00
|
|
|
if a.config.BindAddr != "" {
|
|
|
|
base.SerfLANConfig.MemberlistConfig.BindAddr = a.config.BindAddr
|
|
|
|
base.SerfWANConfig.MemberlistConfig.BindAddr = a.config.BindAddr
|
2013-12-20 23:33:13 +00:00
|
|
|
}
|
2014-04-11 22:22:35 +00:00
|
|
|
if a.config.Ports.SerfLan != 0 {
|
|
|
|
base.SerfLANConfig.MemberlistConfig.BindPort = a.config.Ports.SerfLan
|
|
|
|
base.SerfLANConfig.MemberlistConfig.AdvertisePort = a.config.Ports.SerfLan
|
2013-12-20 23:33:13 +00:00
|
|
|
}
|
2014-04-11 22:22:35 +00:00
|
|
|
if a.config.Ports.SerfWan != 0 {
|
|
|
|
base.SerfWANConfig.MemberlistConfig.BindPort = a.config.Ports.SerfWan
|
|
|
|
base.SerfWANConfig.MemberlistConfig.AdvertisePort = a.config.Ports.SerfWan
|
2013-12-20 23:33:13 +00:00
|
|
|
}
|
2014-04-11 22:22:35 +00:00
|
|
|
if a.config.BindAddr != "" {
|
2014-04-14 19:37:49 +00:00
|
|
|
bindAddr := &net.TCPAddr{
|
|
|
|
IP: net.ParseIP(a.config.BindAddr),
|
|
|
|
Port: a.config.Ports.Server,
|
|
|
|
}
|
|
|
|
base.RPCAddr = bindAddr
|
2014-01-01 00:45:13 +00:00
|
|
|
}
|
|
|
|
if a.config.AdvertiseAddr != "" {
|
|
|
|
base.SerfLANConfig.MemberlistConfig.AdvertiseAddr = a.config.AdvertiseAddr
|
2015-03-21 09:14:03 +00:00
|
|
|
if a.config.AdvertiseAddrWan != "" {
|
|
|
|
base.SerfWANConfig.MemberlistConfig.AdvertiseAddr = a.config.AdvertiseAddrWan
|
|
|
|
} else {
|
|
|
|
base.SerfWANConfig.MemberlistConfig.AdvertiseAddr = a.config.AdvertiseAddr
|
|
|
|
}
|
2014-01-01 00:45:13 +00:00
|
|
|
base.RPCAdvertise = &net.TCPAddr{
|
|
|
|
IP: net.ParseIP(a.config.AdvertiseAddr),
|
2014-04-14 19:37:49 +00:00
|
|
|
Port: a.config.Ports.Server,
|
2014-01-01 00:45:13 +00:00
|
|
|
}
|
2013-12-20 23:33:13 +00:00
|
|
|
}
|
2015-06-05 11:44:42 +00:00
|
|
|
if a.config.AdvertiseAddrs.SerfLan != nil {
|
|
|
|
base.SerfLANConfig.MemberlistConfig.AdvertiseAddr = a.config.AdvertiseAddrs.SerfLan.IP.String()
|
|
|
|
base.SerfLANConfig.MemberlistConfig.AdvertisePort = a.config.AdvertiseAddrs.SerfLan.Port
|
|
|
|
}
|
|
|
|
if a.config.AdvertiseAddrs.SerfWan != nil {
|
|
|
|
base.SerfWANConfig.MemberlistConfig.AdvertiseAddr = a.config.AdvertiseAddrs.SerfWan.IP.String()
|
|
|
|
base.SerfWANConfig.MemberlistConfig.AdvertisePort = a.config.AdvertiseAddrs.SerfWan.Port
|
|
|
|
}
|
2016-04-11 05:46:07 +00:00
|
|
|
if a.config.ReconnectTimeoutLan != 0 {
|
|
|
|
base.SerfLANConfig.ReconnectTimeout = a.config.ReconnectTimeoutLan
|
|
|
|
}
|
|
|
|
if a.config.ReconnectTimeoutWan != 0 {
|
|
|
|
base.SerfWANConfig.ReconnectTimeout = a.config.ReconnectTimeoutWan
|
|
|
|
}
|
2015-06-05 11:44:42 +00:00
|
|
|
if a.config.AdvertiseAddrs.RPC != nil {
|
|
|
|
base.RPCAdvertise = a.config.AdvertiseAddrs.RPC
|
|
|
|
}
|
2013-12-25 00:48:07 +00:00
|
|
|
if a.config.Bootstrap {
|
|
|
|
base.Bootstrap = true
|
|
|
|
}
|
2014-06-18 17:32:19 +00:00
|
|
|
if a.config.RejoinAfterLeave {
|
|
|
|
base.RejoinAfterLeave = true
|
|
|
|
}
|
2014-06-20 00:08:48 +00:00
|
|
|
if a.config.BootstrapExpect != 0 {
|
|
|
|
base.BootstrapExpect = a.config.BootstrapExpect
|
2014-06-16 21:36:12 +00:00
|
|
|
}
|
2014-03-09 22:57:03 +00:00
|
|
|
if a.config.Protocol > 0 {
|
|
|
|
base.ProtocolVersion = uint8(a.config.Protocol)
|
|
|
|
}
|
2014-08-05 22:20:35 +00:00
|
|
|
if a.config.ACLToken != "" {
|
|
|
|
base.ACLToken = a.config.ACLToken
|
|
|
|
}
|
2014-08-05 22:36:08 +00:00
|
|
|
if a.config.ACLMasterToken != "" {
|
|
|
|
base.ACLMasterToken = a.config.ACLMasterToken
|
|
|
|
}
|
2014-08-05 22:20:35 +00:00
|
|
|
if a.config.ACLDatacenter != "" {
|
|
|
|
base.ACLDatacenter = a.config.ACLDatacenter
|
|
|
|
}
|
|
|
|
if a.config.ACLTTLRaw != "" {
|
|
|
|
base.ACLTTL = a.config.ACLTTL
|
|
|
|
}
|
|
|
|
if a.config.ACLDefaultPolicy != "" {
|
|
|
|
base.ACLDefaultPolicy = a.config.ACLDefaultPolicy
|
|
|
|
}
|
|
|
|
if a.config.ACLDownPolicy != "" {
|
|
|
|
base.ACLDownPolicy = a.config.ACLDownPolicy
|
|
|
|
}
|
2016-08-03 05:04:11 +00:00
|
|
|
if a.config.ACLReplicationToken != "" {
|
|
|
|
base.ACLReplicationToken = a.config.ACLReplicationToken
|
|
|
|
}
|
2015-03-27 05:30:04 +00:00
|
|
|
if a.config.SessionTTLMinRaw != "" {
|
|
|
|
base.SessionTTLMin = a.config.SessionTTLMin
|
|
|
|
}
|
2013-12-20 23:33:13 +00:00
|
|
|
|
2014-06-06 22:36:40 +00:00
|
|
|
// Format the build string
|
|
|
|
revision := a.config.Revision
|
|
|
|
if len(revision) > 8 {
|
|
|
|
revision = revision[:8]
|
|
|
|
}
|
|
|
|
base.Build = fmt.Sprintf("%s%s:%s",
|
|
|
|
a.config.Version, a.config.VersionPrerelease, revision)
|
|
|
|
|
2014-04-04 23:52:39 +00:00
|
|
|
// Copy the TLS configuration
|
|
|
|
base.VerifyIncoming = a.config.VerifyIncoming
|
|
|
|
base.VerifyOutgoing = a.config.VerifyOutgoing
|
2015-05-11 22:16:13 +00:00
|
|
|
base.VerifyServerHostname = a.config.VerifyServerHostname
|
2014-04-04 23:52:39 +00:00
|
|
|
base.CAFile = a.config.CAFile
|
|
|
|
base.CertFile = a.config.CertFile
|
|
|
|
base.KeyFile = a.config.KeyFile
|
2014-06-13 18:27:44 +00:00
|
|
|
base.ServerName = a.config.ServerName
|
2015-05-11 22:16:13 +00:00
|
|
|
base.Domain = a.config.Domain
|
2014-04-04 23:52:39 +00:00
|
|
|
|
2014-02-07 20:11:34 +00:00
|
|
|
// Setup the ServerUp callback
|
|
|
|
base.ServerUp = a.state.ConsulServerUp
|
|
|
|
|
2014-08-27 23:49:12 +00:00
|
|
|
// Setup the user event callback
|
|
|
|
base.UserEventHandler = func(e serf.UserEvent) {
|
|
|
|
select {
|
|
|
|
case a.eventCh <- e:
|
|
|
|
case <-a.shutdownCh:
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2013-12-21 00:39:32 +00:00
|
|
|
// Setup the loggers
|
|
|
|
base.LogOutput = a.logOutput
|
2013-12-20 23:33:13 +00:00
|
|
|
return base
|
|
|
|
}
|
|
|
|
|
|
|
|
// setupServer is used to initialize the Consul server
|
|
|
|
func (a *Agent) setupServer() error {
|
2014-09-12 02:52:16 +00:00
|
|
|
config := a.consulConfig()
|
|
|
|
|
2014-10-10 18:13:30 +00:00
|
|
|
if err := a.setupKeyrings(config); err != nil {
|
|
|
|
return fmt.Errorf("Failed to configure keyring: %v", err)
|
2014-09-12 02:52:16 +00:00
|
|
|
}
|
|
|
|
|
2014-09-12 05:46:57 +00:00
|
|
|
server, err := consul.NewServer(config)
|
2013-12-20 23:33:13 +00:00
|
|
|
if err != nil {
|
|
|
|
return fmt.Errorf("Failed to start Consul server: %v", err)
|
|
|
|
}
|
|
|
|
a.server = server
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
|
|
|
// setupClient is used to initialize the Consul client
|
|
|
|
func (a *Agent) setupClient() error {
|
2014-09-12 05:46:57 +00:00
|
|
|
config := a.consulConfig()
|
|
|
|
|
2014-10-10 18:13:30 +00:00
|
|
|
if err := a.setupKeyrings(config); err != nil {
|
|
|
|
return fmt.Errorf("Failed to configure keyring: %v", err)
|
2014-09-12 05:46:57 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
client, err := consul.NewClient(config)
|
2013-12-20 23:33:13 +00:00
|
|
|
if err != nil {
|
|
|
|
return fmt.Errorf("Failed to start Consul client: %v", err)
|
|
|
|
}
|
|
|
|
a.client = client
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
2014-10-10 18:13:30 +00:00
|
|
|
// setupKeyrings is used to initialize and load keyrings during agent startup
|
|
|
|
func (a *Agent) setupKeyrings(config *consul.Config) error {
|
|
|
|
fileLAN := filepath.Join(a.config.DataDir, serfLANKeyring)
|
|
|
|
fileWAN := filepath.Join(a.config.DataDir, serfWANKeyring)
|
|
|
|
|
|
|
|
if a.config.EncryptKey == "" {
|
|
|
|
goto LOAD
|
|
|
|
}
|
|
|
|
if _, err := os.Stat(fileLAN); err != nil {
|
|
|
|
if err := initKeyring(fileLAN, a.config.EncryptKey); err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if a.config.Server {
|
|
|
|
if _, err := os.Stat(fileWAN); err != nil {
|
|
|
|
if err := initKeyring(fileWAN, a.config.EncryptKey); err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
LOAD:
|
|
|
|
if _, err := os.Stat(fileLAN); err == nil {
|
|
|
|
config.SerfLANConfig.KeyringFile = fileLAN
|
|
|
|
}
|
|
|
|
if err := loadKeyringFile(config.SerfLANConfig); err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
if a.config.Server {
|
|
|
|
if _, err := os.Stat(fileWAN); err == nil {
|
|
|
|
config.SerfWANConfig.KeyringFile = fileWAN
|
|
|
|
}
|
|
|
|
if err := loadKeyringFile(config.SerfWANConfig); err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Success!
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
2013-12-20 23:33:13 +00:00
|
|
|
// RPC is used to make an RPC call to the Consul servers
|
|
|
|
// This allows the agent to implement the Consul.Interface
|
|
|
|
func (a *Agent) RPC(method string, args interface{}, reply interface{}) error {
|
|
|
|
if a.server != nil {
|
|
|
|
return a.server.RPC(method, args, reply)
|
|
|
|
}
|
|
|
|
return a.client.RPC(method, args, reply)
|
|
|
|
}
|
|
|
|
|
2014-04-18 05:46:31 +00:00
|
|
|
// Leave is used to prepare the agent for a graceful shutdown
|
2013-12-20 01:14:46 +00:00
|
|
|
func (a *Agent) Leave() error {
|
2013-12-20 23:33:13 +00:00
|
|
|
if a.server != nil {
|
|
|
|
return a.server.Leave()
|
|
|
|
} else {
|
|
|
|
return a.client.Leave()
|
|
|
|
}
|
2013-12-20 01:14:46 +00:00
|
|
|
}
|
|
|
|
|
2014-04-18 05:46:31 +00:00
|
|
|
// Shutdown is used to hard stop the agent. Should be
|
2014-12-04 23:25:06 +00:00
|
|
|
// preceded by a call to Leave to do it gracefully.
|
2013-12-20 01:14:46 +00:00
|
|
|
func (a *Agent) Shutdown() error {
|
2013-12-21 00:39:32 +00:00
|
|
|
a.shutdownLock.Lock()
|
|
|
|
defer a.shutdownLock.Unlock()
|
|
|
|
|
|
|
|
if a.shutdown {
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
2014-01-21 20:05:56 +00:00
|
|
|
// Stop all the checks
|
|
|
|
a.checkLock.Lock()
|
|
|
|
defer a.checkLock.Unlock()
|
|
|
|
for _, chk := range a.checkMonitors {
|
|
|
|
chk.Stop()
|
|
|
|
}
|
|
|
|
for _, chk := range a.checkTTLs {
|
|
|
|
chk.Stop()
|
|
|
|
}
|
|
|
|
|
2015-01-09 22:43:24 +00:00
|
|
|
for _, chk := range a.checkHTTPs {
|
|
|
|
chk.Stop()
|
|
|
|
}
|
|
|
|
|
2015-07-23 11:45:08 +00:00
|
|
|
for _, chk := range a.checkTCPs {
|
|
|
|
chk.Stop()
|
|
|
|
}
|
|
|
|
|
2013-12-21 00:39:32 +00:00
|
|
|
a.logger.Println("[INFO] agent: requesting shutdown")
|
|
|
|
var err error
|
2013-12-20 23:33:13 +00:00
|
|
|
if a.server != nil {
|
2013-12-21 00:39:32 +00:00
|
|
|
err = a.server.Shutdown()
|
2013-12-20 23:33:13 +00:00
|
|
|
} else {
|
2013-12-21 00:39:32 +00:00
|
|
|
err = a.client.Shutdown()
|
2013-12-20 23:33:13 +00:00
|
|
|
}
|
2013-12-21 00:39:32 +00:00
|
|
|
|
2014-05-06 16:57:53 +00:00
|
|
|
pidErr := a.deletePid()
|
|
|
|
if pidErr != nil {
|
|
|
|
a.logger.Println("[WARN] agent: could not delete pid file ", pidErr)
|
|
|
|
}
|
2014-05-06 03:29:50 +00:00
|
|
|
|
2013-12-21 00:39:32 +00:00
|
|
|
a.logger.Println("[INFO] agent: shutdown complete")
|
|
|
|
a.shutdown = true
|
|
|
|
close(a.shutdownCh)
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
|
2014-04-18 05:46:31 +00:00
|
|
|
// ShutdownCh is used to return a channel that can be
|
|
|
|
// selected to wait for the agent to perform a shutdown.
|
2013-12-21 00:39:32 +00:00
|
|
|
func (a *Agent) ShutdownCh() <-chan struct{} {
|
|
|
|
return a.shutdownCh
|
2013-12-20 01:14:46 +00:00
|
|
|
}
|
2013-12-30 22:42:41 +00:00
|
|
|
|
|
|
|
// JoinLAN is used to have the agent join a LAN cluster
|
|
|
|
func (a *Agent) JoinLAN(addrs []string) (n int, err error) {
|
|
|
|
a.logger.Printf("[INFO] agent: (LAN) joining: %v", addrs)
|
|
|
|
if a.server != nil {
|
|
|
|
n, err = a.server.JoinLAN(addrs)
|
|
|
|
} else {
|
|
|
|
n, err = a.client.JoinLAN(addrs)
|
|
|
|
}
|
|
|
|
a.logger.Printf("[INFO] agent: (LAN) joined: %d Err: %v", n, err)
|
|
|
|
return
|
|
|
|
}
|
|
|
|
|
|
|
|
// JoinWAN is used to have the agent join a WAN cluster
|
|
|
|
func (a *Agent) JoinWAN(addrs []string) (n int, err error) {
|
|
|
|
a.logger.Printf("[INFO] agent: (WAN) joining: %v", addrs)
|
|
|
|
if a.server != nil {
|
|
|
|
n, err = a.server.JoinWAN(addrs)
|
|
|
|
} else {
|
|
|
|
err = fmt.Errorf("Must be a server to join WAN cluster")
|
|
|
|
}
|
|
|
|
a.logger.Printf("[INFO] agent: (WAN) joined: %d Err: %v", n, err)
|
|
|
|
return
|
|
|
|
}
|
|
|
|
|
|
|
|
// ForceLeave is used to remove a failed node from the cluster
|
|
|
|
func (a *Agent) ForceLeave(node string) (err error) {
|
|
|
|
a.logger.Printf("[INFO] Force leaving node: %v", node)
|
|
|
|
if a.server != nil {
|
|
|
|
err = a.server.RemoveFailedNode(node)
|
|
|
|
} else {
|
|
|
|
err = a.client.RemoveFailedNode(node)
|
|
|
|
}
|
|
|
|
if err != nil {
|
|
|
|
a.logger.Printf("[WARN] Failed to remove node: %v", err)
|
|
|
|
}
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
|
2014-05-25 23:59:48 +00:00
|
|
|
// LocalMember is used to return the local node
|
|
|
|
func (a *Agent) LocalMember() serf.Member {
|
2014-05-29 18:21:56 +00:00
|
|
|
if a.server != nil {
|
|
|
|
return a.server.LocalMember()
|
|
|
|
} else {
|
|
|
|
return a.client.LocalMember()
|
|
|
|
}
|
2014-05-25 23:59:48 +00:00
|
|
|
}
|
|
|
|
|
2014-04-18 05:46:31 +00:00
|
|
|
// LANMembers is used to retrieve the LAN members
|
2013-12-30 22:42:41 +00:00
|
|
|
func (a *Agent) LANMembers() []serf.Member {
|
|
|
|
if a.server != nil {
|
|
|
|
return a.server.LANMembers()
|
|
|
|
} else {
|
|
|
|
return a.client.LANMembers()
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2014-04-18 05:46:31 +00:00
|
|
|
// WANMembers is used to retrieve the WAN members
|
2013-12-30 22:42:41 +00:00
|
|
|
func (a *Agent) WANMembers() []serf.Member {
|
|
|
|
if a.server != nil {
|
|
|
|
return a.server.WANMembers()
|
|
|
|
} else {
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
}
|
2014-01-21 19:52:25 +00:00
|
|
|
|
|
|
|
// StartSync is called once Services and Checks are registered.
|
|
|
|
// This is called to prevent a race between clients and the anti-entropy routines
|
|
|
|
func (a *Agent) StartSync() {
|
|
|
|
// Start the anti entropy routine
|
|
|
|
go a.state.antiEntropy(a.shutdownCh)
|
|
|
|
}
|
2014-01-30 21:39:02 +00:00
|
|
|
|
2014-04-18 05:46:31 +00:00
|
|
|
// PauseSync is used to pause anti-entropy while bulk changes are make
|
2014-02-07 20:19:56 +00:00
|
|
|
func (a *Agent) PauseSync() {
|
|
|
|
a.state.Pause()
|
|
|
|
}
|
|
|
|
|
2014-04-18 05:46:31 +00:00
|
|
|
// ResumeSync is used to unpause anti-entropy after bulk changes are make
|
2014-02-07 20:19:56 +00:00
|
|
|
func (a *Agent) ResumeSync() {
|
|
|
|
a.state.Resume()
|
|
|
|
}
|
|
|
|
|
2015-10-16 02:28:31 +00:00
|
|
|
// Returns the coordinate of this node in the local pool (assumes coordinates
|
|
|
|
// are enabled, so check that before calling).
|
|
|
|
func (a *Agent) GetCoordinate() (*coordinate.Coordinate, error) {
|
|
|
|
if a.config.Server {
|
|
|
|
return a.server.GetLANCoordinate()
|
|
|
|
} else {
|
|
|
|
return a.client.GetCoordinate()
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2015-06-06 03:31:33 +00:00
|
|
|
// sendCoordinate is a long-running loop that periodically sends our coordinate
|
|
|
|
// to the server. Closing the agent's shutdownChannel will cause this to exit.
|
|
|
|
func (a *Agent) sendCoordinate() {
|
2015-04-15 23:12:45 +00:00
|
|
|
for {
|
2015-06-30 19:02:05 +00:00
|
|
|
rate := a.config.SyncCoordinateRateTarget
|
|
|
|
min := a.config.SyncCoordinateIntervalMin
|
2016-01-29 19:42:34 +00:00
|
|
|
intv := lib.RateScaledInterval(rate, min, len(a.LANMembers()))
|
|
|
|
intv = intv + lib.RandomStagger(intv)
|
2015-06-06 03:31:33 +00:00
|
|
|
|
2015-04-15 23:12:45 +00:00
|
|
|
select {
|
2015-04-29 01:47:41 +00:00
|
|
|
case <-time.After(intv):
|
2015-10-27 21:30:29 +00:00
|
|
|
members := a.LANMembers()
|
|
|
|
grok, err := consul.CanServersUnderstandProtocol(members, 3)
|
|
|
|
if err != nil {
|
|
|
|
a.logger.Printf("[ERR] agent: failed to check servers: %s", err)
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
if !grok {
|
|
|
|
a.logger.Printf("[DEBUG] agent: skipping coordinate updates until servers are upgraded")
|
2015-10-16 02:28:31 +00:00
|
|
|
continue
|
|
|
|
}
|
|
|
|
|
2015-10-27 21:30:29 +00:00
|
|
|
c, err := a.GetCoordinate()
|
|
|
|
if err != nil {
|
2015-06-29 22:53:29 +00:00
|
|
|
a.logger.Printf("[ERR] agent: failed to get coordinate: %s", err)
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
|
2015-06-30 01:26:04 +00:00
|
|
|
// TODO - Consider adding a distance check so we don't send
|
|
|
|
// an update if the position hasn't changed by more than a
|
|
|
|
// threshold.
|
2015-04-15 23:12:45 +00:00
|
|
|
req := structs.CoordinateUpdateRequest{
|
2015-04-18 21:05:29 +00:00
|
|
|
Datacenter: a.config.Datacenter,
|
|
|
|
Node: a.config.NodeName,
|
2015-04-15 23:12:45 +00:00
|
|
|
Coord: c,
|
2015-04-16 20:54:29 +00:00
|
|
|
WriteRequest: structs.WriteRequest{Token: a.config.ACLToken},
|
2015-04-15 23:12:45 +00:00
|
|
|
}
|
|
|
|
var reply struct{}
|
2015-04-16 20:54:29 +00:00
|
|
|
if err := a.RPC("Coordinate.Update", &req, &reply); err != nil {
|
2015-06-06 03:31:33 +00:00
|
|
|
a.logger.Printf("[ERR] agent: coordinate update error: %s", err)
|
2015-06-29 22:53:29 +00:00
|
|
|
continue
|
2015-04-15 23:12:45 +00:00
|
|
|
}
|
2015-04-19 00:49:49 +00:00
|
|
|
case <-a.shutdownCh:
|
2015-04-15 23:12:45 +00:00
|
|
|
return
|
|
|
|
}
|
2015-04-13 20:45:42 +00:00
|
|
|
}
|
2015-04-09 20:23:14 +00:00
|
|
|
}
|
|
|
|
|
2014-11-24 08:36:03 +00:00
|
|
|
// persistService saves a service definition to a JSON file in the data dir
|
|
|
|
func (a *Agent) persistService(service *structs.NodeService) error {
|
2015-01-08 03:11:21 +00:00
|
|
|
svcPath := filepath.Join(a.config.DataDir, servicesDir, stringHash(service.ID))
|
2015-05-06 05:08:03 +00:00
|
|
|
wrapped := persistedService{
|
|
|
|
Token: a.state.ServiceToken(service.ID),
|
|
|
|
Service: service,
|
|
|
|
}
|
|
|
|
encoded, err := json.Marshal(wrapped)
|
|
|
|
if err != nil {
|
2016-04-26 22:03:26 +00:00
|
|
|
return err
|
2015-05-06 05:08:03 +00:00
|
|
|
}
|
|
|
|
if err := os.MkdirAll(filepath.Dir(svcPath), 0700); err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
fh, err := os.OpenFile(svcPath, os.O_CREATE|os.O_WRONLY|os.O_TRUNC, 0600)
|
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
defer fh.Close()
|
|
|
|
if _, err := fh.Write(encoded); err != nil {
|
|
|
|
return err
|
2014-11-24 08:36:03 +00:00
|
|
|
}
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
|
|
|
// purgeService removes a persisted service definition file from the data dir
|
|
|
|
func (a *Agent) purgeService(serviceID string) error {
|
2015-01-08 03:11:21 +00:00
|
|
|
svcPath := filepath.Join(a.config.DataDir, servicesDir, stringHash(serviceID))
|
2014-11-24 08:36:03 +00:00
|
|
|
if _, err := os.Stat(svcPath); err == nil {
|
|
|
|
return os.Remove(svcPath)
|
|
|
|
}
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
|
|
|
// persistCheck saves a check definition to the local agent's state directory
|
2014-11-29 20:25:01 +00:00
|
|
|
func (a *Agent) persistCheck(check *structs.HealthCheck, chkType *CheckType) error {
|
2016-06-06 08:53:30 +00:00
|
|
|
checkPath := filepath.Join(a.config.DataDir, checksDir, checkIDHash(check.CheckID))
|
2014-11-29 20:25:01 +00:00
|
|
|
|
|
|
|
// Create the persisted check
|
2015-04-28 19:44:46 +00:00
|
|
|
wrapped := persistedCheck{
|
|
|
|
Check: check,
|
|
|
|
ChkType: chkType,
|
|
|
|
Token: a.state.CheckToken(check.CheckID),
|
|
|
|
}
|
2014-11-29 20:25:01 +00:00
|
|
|
|
2015-04-28 19:44:46 +00:00
|
|
|
encoded, err := json.Marshal(wrapped)
|
2014-11-29 20:25:01 +00:00
|
|
|
if err != nil {
|
2016-04-26 22:03:26 +00:00
|
|
|
return err
|
2014-11-29 20:25:01 +00:00
|
|
|
}
|
|
|
|
if err := os.MkdirAll(filepath.Dir(checkPath), 0700); err != nil {
|
|
|
|
return err
|
|
|
|
}
|
2015-05-06 05:08:03 +00:00
|
|
|
fh, err := os.OpenFile(checkPath, os.O_CREATE|os.O_WRONLY|os.O_TRUNC, 0600)
|
2014-11-29 20:25:01 +00:00
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
defer fh.Close()
|
|
|
|
if _, err := fh.Write(encoded); err != nil {
|
|
|
|
return err
|
2014-11-24 08:36:03 +00:00
|
|
|
}
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
|
|
|
// purgeCheck removes a persisted check definition file from the data dir
|
2016-06-06 20:19:31 +00:00
|
|
|
func (a *Agent) purgeCheck(checkID types.CheckID) error {
|
2016-06-06 08:53:30 +00:00
|
|
|
checkPath := filepath.Join(a.config.DataDir, checksDir, checkIDHash(checkID))
|
2014-11-24 08:36:03 +00:00
|
|
|
if _, err := os.Stat(checkPath); err == nil {
|
|
|
|
return os.Remove(checkPath)
|
|
|
|
}
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
2014-01-30 21:39:02 +00:00
|
|
|
// AddService is used to add a service entry.
|
|
|
|
// This entry is persistent and the agent will make a best effort to
|
|
|
|
// ensure it is registered
|
2015-05-05 00:36:17 +00:00
|
|
|
func (a *Agent) AddService(service *structs.NodeService, chkTypes CheckTypes, persist bool, token string) error {
|
2014-01-30 21:39:02 +00:00
|
|
|
if service.Service == "" {
|
|
|
|
return fmt.Errorf("Service name missing")
|
|
|
|
}
|
|
|
|
if service.ID == "" && service.Service != "" {
|
|
|
|
service.ID = service.Service
|
|
|
|
}
|
2015-01-14 01:52:17 +00:00
|
|
|
for _, check := range chkTypes {
|
|
|
|
if !check.Valid() {
|
|
|
|
return fmt.Errorf("Check type is not valid")
|
|
|
|
}
|
2014-01-30 21:39:02 +00:00
|
|
|
}
|
|
|
|
|
2015-02-09 17:22:51 +00:00
|
|
|
// Warn if the service name is incompatible with DNS
|
2015-02-09 17:30:06 +00:00
|
|
|
if !dnsNameRe.MatchString(service.Service) {
|
2015-02-09 17:22:51 +00:00
|
|
|
a.logger.Printf("[WARN] Service name %q will not be discoverable "+
|
2015-02-09 17:59:21 +00:00
|
|
|
"via DNS due to invalid characters. Valid characters include "+
|
|
|
|
"all alpha-numerics and dashes.", service.Service)
|
2015-02-09 17:22:51 +00:00
|
|
|
}
|
|
|
|
|
2015-02-09 17:30:06 +00:00
|
|
|
// Warn if any tags are incompatible with DNS
|
|
|
|
for _, tag := range service.Tags {
|
|
|
|
if !dnsNameRe.MatchString(tag) {
|
|
|
|
a.logger.Printf("[WARN] Service tag %q will not be discoverable "+
|
2015-02-09 17:59:21 +00:00
|
|
|
"via DNS due to invalid characters. Valid characters include "+
|
|
|
|
"all alpha-numerics and dashes.", tag)
|
2015-02-09 17:30:06 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2015-05-06 19:28:42 +00:00
|
|
|
// Pause the service syncs during modification
|
|
|
|
a.PauseSync()
|
|
|
|
defer a.ResumeSync()
|
|
|
|
|
|
|
|
// Take a snapshot of the current state of checks (if any), and
|
|
|
|
// restore them before resuming anti-entropy.
|
|
|
|
snap := a.snapshotCheckState()
|
|
|
|
defer a.restoreCheckState(snap)
|
|
|
|
|
2014-01-30 21:39:02 +00:00
|
|
|
// Add the service
|
2015-05-05 00:36:17 +00:00
|
|
|
a.state.AddService(service, token)
|
2014-01-30 21:39:02 +00:00
|
|
|
|
2014-11-24 08:36:03 +00:00
|
|
|
// Persist the service to a file
|
2015-11-29 04:40:05 +00:00
|
|
|
if persist && !a.config.DevMode {
|
2014-11-25 03:24:32 +00:00
|
|
|
if err := a.persistService(service); err != nil {
|
|
|
|
return err
|
|
|
|
}
|
2014-11-24 09:58:39 +00:00
|
|
|
}
|
2014-11-24 08:36:03 +00:00
|
|
|
|
2014-01-30 21:39:02 +00:00
|
|
|
// Create an associated health check
|
2015-01-14 01:52:17 +00:00
|
|
|
for i, chkType := range chkTypes {
|
|
|
|
checkID := fmt.Sprintf("service:%s", service.ID)
|
|
|
|
if len(chkTypes) > 1 {
|
|
|
|
checkID += fmt.Sprintf(":%d", i+1)
|
|
|
|
}
|
2014-01-30 21:39:02 +00:00
|
|
|
check := &structs.HealthCheck{
|
|
|
|
Node: a.config.NodeName,
|
2016-06-06 20:19:31 +00:00
|
|
|
CheckID: types.CheckID(checkID),
|
2014-01-30 21:39:02 +00:00
|
|
|
Name: fmt.Sprintf("Service '%s' check", service.Service),
|
2014-10-15 17:14:46 +00:00
|
|
|
Status: structs.HealthCritical,
|
2014-11-07 02:24:04 +00:00
|
|
|
Notes: chkType.Notes,
|
2014-01-30 21:39:02 +00:00
|
|
|
ServiceID: service.ID,
|
|
|
|
ServiceName: service.Service,
|
|
|
|
}
|
2015-04-12 00:53:48 +00:00
|
|
|
if chkType.Status != "" {
|
|
|
|
check.Status = chkType.Status
|
|
|
|
}
|
2015-05-05 00:36:17 +00:00
|
|
|
if err := a.AddCheck(check, chkType, persist, token); err != nil {
|
2014-01-30 21:39:02 +00:00
|
|
|
return err
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
|
|
|
// RemoveService is used to remove a service entry.
|
|
|
|
// The agent will make a best effort to ensure it is deregistered
|
2014-11-26 07:58:02 +00:00
|
|
|
func (a *Agent) RemoveService(serviceID string, persist bool) error {
|
2014-10-14 22:05:41 +00:00
|
|
|
// Protect "consul" service from deletion by a user
|
2014-10-14 22:42:49 +00:00
|
|
|
if a.server != nil && serviceID == consul.ConsulServiceID {
|
2014-10-14 22:05:41 +00:00
|
|
|
return fmt.Errorf(
|
2014-10-15 21:56:15 +00:00
|
|
|
"Deregistering the %s service is not allowed",
|
|
|
|
consul.ConsulServiceID)
|
2014-10-14 22:05:41 +00:00
|
|
|
}
|
|
|
|
|
2015-01-26 16:06:49 +00:00
|
|
|
// Validate ServiceID
|
|
|
|
if serviceID == "" {
|
|
|
|
return fmt.Errorf("ServiceID missing")
|
|
|
|
}
|
|
|
|
|
2015-09-15 12:22:08 +00:00
|
|
|
// Remove service immediately
|
2014-01-30 21:39:02 +00:00
|
|
|
a.state.RemoveService(serviceID)
|
|
|
|
|
2014-11-24 08:36:03 +00:00
|
|
|
// Remove the service from the data dir
|
2014-11-26 07:58:02 +00:00
|
|
|
if persist {
|
|
|
|
if err := a.purgeService(serviceID); err != nil {
|
|
|
|
return err
|
|
|
|
}
|
2014-11-24 08:36:03 +00:00
|
|
|
}
|
|
|
|
|
2014-01-30 21:39:02 +00:00
|
|
|
// Deregister any associated health checks
|
2015-05-07 22:30:01 +00:00
|
|
|
for checkID, health := range a.state.Checks() {
|
|
|
|
if health.ServiceID != serviceID {
|
2015-01-14 01:52:17 +00:00
|
|
|
continue
|
|
|
|
}
|
|
|
|
if err := a.RemoveCheck(checkID, persist); err != nil {
|
|
|
|
return err
|
|
|
|
}
|
2015-01-08 06:26:40 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
log.Printf("[DEBUG] agent: removed service %q", serviceID)
|
|
|
|
return nil
|
2014-01-30 21:39:02 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// AddCheck is used to add a health check to the agent.
|
|
|
|
// This entry is persistent and the agent will make a best effort to
|
|
|
|
// ensure it is registered. The Check may include a CheckType which
|
|
|
|
// is used to automatically update the check status
|
2015-05-05 00:36:17 +00:00
|
|
|
func (a *Agent) AddCheck(check *structs.HealthCheck, chkType *CheckType, persist bool, token string) error {
|
2014-01-30 21:39:02 +00:00
|
|
|
if check.CheckID == "" {
|
|
|
|
return fmt.Errorf("CheckID missing")
|
|
|
|
}
|
|
|
|
if chkType != nil && !chkType.Valid() {
|
|
|
|
return fmt.Errorf("Check type is not valid")
|
|
|
|
}
|
|
|
|
|
2015-01-14 01:52:17 +00:00
|
|
|
if check.ServiceID != "" {
|
|
|
|
svc, ok := a.state.Services()[check.ServiceID]
|
|
|
|
if !ok {
|
|
|
|
return fmt.Errorf("ServiceID %q does not exist", check.ServiceID)
|
|
|
|
}
|
|
|
|
check.ServiceName = svc.Service
|
|
|
|
}
|
|
|
|
|
2014-01-30 21:39:02 +00:00
|
|
|
a.checkLock.Lock()
|
|
|
|
defer a.checkLock.Unlock()
|
|
|
|
|
|
|
|
// Check if already registered
|
|
|
|
if chkType != nil {
|
|
|
|
if chkType.IsTTL() {
|
2014-06-17 23:48:19 +00:00
|
|
|
if existing, ok := a.checkTTLs[check.CheckID]; ok {
|
|
|
|
existing.Stop()
|
2014-01-30 21:39:02 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
ttl := &CheckTTL{
|
|
|
|
Notify: &a.state,
|
|
|
|
CheckID: check.CheckID,
|
|
|
|
TTL: chkType.TTL,
|
|
|
|
Logger: a.logger,
|
|
|
|
}
|
2015-06-05 23:17:07 +00:00
|
|
|
|
|
|
|
// Restore persisted state, if any
|
2015-06-08 16:35:10 +00:00
|
|
|
if err := a.loadCheckState(check); err != nil {
|
2015-06-05 23:17:07 +00:00
|
|
|
a.logger.Printf("[WARN] agent: failed restoring state for check %q: %s",
|
|
|
|
check.CheckID, err)
|
|
|
|
}
|
|
|
|
|
2014-01-30 21:39:02 +00:00
|
|
|
ttl.Start()
|
|
|
|
a.checkTTLs[check.CheckID] = ttl
|
|
|
|
|
2015-01-09 22:43:24 +00:00
|
|
|
} else if chkType.IsHTTP() {
|
|
|
|
if existing, ok := a.checkHTTPs[check.CheckID]; ok {
|
|
|
|
existing.Stop()
|
|
|
|
}
|
|
|
|
if chkType.Interval < MinInterval {
|
|
|
|
a.logger.Println(fmt.Sprintf("[WARN] agent: check '%s' has interval below minimum of %v",
|
|
|
|
check.CheckID, MinInterval))
|
|
|
|
chkType.Interval = MinInterval
|
|
|
|
}
|
|
|
|
|
|
|
|
http := &CheckHTTP{
|
|
|
|
Notify: &a.state,
|
|
|
|
CheckID: check.CheckID,
|
|
|
|
HTTP: chkType.HTTP,
|
|
|
|
Interval: chkType.Interval,
|
2015-01-29 06:37:48 +00:00
|
|
|
Timeout: chkType.Timeout,
|
2015-01-09 22:43:24 +00:00
|
|
|
Logger: a.logger,
|
|
|
|
}
|
|
|
|
http.Start()
|
|
|
|
a.checkHTTPs[check.CheckID] = http
|
|
|
|
|
2015-07-23 11:45:08 +00:00
|
|
|
} else if chkType.IsTCP() {
|
|
|
|
if existing, ok := a.checkTCPs[check.CheckID]; ok {
|
|
|
|
existing.Stop()
|
|
|
|
}
|
|
|
|
if chkType.Interval < MinInterval {
|
|
|
|
a.logger.Println(fmt.Sprintf("[WARN] agent: check '%s' has interval below minimum of %v",
|
|
|
|
check.CheckID, MinInterval))
|
|
|
|
chkType.Interval = MinInterval
|
|
|
|
}
|
|
|
|
|
|
|
|
tcp := &CheckTCP{
|
|
|
|
Notify: &a.state,
|
|
|
|
CheckID: check.CheckID,
|
|
|
|
TCP: chkType.TCP,
|
|
|
|
Interval: chkType.Interval,
|
|
|
|
Timeout: chkType.Timeout,
|
|
|
|
Logger: a.logger,
|
|
|
|
}
|
|
|
|
tcp.Start()
|
|
|
|
a.checkTCPs[check.CheckID] = tcp
|
|
|
|
|
2015-10-22 22:29:13 +00:00
|
|
|
} else if chkType.IsDocker() {
|
|
|
|
if existing, ok := a.checkDockers[check.CheckID]; ok {
|
|
|
|
existing.Stop()
|
|
|
|
}
|
|
|
|
if chkType.Interval < MinInterval {
|
|
|
|
a.logger.Println(fmt.Sprintf("[WARN] agent: check '%s' has interval below minimum of %v",
|
|
|
|
check.CheckID, MinInterval))
|
|
|
|
chkType.Interval = MinInterval
|
|
|
|
}
|
|
|
|
|
|
|
|
dockerCheck := &CheckDocker{
|
|
|
|
Notify: &a.state,
|
|
|
|
CheckID: check.CheckID,
|
2015-11-18 15:40:02 +00:00
|
|
|
DockerContainerID: chkType.DockerContainerID,
|
2015-10-22 22:29:13 +00:00
|
|
|
Shell: chkType.Shell,
|
|
|
|
Script: chkType.Script,
|
|
|
|
Interval: chkType.Interval,
|
|
|
|
Logger: a.logger,
|
|
|
|
}
|
2015-10-26 23:45:12 +00:00
|
|
|
if err := dockerCheck.Init(); err != nil {
|
|
|
|
return err
|
|
|
|
}
|
2015-10-22 22:29:13 +00:00
|
|
|
dockerCheck.Start()
|
|
|
|
a.checkDockers[check.CheckID] = dockerCheck
|
2015-10-27 02:52:32 +00:00
|
|
|
} else if chkType.IsMonitor() {
|
2015-10-26 22:02:23 +00:00
|
|
|
if existing, ok := a.checkMonitors[check.CheckID]; ok {
|
|
|
|
existing.Stop()
|
|
|
|
}
|
|
|
|
if chkType.Interval < MinInterval {
|
|
|
|
a.logger.Println(fmt.Sprintf("[WARN] agent: check '%s' has interval below minimum of %v",
|
|
|
|
check.CheckID, MinInterval))
|
|
|
|
chkType.Interval = MinInterval
|
|
|
|
}
|
|
|
|
|
|
|
|
monitor := &CheckMonitor{
|
|
|
|
Notify: &a.state,
|
|
|
|
CheckID: check.CheckID,
|
|
|
|
Script: chkType.Script,
|
|
|
|
Interval: chkType.Interval,
|
2016-02-26 03:18:20 +00:00
|
|
|
Timeout: chkType.Timeout,
|
2015-10-26 22:02:23 +00:00
|
|
|
Logger: a.logger,
|
2016-01-13 05:10:25 +00:00
|
|
|
ReapLock: &a.reapLock,
|
2015-10-26 22:02:23 +00:00
|
|
|
}
|
|
|
|
monitor.Start()
|
|
|
|
a.checkMonitors[check.CheckID] = monitor
|
2015-10-27 02:52:32 +00:00
|
|
|
} else {
|
|
|
|
return fmt.Errorf("Check type is not valid")
|
2014-01-30 21:39:02 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Add to the local state for anti-entropy
|
2015-05-05 00:36:17 +00:00
|
|
|
a.state.AddCheck(check, token)
|
2014-11-24 08:36:03 +00:00
|
|
|
|
|
|
|
// Persist the check
|
2015-11-29 04:40:05 +00:00
|
|
|
if persist && !a.config.DevMode {
|
2014-11-29 20:25:01 +00:00
|
|
|
return a.persistCheck(check, chkType)
|
2014-11-25 03:24:32 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
return nil
|
2014-01-30 21:39:02 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// RemoveCheck is used to remove a health check.
|
|
|
|
// The agent will make a best effort to ensure it is deregistered
|
2016-06-06 20:19:31 +00:00
|
|
|
func (a *Agent) RemoveCheck(checkID types.CheckID, persist bool) error {
|
2015-01-26 16:06:49 +00:00
|
|
|
// Validate CheckID
|
|
|
|
if checkID == "" {
|
|
|
|
return fmt.Errorf("CheckID missing")
|
|
|
|
}
|
|
|
|
|
2014-01-30 21:39:02 +00:00
|
|
|
// Add to the local state for anti-entropy
|
|
|
|
a.state.RemoveCheck(checkID)
|
|
|
|
|
|
|
|
a.checkLock.Lock()
|
|
|
|
defer a.checkLock.Unlock()
|
|
|
|
|
|
|
|
// Stop any monitors
|
|
|
|
if check, ok := a.checkMonitors[checkID]; ok {
|
|
|
|
check.Stop()
|
|
|
|
delete(a.checkMonitors, checkID)
|
|
|
|
}
|
2015-01-12 22:34:39 +00:00
|
|
|
if check, ok := a.checkHTTPs[checkID]; ok {
|
|
|
|
check.Stop()
|
|
|
|
delete(a.checkHTTPs, checkID)
|
|
|
|
}
|
2015-07-23 11:45:08 +00:00
|
|
|
if check, ok := a.checkTCPs[checkID]; ok {
|
|
|
|
check.Stop()
|
|
|
|
delete(a.checkTCPs, checkID)
|
|
|
|
}
|
2014-01-30 21:39:02 +00:00
|
|
|
if check, ok := a.checkTTLs[checkID]; ok {
|
|
|
|
check.Stop()
|
|
|
|
delete(a.checkTTLs, checkID)
|
|
|
|
}
|
2014-11-26 07:58:02 +00:00
|
|
|
if persist {
|
2015-06-05 23:57:14 +00:00
|
|
|
if err := a.purgeCheck(checkID); err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
if err := a.purgeCheckState(checkID); err != nil {
|
|
|
|
return err
|
|
|
|
}
|
2014-11-26 07:58:02 +00:00
|
|
|
}
|
2015-01-08 06:26:40 +00:00
|
|
|
log.Printf("[DEBUG] agent: removed check %q", checkID)
|
2014-11-26 07:58:02 +00:00
|
|
|
return nil
|
2014-01-30 21:39:02 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// UpdateCheck is used to update the status of a check.
|
|
|
|
// This can only be used with checks of the TTL type.
|
2016-06-06 20:19:31 +00:00
|
|
|
func (a *Agent) UpdateCheck(checkID types.CheckID, status, output string) error {
|
2014-01-30 21:39:02 +00:00
|
|
|
a.checkLock.Lock()
|
|
|
|
defer a.checkLock.Unlock()
|
|
|
|
|
|
|
|
check, ok := a.checkTTLs[checkID]
|
|
|
|
if !ok {
|
2016-06-20 22:25:21 +00:00
|
|
|
return fmt.Errorf("CheckID %q does not have associated TTL", checkID)
|
2014-01-30 21:39:02 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// Set the status through CheckTTL to reset the TTL
|
|
|
|
check.SetStatus(status, output)
|
2015-06-05 23:17:07 +00:00
|
|
|
|
2015-11-29 04:40:05 +00:00
|
|
|
if a.config.DevMode {
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
2015-06-05 23:17:07 +00:00
|
|
|
// Always persist the state for TTL checks
|
|
|
|
if err := a.persistCheckState(check, status, output); err != nil {
|
|
|
|
return fmt.Errorf("failed persisting state for check %q: %s", checkID, err)
|
|
|
|
}
|
|
|
|
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
2016-06-15 18:02:51 +00:00
|
|
|
// TranslateAddr is used to provide the final, translated address for a node,
|
|
|
|
// depending on how this agent and the other node are configured.
|
|
|
|
func (a *Agent) TranslateAddr(dc string, addr string, taggedAddr map[string]string) string {
|
|
|
|
if a.config.TranslateWanAddrs && (a.config.Datacenter != dc) {
|
|
|
|
wanAddr := taggedAddr["wan"]
|
|
|
|
if wanAddr != "" {
|
|
|
|
addr = wanAddr
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return addr
|
|
|
|
}
|
|
|
|
|
2015-06-05 23:17:07 +00:00
|
|
|
// persistCheckState is used to record the check status into the data dir.
|
|
|
|
// This allows the state to be restored on a later agent start. Currently
|
|
|
|
// only useful for TTL based checks.
|
|
|
|
func (a *Agent) persistCheckState(check *CheckTTL, status, output string) error {
|
|
|
|
// Create the persisted state
|
|
|
|
state := persistedCheckState{
|
|
|
|
CheckID: check.CheckID,
|
|
|
|
Status: status,
|
|
|
|
Output: output,
|
|
|
|
Expires: time.Now().Add(check.TTL).Unix(),
|
|
|
|
}
|
|
|
|
|
|
|
|
// Encode the state
|
|
|
|
buf, err := json.Marshal(state)
|
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
|
|
|
|
// Create the state dir if it doesn't exist
|
|
|
|
dir := filepath.Join(a.config.DataDir, checkStateDir)
|
|
|
|
if err := os.MkdirAll(dir, 0700); err != nil {
|
|
|
|
return fmt.Errorf("failed creating check state dir %q: %s", dir, err)
|
|
|
|
}
|
|
|
|
|
|
|
|
// Write the state to the file
|
2016-06-06 08:53:30 +00:00
|
|
|
file := filepath.Join(dir, checkIDHash(check.CheckID))
|
2015-06-05 23:17:07 +00:00
|
|
|
if err := ioutil.WriteFile(file, buf, 0600); err != nil {
|
|
|
|
return fmt.Errorf("failed writing file %q: %s", file, err)
|
|
|
|
}
|
|
|
|
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
2015-06-08 16:35:10 +00:00
|
|
|
// loadCheckState is used to restore the persisted state of a check.
|
|
|
|
func (a *Agent) loadCheckState(check *structs.HealthCheck) error {
|
2015-06-05 23:17:07 +00:00
|
|
|
// Try to read the persisted state for this check
|
2016-06-06 08:53:30 +00:00
|
|
|
file := filepath.Join(a.config.DataDir, checkStateDir, checkIDHash(check.CheckID))
|
2015-06-05 23:17:07 +00:00
|
|
|
buf, err := ioutil.ReadFile(file)
|
|
|
|
if err != nil {
|
|
|
|
if os.IsNotExist(err) {
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
return fmt.Errorf("failed reading file %q: %s", file, err)
|
|
|
|
}
|
|
|
|
|
|
|
|
// Decode the state data
|
|
|
|
var p persistedCheckState
|
|
|
|
if err := json.Unmarshal(buf, &p); err != nil {
|
|
|
|
return fmt.Errorf("failed decoding check state: %s", err)
|
|
|
|
}
|
|
|
|
|
|
|
|
// Check if the state has expired
|
2015-06-05 23:45:05 +00:00
|
|
|
if time.Now().Unix() >= p.Expires {
|
2015-06-05 23:17:07 +00:00
|
|
|
a.logger.Printf("[DEBUG] agent: check state expired for %q, not restoring", check.CheckID)
|
2015-06-05 23:59:41 +00:00
|
|
|
return a.purgeCheckState(check.CheckID)
|
2015-06-05 23:17:07 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// Restore the fields from the state
|
|
|
|
check.Output = p.Output
|
|
|
|
check.Status = p.Status
|
2014-01-30 21:39:02 +00:00
|
|
|
return nil
|
|
|
|
}
|
2014-02-24 00:42:39 +00:00
|
|
|
|
2015-06-05 23:57:14 +00:00
|
|
|
// purgeCheckState is used to purge the state of a check from the data dir
|
2016-06-06 20:19:31 +00:00
|
|
|
func (a *Agent) purgeCheckState(checkID types.CheckID) error {
|
2016-06-06 08:53:30 +00:00
|
|
|
file := filepath.Join(a.config.DataDir, checkStateDir, checkIDHash(checkID))
|
2015-06-05 23:57:14 +00:00
|
|
|
err := os.Remove(file)
|
|
|
|
if os.IsNotExist(err) {
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
|
2014-02-24 00:42:39 +00:00
|
|
|
// Stats is used to get various debugging state from the sub-systems
|
|
|
|
func (a *Agent) Stats() map[string]map[string]string {
|
|
|
|
toString := func(v uint64) string {
|
|
|
|
return strconv.FormatUint(v, 10)
|
|
|
|
}
|
|
|
|
var stats map[string]map[string]string
|
|
|
|
if a.server != nil {
|
|
|
|
stats = a.server.Stats()
|
|
|
|
} else {
|
|
|
|
stats = a.client.Stats()
|
|
|
|
}
|
|
|
|
stats["agent"] = map[string]string{
|
|
|
|
"check_monitors": toString(uint64(len(a.checkMonitors))),
|
|
|
|
"check_ttls": toString(uint64(len(a.checkTTLs))),
|
|
|
|
"checks": toString(uint64(len(a.state.checks))),
|
|
|
|
"services": toString(uint64(len(a.state.services))),
|
|
|
|
}
|
2014-06-06 21:40:22 +00:00
|
|
|
|
|
|
|
revision := a.config.Revision
|
|
|
|
if len(revision) > 8 {
|
|
|
|
revision = revision[:8]
|
|
|
|
}
|
|
|
|
stats["build"] = map[string]string{
|
|
|
|
"revision": revision,
|
|
|
|
"version": a.config.Version,
|
|
|
|
"prerelease": a.config.VersionPrerelease,
|
|
|
|
}
|
2014-02-24 00:42:39 +00:00
|
|
|
return stats
|
|
|
|
}
|
2014-05-06 03:29:50 +00:00
|
|
|
|
2014-05-06 19:43:33 +00:00
|
|
|
// storePid is used to write out our PID to a file if necessary
|
2014-05-06 16:57:53 +00:00
|
|
|
func (a *Agent) storePid() error {
|
2014-05-06 19:43:33 +00:00
|
|
|
// Quit fast if no pidfile
|
2014-05-06 03:29:50 +00:00
|
|
|
pidPath := a.config.PidFile
|
2014-05-06 19:43:33 +00:00
|
|
|
if pidPath == "" {
|
|
|
|
return nil
|
|
|
|
}
|
2014-05-06 03:29:50 +00:00
|
|
|
|
2014-05-06 19:43:33 +00:00
|
|
|
// Open the PID file
|
|
|
|
pidFile, err := os.OpenFile(pidPath, os.O_CREATE|os.O_WRONLY|os.O_TRUNC, 0666)
|
|
|
|
if err != nil {
|
|
|
|
return fmt.Errorf("Could not open pid file: %v", err)
|
2014-05-06 03:29:50 +00:00
|
|
|
}
|
2014-05-06 19:43:33 +00:00
|
|
|
defer pidFile.Close()
|
2014-05-06 16:57:53 +00:00
|
|
|
|
2014-05-06 19:43:33 +00:00
|
|
|
// Write out the PID
|
|
|
|
pid := os.Getpid()
|
|
|
|
_, err = pidFile.WriteString(fmt.Sprintf("%d", pid))
|
|
|
|
if err != nil {
|
|
|
|
return fmt.Errorf("Could not write to pid file: %s", err)
|
|
|
|
}
|
2014-05-06 16:57:53 +00:00
|
|
|
return nil
|
2014-05-06 03:29:50 +00:00
|
|
|
}
|
|
|
|
|
2014-05-06 19:43:33 +00:00
|
|
|
// deletePid is used to delete our PID on exit
|
2014-05-06 16:57:53 +00:00
|
|
|
func (a *Agent) deletePid() error {
|
2014-05-06 19:43:33 +00:00
|
|
|
// Quit fast if no pidfile
|
2014-05-06 03:29:50 +00:00
|
|
|
pidPath := a.config.PidFile
|
2014-05-06 19:43:33 +00:00
|
|
|
if pidPath == "" {
|
|
|
|
return nil
|
|
|
|
}
|
2014-05-06 03:29:50 +00:00
|
|
|
|
2014-05-06 19:43:33 +00:00
|
|
|
stat, err := os.Stat(pidPath)
|
|
|
|
if err != nil {
|
|
|
|
return fmt.Errorf("Could not remove pid file: %s", err)
|
|
|
|
}
|
2014-05-06 03:29:50 +00:00
|
|
|
|
2014-05-06 19:43:33 +00:00
|
|
|
if stat.IsDir() {
|
|
|
|
return fmt.Errorf("Specified pid file path is directory")
|
2014-05-06 03:29:50 +00:00
|
|
|
}
|
2014-05-06 16:57:53 +00:00
|
|
|
|
2014-05-06 19:43:33 +00:00
|
|
|
err = os.Remove(pidPath)
|
|
|
|
if err != nil {
|
|
|
|
return fmt.Errorf("Could not remove pid file: %s", err)
|
|
|
|
}
|
2014-05-06 16:57:53 +00:00
|
|
|
return nil
|
2014-05-06 03:29:50 +00:00
|
|
|
}
|
2014-11-26 07:58:02 +00:00
|
|
|
|
2015-01-08 02:05:46 +00:00
|
|
|
// loadServices will load service definitions from configuration and persisted
|
|
|
|
// definitions on disk, and load them into the local agent.
|
|
|
|
func (a *Agent) loadServices(conf *Config) error {
|
2014-11-26 07:58:02 +00:00
|
|
|
// Register the services from config
|
|
|
|
for _, service := range conf.Services {
|
|
|
|
ns := service.NodeService()
|
2015-01-14 01:52:17 +00:00
|
|
|
chkTypes := service.CheckTypes()
|
2015-05-05 00:36:17 +00:00
|
|
|
if err := a.AddService(ns, chkTypes, false, service.Token); err != nil {
|
2014-11-26 07:58:02 +00:00
|
|
|
return fmt.Errorf("Failed to register service '%s': %v", service.ID, err)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Load any persisted services
|
2015-01-08 05:24:47 +00:00
|
|
|
svcDir := filepath.Join(a.config.DataDir, servicesDir)
|
2015-06-04 21:33:30 +00:00
|
|
|
files, err := ioutil.ReadDir(svcDir)
|
|
|
|
if err != nil {
|
|
|
|
if os.IsNotExist(err) {
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
return fmt.Errorf("Failed reading services dir %q: %s", svcDir, err)
|
2014-11-26 07:58:02 +00:00
|
|
|
}
|
2015-06-04 21:33:30 +00:00
|
|
|
for _, fi := range files {
|
|
|
|
// Skip all dirs
|
|
|
|
if fi.IsDir() {
|
|
|
|
continue
|
|
|
|
}
|
2014-11-26 07:58:02 +00:00
|
|
|
|
2015-06-04 21:33:30 +00:00
|
|
|
// Open the file for reading
|
|
|
|
file := filepath.Join(svcDir, fi.Name())
|
|
|
|
fh, err := os.Open(file)
|
2015-01-08 05:24:47 +00:00
|
|
|
if err != nil {
|
2015-06-04 21:33:30 +00:00
|
|
|
return fmt.Errorf("failed opening service file %q: %s", file, err)
|
2015-01-08 05:24:47 +00:00
|
|
|
}
|
2015-06-04 21:33:30 +00:00
|
|
|
|
|
|
|
// Read the contents into a buffer
|
|
|
|
buf, err := ioutil.ReadAll(fh)
|
|
|
|
fh.Close()
|
2015-01-08 05:24:47 +00:00
|
|
|
if err != nil {
|
2015-06-04 21:33:30 +00:00
|
|
|
return fmt.Errorf("failed reading service file %q: %s", file, err)
|
2015-01-08 05:24:47 +00:00
|
|
|
}
|
|
|
|
|
2015-06-04 21:33:30 +00:00
|
|
|
// Try decoding the service definition
|
|
|
|
var p persistedService
|
|
|
|
if err := json.Unmarshal(buf, &p); err != nil {
|
2015-04-28 19:18:41 +00:00
|
|
|
// Backwards-compatibility for pre-0.5.1 persisted services
|
2015-06-04 21:33:30 +00:00
|
|
|
if err := json.Unmarshal(buf, &p.Service); err != nil {
|
|
|
|
return fmt.Errorf("failed decoding service file %q: %s", file, err)
|
2015-04-28 19:18:41 +00:00
|
|
|
}
|
2015-01-08 05:24:47 +00:00
|
|
|
}
|
2015-06-04 21:33:30 +00:00
|
|
|
serviceID := p.Service.ID
|
2015-01-08 05:24:47 +00:00
|
|
|
|
2015-06-04 21:33:30 +00:00
|
|
|
if _, ok := a.state.services[serviceID]; ok {
|
2015-01-08 05:24:47 +00:00
|
|
|
// Purge previously persisted service. This allows config to be
|
|
|
|
// preferred over services persisted from the API.
|
2015-01-08 06:26:40 +00:00
|
|
|
a.logger.Printf("[DEBUG] agent: service %q exists, not restoring from %q",
|
2015-06-04 21:33:30 +00:00
|
|
|
serviceID, file)
|
|
|
|
if err := a.purgeService(serviceID); err != nil {
|
|
|
|
return fmt.Errorf("failed purging service %q: %s", serviceID, err)
|
|
|
|
}
|
2015-01-08 05:24:47 +00:00
|
|
|
} else {
|
2015-01-08 06:26:40 +00:00
|
|
|
a.logger.Printf("[DEBUG] agent: restored service definition %q from %q",
|
2015-06-04 21:33:30 +00:00
|
|
|
serviceID, file)
|
|
|
|
if err := a.AddService(p.Service, nil, false, p.Token); err != nil {
|
|
|
|
return fmt.Errorf("failed adding service %q: %s", serviceID, err)
|
|
|
|
}
|
2015-01-08 05:24:47 +00:00
|
|
|
}
|
2015-06-04 21:33:30 +00:00
|
|
|
}
|
2015-01-08 05:24:47 +00:00
|
|
|
|
2015-06-04 21:33:30 +00:00
|
|
|
return nil
|
2014-11-26 07:58:02 +00:00
|
|
|
}
|
|
|
|
|
2015-01-08 02:05:46 +00:00
|
|
|
// unloadServices will deregister all services other than the 'consul' service
|
|
|
|
// known to the local agent.
|
|
|
|
func (a *Agent) unloadServices() error {
|
|
|
|
for _, service := range a.state.Services() {
|
|
|
|
if service.ID == consul.ConsulServiceID {
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
if err := a.RemoveService(service.ID, false); err != nil {
|
|
|
|
return fmt.Errorf("Failed deregistering service '%s': %v", service.ID, err)
|
2014-11-26 07:58:02 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2015-01-08 02:05:46 +00:00
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
|
|
|
// loadChecks loads check definitions and/or persisted check definitions from
|
|
|
|
// disk and re-registers them with the local agent.
|
|
|
|
func (a *Agent) loadChecks(conf *Config) error {
|
2014-11-26 07:58:02 +00:00
|
|
|
// Register the checks from config
|
|
|
|
for _, check := range conf.Checks {
|
|
|
|
health := check.HealthCheck(conf.NodeName)
|
|
|
|
chkType := &check.CheckType
|
2015-05-05 00:36:17 +00:00
|
|
|
if err := a.AddCheck(health, chkType, false, check.Token); err != nil {
|
2014-11-26 07:58:02 +00:00
|
|
|
return fmt.Errorf("Failed to register check '%s': %v %v", check.Name, err, check)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Load any persisted checks
|
2015-01-08 05:24:47 +00:00
|
|
|
checkDir := filepath.Join(a.config.DataDir, checksDir)
|
2015-06-04 21:33:30 +00:00
|
|
|
files, err := ioutil.ReadDir(checkDir)
|
|
|
|
if err != nil {
|
|
|
|
if os.IsNotExist(err) {
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
return fmt.Errorf("Failed reading checks dir %q: %s", checkDir, err)
|
2014-11-26 07:58:02 +00:00
|
|
|
}
|
2015-06-04 21:33:30 +00:00
|
|
|
for _, fi := range files {
|
|
|
|
// Ignore dirs - we only care about the check definition files
|
|
|
|
if fi.IsDir() {
|
|
|
|
continue
|
|
|
|
}
|
2014-11-26 07:58:02 +00:00
|
|
|
|
2015-06-04 21:33:30 +00:00
|
|
|
// Open the file for reading
|
|
|
|
file := filepath.Join(checkDir, fi.Name())
|
|
|
|
fh, err := os.Open(file)
|
2015-01-08 05:24:47 +00:00
|
|
|
if err != nil {
|
2015-06-04 21:33:30 +00:00
|
|
|
return fmt.Errorf("Failed opening check file %q: %s", file, err)
|
2015-01-08 05:24:47 +00:00
|
|
|
}
|
2015-06-04 21:33:30 +00:00
|
|
|
|
|
|
|
// Read the contents into a buffer
|
|
|
|
buf, err := ioutil.ReadAll(fh)
|
|
|
|
fh.Close()
|
2015-01-08 05:24:47 +00:00
|
|
|
if err != nil {
|
2015-06-04 21:33:30 +00:00
|
|
|
return fmt.Errorf("failed reading check file %q: %s", file, err)
|
2015-01-08 05:24:47 +00:00
|
|
|
}
|
|
|
|
|
2015-06-04 21:33:30 +00:00
|
|
|
// Decode the check
|
2015-01-08 05:24:47 +00:00
|
|
|
var p persistedCheck
|
2015-06-04 21:33:30 +00:00
|
|
|
if err := json.Unmarshal(buf, &p); err != nil {
|
|
|
|
return fmt.Errorf("Failed decoding check file %q: %s", file, err)
|
2015-01-08 05:24:47 +00:00
|
|
|
}
|
2015-06-04 21:33:30 +00:00
|
|
|
checkID := p.Check.CheckID
|
2015-01-08 05:24:47 +00:00
|
|
|
|
2015-06-04 21:33:30 +00:00
|
|
|
if _, ok := a.state.checks[checkID]; ok {
|
2015-01-08 05:24:47 +00:00
|
|
|
// Purge previously persisted check. This allows config to be
|
|
|
|
// preferred over persisted checks from the API.
|
2015-01-08 06:26:40 +00:00
|
|
|
a.logger.Printf("[DEBUG] agent: check %q exists, not restoring from %q",
|
2015-06-04 21:33:30 +00:00
|
|
|
checkID, file)
|
|
|
|
if err := a.purgeCheck(checkID); err != nil {
|
|
|
|
return fmt.Errorf("Failed purging check %q: %s", checkID, err)
|
|
|
|
}
|
2015-01-08 05:24:47 +00:00
|
|
|
} else {
|
|
|
|
// Default check to critical to avoid placing potentially unhealthy
|
|
|
|
// services into the active pool
|
|
|
|
p.Check.Status = structs.HealthCritical
|
|
|
|
|
2015-05-05 00:36:17 +00:00
|
|
|
if err := a.AddCheck(p.Check, p.ChkType, false, p.Token); err != nil {
|
2015-03-11 23:13:19 +00:00
|
|
|
// Purge the check if it is unable to be restored.
|
|
|
|
a.logger.Printf("[WARN] agent: Failed to restore check %q: %s",
|
2015-06-04 21:33:30 +00:00
|
|
|
checkID, err)
|
|
|
|
if err := a.purgeCheck(checkID); err != nil {
|
|
|
|
return fmt.Errorf("Failed purging check %q: %s", checkID, err)
|
|
|
|
}
|
2015-03-11 23:13:19 +00:00
|
|
|
}
|
2015-01-08 06:26:40 +00:00
|
|
|
a.logger.Printf("[DEBUG] agent: restored health check %q from %q",
|
2015-06-04 21:33:30 +00:00
|
|
|
p.Check.CheckID, file)
|
2015-01-08 05:24:47 +00:00
|
|
|
}
|
2015-06-04 21:33:30 +00:00
|
|
|
}
|
2015-01-08 05:24:47 +00:00
|
|
|
|
2015-06-04 21:33:30 +00:00
|
|
|
return nil
|
2014-11-26 07:58:02 +00:00
|
|
|
}
|
2015-01-08 02:05:46 +00:00
|
|
|
|
|
|
|
// unloadChecks will deregister all checks known to the local agent.
|
|
|
|
func (a *Agent) unloadChecks() error {
|
|
|
|
for _, check := range a.state.Checks() {
|
|
|
|
if err := a.RemoveCheck(check.CheckID, false); err != nil {
|
|
|
|
return fmt.Errorf("Failed deregistering check '%s': %s", check.CheckID, err)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return nil
|
|
|
|
}
|
2015-01-15 08:16:34 +00:00
|
|
|
|
2015-02-17 20:00:04 +00:00
|
|
|
// snapshotCheckState is used to snapshot the current state of the health
|
|
|
|
// checks. This is done before we reload our checks, so that we can properly
|
|
|
|
// restore into the same state.
|
2016-06-06 20:19:31 +00:00
|
|
|
func (a *Agent) snapshotCheckState() map[types.CheckID]*structs.HealthCheck {
|
2015-02-17 20:00:04 +00:00
|
|
|
return a.state.Checks()
|
|
|
|
}
|
|
|
|
|
|
|
|
// restoreCheckState is used to reset the health state based on a snapshot.
|
|
|
|
// This is done after we finish the reload to avoid any unnecessary flaps
|
|
|
|
// in health state and potential session invalidations.
|
2016-06-06 20:19:31 +00:00
|
|
|
func (a *Agent) restoreCheckState(snap map[types.CheckID]*structs.HealthCheck) {
|
2015-02-17 20:00:04 +00:00
|
|
|
for id, check := range snap {
|
|
|
|
a.state.UpdateCheck(id, check.Status, check.Output)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2015-01-15 20:20:57 +00:00
|
|
|
// serviceMaintCheckID returns the ID of a given service's maintenance check
|
2016-06-06 20:19:31 +00:00
|
|
|
func serviceMaintCheckID(serviceID string) types.CheckID {
|
|
|
|
return types.CheckID(fmt.Sprintf("%s:%s", serviceMaintCheckPrefix, serviceID))
|
2015-01-15 20:20:57 +00:00
|
|
|
}
|
|
|
|
|
2015-01-15 08:25:36 +00:00
|
|
|
// EnableServiceMaintenance will register a false health check against the given
|
|
|
|
// service ID with critical status. This will exclude the service from queries.
|
2015-09-10 18:43:59 +00:00
|
|
|
func (a *Agent) EnableServiceMaintenance(serviceID, reason, token string) error {
|
2015-01-15 18:51:00 +00:00
|
|
|
service, ok := a.state.Services()[serviceID]
|
|
|
|
if !ok {
|
2015-01-15 08:16:34 +00:00
|
|
|
return fmt.Errorf("No service registered with ID %q", serviceID)
|
|
|
|
}
|
|
|
|
|
2015-01-15 20:20:57 +00:00
|
|
|
// Check if maintenance mode is not already enabled
|
|
|
|
checkID := serviceMaintCheckID(serviceID)
|
|
|
|
if _, ok := a.state.Checks()[checkID]; ok {
|
2015-01-15 18:51:00 +00:00
|
|
|
return nil
|
2015-01-15 08:16:34 +00:00
|
|
|
}
|
|
|
|
|
2015-01-21 20:21:57 +00:00
|
|
|
// Use default notes if no reason provided
|
|
|
|
if reason == "" {
|
2015-01-21 22:45:09 +00:00
|
|
|
reason = defaultServiceMaintReason
|
2015-01-21 20:21:57 +00:00
|
|
|
}
|
|
|
|
|
2015-01-15 08:16:34 +00:00
|
|
|
// Create and register the critical health check
|
|
|
|
check := &structs.HealthCheck{
|
|
|
|
Node: a.config.NodeName,
|
2015-01-15 20:20:57 +00:00
|
|
|
CheckID: checkID,
|
2015-01-15 08:16:34 +00:00
|
|
|
Name: "Service Maintenance Mode",
|
2015-01-21 20:21:57 +00:00
|
|
|
Notes: reason,
|
2015-01-15 08:16:34 +00:00
|
|
|
ServiceID: service.ID,
|
|
|
|
ServiceName: service.Service,
|
|
|
|
Status: structs.HealthCritical,
|
|
|
|
}
|
2015-09-10 18:43:59 +00:00
|
|
|
a.AddCheck(check, nil, true, token)
|
2015-01-22 19:14:28 +00:00
|
|
|
a.logger.Printf("[INFO] agent: Service %q entered maintenance mode", serviceID)
|
2015-01-15 08:16:34 +00:00
|
|
|
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
2015-01-15 08:25:36 +00:00
|
|
|
// DisableServiceMaintenance will deregister the fake maintenance mode check
|
|
|
|
// if the service has been marked as in maintenance.
|
2015-01-15 08:16:34 +00:00
|
|
|
func (a *Agent) DisableServiceMaintenance(serviceID string) error {
|
2015-01-15 18:51:00 +00:00
|
|
|
if _, ok := a.state.Services()[serviceID]; !ok {
|
2015-01-15 08:16:34 +00:00
|
|
|
return fmt.Errorf("No service registered with ID %q", serviceID)
|
|
|
|
}
|
|
|
|
|
2015-01-15 20:20:57 +00:00
|
|
|
// Check if maintenance mode is enabled
|
|
|
|
checkID := serviceMaintCheckID(serviceID)
|
|
|
|
if _, ok := a.state.Checks()[checkID]; !ok {
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
2015-01-15 08:16:34 +00:00
|
|
|
// Deregister the maintenance check
|
2015-01-15 20:20:57 +00:00
|
|
|
a.RemoveCheck(checkID, true)
|
2015-01-22 19:14:28 +00:00
|
|
|
a.logger.Printf("[INFO] agent: Service %q left maintenance mode", serviceID)
|
2015-01-15 20:20:57 +00:00
|
|
|
|
2015-01-15 08:16:34 +00:00
|
|
|
return nil
|
|
|
|
}
|
2015-01-15 19:20:22 +00:00
|
|
|
|
|
|
|
// EnableNodeMaintenance places a node into maintenance mode.
|
2015-09-10 18:43:59 +00:00
|
|
|
func (a *Agent) EnableNodeMaintenance(reason, token string) {
|
2015-01-15 19:20:22 +00:00
|
|
|
// Ensure node maintenance is not already enabled
|
|
|
|
if _, ok := a.state.Checks()[nodeMaintCheckID]; ok {
|
|
|
|
return
|
|
|
|
}
|
|
|
|
|
2015-01-21 20:21:57 +00:00
|
|
|
// Use a default notes value
|
|
|
|
if reason == "" {
|
2015-01-21 22:45:09 +00:00
|
|
|
reason = defaultNodeMaintReason
|
2015-01-21 20:21:57 +00:00
|
|
|
}
|
|
|
|
|
2015-01-15 19:20:22 +00:00
|
|
|
// Create and register the node maintenance check
|
|
|
|
check := &structs.HealthCheck{
|
|
|
|
Node: a.config.NodeName,
|
|
|
|
CheckID: nodeMaintCheckID,
|
|
|
|
Name: "Node Maintenance Mode",
|
2015-01-21 20:21:57 +00:00
|
|
|
Notes: reason,
|
2015-01-15 19:20:22 +00:00
|
|
|
Status: structs.HealthCritical,
|
|
|
|
}
|
2015-09-10 18:43:59 +00:00
|
|
|
a.AddCheck(check, nil, true, token)
|
2015-01-22 19:14:28 +00:00
|
|
|
a.logger.Printf("[INFO] agent: Node entered maintenance mode")
|
2015-01-15 19:20:22 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// DisableNodeMaintenance removes a node from maintenance mode
|
|
|
|
func (a *Agent) DisableNodeMaintenance() {
|
2015-01-15 20:20:57 +00:00
|
|
|
if _, ok := a.state.Checks()[nodeMaintCheckID]; !ok {
|
|
|
|
return
|
|
|
|
}
|
2015-01-15 19:20:22 +00:00
|
|
|
a.RemoveCheck(nodeMaintCheckID, true)
|
2015-01-22 19:14:28 +00:00
|
|
|
a.logger.Printf("[INFO] agent: Node left maintenance mode")
|
2015-01-15 19:20:22 +00:00
|
|
|
}
|
2015-11-12 17:19:33 +00:00
|
|
|
|
|
|
|
// InjectEndpoint overrides the given endpoint with a substitute one. Note
|
|
|
|
// that not all agent methods use this mechanism, and that is should only
|
|
|
|
// be used for testing.
|
|
|
|
func (a *Agent) InjectEndpoint(endpoint string, handler interface{}) error {
|
|
|
|
if a.server == nil {
|
|
|
|
return fmt.Errorf("agent must be a server")
|
|
|
|
}
|
|
|
|
|
|
|
|
if err := a.server.InjectEndpoint(handler); err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
name := reflect.Indirect(reflect.ValueOf(handler)).Type().Name()
|
|
|
|
a.endpoints[endpoint] = name
|
|
|
|
|
|
|
|
a.logger.Printf("[WARN] agent: endpoint injected; this should only be used for testing")
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
|
|
|
// getEndpoint returns the endpoint name to use for the given endpoint,
|
|
|
|
// which may be overridden.
|
|
|
|
func (a *Agent) getEndpoint(endpoint string) string {
|
|
|
|
if override, ok := a.endpoints[endpoint]; ok {
|
|
|
|
return override
|
|
|
|
}
|
|
|
|
return endpoint
|
|
|
|
}
|