consul/agent/agent.go

2388 lines
69 KiB
Go

package agent
import (
"context"
"crypto/sha512"
"crypto/tls"
"encoding/json"
"fmt"
"io"
"io/ioutil"
"log"
"net"
"net/http"
"os"
"path/filepath"
"strconv"
"strings"
"sync"
"time"
"github.com/armon/go-metrics"
"github.com/hashicorp/consul/acl"
"github.com/hashicorp/consul/agent/config"
"github.com/hashicorp/consul/agent/consul"
"github.com/hashicorp/consul/agent/structs"
"github.com/hashicorp/consul/agent/systemd"
"github.com/hashicorp/consul/agent/token"
"github.com/hashicorp/consul/api"
"github.com/hashicorp/consul/ipaddr"
"github.com/hashicorp/consul/lib"
"github.com/hashicorp/consul/logger"
"github.com/hashicorp/consul/types"
"github.com/hashicorp/consul/watch"
"github.com/hashicorp/go-uuid"
"github.com/hashicorp/memberlist"
"github.com/hashicorp/raft"
"github.com/hashicorp/serf/serf"
"github.com/shirou/gopsutil/host"
)
const (
// Path to save agent service definitions
servicesDir = "services"
// Path to save local agent checks
checksDir = "checks"
checkStateDir = "checks/state"
// Default reasons for node/service maintenance mode
defaultNodeMaintReason = "Maintenance mode is enabled for this node, " +
"but no reason was provided. This is a default message."
defaultServiceMaintReason = "Maintenance mode is enabled for this " +
"service, but no reason was provided. This is a default message."
)
// delegate defines the interface shared by both
// consul.Client and consul.Server.
type delegate interface {
Encrypted() bool
GetLANCoordinate() (lib.CoordinateSet, error)
Leave() error
LANMembers() []serf.Member
LANMembersAllSegments() ([]serf.Member, error)
LANSegmentMembers(segment string) ([]serf.Member, error)
LocalMember() serf.Member
JoinLAN(addrs []string) (n int, err error)
RemoveFailedNode(node string) error
RPC(method string, args interface{}, reply interface{}) error
SnapshotRPC(args *structs.SnapshotRequest, in io.Reader, out io.Writer, replyFn structs.SnapshotReplyFn) error
Shutdown() error
Stats() map[string]map[string]string
}
// notifier is called after a successful JoinLAN.
type notifier interface {
Notify(string) error
}
// The agent is the long running process that is run on every machine.
// It exposes an RPC interface that is used by the CLI to control the
// agent. The agent runs the query interfaces like HTTP, DNS, and RPC.
// However, it can run in either a client, or server mode. In server
// mode, it runs a full Consul server. In client-only mode, it only forwards
// requests to other Consul servers.
type Agent struct {
// config is the agent configuration.
config *config.RuntimeConfig
// Used for writing our logs
logger *log.Logger
// Output sink for logs
LogOutput io.Writer
// Used for streaming logs to
LogWriter *logger.LogWriter
// In-memory sink used for collecting metrics
MemSink *metrics.InmemSink
// delegate is either a *consul.Server or *consul.Client
// depending on the configuration
delegate delegate
// acls is an object that helps manage local ACL enforcement.
acls *aclManager
// state stores a local representation of the node,
// services and checks. Used for anti-entropy.
state *localState
// checkReapAfter maps the check ID to a timeout after which we should
// reap its associated service
checkReapAfter map[types.CheckID]time.Duration
// checkMonitors maps the check ID to an associated monitor
checkMonitors map[types.CheckID]*CheckMonitor
// checkHTTPs maps the check ID to an associated HTTP check
checkHTTPs map[types.CheckID]*CheckHTTP
// checkTCPs maps the check ID to an associated TCP check
checkTCPs map[types.CheckID]*CheckTCP
// checkTTLs maps the check ID to an associated check TTL
checkTTLs map[types.CheckID]*CheckTTL
// checkDockers maps the check ID to an associated Docker Exec based check
checkDockers map[types.CheckID]*CheckDocker
// checkLock protects updates to the check* maps
checkLock sync.Mutex
// dockerClient is the client for performing docker health checks.
dockerClient *DockerClient
// eventCh is used to receive user events
eventCh chan serf.UserEvent
// eventBuf stores the most recent events in a ring buffer
// using eventIndex as the next index to insert into. This
// is guarded by eventLock. When an insert happens, the
// eventNotify group is notified.
eventBuf []*UserEvent
eventIndex int
eventLock sync.RWMutex
eventNotify NotifyGroup
reloadCh chan chan error
shutdown bool
shutdownCh chan struct{}
shutdownLock sync.Mutex
// joinLANNotifier is called after a successful JoinLAN.
joinLANNotifier notifier
// retryJoinCh transports errors from the retry join
// attempts.
retryJoinCh chan error
// endpoints maps unique RPC endpoint names to common ones
// to allow overriding of RPC handlers since the golang
// net/rpc server does not allow this.
endpoints map[string]string
endpointsLock sync.RWMutex
// dnsServer provides the DNS API
dnsServers []*DNSServer
// httpServers provides the HTTP API on various endpoints
httpServers []*HTTPServer
// wgServers is the wait group for all HTTP and DNS servers
wgServers sync.WaitGroup
// watchPlans tracks all the currently-running watch plans for the
// agent.
watchPlans []*watch.Plan
// tokens holds ACL tokens initially from the configuration, but can
// be updated at runtime, so should always be used instead of going to
// the configuration directly.
tokens *token.Store
}
func New(c *config.RuntimeConfig) (*Agent, error) {
if c.Datacenter == "" {
return nil, fmt.Errorf("Must configure a Datacenter")
}
if c.DataDir == "" && !c.DevMode {
return nil, fmt.Errorf("Must configure a DataDir")
}
acls, err := newACLManager(c)
if err != nil {
return nil, err
}
a := &Agent{
config: c,
acls: acls,
checkReapAfter: make(map[types.CheckID]time.Duration),
checkMonitors: make(map[types.CheckID]*CheckMonitor),
checkTTLs: make(map[types.CheckID]*CheckTTL),
checkHTTPs: make(map[types.CheckID]*CheckHTTP),
checkTCPs: make(map[types.CheckID]*CheckTCP),
checkDockers: make(map[types.CheckID]*CheckDocker),
eventCh: make(chan serf.UserEvent, 1024),
eventBuf: make([]*UserEvent, 256),
joinLANNotifier: &systemd.Notifier{},
reloadCh: make(chan chan error),
retryJoinCh: make(chan error),
shutdownCh: make(chan struct{}),
endpoints: make(map[string]string),
tokens: new(token.Store),
}
// Set up the initial state of the token store based on the config.
a.tokens.UpdateUserToken(a.config.ACLToken)
a.tokens.UpdateAgentToken(a.config.ACLAgentToken)
a.tokens.UpdateAgentMasterToken(a.config.ACLAgentMasterToken)
a.tokens.UpdateACLReplicationToken(a.config.ACLReplicationToken)
return a, nil
}
func (a *Agent) Start() error {
c := a.config
logOutput := a.LogOutput
if a.logger == nil {
if logOutput == nil {
logOutput = os.Stderr
}
a.logger = log.New(logOutput, "", log.LstdFlags)
}
// Retrieve or generate the node ID before setting up the rest of the
// agent, which depends on it.
if err := a.setupNodeID(c); err != nil {
return fmt.Errorf("Failed to setup node ID: %v", err)
}
// create the local state
a.state = NewLocalState(c, a.logger, a.tokens)
// create the config for the rpc server/client
consulCfg, err := a.consulConfig()
if err != nil {
return err
}
// link consul client/server with the state
consulCfg.ServerUp = a.state.ConsulServerUp
// Setup either the client or the server.
if c.ServerMode {
server, err := consul.NewServerLogger(consulCfg, a.logger, a.tokens)
if err != nil {
return fmt.Errorf("Failed to start Consul server: %v", err)
}
a.delegate = server
a.state.delegate = server
} else {
client, err := consul.NewClientLogger(consulCfg, a.logger)
if err != nil {
return fmt.Errorf("Failed to start Consul client: %v", err)
}
a.delegate = client
a.state.delegate = client
}
// Load checks/services/metadata.
if err := a.loadServices(c); err != nil {
return err
}
if err := a.loadChecks(c); err != nil {
return err
}
if err := a.loadMetadata(c); err != nil {
return err
}
// Start watching for critical services to deregister, based on their
// checks.
go a.reapServices()
// Start handling events.
go a.handleEvents()
// Start sending network coordinate to the server.
if !c.DisableCoordinates {
go a.sendCoordinate()
}
// Write out the PID file if necessary.
if err := a.storePid(); err != nil {
return err
}
// start DNS servers
if err := a.listenAndServeDNS(); err != nil {
return err
}
// create listeners and unstarted servers
// see comment on listenHTTP why we are doing this
httpln, err := a.listenHTTP()
if err != nil {
return err
}
// start HTTP and HTTPS servers
for _, l := range httpln {
srv := NewHTTPServer(l.Addr(), a)
if err := a.serveHTTP(l, srv); err != nil {
return err
}
a.httpServers = append(a.httpServers, srv)
}
// register watches
if err := a.reloadWatches(a.config); err != nil {
return err
}
// start retry join
go a.retryJoinLAN()
go a.retryJoinWAN()
return nil
}
func (a *Agent) listenAndServeDNS() error {
notif := make(chan net.Addr, len(a.config.DNSAddrs))
for _, addr := range a.config.DNSAddrs {
// create server
s, err := NewDNSServer(a)
if err != nil {
return err
}
a.dnsServers = append(a.dnsServers, s)
// start server
a.wgServers.Add(1)
go func(addr net.Addr) {
defer a.wgServers.Done()
err := s.ListenAndServe(addr.Network(), addr.String(), func() { notif <- addr })
if err != nil && !strings.Contains(err.Error(), "accept") {
a.logger.Printf("[ERR] agent: Error starting DNS server %s (%s): %v", addr.String(), addr.Network(), err)
}
}(addr)
}
// wait for servers to be up
timeout := time.After(time.Second)
for range a.config.DNSAddrs {
select {
case addr := <-notif:
a.logger.Printf("[INFO] agent: Started DNS server %s (%s)", addr.String(), addr.Network())
continue
case <-timeout:
return fmt.Errorf("agent: timeout starting DNS servers")
}
}
return nil
}
// listenHTTP binds listeners to the provided addresses and also returns
// pre-configured HTTP servers which are not yet started. The motivation is
// that in the current startup/shutdown setup we de-couple the listener
// creation from the server startup assuming that if any of the listeners
// cannot be bound we fail immediately and later failures do not occur.
// Therefore, starting a server with a running listener is assumed to not
// produce an error.
//
// The second motivation is that an HTTPS server needs to use the same TLSConfig
// on both the listener and the HTTP server. When listeners and servers are
// created at different times this becomes difficult to handle without keeping
// the TLS configuration somewhere or recreating it.
//
// This approach should ultimately be refactored to the point where we just
// start the server and any error should trigger a proper shutdown of the agent.
func (a *Agent) listenHTTP() ([]net.Listener, error) {
var ln []net.Listener
start := func(proto string, addrs []net.Addr) error {
for _, addr := range addrs {
var l net.Listener
var err error
switch x := addr.(type) {
case *net.UnixAddr:
l, err = a.listenSocket(x.Name)
if err != nil {
return err
}
case *net.TCPAddr:
l, err = net.Listen("tcp", x.String())
if err != nil {
return err
}
l = &tcpKeepAliveListener{l.(*net.TCPListener)}
if proto == "https" {
tlscfg, err := a.config.IncomingHTTPSConfig()
if err != nil {
return err
}
l = tls.NewListener(l, tlscfg)
}
default:
return fmt.Errorf("unsupported address type %T", addr)
}
ln = append(ln, l)
}
return nil
}
if err := start("http", a.config.HTTPAddrs); err != nil {
for _, l := range ln {
l.Close()
}
return nil, err
}
if err := start("https", a.config.HTTPSAddrs); err != nil {
for _, l := range ln {
l.Close()
}
return nil, err
}
return ln, nil
}
// tcpKeepAliveListener sets TCP keep-alive timeouts on accepted
// connections. It's used by NewHttpServer so dead TCP connections
// eventually go away.
type tcpKeepAliveListener struct {
*net.TCPListener
}
func (ln tcpKeepAliveListener) Accept() (c net.Conn, err error) {
tc, err := ln.AcceptTCP()
if err != nil {
return
}
tc.SetKeepAlive(true)
tc.SetKeepAlivePeriod(30 * time.Second)
return tc, nil
}
func (a *Agent) listenSocket(path string) (net.Listener, error) {
if _, err := os.Stat(path); !os.IsNotExist(err) {
a.logger.Printf("[WARN] agent: Replacing socket %q", path)
}
if err := os.Remove(path); err != nil && !os.IsNotExist(err) {
return nil, fmt.Errorf("error removing socket file: %s", err)
}
l, err := net.Listen("unix", path)
if err != nil {
return nil, err
}
user, group, mode := a.config.UnixSocketUser, a.config.UnixSocketGroup, a.config.UnixSocketMode
if err := setFilePermissions(path, user, group, mode); err != nil {
return nil, fmt.Errorf("Failed setting up socket: %s", err)
}
return l, nil
}
func (a *Agent) serveHTTP(l net.Listener, srv *HTTPServer) error {
// https://github.com/golang/go/issues/20239
//
// In go.8.1 there is a race between Serve and Shutdown. If
// Shutdown is called before the Serve go routine was scheduled then
// the Serve go routine never returns. This deadlocks the agent
// shutdown for some tests since it will wait forever.
//
// Since we need to check for an unexported type (*tls.listener)
// we cannot just perform a type check since the compiler won't let
// us. We might be able to use reflection but the fmt.Sprintf() hack
// works just as well.
srv.proto = "http"
if strings.Contains("*tls.listener", fmt.Sprintf("%T", l)) {
srv.proto = "https"
}
notif := make(chan net.Addr)
a.wgServers.Add(1)
go func() {
defer a.wgServers.Done()
notif <- l.Addr()
err := srv.Serve(l)
if err != nil && err != http.ErrServerClosed {
a.logger.Print(err)
}
}()
select {
case addr := <-notif:
if srv.proto == "https" {
a.logger.Printf("[INFO] agent: Started HTTPS server on %s (%s)", addr.String(), addr.Network())
} else {
a.logger.Printf("[INFO] agent: Started HTTP server on %s (%s)", addr.String(), addr.Network())
}
return nil
case <-time.After(time.Second):
return fmt.Errorf("agent: timeout starting HTTP servers")
}
}
// reloadWatches stops any existing watch plans and attempts to load the given
// set of watches.
func (a *Agent) reloadWatches(cfg *config.RuntimeConfig) error {
// Stop the current watches.
for _, wp := range a.watchPlans {
wp.Stop()
}
a.watchPlans = nil
// Return if there are no watches now.
if len(cfg.Watches) == 0 {
return nil
}
// Watches use the API to talk to this agent, so that must be enabled.
if len(cfg.HTTPAddrs) == 0 && len(cfg.HTTPSAddrs) == 0 {
return fmt.Errorf("watch plans require an HTTP or HTTPS endpoint")
}
// Compile the watches
var watchPlans []*watch.Plan
for _, params := range cfg.Watches {
// Parse the watches, excluding the handler
wp, err := watch.ParseExempt(params, []string{"handler", "args"})
if err != nil {
return fmt.Errorf("Failed to parse watch (%#v): %v", params, err)
}
// Get the handler and subprocess arguments
handler, hasHandler := wp.Exempt["handler"]
args, hasArgs := wp.Exempt["args"]
if hasHandler {
a.logger.Printf("[WARN] agent: The 'handler' field in watches has been deprecated " +
"and replaced with the 'args' field. See https://www.consul.io/docs/agent/watches.html")
}
if _, ok := handler.(string); hasHandler && !ok {
return fmt.Errorf("Watch handler must be a string")
}
if raw, ok := args.([]interface{}); hasArgs && ok {
var parsed []string
for _, arg := range raw {
if v, ok := arg.(string); !ok {
return fmt.Errorf("Watch args must be a list of strings")
} else {
parsed = append(parsed, v)
}
}
wp.Exempt["args"] = parsed
} else if hasArgs && !ok {
return fmt.Errorf("Watch args must be a list of strings")
}
if hasHandler && hasArgs {
return fmt.Errorf("Cannot define both watch handler and args")
}
if !hasHandler && !hasArgs {
return fmt.Errorf("Must define either watch handler or args")
}
// Store the watch plan
watchPlans = append(watchPlans, wp)
}
// Determine the primary http(s) endpoint.
var netaddr net.Addr
if len(cfg.HTTPAddrs) > 0 {
netaddr = cfg.HTTPAddrs[0]
} else {
netaddr = cfg.HTTPSAddrs[0]
}
addr := netaddr.String()
if netaddr.Network() == "unix" {
addr = "unix://" + addr
}
// Fire off a goroutine for each new watch plan.
for _, wp := range watchPlans {
a.watchPlans = append(a.watchPlans, wp)
go func(wp *watch.Plan) {
var handler interface{}
if h, ok := wp.Exempt["handler"]; ok {
handler = h
} else {
handler = wp.Exempt["args"]
}
wp.Handler = makeWatchHandler(a.LogOutput, handler)
wp.LogOutput = a.LogOutput
if err := wp.Run(addr); err != nil {
a.logger.Printf("[ERR] Failed to run watch: %v", err)
}
}(wp)
}
return nil
}
// consulConfig is used to return a consul configuration
func (a *Agent) consulConfig() (*consul.Config, error) {
// Start with the provided config or default config
base := consul.DefaultConfig()
// This is set when the agent starts up
base.NodeID = a.config.NodeID
// Apply dev mode
base.DevMode = a.config.DevMode
// Override with our config
// todo(fs): these are now always set in the runtime config so we can simplify this
// todo(fs): or is there a reason to keep it like that?
base.Datacenter = a.config.Datacenter
base.DataDir = a.config.DataDir
base.NodeName = a.config.NodeName
base.CoordinateUpdateBatchSize = a.config.ConsulCoordinateUpdateBatchSize
base.CoordinateUpdateMaxBatches = a.config.ConsulCoordinateUpdateMaxBatches
base.CoordinateUpdatePeriod = a.config.ConsulCoordinateUpdatePeriod
base.RaftConfig.HeartbeatTimeout = a.config.ConsulRaftHeartbeatTimeout
base.RaftConfig.LeaderLeaseTimeout = a.config.ConsulRaftLeaderLeaseTimeout
base.RaftConfig.ElectionTimeout = a.config.ConsulRaftElectionTimeout
base.SerfLANConfig.MemberlistConfig.BindAddr = a.config.SerfBindAddrLAN.IP.String()
base.SerfLANConfig.MemberlistConfig.BindPort = a.config.SerfBindAddrLAN.Port
base.SerfLANConfig.MemberlistConfig.AdvertiseAddr = a.config.SerfAdvertiseAddrLAN.IP.String()
base.SerfLANConfig.MemberlistConfig.AdvertisePort = a.config.SerfAdvertiseAddrLAN.Port
base.SerfLANConfig.MemberlistConfig.GossipVerifyIncoming = a.config.EncryptVerifyIncoming
base.SerfLANConfig.MemberlistConfig.GossipVerifyOutgoing = a.config.EncryptVerifyOutgoing
base.SerfLANConfig.MemberlistConfig.GossipInterval = a.config.ConsulSerfLANGossipInterval
base.SerfLANConfig.MemberlistConfig.ProbeInterval = a.config.ConsulSerfLANProbeInterval
base.SerfLANConfig.MemberlistConfig.ProbeTimeout = a.config.ConsulSerfLANProbeTimeout
base.SerfLANConfig.MemberlistConfig.SuspicionMult = a.config.ConsulSerfLANSuspicionMult
base.SerfWANConfig.MemberlistConfig.BindAddr = a.config.SerfBindAddrWAN.IP.String()
base.SerfWANConfig.MemberlistConfig.BindPort = a.config.SerfBindAddrWAN.Port
base.SerfWANConfig.MemberlistConfig.AdvertiseAddr = a.config.SerfAdvertiseAddrWAN.IP.String()
base.SerfWANConfig.MemberlistConfig.AdvertisePort = a.config.SerfAdvertiseAddrWAN.Port
base.SerfWANConfig.MemberlistConfig.GossipVerifyIncoming = a.config.EncryptVerifyIncoming
base.SerfWANConfig.MemberlistConfig.GossipVerifyOutgoing = a.config.EncryptVerifyOutgoing
base.SerfWANConfig.MemberlistConfig.GossipInterval = a.config.ConsulSerfWANGossipInterval
base.SerfWANConfig.MemberlistConfig.ProbeInterval = a.config.ConsulSerfWANProbeInterval
base.SerfWANConfig.MemberlistConfig.ProbeTimeout = a.config.ConsulSerfWANProbeTimeout
base.SerfWANConfig.MemberlistConfig.SuspicionMult = a.config.ConsulSerfWANSuspicionMult
base.RPCAddr = a.config.RPCBindAddr
base.RPCAdvertise = a.config.RPCAdvertiseAddr
if a.config.ReconnectTimeoutLAN != 0 {
base.SerfLANConfig.ReconnectTimeout = a.config.ReconnectTimeoutLAN
}
if a.config.ReconnectTimeoutWAN != 0 {
base.SerfWANConfig.ReconnectTimeout = a.config.ReconnectTimeoutWAN
}
base.Segment = a.config.SegmentName
if len(a.config.Segments) > 0 {
segments, err := a.segmentConfig()
if err != nil {
return nil, err
}
base.Segments = segments
}
if a.config.Bootstrap {
base.Bootstrap = true
}
if a.config.RejoinAfterLeave {
base.RejoinAfterLeave = true
}
if a.config.BootstrapExpect != 0 {
base.BootstrapExpect = a.config.BootstrapExpect
}
if a.config.RPCProtocol > 0 {
base.ProtocolVersion = uint8(a.config.RPCProtocol)
}
if a.config.RaftProtocol != 0 {
base.RaftConfig.ProtocolVersion = raft.ProtocolVersion(a.config.RaftProtocol)
}
if a.config.ACLMasterToken != "" {
base.ACLMasterToken = a.config.ACLMasterToken
}
if a.config.ACLDatacenter != "" {
base.ACLDatacenter = a.config.ACLDatacenter
}
if a.config.ACLTTL != 0 {
base.ACLTTL = a.config.ACLTTL
}
if a.config.ACLDefaultPolicy != "" {
base.ACLDefaultPolicy = a.config.ACLDefaultPolicy
}
if a.config.ACLDownPolicy != "" {
base.ACLDownPolicy = a.config.ACLDownPolicy
}
base.EnableACLReplication = a.config.EnableACLReplication
if a.config.ACLEnforceVersion8 {
base.ACLEnforceVersion8 = a.config.ACLEnforceVersion8
}
if a.config.ACLEnableKeyListPolicy {
base.ACLEnableKeyListPolicy = a.config.ACLEnableKeyListPolicy
}
if a.config.SessionTTLMin != 0 {
base.SessionTTLMin = a.config.SessionTTLMin
}
if a.config.AutopilotCleanupDeadServers {
base.AutopilotConfig.CleanupDeadServers = a.config.AutopilotCleanupDeadServers
}
if a.config.AutopilotLastContactThreshold != 0 {
base.AutopilotConfig.LastContactThreshold = a.config.AutopilotLastContactThreshold
}
if a.config.AutopilotMaxTrailingLogs != 0 {
base.AutopilotConfig.MaxTrailingLogs = uint64(a.config.AutopilotMaxTrailingLogs)
}
if a.config.AutopilotServerStabilizationTime != 0 {
base.AutopilotConfig.ServerStabilizationTime = a.config.AutopilotServerStabilizationTime
}
if a.config.NonVotingServer {
base.NonVoter = a.config.NonVotingServer
}
if a.config.AutopilotRedundancyZoneTag != "" {
base.AutopilotConfig.RedundancyZoneTag = a.config.AutopilotRedundancyZoneTag
}
if a.config.AutopilotDisableUpgradeMigration {
base.AutopilotConfig.DisableUpgradeMigration = a.config.AutopilotDisableUpgradeMigration
}
if a.config.AutopilotUpgradeVersionTag != "" {
base.AutopilotConfig.UpgradeVersionTag = a.config.AutopilotUpgradeVersionTag
}
// make sure the advertise address is always set
if base.RPCAdvertise == nil {
base.RPCAdvertise = base.RPCAddr
}
// Rate limiting for RPC calls.
if a.config.RPCRateLimit > 0 {
base.RPCRate = a.config.RPCRateLimit
}
if a.config.RPCMaxBurst > 0 {
base.RPCMaxBurst = a.config.RPCMaxBurst
}
// RPC-related performance configs.
if a.config.RPCHoldTimeout > 0 {
base.RPCHoldTimeout = a.config.RPCHoldTimeout
}
if a.config.LeaveDrainTime > 0 {
base.LeaveDrainTime = a.config.LeaveDrainTime
}
// set the src address for outgoing rpc connections
// Use port 0 so that outgoing connections use a random port.
if !ipaddr.IsAny(base.RPCAddr.IP) {
base.RPCSrcAddr = &net.TCPAddr{IP: base.RPCAddr.IP}
}
// Format the build string
revision := a.config.Revision
if len(revision) > 8 {
revision = revision[:8]
}
base.Build = fmt.Sprintf("%s%s:%s", a.config.Version, a.config.VersionPrerelease, revision)
// Copy the TLS configuration
base.VerifyIncoming = a.config.VerifyIncoming || a.config.VerifyIncomingRPC
if a.config.CAPath != "" || a.config.CAFile != "" {
base.UseTLS = true
}
base.VerifyOutgoing = a.config.VerifyOutgoing
base.VerifyServerHostname = a.config.VerifyServerHostname
base.CAFile = a.config.CAFile
base.CAPath = a.config.CAPath
base.CertFile = a.config.CertFile
base.KeyFile = a.config.KeyFile
base.ServerName = a.config.ServerName
base.Domain = a.config.DNSDomain
base.TLSMinVersion = a.config.TLSMinVersion
base.TLSCipherSuites = a.config.TLSCipherSuites
base.TLSPreferServerCipherSuites = a.config.TLSPreferServerCipherSuites
// Setup the user event callback
base.UserEventHandler = func(e serf.UserEvent) {
select {
case a.eventCh <- e:
case <-a.shutdownCh:
}
}
// Setup the loggers
base.LogOutput = a.LogOutput
// This will set up the LAN keyring, as well as the WAN and any segments
// for servers.
if err := a.setupKeyrings(base); err != nil {
return nil, fmt.Errorf("Failed to configure keyring: %v", err)
}
return base, nil
}
// Setup the serf and memberlist config for any defined network segments.
func (a *Agent) segmentConfig() ([]consul.NetworkSegment, error) {
var segments []consul.NetworkSegment
config := a.config
for _, s := range config.Segments {
serfConf := consul.DefaultConfig().SerfLANConfig
serfConf.MemberlistConfig.BindAddr = s.Bind.IP.String()
serfConf.MemberlistConfig.BindPort = s.Bind.Port
serfConf.MemberlistConfig.AdvertiseAddr = s.Advertise.IP.String()
serfConf.MemberlistConfig.AdvertisePort = s.Advertise.Port
if config.ReconnectTimeoutLAN != 0 {
serfConf.ReconnectTimeout = config.ReconnectTimeoutLAN
}
if config.EncryptVerifyIncoming {
serfConf.MemberlistConfig.GossipVerifyIncoming = config.EncryptVerifyIncoming
}
if config.EncryptVerifyOutgoing {
serfConf.MemberlistConfig.GossipVerifyOutgoing = config.EncryptVerifyOutgoing
}
var rpcAddr *net.TCPAddr
if s.RPCListener {
rpcAddr = &net.TCPAddr{
IP: s.Bind.IP,
Port: a.config.ServerPort,
}
}
segments = append(segments, consul.NetworkSegment{
Name: s.Name,
Bind: serfConf.MemberlistConfig.BindAddr,
Advertise: serfConf.MemberlistConfig.AdvertiseAddr,
Port: s.Bind.Port,
RPCAddr: rpcAddr,
SerfConfig: serfConf,
})
}
return segments, nil
}
// makeRandomID will generate a random UUID for a node.
func (a *Agent) makeRandomID() (string, error) {
id, err := uuid.GenerateUUID()
if err != nil {
return "", err
}
a.logger.Printf("[DEBUG] Using random ID %q as node ID", id)
return id, nil
}
// makeNodeID will try to find a host-specific ID, or else will generate a
// random ID. The returned ID will always be formatted as a GUID. We don't tell
// the caller whether this ID is random or stable since the consequences are
// high for us if this changes, so we will persist it either way. This will let
// gopsutil change implementations without affecting in-place upgrades of nodes.
func (a *Agent) makeNodeID() (string, error) {
// If they've disabled host-based IDs then just make a random one.
if a.config.DisableHostNodeID {
return a.makeRandomID()
}
// Try to get a stable ID associated with the host itself.
info, err := host.Info()
if err != nil {
a.logger.Printf("[DEBUG] Couldn't get a unique ID from the host: %v", err)
return a.makeRandomID()
}
// Make sure the host ID parses as a UUID, since we don't have complete
// control over this process.
id := strings.ToLower(info.HostID)
if _, err := uuid.ParseUUID(id); err != nil {
a.logger.Printf("[DEBUG] Unique ID %q from host isn't formatted as a UUID: %v",
id, err)
return a.makeRandomID()
}
// Hash the input to make it well distributed. The reported Host UUID may be
// similar across nodes if they are on a cloud provider or on motherboards
// created from the same batch.
buf := sha512.Sum512([]byte(id))
id = fmt.Sprintf("%08x-%04x-%04x-%04x-%12x",
buf[0:4],
buf[4:6],
buf[6:8],
buf[8:10],
buf[10:16])
a.logger.Printf("[DEBUG] Using unique ID %q from host as node ID", id)
return id, nil
}
// setupNodeID will pull the persisted node ID, if any, or create a random one
// and persist it.
func (a *Agent) setupNodeID(config *config.RuntimeConfig) error {
// If they've configured a node ID manually then just use that, as
// long as it's valid.
if config.NodeID != "" {
config.NodeID = types.NodeID(strings.ToLower(string(config.NodeID)))
if _, err := uuid.ParseUUID(string(config.NodeID)); err != nil {
return err
}
return nil
}
// For dev mode we have no filesystem access so just make one.
if a.config.DevMode {
id, err := a.makeNodeID()
if err != nil {
return err
}
config.NodeID = types.NodeID(id)
return nil
}
// Load saved state, if any. Since a user could edit this, we also
// validate it.
fileID := filepath.Join(config.DataDir, "node-id")
if _, err := os.Stat(fileID); err == nil {
rawID, err := ioutil.ReadFile(fileID)
if err != nil {
return err
}
nodeID := strings.TrimSpace(string(rawID))
nodeID = strings.ToLower(nodeID)
if _, err := uuid.ParseUUID(nodeID); err != nil {
return err
}
config.NodeID = types.NodeID(nodeID)
}
// If we still don't have a valid node ID, make one.
if config.NodeID == "" {
id, err := a.makeNodeID()
if err != nil {
return err
}
if err := lib.EnsurePath(fileID, false); err != nil {
return err
}
if err := ioutil.WriteFile(fileID, []byte(id), 0600); err != nil {
return err
}
config.NodeID = types.NodeID(id)
}
return nil
}
// setupBaseKeyrings configures the LAN and WAN keyrings.
func (a *Agent) setupBaseKeyrings(config *consul.Config) error {
// If the keyring file is disabled then just poke the provided key
// into the in-memory keyring.
if a.config.DisableKeyringFile {
if a.config.EncryptKey == "" {
return nil
}
keys := []string{a.config.EncryptKey}
if err := loadKeyring(config.SerfLANConfig, keys); err != nil {
return err
}
if a.config.ServerMode {
if err := loadKeyring(config.SerfWANConfig, keys); err != nil {
return err
}
}
return nil
}
// Otherwise, we need to deal with the keyring files.
fileLAN := filepath.Join(a.config.DataDir, SerfLANKeyring)
fileWAN := filepath.Join(a.config.DataDir, SerfWANKeyring)
if a.config.EncryptKey == "" {
goto LOAD
}
if _, err := os.Stat(fileLAN); err != nil {
if err := initKeyring(fileLAN, a.config.EncryptKey); err != nil {
return err
}
}
if a.config.ServerMode {
if _, err := os.Stat(fileWAN); err != nil {
if err := initKeyring(fileWAN, a.config.EncryptKey); err != nil {
return err
}
}
}
LOAD:
if _, err := os.Stat(fileLAN); err == nil {
config.SerfLANConfig.KeyringFile = fileLAN
}
if err := loadKeyringFile(config.SerfLANConfig); err != nil {
return err
}
if a.config.ServerMode {
if _, err := os.Stat(fileWAN); err == nil {
config.SerfWANConfig.KeyringFile = fileWAN
}
if err := loadKeyringFile(config.SerfWANConfig); err != nil {
return err
}
}
return nil
}
// setupKeyrings is used to initialize and load keyrings during agent startup.
func (a *Agent) setupKeyrings(config *consul.Config) error {
// First set up the LAN and WAN keyrings.
if err := a.setupBaseKeyrings(config); err != nil {
return err
}
// If there's no LAN keyring then there's nothing else to set up for
// any segments.
lanKeyring := config.SerfLANConfig.MemberlistConfig.Keyring
if lanKeyring == nil {
return nil
}
// Copy the initial state of the LAN keyring into each segment config.
// Segments don't have their own keyring file, they rely on the LAN
// holding the state so things can't get out of sync.
k, pk := lanKeyring.GetKeys(), lanKeyring.GetPrimaryKey()
for _, segment := range config.Segments {
keyring, err := memberlist.NewKeyring(k, pk)
if err != nil {
return err
}
segment.SerfConfig.MemberlistConfig.Keyring = keyring
}
return nil
}
// registerEndpoint registers a handler for the consul RPC server
// under a unique name while making it accessible under the provided
// name. This allows overwriting handlers for the golang net/rpc
// service which does not allow this.
func (a *Agent) registerEndpoint(name string, handler interface{}) error {
srv, ok := a.delegate.(*consul.Server)
if !ok {
panic("agent must be a server")
}
realname := fmt.Sprintf("%s-%d", name, time.Now().UnixNano())
a.endpointsLock.Lock()
a.endpoints[name] = realname
a.endpointsLock.Unlock()
return srv.RegisterEndpoint(realname, handler)
}
// RPC is used to make an RPC call to the Consul servers
// This allows the agent to implement the Consul.Interface
func (a *Agent) RPC(method string, args interface{}, reply interface{}) error {
a.endpointsLock.RLock()
// fast path: only translate if there are overrides
if len(a.endpoints) > 0 {
p := strings.SplitN(method, ".", 2)
if e := a.endpoints[p[0]]; e != "" {
method = e + "." + p[1]
}
}
a.endpointsLock.RUnlock()
return a.delegate.RPC(method, args, reply)
}
// SnapshotRPC performs the requested snapshot RPC against the Consul server in
// a streaming manner. The contents of in will be read and passed along as the
// payload, and the response message will determine the error status, and any
// return payload will be written to out.
func (a *Agent) SnapshotRPC(args *structs.SnapshotRequest, in io.Reader, out io.Writer,
replyFn structs.SnapshotReplyFn) error {
return a.delegate.SnapshotRPC(args, in, out, replyFn)
}
// Leave is used to prepare the agent for a graceful shutdown
func (a *Agent) Leave() error {
return a.delegate.Leave()
}
// ShutdownAgent is used to hard stop the agent. Should be preceded by
// Leave to do it gracefully. Should be followed by ShutdownEndpoints to
// terminate the HTTP and DNS servers as well.
func (a *Agent) ShutdownAgent() error {
a.shutdownLock.Lock()
defer a.shutdownLock.Unlock()
if a.shutdown {
return nil
}
a.logger.Println("[INFO] agent: Requesting shutdown")
// Stop all the checks
a.checkLock.Lock()
defer a.checkLock.Unlock()
for _, chk := range a.checkMonitors {
chk.Stop()
}
for _, chk := range a.checkTTLs {
chk.Stop()
}
for _, chk := range a.checkHTTPs {
chk.Stop()
}
for _, chk := range a.checkTCPs {
chk.Stop()
}
for _, chk := range a.checkDockers {
chk.Stop()
}
var err error
if a.delegate != nil {
err = a.delegate.Shutdown()
if _, ok := a.delegate.(*consul.Server); ok {
a.logger.Print("[INFO] agent: consul server down")
} else {
a.logger.Print("[INFO] agent: consul client down")
}
}
pidErr := a.deletePid()
if pidErr != nil {
a.logger.Println("[WARN] agent: could not delete pid file ", pidErr)
}
a.logger.Println("[INFO] agent: shutdown complete")
a.shutdown = true
close(a.shutdownCh)
return err
}
// ShutdownEndpoints terminates the HTTP and DNS servers. Should be
// preceeded by ShutdownAgent.
func (a *Agent) ShutdownEndpoints() {
a.shutdownLock.Lock()
defer a.shutdownLock.Unlock()
if len(a.dnsServers) == 0 || len(a.httpServers) == 0 {
return
}
for _, srv := range a.dnsServers {
a.logger.Printf("[INFO] agent: Stopping DNS server %s (%s)", srv.Server.Addr, srv.Server.Net)
srv.Shutdown()
}
a.dnsServers = nil
for _, srv := range a.httpServers {
a.logger.Printf("[INFO] agent: Stopping %s server %s (%s)", strings.ToUpper(srv.proto), srv.addr.String(), srv.addr.Network())
ctx, cancel := context.WithTimeout(context.Background(), time.Second)
defer cancel()
srv.Shutdown(ctx)
if ctx.Err() == context.DeadlineExceeded {
a.logger.Printf("[WARN] agent: Timeout stopping %s server %s (%s)", strings.ToUpper(srv.proto), srv.addr.String(), srv.addr.Network())
}
}
a.httpServers = nil
a.logger.Println("[INFO] agent: Waiting for endpoints to shut down")
a.wgServers.Wait()
a.logger.Print("[INFO] agent: Endpoints down")
}
// ReloadCh is used to return a channel that can be
// used for triggering reloads and returning a response.
func (a *Agent) ReloadCh() chan chan error {
return a.reloadCh
}
// RetryJoinCh is a channel that transports errors
// from the retry join process.
func (a *Agent) RetryJoinCh() <-chan error {
return a.retryJoinCh
}
// ShutdownCh is used to return a channel that can be
// selected to wait for the agent to perform a shutdown.
func (a *Agent) ShutdownCh() <-chan struct{} {
return a.shutdownCh
}
// JoinLAN is used to have the agent join a LAN cluster
func (a *Agent) JoinLAN(addrs []string) (n int, err error) {
a.logger.Printf("[INFO] agent: (LAN) joining: %v", addrs)
n, err = a.delegate.JoinLAN(addrs)
a.logger.Printf("[INFO] agent: (LAN) joined: %d Err: %v", n, err)
if err == nil && a.joinLANNotifier != nil {
if notifErr := a.joinLANNotifier.Notify(systemd.Ready); notifErr != nil {
a.logger.Printf("[DEBUG] agent: systemd notify failed: %v", notifErr)
}
}
return
}
// JoinWAN is used to have the agent join a WAN cluster
func (a *Agent) JoinWAN(addrs []string) (n int, err error) {
a.logger.Printf("[INFO] agent: (WAN) joining: %v", addrs)
if srv, ok := a.delegate.(*consul.Server); ok {
n, err = srv.JoinWAN(addrs)
} else {
err = fmt.Errorf("Must be a server to join WAN cluster")
}
a.logger.Printf("[INFO] agent: (WAN) joined: %d Err: %v", n, err)
return
}
// ForceLeave is used to remove a failed node from the cluster
func (a *Agent) ForceLeave(node string) (err error) {
a.logger.Printf("[INFO] Force leaving node: %v", node)
err = a.delegate.RemoveFailedNode(node)
if err != nil {
a.logger.Printf("[WARN] Failed to remove node: %v", err)
}
return err
}
// LocalMember is used to return the local node
func (a *Agent) LocalMember() serf.Member {
return a.delegate.LocalMember()
}
// LANMembers is used to retrieve the LAN members
func (a *Agent) LANMembers() []serf.Member {
return a.delegate.LANMembers()
}
// WANMembers is used to retrieve the WAN members
func (a *Agent) WANMembers() []serf.Member {
if srv, ok := a.delegate.(*consul.Server); ok {
return srv.WANMembers()
}
return nil
}
// StartSync is called once Services and Checks are registered.
// This is called to prevent a race between clients and the anti-entropy routines
func (a *Agent) StartSync() {
// Start the anti entropy routine
go a.state.antiEntropy(a.shutdownCh)
}
// PauseSync is used to pause anti-entropy while bulk changes are make
func (a *Agent) PauseSync() {
a.state.Pause()
}
// ResumeSync is used to unpause anti-entropy after bulk changes are make
func (a *Agent) ResumeSync() {
a.state.Resume()
}
// GetLANCoordinate returns the coordinates of this node in the local pools
// (assumes coordinates are enabled, so check that before calling).
func (a *Agent) GetLANCoordinate() (lib.CoordinateSet, error) {
return a.delegate.GetLANCoordinate()
}
// sendCoordinate is a long-running loop that periodically sends our coordinate
// to the server. Closing the agent's shutdownChannel will cause this to exit.
func (a *Agent) sendCoordinate() {
OUTER:
for {
rate := a.config.SyncCoordinateRateTarget
min := a.config.SyncCoordinateIntervalMin
intv := lib.RateScaledInterval(rate, min, len(a.LANMembers()))
intv = intv + lib.RandomStagger(intv)
select {
case <-time.After(intv):
members := a.LANMembers()
grok, err := consul.CanServersUnderstandProtocol(members, 3)
if err != nil {
a.logger.Printf("[ERR] agent: Failed to check servers: %s", err)
continue
}
if !grok {
a.logger.Printf("[DEBUG] agent: Skipping coordinate updates until servers are upgraded")
continue
}
cs, err := a.GetLANCoordinate()
if err != nil {
a.logger.Printf("[ERR] agent: Failed to get coordinate: %s", err)
continue
}
for segment, coord := range cs {
req := structs.CoordinateUpdateRequest{
Datacenter: a.config.Datacenter,
Node: a.config.NodeName,
Segment: segment,
Coord: coord,
WriteRequest: structs.WriteRequest{Token: a.tokens.AgentToken()},
}
var reply struct{}
if err := a.RPC("Coordinate.Update", &req, &reply); err != nil {
if acl.IsErrPermissionDenied(err) {
a.logger.Printf("[WARN] agent: Coordinate update blocked by ACLs")
} else {
a.logger.Printf("[ERR] agent: Coordinate update error: %v", err)
}
continue OUTER
}
}
case <-a.shutdownCh:
return
}
}
}
// reapServicesInternal does a single pass, looking for services to reap.
func (a *Agent) reapServicesInternal() {
reaped := make(map[string]struct{})
for checkID, check := range a.state.CriticalChecks() {
// There's nothing to do if there's no service.
if check.Check.ServiceID == "" {
continue
}
// There might be multiple checks for one service, so
// we don't need to reap multiple times.
serviceID := check.Check.ServiceID
if _, ok := reaped[serviceID]; ok {
continue
}
// See if there's a timeout.
a.checkLock.Lock()
timeout, ok := a.checkReapAfter[checkID]
a.checkLock.Unlock()
// Reap, if necessary. We keep track of which service
// this is so that we won't try to remove it again.
if ok && check.CriticalFor > timeout {
reaped[serviceID] = struct{}{}
a.RemoveService(serviceID, true)
a.logger.Printf("[INFO] agent: Check %q for service %q has been critical for too long; deregistered service",
checkID, serviceID)
}
}
}
// reapServices is a long running goroutine that looks for checks that have been
// critical too long and dregisters their associated services.
func (a *Agent) reapServices() {
for {
select {
case <-time.After(a.config.CheckReapInterval):
a.reapServicesInternal()
case <-a.shutdownCh:
return
}
}
}
// persistedService is used to wrap a service definition and bundle it
// with an ACL token so we can restore both at a later agent start.
type persistedService struct {
Token string
Service *structs.NodeService
}
// persistService saves a service definition to a JSON file in the data dir
func (a *Agent) persistService(service *structs.NodeService) error {
svcPath := filepath.Join(a.config.DataDir, servicesDir, stringHash(service.ID))
wrapped := persistedService{
Token: a.state.ServiceToken(service.ID),
Service: service,
}
encoded, err := json.Marshal(wrapped)
if err != nil {
return err
}
return writeFileAtomic(svcPath, encoded)
}
// purgeService removes a persisted service definition file from the data dir
func (a *Agent) purgeService(serviceID string) error {
svcPath := filepath.Join(a.config.DataDir, servicesDir, stringHash(serviceID))
if _, err := os.Stat(svcPath); err == nil {
return os.Remove(svcPath)
}
return nil
}
// persistCheck saves a check definition to the local agent's state directory
func (a *Agent) persistCheck(check *structs.HealthCheck, chkType *structs.CheckType) error {
checkPath := filepath.Join(a.config.DataDir, checksDir, checkIDHash(check.CheckID))
// Create the persisted check
wrapped := persistedCheck{
Check: check,
ChkType: chkType,
Token: a.state.CheckToken(check.CheckID),
}
encoded, err := json.Marshal(wrapped)
if err != nil {
return err
}
return writeFileAtomic(checkPath, encoded)
}
// purgeCheck removes a persisted check definition file from the data dir
func (a *Agent) purgeCheck(checkID types.CheckID) error {
checkPath := filepath.Join(a.config.DataDir, checksDir, checkIDHash(checkID))
if _, err := os.Stat(checkPath); err == nil {
return os.Remove(checkPath)
}
return nil
}
// writeFileAtomic writes the given contents to a temporary file in the same
// directory, does an fsync and then renames the file to its real path
func writeFileAtomic(path string, contents []byte) error {
uuid, err := uuid.GenerateUUID()
if err != nil {
return err
}
tempPath := fmt.Sprintf("%s-%s.tmp", path, uuid)
if err := os.MkdirAll(filepath.Dir(path), 0700); err != nil {
return err
}
fh, err := os.OpenFile(tempPath, os.O_CREATE|os.O_WRONLY|os.O_TRUNC, 0600)
if err != nil {
return err
}
if _, err := fh.Write(contents); err != nil {
fh.Close()
os.Remove(tempPath)
return err
}
if err := fh.Sync(); err != nil {
fh.Close()
os.Remove(tempPath)
return err
}
if err := fh.Close(); err != nil {
os.Remove(tempPath)
return err
}
if err := os.Rename(tempPath, path); err != nil {
os.Remove(tempPath)
return err
}
return nil
}
// AddService is used to add a service entry.
// This entry is persistent and the agent will make a best effort to
// ensure it is registered
func (a *Agent) AddService(service *structs.NodeService, chkTypes []*structs.CheckType, persist bool, token string) error {
if service.Service == "" {
return fmt.Errorf("Service name missing")
}
if service.ID == "" && service.Service != "" {
service.ID = service.Service
}
for _, check := range chkTypes {
if !check.Valid() {
return fmt.Errorf("Check type is not valid")
}
}
// Warn if the service name is incompatible with DNS
if InvalidDnsRe.MatchString(service.Service) {
a.logger.Printf("[WARN] Service name %q will not be discoverable "+
"via DNS due to invalid characters. Valid characters include "+
"all alpha-numerics and dashes.", service.Service)
}
// Warn if any tags are incompatible with DNS
for _, tag := range service.Tags {
if InvalidDnsRe.MatchString(tag) {
a.logger.Printf("[DEBUG] Service tag %q will not be discoverable "+
"via DNS due to invalid characters. Valid characters include "+
"all alpha-numerics and dashes.", tag)
}
}
// Pause the service syncs during modification
a.PauseSync()
defer a.ResumeSync()
// Take a snapshot of the current state of checks (if any), and
// restore them before resuming anti-entropy.
snap := a.snapshotCheckState()
defer a.restoreCheckState(snap)
// Add the service
a.state.AddService(service, token)
// Persist the service to a file
if persist && !a.config.DevMode {
if err := a.persistService(service); err != nil {
return err
}
}
// Create an associated health check
for i, chkType := range chkTypes {
checkID := string(chkType.CheckID)
if checkID == "" {
checkID = fmt.Sprintf("service:%s", service.ID)
if len(chkTypes) > 1 {
checkID += fmt.Sprintf(":%d", i+1)
}
}
name := chkType.Name
if name == "" {
name = fmt.Sprintf("Service '%s' check", service.Service)
}
check := &structs.HealthCheck{
Node: a.config.NodeName,
CheckID: types.CheckID(checkID),
Name: name,
Status: api.HealthCritical,
Notes: chkType.Notes,
ServiceID: service.ID,
ServiceName: service.Service,
}
if chkType.Status != "" {
check.Status = chkType.Status
}
if err := a.AddCheck(check, chkType, persist, token); err != nil {
return err
}
}
return nil
}
// RemoveService is used to remove a service entry.
// The agent will make a best effort to ensure it is deregistered
func (a *Agent) RemoveService(serviceID string, persist bool) error {
// Validate ServiceID
if serviceID == "" {
return fmt.Errorf("ServiceID missing")
}
// Remove service immediately
if err := a.state.RemoveService(serviceID); err != nil {
a.logger.Printf("[WARN] agent: Failed to deregister service %q: %s", serviceID, err)
return nil
}
// Remove the service from the data dir
if persist {
if err := a.purgeService(serviceID); err != nil {
return err
}
}
// Deregister any associated health checks
for checkID, health := range a.state.Checks() {
if health.ServiceID != serviceID {
continue
}
if err := a.RemoveCheck(checkID, persist); err != nil {
return err
}
}
log.Printf("[DEBUG] agent: removed service %q", serviceID)
return nil
}
// AddCheck is used to add a health check to the agent.
// This entry is persistent and the agent will make a best effort to
// ensure it is registered. The Check may include a CheckType which
// is used to automatically update the check status
func (a *Agent) AddCheck(check *structs.HealthCheck, chkType *structs.CheckType, persist bool, token string) error {
if check.CheckID == "" {
return fmt.Errorf("CheckID missing")
}
if chkType != nil {
if !chkType.Valid() {
return fmt.Errorf("Check type is not valid")
}
if chkType.IsScript() && !a.config.EnableScriptChecks {
return fmt.Errorf("Scripts are disabled on this agent; to enable, configure 'enable_script_checks' to true")
}
}
if check.ServiceID != "" {
svc, ok := a.state.Services()[check.ServiceID]
if !ok {
return fmt.Errorf("ServiceID %q does not exist", check.ServiceID)
}
check.ServiceName = svc.Service
}
a.checkLock.Lock()
defer a.checkLock.Unlock()
// Check if already registered
if chkType != nil {
switch {
case chkType.IsTTL():
if existing, ok := a.checkTTLs[check.CheckID]; ok {
existing.Stop()
delete(a.checkTTLs, check.CheckID)
}
ttl := &CheckTTL{
Notify: a.state,
CheckID: check.CheckID,
TTL: chkType.TTL,
Logger: a.logger,
}
// Restore persisted state, if any
if err := a.loadCheckState(check); err != nil {
a.logger.Printf("[WARN] agent: failed restoring state for check %q: %s",
check.CheckID, err)
}
ttl.Start()
a.checkTTLs[check.CheckID] = ttl
case chkType.IsHTTP():
if existing, ok := a.checkHTTPs[check.CheckID]; ok {
existing.Stop()
delete(a.checkHTTPs, check.CheckID)
}
if chkType.Interval < MinInterval {
a.logger.Println(fmt.Sprintf("[WARN] agent: check '%s' has interval below minimum of %v",
check.CheckID, MinInterval))
chkType.Interval = MinInterval
}
http := &CheckHTTP{
Notify: a.state,
CheckID: check.CheckID,
HTTP: chkType.HTTP,
Header: chkType.Header,
Method: chkType.Method,
Interval: chkType.Interval,
Timeout: chkType.Timeout,
Logger: a.logger,
TLSSkipVerify: chkType.TLSSkipVerify,
}
http.Start()
a.checkHTTPs[check.CheckID] = http
case chkType.IsTCP():
if existing, ok := a.checkTCPs[check.CheckID]; ok {
existing.Stop()
delete(a.checkTCPs, check.CheckID)
}
if chkType.Interval < MinInterval {
a.logger.Println(fmt.Sprintf("[WARN] agent: check '%s' has interval below minimum of %v",
check.CheckID, MinInterval))
chkType.Interval = MinInterval
}
tcp := &CheckTCP{
Notify: a.state,
CheckID: check.CheckID,
TCP: chkType.TCP,
Interval: chkType.Interval,
Timeout: chkType.Timeout,
Logger: a.logger,
}
tcp.Start()
a.checkTCPs[check.CheckID] = tcp
case chkType.IsDocker():
if existing, ok := a.checkDockers[check.CheckID]; ok {
existing.Stop()
delete(a.checkDockers, check.CheckID)
}
if chkType.Interval < MinInterval {
a.logger.Println(fmt.Sprintf("[WARN] agent: check '%s' has interval below minimum of %v",
check.CheckID, MinInterval))
chkType.Interval = MinInterval
}
if chkType.Script != "" {
a.logger.Printf("[WARN] agent: check %q has the 'script' field, which has been deprecated "+
"and replaced with the 'args' field. See https://www.consul.io/docs/agent/checks.html",
check.CheckID)
}
if a.dockerClient == nil {
dc, err := NewDockerClient(os.Getenv("DOCKER_HOST"), CheckBufSize)
if err != nil {
a.logger.Printf("[ERR] agent: error creating docker client: %s", err)
return err
}
a.logger.Printf("[DEBUG] agent: created docker client for %s", dc.host)
a.dockerClient = dc
}
dockerCheck := &CheckDocker{
Notify: a.state,
CheckID: check.CheckID,
DockerContainerID: chkType.DockerContainerID,
Shell: chkType.Shell,
Script: chkType.Script,
ScriptArgs: chkType.ScriptArgs,
Interval: chkType.Interval,
Logger: a.logger,
client: a.dockerClient,
}
dockerCheck.Start()
a.checkDockers[check.CheckID] = dockerCheck
case chkType.IsMonitor():
if existing, ok := a.checkMonitors[check.CheckID]; ok {
existing.Stop()
delete(a.checkMonitors, check.CheckID)
}
if chkType.Interval < MinInterval {
a.logger.Printf("[WARN] agent: check '%s' has interval below minimum of %v",
check.CheckID, MinInterval)
chkType.Interval = MinInterval
}
if chkType.Script != "" {
a.logger.Printf("[WARN] agent: check %q has the 'script' field, which has been deprecated "+
"and replaced with the 'args' field. See https://www.consul.io/docs/agent/checks.html",
check.CheckID)
}
monitor := &CheckMonitor{
Notify: a.state,
CheckID: check.CheckID,
Script: chkType.Script,
ScriptArgs: chkType.ScriptArgs,
Interval: chkType.Interval,
Timeout: chkType.Timeout,
Logger: a.logger,
}
monitor.Start()
a.checkMonitors[check.CheckID] = monitor
default:
return fmt.Errorf("Check type is not valid")
}
if chkType.DeregisterCriticalServiceAfter > 0 {
timeout := chkType.DeregisterCriticalServiceAfter
if timeout < a.config.CheckDeregisterIntervalMin {
timeout = a.config.CheckDeregisterIntervalMin
a.logger.Println(fmt.Sprintf("[WARN] agent: check '%s' has deregister interval below minimum of %v",
check.CheckID, a.config.CheckDeregisterIntervalMin))
}
a.checkReapAfter[check.CheckID] = timeout
} else {
delete(a.checkReapAfter, check.CheckID)
}
}
// Add to the local state for anti-entropy
err := a.state.AddCheck(check, token)
if err != nil {
a.cancelCheckMonitors(check.CheckID)
return err
}
// Persist the check
if persist && !a.config.DevMode {
return a.persistCheck(check, chkType)
}
return nil
}
// RemoveCheck is used to remove a health check.
// The agent will make a best effort to ensure it is deregistered
func (a *Agent) RemoveCheck(checkID types.CheckID, persist bool) error {
// Validate CheckID
if checkID == "" {
return fmt.Errorf("CheckID missing")
}
// Add to the local state for anti-entropy
a.state.RemoveCheck(checkID)
a.checkLock.Lock()
defer a.checkLock.Unlock()
a.cancelCheckMonitors(checkID)
if persist {
if err := a.purgeCheck(checkID); err != nil {
return err
}
if err := a.purgeCheckState(checkID); err != nil {
return err
}
}
a.logger.Printf("[DEBUG] agent: removed check %q", checkID)
return nil
}
func (a *Agent) cancelCheckMonitors(checkID types.CheckID) {
// Stop any monitors
delete(a.checkReapAfter, checkID)
if check, ok := a.checkMonitors[checkID]; ok {
check.Stop()
delete(a.checkMonitors, checkID)
}
if check, ok := a.checkHTTPs[checkID]; ok {
check.Stop()
delete(a.checkHTTPs, checkID)
}
if check, ok := a.checkTCPs[checkID]; ok {
check.Stop()
delete(a.checkTCPs, checkID)
}
if check, ok := a.checkTTLs[checkID]; ok {
check.Stop()
delete(a.checkTTLs, checkID)
}
if check, ok := a.checkDockers[checkID]; ok {
check.Stop()
delete(a.checkDockers, checkID)
}
}
// updateTTLCheck is used to update the status of a TTL check via the Agent API.
func (a *Agent) updateTTLCheck(checkID types.CheckID, status, output string) error {
a.checkLock.Lock()
defer a.checkLock.Unlock()
// Grab the TTL check.
check, ok := a.checkTTLs[checkID]
if !ok {
return fmt.Errorf("CheckID %q does not have associated TTL", checkID)
}
// Set the status through CheckTTL to reset the TTL.
check.SetStatus(status, output)
// We don't write any files in dev mode so bail here.
if a.config.DevMode {
return nil
}
// Persist the state so the TTL check can come up in a good state after
// an agent restart, especially with long TTL values.
if err := a.persistCheckState(check, status, output); err != nil {
return fmt.Errorf("failed persisting state for check %q: %s", checkID, err)
}
return nil
}
// persistCheckState is used to record the check status into the data dir.
// This allows the state to be restored on a later agent start. Currently
// only useful for TTL based checks.
func (a *Agent) persistCheckState(check *CheckTTL, status, output string) error {
// Create the persisted state
state := persistedCheckState{
CheckID: check.CheckID,
Status: status,
Output: output,
Expires: time.Now().Add(check.TTL).Unix(),
}
// Encode the state
buf, err := json.Marshal(state)
if err != nil {
return err
}
// Create the state dir if it doesn't exist
dir := filepath.Join(a.config.DataDir, checkStateDir)
if err := os.MkdirAll(dir, 0700); err != nil {
return fmt.Errorf("failed creating check state dir %q: %s", dir, err)
}
// Write the state to the file
file := filepath.Join(dir, checkIDHash(check.CheckID))
// Create temp file in same dir, to make more likely atomic
tempFile := file + ".tmp"
// persistCheckState is called frequently, so don't use writeFileAtomic to avoid calling fsync here
if err := ioutil.WriteFile(tempFile, buf, 0600); err != nil {
return fmt.Errorf("failed writing temp file %q: %s", tempFile, err)
}
if err := os.Rename(tempFile, file); err != nil {
return fmt.Errorf("failed to rename temp file from %q to %q: %s", tempFile, file, err)
}
return nil
}
// loadCheckState is used to restore the persisted state of a check.
func (a *Agent) loadCheckState(check *structs.HealthCheck) error {
// Try to read the persisted state for this check
file := filepath.Join(a.config.DataDir, checkStateDir, checkIDHash(check.CheckID))
buf, err := ioutil.ReadFile(file)
if err != nil {
if os.IsNotExist(err) {
return nil
}
return fmt.Errorf("failed reading file %q: %s", file, err)
}
// Decode the state data
var p persistedCheckState
if err := json.Unmarshal(buf, &p); err != nil {
a.logger.Printf("[ERR] agent: failed decoding check state: %s", err)
return a.purgeCheckState(check.CheckID)
}
// Check if the state has expired
if time.Now().Unix() >= p.Expires {
a.logger.Printf("[DEBUG] agent: check state expired for %q, not restoring", check.CheckID)
return a.purgeCheckState(check.CheckID)
}
// Restore the fields from the state
check.Output = p.Output
check.Status = p.Status
return nil
}
// purgeCheckState is used to purge the state of a check from the data dir
func (a *Agent) purgeCheckState(checkID types.CheckID) error {
file := filepath.Join(a.config.DataDir, checkStateDir, checkIDHash(checkID))
err := os.Remove(file)
if os.IsNotExist(err) {
return nil
}
return err
}
func (a *Agent) GossipEncrypted() bool {
return a.delegate.Encrypted()
}
// Stats is used to get various debugging state from the sub-systems
func (a *Agent) Stats() map[string]map[string]string {
toString := func(v uint64) string {
return strconv.FormatUint(v, 10)
}
stats := a.delegate.Stats()
stats["agent"] = map[string]string{
"check_monitors": toString(uint64(len(a.checkMonitors))),
"check_ttls": toString(uint64(len(a.checkTTLs))),
"checks": toString(uint64(len(a.state.checks))),
"services": toString(uint64(len(a.state.services))),
}
revision := a.config.Revision
if len(revision) > 8 {
revision = revision[:8]
}
stats["build"] = map[string]string{
"revision": revision,
"version": a.config.Version,
"prerelease": a.config.VersionPrerelease,
}
return stats
}
// storePid is used to write out our PID to a file if necessary
func (a *Agent) storePid() error {
// Quit fast if no pidfile
pidPath := a.config.PidFile
if pidPath == "" {
return nil
}
// Open the PID file
pidFile, err := os.OpenFile(pidPath, os.O_CREATE|os.O_WRONLY|os.O_TRUNC, 0666)
if err != nil {
return fmt.Errorf("Could not open pid file: %v", err)
}
defer pidFile.Close()
// Write out the PID
pid := os.Getpid()
_, err = pidFile.WriteString(fmt.Sprintf("%d", pid))
if err != nil {
return fmt.Errorf("Could not write to pid file: %s", err)
}
return nil
}
// deletePid is used to delete our PID on exit
func (a *Agent) deletePid() error {
// Quit fast if no pidfile
pidPath := a.config.PidFile
if pidPath == "" {
return nil
}
stat, err := os.Stat(pidPath)
if err != nil {
return fmt.Errorf("Could not remove pid file: %s", err)
}
if stat.IsDir() {
return fmt.Errorf("Specified pid file path is directory")
}
err = os.Remove(pidPath)
if err != nil {
return fmt.Errorf("Could not remove pid file: %s", err)
}
return nil
}
// loadServices will load service definitions from configuration and persisted
// definitions on disk, and load them into the local agent.
func (a *Agent) loadServices(conf *config.RuntimeConfig) error {
// Register the services from config
for _, service := range conf.Services {
ns := service.NodeService()
chkTypes := service.CheckTypes()
if err := a.AddService(ns, chkTypes, false, service.Token); err != nil {
return fmt.Errorf("Failed to register service '%s': %v", service.ID, err)
}
}
// Load any persisted services
svcDir := filepath.Join(a.config.DataDir, servicesDir)
files, err := ioutil.ReadDir(svcDir)
if err != nil {
if os.IsNotExist(err) {
return nil
}
return fmt.Errorf("Failed reading services dir %q: %s", svcDir, err)
}
for _, fi := range files {
// Skip all dirs
if fi.IsDir() {
continue
}
// Skip all partially written temporary files
if strings.HasSuffix(fi.Name(), "tmp") {
a.logger.Printf("[WARN] Ignoring temporary service file %v", fi.Name())
continue
}
// Open the file for reading
file := filepath.Join(svcDir, fi.Name())
fh, err := os.Open(file)
if err != nil {
return fmt.Errorf("failed opening service file %q: %s", file, err)
}
// Read the contents into a buffer
buf, err := ioutil.ReadAll(fh)
fh.Close()
if err != nil {
return fmt.Errorf("failed reading service file %q: %s", file, err)
}
// Try decoding the service definition
var p persistedService
if err := json.Unmarshal(buf, &p); err != nil {
// Backwards-compatibility for pre-0.5.1 persisted services
if err := json.Unmarshal(buf, &p.Service); err != nil {
return fmt.Errorf("failed decoding service file %q: %s", file, err)
}
}
serviceID := p.Service.ID
if _, ok := a.state.services[serviceID]; ok {
// Purge previously persisted service. This allows config to be
// preferred over services persisted from the API.
a.logger.Printf("[DEBUG] agent: service %q exists, not restoring from %q",
serviceID, file)
if err := a.purgeService(serviceID); err != nil {
return fmt.Errorf("failed purging service %q: %s", serviceID, err)
}
} else {
a.logger.Printf("[DEBUG] agent: restored service definition %q from %q",
serviceID, file)
if err := a.AddService(p.Service, nil, false, p.Token); err != nil {
return fmt.Errorf("failed adding service %q: %s", serviceID, err)
}
}
}
return nil
}
// unloadServices will deregister all services other than the 'consul' service
// known to the local agent.
func (a *Agent) unloadServices() error {
for _, service := range a.state.Services() {
if err := a.RemoveService(service.ID, false); err != nil {
return fmt.Errorf("Failed deregistering service '%s': %v", service.ID, err)
}
}
return nil
}
// loadChecks loads check definitions and/or persisted check definitions from
// disk and re-registers them with the local agent.
func (a *Agent) loadChecks(conf *config.RuntimeConfig) error {
// Register the checks from config
for _, check := range conf.Checks {
health := check.HealthCheck(conf.NodeName)
chkType := check.CheckType()
if err := a.AddCheck(health, chkType, false, check.Token); err != nil {
return fmt.Errorf("Failed to register check '%s': %v %v", check.Name, err, check)
}
}
// Load any persisted checks
checkDir := filepath.Join(a.config.DataDir, checksDir)
files, err := ioutil.ReadDir(checkDir)
if err != nil {
if os.IsNotExist(err) {
return nil
}
return fmt.Errorf("Failed reading checks dir %q: %s", checkDir, err)
}
for _, fi := range files {
// Ignore dirs - we only care about the check definition files
if fi.IsDir() {
continue
}
// Open the file for reading
file := filepath.Join(checkDir, fi.Name())
fh, err := os.Open(file)
if err != nil {
return fmt.Errorf("Failed opening check file %q: %s", file, err)
}
// Read the contents into a buffer
buf, err := ioutil.ReadAll(fh)
fh.Close()
if err != nil {
return fmt.Errorf("failed reading check file %q: %s", file, err)
}
// Decode the check
var p persistedCheck
if err := json.Unmarshal(buf, &p); err != nil {
return fmt.Errorf("Failed decoding check file %q: %s", file, err)
}
checkID := p.Check.CheckID
if _, ok := a.state.checks[checkID]; ok {
// Purge previously persisted check. This allows config to be
// preferred over persisted checks from the API.
a.logger.Printf("[DEBUG] agent: check %q exists, not restoring from %q",
checkID, file)
if err := a.purgeCheck(checkID); err != nil {
return fmt.Errorf("Failed purging check %q: %s", checkID, err)
}
} else {
// Default check to critical to avoid placing potentially unhealthy
// services into the active pool
p.Check.Status = api.HealthCritical
if err := a.AddCheck(p.Check, p.ChkType, false, p.Token); err != nil {
// Purge the check if it is unable to be restored.
a.logger.Printf("[WARN] agent: Failed to restore check %q: %s",
checkID, err)
if err := a.purgeCheck(checkID); err != nil {
return fmt.Errorf("Failed purging check %q: %s", checkID, err)
}
}
a.logger.Printf("[DEBUG] agent: restored health check %q from %q",
p.Check.CheckID, file)
}
}
return nil
}
// unloadChecks will deregister all checks known to the local agent.
func (a *Agent) unloadChecks() error {
for _, check := range a.state.Checks() {
if err := a.RemoveCheck(check.CheckID, false); err != nil {
return fmt.Errorf("Failed deregistering check '%s': %s", check.CheckID, err)
}
}
return nil
}
// snapshotCheckState is used to snapshot the current state of the health
// checks. This is done before we reload our checks, so that we can properly
// restore into the same state.
func (a *Agent) snapshotCheckState() map[types.CheckID]*structs.HealthCheck {
return a.state.Checks()
}
// restoreCheckState is used to reset the health state based on a snapshot.
// This is done after we finish the reload to avoid any unnecessary flaps
// in health state and potential session invalidations.
func (a *Agent) restoreCheckState(snap map[types.CheckID]*structs.HealthCheck) {
for id, check := range snap {
a.state.UpdateCheck(id, check.Status, check.Output)
}
}
// loadMetadata loads node metadata fields from the agent config and
// updates them on the local agent.
func (a *Agent) loadMetadata(conf *config.RuntimeConfig) error {
a.state.Lock()
defer a.state.Unlock()
for key, value := range conf.NodeMeta {
a.state.metadata[key] = value
}
a.state.metadata[structs.MetaSegmentKey] = conf.SegmentName
a.state.changeMade()
return nil
}
// unloadMetadata resets the local metadata state
func (a *Agent) unloadMetadata() {
a.state.Lock()
defer a.state.Unlock()
a.state.metadata = make(map[string]string)
}
// serviceMaintCheckID returns the ID of a given service's maintenance check
func serviceMaintCheckID(serviceID string) types.CheckID {
return types.CheckID(structs.ServiceMaintPrefix + serviceID)
}
// EnableServiceMaintenance will register a false health check against the given
// service ID with critical status. This will exclude the service from queries.
func (a *Agent) EnableServiceMaintenance(serviceID, reason, token string) error {
service, ok := a.state.Services()[serviceID]
if !ok {
return fmt.Errorf("No service registered with ID %q", serviceID)
}
// Check if maintenance mode is not already enabled
checkID := serviceMaintCheckID(serviceID)
if _, ok := a.state.Checks()[checkID]; ok {
return nil
}
// Use default notes if no reason provided
if reason == "" {
reason = defaultServiceMaintReason
}
// Create and register the critical health check
check := &structs.HealthCheck{
Node: a.config.NodeName,
CheckID: checkID,
Name: "Service Maintenance Mode",
Notes: reason,
ServiceID: service.ID,
ServiceName: service.Service,
Status: api.HealthCritical,
}
a.AddCheck(check, nil, true, token)
a.logger.Printf("[INFO] agent: Service %q entered maintenance mode", serviceID)
return nil
}
// DisableServiceMaintenance will deregister the fake maintenance mode check
// if the service has been marked as in maintenance.
func (a *Agent) DisableServiceMaintenance(serviceID string) error {
if _, ok := a.state.Services()[serviceID]; !ok {
return fmt.Errorf("No service registered with ID %q", serviceID)
}
// Check if maintenance mode is enabled
checkID := serviceMaintCheckID(serviceID)
if _, ok := a.state.Checks()[checkID]; !ok {
return nil
}
// Deregister the maintenance check
a.RemoveCheck(checkID, true)
a.logger.Printf("[INFO] agent: Service %q left maintenance mode", serviceID)
return nil
}
// EnableNodeMaintenance places a node into maintenance mode.
func (a *Agent) EnableNodeMaintenance(reason, token string) {
// Ensure node maintenance is not already enabled
if _, ok := a.state.Checks()[structs.NodeMaint]; ok {
return
}
// Use a default notes value
if reason == "" {
reason = defaultNodeMaintReason
}
// Create and register the node maintenance check
check := &structs.HealthCheck{
Node: a.config.NodeName,
CheckID: structs.NodeMaint,
Name: "Node Maintenance Mode",
Notes: reason,
Status: api.HealthCritical,
}
a.AddCheck(check, nil, true, token)
a.logger.Printf("[INFO] agent: Node entered maintenance mode")
}
// DisableNodeMaintenance removes a node from maintenance mode
func (a *Agent) DisableNodeMaintenance() {
if _, ok := a.state.Checks()[structs.NodeMaint]; !ok {
return
}
a.RemoveCheck(structs.NodeMaint, true)
a.logger.Printf("[INFO] agent: Node left maintenance mode")
}
func (a *Agent) ReloadConfig(newCfg *config.RuntimeConfig) error {
// Bulk update the services and checks
a.PauseSync()
defer a.ResumeSync()
// Snapshot the current state, and restore it afterwards
snap := a.snapshotCheckState()
defer a.restoreCheckState(snap)
// First unload all checks, services, and metadata. This lets us begin the reload
// with a clean slate.
if err := a.unloadServices(); err != nil {
return fmt.Errorf("Failed unloading services: %s", err)
}
if err := a.unloadChecks(); err != nil {
return fmt.Errorf("Failed unloading checks: %s", err)
}
a.unloadMetadata()
// Reload service/check definitions and metadata.
if err := a.loadServices(newCfg); err != nil {
return fmt.Errorf("Failed reloading services: %s", err)
}
if err := a.loadChecks(newCfg); err != nil {
return fmt.Errorf("Failed reloading checks: %s", err)
}
if err := a.loadMetadata(newCfg); err != nil {
return fmt.Errorf("Failed reloading metadata: %s", err)
}
if err := a.reloadWatches(newCfg); err != nil {
return fmt.Errorf("Failed reloading watches: %v", err)
}
// Update filtered metrics
metrics.UpdateFilter(newCfg.TelemetryAllowedPrefixes, newCfg.TelemetryBlockedPrefixes)
return nil
}