235 lines
7.8 KiB
Go

package autopilot
import (
"context"
"sync"
"time"
hclog "github.com/hashicorp/go-hclog"
"github.com/hashicorp/raft"
)
const (
// These constants were take from what exists in Consul at the time of module extraction.
DefaultUpdateInterval = 2 * time.Second
DefaultReconcileInterval = 10 * time.Second
)
// Option is an option to be used when creating a new Autopilot instance
type Option func(*Autopilot)
// WithUpdateInterval returns an Option to set the Autopilot instance's
// update interval.
func WithUpdateInterval(t time.Duration) Option {
if t == 0 {
t = DefaultUpdateInterval
}
return func(a *Autopilot) {
a.updateInterval = t
}
}
// WithReconcileInterval returns an Option to set the Autopilot instance's
// reconcile interval.
func WithReconcileInterval(t time.Duration) Option {
if t == 0 {
t = DefaultReconcileInterval
}
return func(a *Autopilot) {
a.reconcileInterval = t
}
}
// WithLogger returns an Option to set the Autopilot instance's logger
func WithLogger(logger hclog.Logger) Option {
if logger == nil {
logger = hclog.Default()
}
return func(a *Autopilot) {
a.logger = logger.Named("autopilot")
}
}
// withTimeProvider returns an Option which overrides and Autopilot instance's
// time provider with the given one. This should only be used in tests
// as a means of making some time.Time values in an autopilot state deterministic.
// For real uses the default runtimeTimeProvider should be used.
func withTimeProvider(provider timeProvider) Option {
return func(a *Autopilot) {
a.time = provider
}
}
// WithPromoter returns an option to set the Promoter type that Autpilot will
// use. When the option is not given the default StablePromoter from this package
// will be used.
func WithPromoter(promoter Promoter) Option {
if promoter == nil {
promoter = DefaultPromoter()
}
return func(a *Autopilot) {
a.promoter = promoter
}
}
// ExecutionStatus represents the current status of the autopilot background go routines
type ExecutionStatus string
const (
NotRunning ExecutionStatus = "not-running"
Running ExecutionStatus = "running"
ShuttingDown ExecutionStatus = "shutting-down"
)
type execInfo struct {
// status is the current state of autopilot executation
status ExecutionStatus
// shutdown is a function that can be execute to shutdown a running
// autopilot's go routines.
shutdown context.CancelFunc
// done is a chan that will be closed when the running autopilot go
// routines have exited. Technically closing it is the very last
// thing done in the go routine but at that point enough state has
// been cleaned up that we would then allow it to be started
// immediately afterward
done chan struct{}
}
// Autopilot is the type to manage a running Raft instance.
//
// Each Raft node in the cluster will have a corresponding Autopilot instance but
// only 1 Autopilot instance should run at a time in the cluster. So when a node
// gains Raft leadership the corresponding Autopilot instance should have it's
// Start method called. Then if leadership is lost that node should call the
// Stop method on the Autopilot instance.
type Autopilot struct {
logger hclog.Logger
// delegate is used to get information about the system such as Raft server
// states, known servers etc.
delegate ApplicationIntegration
// promoter is used to calculate promotions, demotions and leadership transfers
// given a particular autopilot State. The interface also contains methods
// for filling in parts of the autopilot state that the core module doesn't
// control such as the Ext fields on the Server and State types.
promoter Promoter
// raft is an interface that implements all the parts of the Raft library interface
// that we use. It is an interface to allow for mocking raft during testing.
raft Raft
// time is an interface with a single method for getting the current time - `Now`.
// In some tests this will be the MockTimeProvider which allows tests to be more
// deterministic but for running systems this should not be overrided from the
// default which is the runtimeTimeProvider and is a small shim around calling
// time.Now.
time timeProvider
// reconcileInterval is how long between rounds of performing promotions, demotions
// and leadership transfers.
reconcileInterval time.Duration
// updateInterval is the time between the periodic state updates. These periodic
// state updates take in known servers from the delegate, request Raft stats be
// fetched and pull in other inputs such as the Raft configuration to create
// an updated view of the Autopilot State.
updateInterval time.Duration
// state is the structure that autopilot uses to make decisions about what to do.
// This field should be considered immutable and no modifications to an existing
// state should be made but instead a new state is created and set to this field
// while holding the stateLock.
state *State
// stateLock is meant to only protect the state field. This just prevents
// the periodic state update and consumers requesting the autopilot state from
// racing.
stateLock sync.RWMutex
// startTime is recorded so that we can make better determinations about server
// stability during the initial period of time after autopilot first starts.
// If autopilot has just started the default behavior to check if a server is
// stable will not work as it will ensure the server has been healthy for
// the configured server stabilization time. If that configure time is longer
// than the amount of time autopilot has been running you can run into issues
// with leadership flapping during some scenarios where a cluster is being
// brought up.
startTime time.Time
// removeDeadCh is used to trigger the running autopilot go routines to
// find and remove any dead/failed servers
removeDeadCh chan struct{}
// reconcileCh is used to trigger an immediate round of reconciliation.
reconcileCh chan struct{}
// leaderLock implements a cancellable mutex that will be used to ensure
// that only one autopilot go routine is the "leader". The leader is
// the go routine that is currently responsible for updating the
// autopilot state and performing raft promotions/demotions.
leaderLock *mutex
// execution is the information about the most recent autopilot execution.
// Start will initialize this with the most recent execution and it will
// be updated by Stop and by the go routines being executed when they are
// finished.
execution *execInfo
// execLock protects access to the execution field
execLock sync.Mutex
}
// New will create a new Autopilot instance utilizing the given Raft and Delegate.
// If the WithPromoter option is not provided the default StablePromoter will
// be used.
func New(raft Raft, delegate ApplicationIntegration, options ...Option) *Autopilot {
a := &Autopilot{
raft: raft,
delegate: delegate,
state: &State{},
promoter: DefaultPromoter(),
logger: hclog.Default().Named("autopilot"),
// should this be buffered?
removeDeadCh: make(chan struct{}, 1),
reconcileInterval: DefaultReconcileInterval,
updateInterval: DefaultUpdateInterval,
time: &runtimeTimeProvider{},
leaderLock: newMutex(),
}
for _, opt := range options {
opt(a)
}
return a
}
// RemoveDeadServers will trigger an immediate removal of dead/failed servers.
func (a *Autopilot) RemoveDeadServers() {
select {
case a.removeDeadCh <- struct{}{}:
default:
}
}
// GetState retrieves the current autopilot State
func (a *Autopilot) GetState() *State {
a.stateLock.Lock()
defer a.stateLock.Unlock()
return a.state
}
// GetServerHealth returns the latest ServerHealth for a given server.
// The returned struct should not be modified or else it will im
func (a *Autopilot) GetServerHealth(id raft.ServerID) *ServerHealth {
state := a.GetState()
srv, ok := state.Servers[id]
if ok {
return &srv.Health
}
return nil
}