mirror of https://github.com/status-im/consul.git
Merge remote-tracking branch 'upstream/master'
This commit is contained in:
commit
fe61650ef0
|
@ -32,6 +32,10 @@ FEATURES:
|
|||
quorum. This version also provides a foundation for new features that will
|
||||
appear in future Consul versions once the remainder of the v2 library is
|
||||
complete. [GH-2222]
|
||||
* Added new `consul operator` command, HTTP endpoint, and associated ACL to
|
||||
allow Consul operators to view and update the Raft configuration. This allows
|
||||
for a stale server to be removed without requiring downtime and peers.json
|
||||
recovery file use. [GH-2312]
|
||||
* Extended the [`translate_wan_addrs`](https://www.consul.io/docs/agent/options.html#translate_wan_addrs)
|
||||
config option to also translate node addresses in HTTP responses, making it easy
|
||||
to use this feature from non-DNS clients. [GH-2118]
|
||||
|
@ -54,6 +58,9 @@ BACKWARDS INCOMPATIBILITIES:
|
|||
* `skip_leave_on_interrupt`'s default behavior is now dependent on whether or
|
||||
not the agent is acting as a server or client. When Consul is started as a
|
||||
server the default is `true` and `false` when a client. [GH-1909]
|
||||
* `allow_stale` for DNS queries now defaults to `true`, allowing for better
|
||||
utilization of available Consul servers and higher throughput at the exponse of
|
||||
weaker consistency. [GH-2315]
|
||||
* HTTP check output is truncated to 4k, similar to script check output. [GH-1952]
|
||||
* Consul's Go API client will now send ACL tokens using HTTP headers instead of
|
||||
query parameters, requiring Consul 0.6.0 or later. [GH-2233]
|
||||
|
|
|
@ -25,8 +25,7 @@ Consul provides several key features:
|
|||
* **Multi-Datacenter** - Consul is built to be datacenter aware, and can
|
||||
support any number of regions without complex configuration.
|
||||
|
||||
Consul runs on Linux, Mac OS X, and Windows. It is recommended to run the
|
||||
Consul servers only on Linux, however.
|
||||
Consul runs on Linux, Mac OS X, FreeBSD, Solaris, and Windows.
|
||||
|
||||
## Quick Start
|
||||
|
||||
|
@ -56,7 +55,7 @@ $ bin/consul
|
|||
...
|
||||
```
|
||||
|
||||
*note: `make` will also place a copy of the binary in the first part of your $GOPATH*
|
||||
*Note: `make` will also place a copy of the binary in the first part of your `$GOPATH`.*
|
||||
|
||||
You can run tests by typing `make test`.
|
||||
|
||||
|
@ -85,4 +84,4 @@ with MinGW.
|
|||
## Vendoring
|
||||
|
||||
Consul currently uses [govendor](https://github.com/kardianos/govendor) for
|
||||
vendoring.
|
||||
vendoring.
|
||||
|
|
45
acl/acl.go
45
acl/acl.go
|
@ -73,6 +73,14 @@ type ACL interface {
|
|||
// KeyringWrite determines if the keyring can be manipulated
|
||||
KeyringWrite() bool
|
||||
|
||||
// OperatorRead determines if the read-only Consul operator functions
|
||||
// can be used.
|
||||
OperatorRead() bool
|
||||
|
||||
// OperatorWrite determines if the state-changing Consul operator
|
||||
// functions can be used.
|
||||
OperatorWrite() bool
|
||||
|
||||
// ACLList checks for permission to list all the ACLs
|
||||
ACLList() bool
|
||||
|
||||
|
@ -132,6 +140,14 @@ func (s *StaticACL) KeyringWrite() bool {
|
|||
return s.defaultAllow
|
||||
}
|
||||
|
||||
func (s *StaticACL) OperatorRead() bool {
|
||||
return s.defaultAllow
|
||||
}
|
||||
|
||||
func (s *StaticACL) OperatorWrite() bool {
|
||||
return s.defaultAllow
|
||||
}
|
||||
|
||||
func (s *StaticACL) ACLList() bool {
|
||||
return s.allowManage
|
||||
}
|
||||
|
@ -188,10 +204,13 @@ type PolicyACL struct {
|
|||
// preparedQueryRules contains the prepared query policies
|
||||
preparedQueryRules *radix.Tree
|
||||
|
||||
// keyringRules contains the keyring policies. The keyring has
|
||||
// keyringRule contains the keyring policies. The keyring has
|
||||
// a very simple yes/no without prefix matching, so here we
|
||||
// don't need to use a radix tree.
|
||||
keyringRule string
|
||||
|
||||
// operatorRule contains the operator policies.
|
||||
operatorRule string
|
||||
}
|
||||
|
||||
// New is used to construct a policy based ACL from a set of policies
|
||||
|
@ -228,6 +247,9 @@ func New(parent ACL, policy *Policy) (*PolicyACL, error) {
|
|||
// Load the keyring policy
|
||||
p.keyringRule = policy.Keyring
|
||||
|
||||
// Load the operator policy
|
||||
p.operatorRule = policy.Operator
|
||||
|
||||
return p, nil
|
||||
}
|
||||
|
||||
|
@ -422,6 +444,27 @@ func (p *PolicyACL) KeyringWrite() bool {
|
|||
return p.parent.KeyringWrite()
|
||||
}
|
||||
|
||||
// OperatorRead determines if the read-only operator functions are allowed.
|
||||
func (p *PolicyACL) OperatorRead() bool {
|
||||
switch p.operatorRule {
|
||||
case PolicyRead, PolicyWrite:
|
||||
return true
|
||||
case PolicyDeny:
|
||||
return false
|
||||
default:
|
||||
return p.parent.OperatorRead()
|
||||
}
|
||||
}
|
||||
|
||||
// OperatorWrite determines if the state-changing operator functions are
|
||||
// allowed.
|
||||
func (p *PolicyACL) OperatorWrite() bool {
|
||||
if p.operatorRule == PolicyWrite {
|
||||
return true
|
||||
}
|
||||
return p.parent.OperatorWrite()
|
||||
}
|
||||
|
||||
// ACLList checks if listing of ACLs is allowed
|
||||
func (p *PolicyACL) ACLList() bool {
|
||||
return p.parent.ACLList()
|
||||
|
|
|
@ -65,6 +65,12 @@ func TestStaticACL(t *testing.T) {
|
|||
if !all.KeyringWrite() {
|
||||
t.Fatalf("should allow")
|
||||
}
|
||||
if !all.OperatorRead() {
|
||||
t.Fatalf("should allow")
|
||||
}
|
||||
if !all.OperatorWrite() {
|
||||
t.Fatalf("should allow")
|
||||
}
|
||||
if all.ACLList() {
|
||||
t.Fatalf("should not allow")
|
||||
}
|
||||
|
@ -108,6 +114,12 @@ func TestStaticACL(t *testing.T) {
|
|||
if none.KeyringWrite() {
|
||||
t.Fatalf("should not allow")
|
||||
}
|
||||
if none.OperatorRead() {
|
||||
t.Fatalf("should now allow")
|
||||
}
|
||||
if none.OperatorWrite() {
|
||||
t.Fatalf("should not allow")
|
||||
}
|
||||
if none.ACLList() {
|
||||
t.Fatalf("should not allow")
|
||||
}
|
||||
|
@ -145,6 +157,12 @@ func TestStaticACL(t *testing.T) {
|
|||
if !manage.KeyringWrite() {
|
||||
t.Fatalf("should allow")
|
||||
}
|
||||
if !manage.OperatorRead() {
|
||||
t.Fatalf("should allow")
|
||||
}
|
||||
if !manage.OperatorWrite() {
|
||||
t.Fatalf("should allow")
|
||||
}
|
||||
if !manage.ACLList() {
|
||||
t.Fatalf("should allow")
|
||||
}
|
||||
|
@ -480,19 +498,18 @@ func TestPolicyACL_Parent(t *testing.T) {
|
|||
}
|
||||
|
||||
func TestPolicyACL_Keyring(t *testing.T) {
|
||||
// Test keyring ACLs
|
||||
type keyringcase struct {
|
||||
inp string
|
||||
read bool
|
||||
write bool
|
||||
}
|
||||
keyringcases := []keyringcase{
|
||||
cases := []keyringcase{
|
||||
{"", false, false},
|
||||
{PolicyRead, true, false},
|
||||
{PolicyWrite, true, true},
|
||||
{PolicyDeny, false, false},
|
||||
}
|
||||
for _, c := range keyringcases {
|
||||
for _, c := range cases {
|
||||
acl, err := New(DenyAll(), &Policy{Keyring: c.inp})
|
||||
if err != nil {
|
||||
t.Fatalf("bad: %s", err)
|
||||
|
@ -505,3 +522,29 @@ func TestPolicyACL_Keyring(t *testing.T) {
|
|||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestPolicyACL_Operator(t *testing.T) {
|
||||
type operatorcase struct {
|
||||
inp string
|
||||
read bool
|
||||
write bool
|
||||
}
|
||||
cases := []operatorcase{
|
||||
{"", false, false},
|
||||
{PolicyRead, true, false},
|
||||
{PolicyWrite, true, true},
|
||||
{PolicyDeny, false, false},
|
||||
}
|
||||
for _, c := range cases {
|
||||
acl, err := New(DenyAll(), &Policy{Operator: c.inp})
|
||||
if err != nil {
|
||||
t.Fatalf("bad: %s", err)
|
||||
}
|
||||
if acl.OperatorRead() != c.read {
|
||||
t.Fatalf("bad: %#v", c)
|
||||
}
|
||||
if acl.OperatorWrite() != c.write {
|
||||
t.Fatalf("bad: %#v", c)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -21,6 +21,7 @@ type Policy struct {
|
|||
Events []*EventPolicy `hcl:"event,expand"`
|
||||
PreparedQueries []*PreparedQueryPolicy `hcl:"query,expand"`
|
||||
Keyring string `hcl:"keyring"`
|
||||
Operator string `hcl:"operator"`
|
||||
}
|
||||
|
||||
// KeyPolicy represents a policy for a key
|
||||
|
@ -125,5 +126,10 @@ func Parse(rules string) (*Policy, error) {
|
|||
return nil, fmt.Errorf("Invalid keyring policy: %#v", p.Keyring)
|
||||
}
|
||||
|
||||
// Validate the operator policy - this one is allowed to be empty
|
||||
if p.Operator != "" && !isPolicyValid(p.Operator) {
|
||||
return nil, fmt.Errorf("Invalid operator policy: %#v", p.Operator)
|
||||
}
|
||||
|
||||
return p, nil
|
||||
}
|
||||
|
|
|
@ -45,6 +45,7 @@ query "bar" {
|
|||
policy = "deny"
|
||||
}
|
||||
keyring = "deny"
|
||||
operator = "deny"
|
||||
`
|
||||
exp := &Policy{
|
||||
Keys: []*KeyPolicy{
|
||||
|
@ -103,7 +104,8 @@ keyring = "deny"
|
|||
Policy: PolicyDeny,
|
||||
},
|
||||
},
|
||||
Keyring: PolicyDeny,
|
||||
Keyring: PolicyDeny,
|
||||
Operator: PolicyDeny,
|
||||
}
|
||||
|
||||
out, err := Parse(inp)
|
||||
|
@ -162,7 +164,8 @@ func TestACLPolicy_Parse_JSON(t *testing.T) {
|
|||
"policy": "deny"
|
||||
}
|
||||
},
|
||||
"keyring": "deny"
|
||||
"keyring": "deny",
|
||||
"operator": "deny"
|
||||
}`
|
||||
exp := &Policy{
|
||||
Keys: []*KeyPolicy{
|
||||
|
@ -221,7 +224,8 @@ func TestACLPolicy_Parse_JSON(t *testing.T) {
|
|||
Policy: PolicyDeny,
|
||||
},
|
||||
},
|
||||
Keyring: PolicyDeny,
|
||||
Keyring: PolicyDeny,
|
||||
Operator: PolicyDeny,
|
||||
}
|
||||
|
||||
out, err := Parse(inp)
|
||||
|
@ -252,6 +256,24 @@ keyring = ""
|
|||
}
|
||||
}
|
||||
|
||||
func TestACLPolicy_Operator_Empty(t *testing.T) {
|
||||
inp := `
|
||||
operator = ""
|
||||
`
|
||||
exp := &Policy{
|
||||
Operator: "",
|
||||
}
|
||||
|
||||
out, err := Parse(inp)
|
||||
if err != nil {
|
||||
t.Fatalf("err: %v", err)
|
||||
}
|
||||
|
||||
if !reflect.DeepEqual(out, exp) {
|
||||
t.Fatalf("bad: %#v %#v", out, exp)
|
||||
}
|
||||
}
|
||||
|
||||
func TestACLPolicy_Bad_Policy(t *testing.T) {
|
||||
cases := []string{
|
||||
`key "" { policy = "nope" }`,
|
||||
|
@ -259,6 +281,7 @@ func TestACLPolicy_Bad_Policy(t *testing.T) {
|
|||
`event "" { policy = "nope" }`,
|
||||
`query "" { policy = "nope" }`,
|
||||
`keyring = "nope"`,
|
||||
`operator = "nope"`,
|
||||
}
|
||||
for _, c := range cases {
|
||||
_, err := Parse(c)
|
||||
|
|
|
@ -0,0 +1,81 @@
|
|||
package api
|
||||
|
||||
// Operator can be used to perform low-level operator tasks for Consul.
|
||||
type Operator struct {
|
||||
c *Client
|
||||
}
|
||||
|
||||
// Operator returns a handle to the operator endpoints.
|
||||
func (c *Client) Operator() *Operator {
|
||||
return &Operator{c}
|
||||
}
|
||||
|
||||
// RaftServer has information about a server in the Raft configuration.
|
||||
type RaftServer struct {
|
||||
// ID is the unique ID for the server. These are currently the same
|
||||
// as the address, but they will be changed to a real GUID in a future
|
||||
// release of Consul.
|
||||
ID string
|
||||
|
||||
// Node is the node name of the server, as known by Consul, or this
|
||||
// will be set to "(unknown)" otherwise.
|
||||
Node string
|
||||
|
||||
// Address is the IP:port of the server, used for Raft communications.
|
||||
Address string
|
||||
|
||||
// Leader is true if this server is the current cluster leader.
|
||||
Leader bool
|
||||
|
||||
// Voter is true if this server has a vote in the cluster. This might
|
||||
// be false if the server is staging and still coming online, or if
|
||||
// it's a non-voting server, which will be added in a future release of
|
||||
// Consul.
|
||||
Voter bool
|
||||
}
|
||||
|
||||
// RaftConfigration is returned when querying for the current Raft configuration.
|
||||
type RaftConfiguration struct {
|
||||
// Servers has the list of servers in the Raft configuration.
|
||||
Servers []*RaftServer
|
||||
|
||||
// Index has the Raft index of this configuration.
|
||||
Index uint64
|
||||
}
|
||||
|
||||
// RaftGetConfiguration is used to query the current Raft peer set.
|
||||
func (op *Operator) RaftGetConfiguration(q *QueryOptions) (*RaftConfiguration, error) {
|
||||
r := op.c.newRequest("GET", "/v1/operator/raft/configuration")
|
||||
r.setQueryOptions(q)
|
||||
_, resp, err := requireOK(op.c.doRequest(r))
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
var out RaftConfiguration
|
||||
if err := decodeBody(resp, &out); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return &out, nil
|
||||
}
|
||||
|
||||
// RaftRemovePeerByAddress is used to kick a stale peer (one that it in the Raft
|
||||
// quorum but no longer known to Serf or the catalog) by address in the form of
|
||||
// "IP:port".
|
||||
func (op *Operator) RaftRemovePeerByAddress(address string, q *WriteOptions) error {
|
||||
r := op.c.newRequest("DELETE", "/v1/operator/raft/peer")
|
||||
r.setWriteOptions(q)
|
||||
|
||||
// TODO (slackpad) Currently we made address a query parameter. Once
|
||||
// IDs are in place this will be DELETE /v1/operator/raft/peer/<id>.
|
||||
r.params.Set("address", string(address))
|
||||
|
||||
_, resp, err := requireOK(op.c.doRequest(r))
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
resp.Body.Close()
|
||||
return nil
|
||||
}
|
|
@ -0,0 +1,38 @@
|
|||
package api
|
||||
|
||||
import (
|
||||
"strings"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestOperator_RaftGetConfiguration(t *testing.T) {
|
||||
t.Parallel()
|
||||
c, s := makeClient(t)
|
||||
defer s.Stop()
|
||||
|
||||
operator := c.Operator()
|
||||
out, err := operator.RaftGetConfiguration(nil)
|
||||
if err != nil {
|
||||
t.Fatalf("err: %v", err)
|
||||
}
|
||||
if len(out.Servers) != 1 ||
|
||||
!out.Servers[0].Leader ||
|
||||
!out.Servers[0].Voter {
|
||||
t.Fatalf("bad: %v", out)
|
||||
}
|
||||
}
|
||||
|
||||
func TestOperator_RaftRemovePeerByAddress(t *testing.T) {
|
||||
t.Parallel()
|
||||
c, s := makeClient(t)
|
||||
defer s.Stop()
|
||||
|
||||
// If we get this error, it proves we sent the address all the way
|
||||
// through.
|
||||
operator := c.Operator()
|
||||
err := operator.RaftRemovePeerByAddress("nope", nil)
|
||||
if err == nil || !strings.Contains(err.Error(),
|
||||
"address \"nope\" was not found in the Raft configuration") {
|
||||
t.Fatalf("err: %v", err)
|
||||
}
|
||||
}
|
|
@ -68,7 +68,7 @@ type DNSConfig struct {
|
|||
// data. This gives horizontal read scalability since
|
||||
// any Consul server can service the query instead of
|
||||
// only the leader.
|
||||
AllowStale bool `mapstructure:"allow_stale"`
|
||||
AllowStale *bool `mapstructure:"allow_stale"`
|
||||
|
||||
// EnableTruncate is used to enable setting the truncate
|
||||
// flag for UDP DNS queries. This allows unmodified
|
||||
|
@ -651,6 +651,7 @@ func DefaultConfig() *Config {
|
|||
Server: 8300,
|
||||
},
|
||||
DNSConfig: DNSConfig{
|
||||
AllowStale: Bool(true),
|
||||
UDPAnswerLimit: 3,
|
||||
MaxStale: 5 * time.Second,
|
||||
RecursorTimeout: 2 * time.Second,
|
||||
|
@ -1351,8 +1352,8 @@ func MergeConfig(a, b *Config) *Config {
|
|||
result.DNSConfig.ServiceTTL[service] = dur
|
||||
}
|
||||
}
|
||||
if b.DNSConfig.AllowStale {
|
||||
result.DNSConfig.AllowStale = true
|
||||
if b.DNSConfig.AllowStale != nil {
|
||||
result.DNSConfig.AllowStale = b.DNSConfig.AllowStale
|
||||
}
|
||||
if b.DNSConfig.UDPAnswerLimit != 0 {
|
||||
result.DNSConfig.UDPAnswerLimit = b.DNSConfig.UDPAnswerLimit
|
||||
|
|
|
@ -544,13 +544,13 @@ func TestDecodeConfig(t *testing.T) {
|
|||
}
|
||||
|
||||
// DNS node ttl, max stale
|
||||
input = `{"dns_config": {"allow_stale": true, "enable_truncate": false, "max_stale": "15s", "node_ttl": "5s", "only_passing": true, "udp_answer_limit": 6, "recursor_timeout": "7s"}}`
|
||||
input = `{"dns_config": {"allow_stale": false, "enable_truncate": false, "max_stale": "15s", "node_ttl": "5s", "only_passing": true, "udp_answer_limit": 6, "recursor_timeout": "7s"}}`
|
||||
config, err = DecodeConfig(bytes.NewReader([]byte(input)))
|
||||
if err != nil {
|
||||
t.Fatalf("err: %s", err)
|
||||
}
|
||||
|
||||
if !config.DNSConfig.AllowStale {
|
||||
if *config.DNSConfig.AllowStale {
|
||||
t.Fatalf("bad: %#v", config)
|
||||
}
|
||||
if config.DNSConfig.EnableTruncate {
|
||||
|
@ -1411,7 +1411,7 @@ func TestMergeConfig(t *testing.T) {
|
|||
DataDir: "/tmp/bar",
|
||||
DNSRecursors: []string{"127.0.0.2:1001"},
|
||||
DNSConfig: DNSConfig{
|
||||
AllowStale: false,
|
||||
AllowStale: Bool(false),
|
||||
EnableTruncate: true,
|
||||
DisableCompression: true,
|
||||
MaxStale: 30 * time.Second,
|
||||
|
|
|
@ -198,7 +198,7 @@ func (d *DNSServer) handlePtr(resp dns.ResponseWriter, req *dns.Msg) {
|
|||
Datacenter: datacenter,
|
||||
QueryOptions: structs.QueryOptions{
|
||||
Token: d.agent.config.ACLToken,
|
||||
AllowStale: d.config.AllowStale,
|
||||
AllowStale: *d.config.AllowStale,
|
||||
},
|
||||
}
|
||||
var out structs.IndexedNodes
|
||||
|
@ -384,7 +384,7 @@ func (d *DNSServer) nodeLookup(network, datacenter, node string, req, resp *dns.
|
|||
Node: node,
|
||||
QueryOptions: structs.QueryOptions{
|
||||
Token: d.agent.config.ACLToken,
|
||||
AllowStale: d.config.AllowStale,
|
||||
AllowStale: *d.config.AllowStale,
|
||||
},
|
||||
}
|
||||
var out structs.IndexedNodeServices
|
||||
|
@ -584,7 +584,7 @@ func (d *DNSServer) serviceLookup(network, datacenter, service, tag string, req,
|
|||
TagFilter: tag != "",
|
||||
QueryOptions: structs.QueryOptions{
|
||||
Token: d.agent.config.ACLToken,
|
||||
AllowStale: d.config.AllowStale,
|
||||
AllowStale: *d.config.AllowStale,
|
||||
},
|
||||
}
|
||||
var out structs.IndexedCheckServiceNodes
|
||||
|
@ -658,7 +658,7 @@ func (d *DNSServer) preparedQueryLookup(network, datacenter, query string, req,
|
|||
QueryIDOrName: query,
|
||||
QueryOptions: structs.QueryOptions{
|
||||
Token: d.agent.config.ACLToken,
|
||||
AllowStale: d.config.AllowStale,
|
||||
AllowStale: *d.config.AllowStale,
|
||||
},
|
||||
|
||||
// Always pass the local agent through. In the DNS interface, there
|
||||
|
|
|
@ -2349,7 +2349,7 @@ func TestDNS_NodeLookup_TTL(t *testing.T) {
|
|||
c.DNSRecursor = recursor.Addr
|
||||
}, func(c *DNSConfig) {
|
||||
c.NodeTTL = 10 * time.Second
|
||||
c.AllowStale = true
|
||||
*c.AllowStale = true
|
||||
c.MaxStale = time.Second
|
||||
})
|
||||
defer os.RemoveAll(dir)
|
||||
|
@ -2469,7 +2469,7 @@ func TestDNS_ServiceLookup_TTL(t *testing.T) {
|
|||
"db": 10 * time.Second,
|
||||
"*": 5 * time.Second,
|
||||
}
|
||||
c.AllowStale = true
|
||||
*c.AllowStale = true
|
||||
c.MaxStale = time.Second
|
||||
}
|
||||
dir, srv := makeDNSServerConfig(t, nil, confFn)
|
||||
|
@ -2572,7 +2572,7 @@ func TestDNS_PreparedQuery_TTL(t *testing.T) {
|
|||
"db": 10 * time.Second,
|
||||
"*": 5 * time.Second,
|
||||
}
|
||||
c.AllowStale = true
|
||||
*c.AllowStale = true
|
||||
c.MaxStale = time.Second
|
||||
}
|
||||
dir, srv := makeDNSServerConfig(t, nil, confFn)
|
||||
|
@ -3233,7 +3233,7 @@ func TestDNS_NonExistingLookupEmptyAorAAAA(t *testing.T) {
|
|||
|
||||
func TestDNS_PreparedQuery_AllowStale(t *testing.T) {
|
||||
confFn := func(c *DNSConfig) {
|
||||
c.AllowStale = true
|
||||
*c.AllowStale = true
|
||||
c.MaxStale = time.Second
|
||||
}
|
||||
dir, srv := makeDNSServerConfig(t, nil, confFn)
|
||||
|
|
|
@ -230,6 +230,9 @@ func (s *HTTPServer) registerHandlers(enableDebug bool) {
|
|||
s.handleFuncMetrics("/v1/status/leader", s.wrap(s.StatusLeader))
|
||||
s.handleFuncMetrics("/v1/status/peers", s.wrap(s.StatusPeers))
|
||||
|
||||
s.handleFuncMetrics("/v1/operator/raft/configuration", s.wrap(s.OperatorRaftConfiguration))
|
||||
s.handleFuncMetrics("/v1/operator/raft/peer", s.wrap(s.OperatorRaftPeer))
|
||||
|
||||
s.handleFuncMetrics("/v1/catalog/register", s.wrap(s.CatalogRegister))
|
||||
s.handleFuncMetrics("/v1/catalog/deregister", s.wrap(s.CatalogDeregister))
|
||||
s.handleFuncMetrics("/v1/catalog/datacenters", s.wrap(s.CatalogDatacenters))
|
||||
|
|
|
@ -0,0 +1,57 @@
|
|||
package agent
|
||||
|
||||
import (
|
||||
"net/http"
|
||||
|
||||
"github.com/hashicorp/consul/consul/structs"
|
||||
"github.com/hashicorp/raft"
|
||||
)
|
||||
|
||||
// OperatorRaftConfiguration is used to inspect the current Raft configuration.
|
||||
// This supports the stale query mode in case the cluster doesn't have a leader.
|
||||
func (s *HTTPServer) OperatorRaftConfiguration(resp http.ResponseWriter, req *http.Request) (interface{}, error) {
|
||||
if req.Method != "GET" {
|
||||
resp.WriteHeader(http.StatusMethodNotAllowed)
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
var args structs.DCSpecificRequest
|
||||
if done := s.parse(resp, req, &args.Datacenter, &args.QueryOptions); done {
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
var reply structs.RaftConfigurationResponse
|
||||
if err := s.agent.RPC("Operator.RaftGetConfiguration", &args, &reply); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return reply, nil
|
||||
}
|
||||
|
||||
// OperatorRaftPeer supports actions on Raft peers. Currently we only support
|
||||
// removing peers by address.
|
||||
func (s *HTTPServer) OperatorRaftPeer(resp http.ResponseWriter, req *http.Request) (interface{}, error) {
|
||||
if req.Method != "DELETE" {
|
||||
resp.WriteHeader(http.StatusMethodNotAllowed)
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
var args structs.RaftPeerByAddressRequest
|
||||
s.parseDC(req, &args.Datacenter)
|
||||
s.parseToken(req, &args.Token)
|
||||
|
||||
params := req.URL.Query()
|
||||
if _, ok := params["address"]; ok {
|
||||
args.Address = raft.ServerAddress(params.Get("address"))
|
||||
} else {
|
||||
resp.WriteHeader(http.StatusBadRequest)
|
||||
resp.Write([]byte("Must specify ?address with IP:port of peer to remove"))
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
var reply struct{}
|
||||
if err := s.agent.RPC("Operator.RaftRemovePeerByAddress", &args, &reply); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return nil, nil
|
||||
}
|
|
@ -0,0 +1,58 @@
|
|||
package agent
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"net/http"
|
||||
"net/http/httptest"
|
||||
"strings"
|
||||
"testing"
|
||||
|
||||
"github.com/hashicorp/consul/consul/structs"
|
||||
)
|
||||
|
||||
func TestOperator_OperatorRaftConfiguration(t *testing.T) {
|
||||
httpTest(t, func(srv *HTTPServer) {
|
||||
body := bytes.NewBuffer(nil)
|
||||
req, err := http.NewRequest("GET", "/v1/operator/raft/configuration", body)
|
||||
if err != nil {
|
||||
t.Fatalf("err: %v", err)
|
||||
}
|
||||
|
||||
resp := httptest.NewRecorder()
|
||||
obj, err := srv.OperatorRaftConfiguration(resp, req)
|
||||
if err != nil {
|
||||
t.Fatalf("err: %v", err)
|
||||
}
|
||||
if resp.Code != 200 {
|
||||
t.Fatalf("bad code: %d", resp.Code)
|
||||
}
|
||||
out, ok := obj.(structs.RaftConfigurationResponse)
|
||||
if !ok {
|
||||
t.Fatalf("unexpected: %T", obj)
|
||||
}
|
||||
if len(out.Servers) != 1 ||
|
||||
!out.Servers[0].Leader ||
|
||||
!out.Servers[0].Voter {
|
||||
t.Fatalf("bad: %v", out)
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
func TestOperator_OperatorRaftPeer(t *testing.T) {
|
||||
httpTest(t, func(srv *HTTPServer) {
|
||||
body := bytes.NewBuffer(nil)
|
||||
req, err := http.NewRequest("DELETE", "/v1/operator/raft/peer?address=nope", body)
|
||||
if err != nil {
|
||||
t.Fatalf("err: %v", err)
|
||||
}
|
||||
|
||||
// If we get this error, it proves we sent the address all the
|
||||
// way through.
|
||||
resp := httptest.NewRecorder()
|
||||
_, err = srv.OperatorRaftPeer(resp, req)
|
||||
if err == nil || !strings.Contains(err.Error(),
|
||||
"address \"nope\" was not found in the Raft configuration") {
|
||||
t.Fatalf("err: %v", err)
|
||||
}
|
||||
})
|
||||
}
|
|
@ -0,0 +1,173 @@
|
|||
package command
|
||||
|
||||
import (
|
||||
"flag"
|
||||
"fmt"
|
||||
"strings"
|
||||
|
||||
"github.com/hashicorp/consul/api"
|
||||
"github.com/mitchellh/cli"
|
||||
"github.com/ryanuber/columnize"
|
||||
)
|
||||
|
||||
// OperatorCommand is used to provide various low-level tools for Consul
|
||||
// operators.
|
||||
type OperatorCommand struct {
|
||||
Ui cli.Ui
|
||||
}
|
||||
|
||||
func (c *OperatorCommand) Help() string {
|
||||
helpText := `
|
||||
Usage: consul operator <subcommand> [common options] [action] [options]
|
||||
|
||||
Provides cluster-level tools for Consul operators, such as interacting with
|
||||
the Raft subsystem. NOTE: Use this command with extreme caution, as improper
|
||||
use could lead to a Consul outage and even loss of data.
|
||||
|
||||
If ACLs are enabled then a token with operator privileges may required in
|
||||
order to use this command. Requests are forwarded internally to the leader
|
||||
if required, so this can be run from any Consul node in a cluster.
|
||||
|
||||
Run consul operator <subcommand> with no arguments for help on that
|
||||
subcommand.
|
||||
|
||||
Common Options:
|
||||
|
||||
-http-addr=127.0.0.1:8500 HTTP address of the Consul agent.
|
||||
-token="" ACL token to use. Defaults to that of agent.
|
||||
|
||||
Subcommands:
|
||||
|
||||
raft View and modify Consul's Raft configuration.
|
||||
`
|
||||
return strings.TrimSpace(helpText)
|
||||
}
|
||||
|
||||
func (c *OperatorCommand) Run(args []string) int {
|
||||
if len(args) < 1 {
|
||||
c.Ui.Error("A subcommand must be specified")
|
||||
c.Ui.Error("")
|
||||
c.Ui.Error(c.Help())
|
||||
return 1
|
||||
}
|
||||
|
||||
var err error
|
||||
subcommand := args[0]
|
||||
switch subcommand {
|
||||
case "raft":
|
||||
err = c.raft(args[1:])
|
||||
default:
|
||||
err = fmt.Errorf("unknown subcommand %q", subcommand)
|
||||
}
|
||||
|
||||
if err != nil {
|
||||
c.Ui.Error(fmt.Sprintf("Operator %q subcommand failed: %v", subcommand, err))
|
||||
return 1
|
||||
}
|
||||
return 0
|
||||
}
|
||||
|
||||
// Synopsis returns a one-line description of this command.
|
||||
func (c *OperatorCommand) Synopsis() string {
|
||||
return "Provides cluster-level tools for Consul operators"
|
||||
}
|
||||
|
||||
const raftHelp = `
|
||||
Raft Subcommand Actions:
|
||||
|
||||
raft -list-peers -stale=[true|false]
|
||||
|
||||
Displays the current Raft peer configuration.
|
||||
|
||||
The -stale argument defaults to "false" which means the leader provides the
|
||||
result. If the cluster is in an outage state without a leader, you may need
|
||||
to set -stale to "true" to get the configuration from a non-leader server.
|
||||
|
||||
raft -remove-peer -address="IP:port"
|
||||
|
||||
Removes Consul server with given -address from the Raft configuration.
|
||||
|
||||
There are rare cases where a peer may be left behind in the Raft quorum even
|
||||
though the server is no longer present and known to the cluster. This
|
||||
command can be used to remove the failed server so that it is no longer
|
||||
affects the Raft quorum. If the server still shows in the output of the
|
||||
"consul members" command, it is preferable to clean up by simply running
|
||||
"consul force-leave" instead of this command.
|
||||
`
|
||||
|
||||
// raft handles the raft subcommands.
|
||||
func (c *OperatorCommand) raft(args []string) error {
|
||||
cmdFlags := flag.NewFlagSet("raft", flag.ContinueOnError)
|
||||
cmdFlags.Usage = func() { c.Ui.Output(c.Help()) }
|
||||
|
||||
// Parse verb arguments.
|
||||
var listPeers, removePeer bool
|
||||
cmdFlags.BoolVar(&listPeers, "list-peers", false, "")
|
||||
cmdFlags.BoolVar(&removePeer, "remove-peer", false, "")
|
||||
|
||||
// Parse other arguments.
|
||||
var stale bool
|
||||
var address, token string
|
||||
cmdFlags.StringVar(&address, "address", "", "")
|
||||
cmdFlags.BoolVar(&stale, "stale", false, "")
|
||||
cmdFlags.StringVar(&token, "token", "", "")
|
||||
httpAddr := HTTPAddrFlag(cmdFlags)
|
||||
if err := cmdFlags.Parse(args); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
// Set up a client.
|
||||
conf := api.DefaultConfig()
|
||||
conf.Address = *httpAddr
|
||||
client, err := api.NewClient(conf)
|
||||
if err != nil {
|
||||
return fmt.Errorf("error connecting to Consul agent: %s", err)
|
||||
}
|
||||
operator := client.Operator()
|
||||
|
||||
// Dispatch based on the verb argument.
|
||||
if listPeers {
|
||||
// Fetch the current configuration.
|
||||
q := &api.QueryOptions{
|
||||
AllowStale: stale,
|
||||
Token: token,
|
||||
}
|
||||
reply, err := operator.RaftGetConfiguration(q)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
// Format it as a nice table.
|
||||
result := []string{"Node|ID|Address|State|Voter"}
|
||||
for _, s := range reply.Servers {
|
||||
state := "follower"
|
||||
if s.Leader {
|
||||
state = "leader"
|
||||
}
|
||||
result = append(result, fmt.Sprintf("%s|%s|%s|%s|%v",
|
||||
s.Node, s.ID, s.Address, state, s.Voter))
|
||||
}
|
||||
c.Ui.Output(columnize.SimpleFormat(result))
|
||||
} else if removePeer {
|
||||
// TODO (slackpad) Once we expose IDs, add support for removing
|
||||
// by ID, add support for that.
|
||||
if len(address) == 0 {
|
||||
return fmt.Errorf("an address is required for the peer to remove")
|
||||
}
|
||||
|
||||
// Try to kick the peer.
|
||||
w := &api.WriteOptions{
|
||||
Token: token,
|
||||
}
|
||||
if err := operator.RaftRemovePeerByAddress(address, w); err != nil {
|
||||
return err
|
||||
}
|
||||
c.Ui.Output(fmt.Sprintf("Removed peer with address %q", address))
|
||||
} else {
|
||||
c.Ui.Output(c.Help())
|
||||
c.Ui.Output("")
|
||||
c.Ui.Output(strings.TrimSpace(raftHelp))
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
|
@ -0,0 +1,52 @@
|
|||
package command
|
||||
|
||||
import (
|
||||
"strings"
|
||||
"testing"
|
||||
|
||||
"github.com/mitchellh/cli"
|
||||
)
|
||||
|
||||
func TestOperator_Implements(t *testing.T) {
|
||||
var _ cli.Command = &OperatorCommand{}
|
||||
}
|
||||
|
||||
func TestOperator_Raft_ListPeers(t *testing.T) {
|
||||
a1 := testAgent(t)
|
||||
defer a1.Shutdown()
|
||||
waitForLeader(t, a1.httpAddr)
|
||||
|
||||
ui := new(cli.MockUi)
|
||||
c := &OperatorCommand{Ui: ui}
|
||||
args := []string{"raft", "-http-addr=" + a1.httpAddr, "-list-peers"}
|
||||
|
||||
code := c.Run(args)
|
||||
if code != 0 {
|
||||
t.Fatalf("bad: %d. %#v", code, ui.ErrorWriter.String())
|
||||
}
|
||||
output := strings.TrimSpace(ui.OutputWriter.String())
|
||||
if !strings.Contains(output, "leader") {
|
||||
t.Fatalf("bad: %s", output)
|
||||
}
|
||||
}
|
||||
|
||||
func TestOperator_Raft_RemovePeer(t *testing.T) {
|
||||
a1 := testAgent(t)
|
||||
defer a1.Shutdown()
|
||||
waitForLeader(t, a1.httpAddr)
|
||||
|
||||
ui := new(cli.MockUi)
|
||||
c := &OperatorCommand{Ui: ui}
|
||||
args := []string{"raft", "-http-addr=" + a1.httpAddr, "-remove-peer", "-address=nope"}
|
||||
|
||||
code := c.Run(args)
|
||||
if code != 1 {
|
||||
t.Fatalf("bad: %d. %#v", code, ui.ErrorWriter.String())
|
||||
}
|
||||
|
||||
// If we get this error, it proves we sent the address all they through.
|
||||
output := strings.TrimSpace(ui.ErrorWriter.String())
|
||||
if !strings.Contains(output, "address \"nope\" was not found in the Raft configuration") {
|
||||
t.Fatalf("bad: %s", output)
|
||||
}
|
||||
}
|
|
@ -103,6 +103,12 @@ func init() {
|
|||
}, nil
|
||||
},
|
||||
|
||||
"operator": func() (cli.Command, error) {
|
||||
return &command.OperatorCommand{
|
||||
Ui: ui,
|
||||
}, nil
|
||||
},
|
||||
|
||||
"info": func() (cli.Command, error) {
|
||||
return &command.InfoCommand{
|
||||
Ui: ui,
|
||||
|
|
|
@ -0,0 +1,127 @@
|
|||
package consul
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"net"
|
||||
|
||||
"github.com/hashicorp/consul/consul/agent"
|
||||
"github.com/hashicorp/consul/consul/structs"
|
||||
"github.com/hashicorp/raft"
|
||||
"github.com/hashicorp/serf/serf"
|
||||
)
|
||||
|
||||
// Operator endpoint is used to perform low-level operator tasks for Consul.
|
||||
type Operator struct {
|
||||
srv *Server
|
||||
}
|
||||
|
||||
// RaftGetConfiguration is used to retrieve the current Raft configuration.
|
||||
func (op *Operator) RaftGetConfiguration(args *structs.DCSpecificRequest, reply *structs.RaftConfigurationResponse) error {
|
||||
if done, err := op.srv.forward("Operator.RaftGetConfiguration", args, args, reply); done {
|
||||
return err
|
||||
}
|
||||
|
||||
// This action requires operator read access.
|
||||
acl, err := op.srv.resolveToken(args.Token)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if acl != nil && !acl.OperatorRead() {
|
||||
return permissionDeniedErr
|
||||
}
|
||||
|
||||
// We can't fetch the leader and the configuration atomically with
|
||||
// the current Raft API.
|
||||
future := op.srv.raft.GetConfiguration()
|
||||
if err := future.Error(); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
// Index the Consul information about the servers.
|
||||
serverMap := make(map[raft.ServerAddress]serf.Member)
|
||||
for _, member := range op.srv.serfLAN.Members() {
|
||||
valid, parts := agent.IsConsulServer(member)
|
||||
if !valid {
|
||||
continue
|
||||
}
|
||||
|
||||
addr := (&net.TCPAddr{IP: member.Addr, Port: parts.Port}).String()
|
||||
serverMap[raft.ServerAddress(addr)] = member
|
||||
}
|
||||
|
||||
// Fill out the reply.
|
||||
leader := op.srv.raft.Leader()
|
||||
reply.Index = future.Index()
|
||||
for _, server := range future.Configuration().Servers {
|
||||
node := "(unknown)"
|
||||
if member, ok := serverMap[server.Address]; ok {
|
||||
node = member.Name
|
||||
}
|
||||
|
||||
entry := &structs.RaftServer{
|
||||
ID: server.ID,
|
||||
Node: node,
|
||||
Address: server.Address,
|
||||
Leader: server.Address == leader,
|
||||
Voter: server.Suffrage == raft.Voter,
|
||||
}
|
||||
reply.Servers = append(reply.Servers, entry)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// RaftRemovePeerByAddress is used to kick a stale peer (one that it in the Raft
|
||||
// quorum but no longer known to Serf or the catalog) by address in the form of
|
||||
// "IP:port". The reply argument is not used, but it required to fulfill the RPC
|
||||
// interface.
|
||||
func (op *Operator) RaftRemovePeerByAddress(args *structs.RaftPeerByAddressRequest, reply *struct{}) error {
|
||||
if done, err := op.srv.forward("Operator.RaftRemovePeerByAddress", args, args, reply); done {
|
||||
return err
|
||||
}
|
||||
|
||||
// This is a super dangerous operation that requires operator write
|
||||
// access.
|
||||
acl, err := op.srv.resolveToken(args.Token)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if acl != nil && !acl.OperatorWrite() {
|
||||
return permissionDeniedErr
|
||||
}
|
||||
|
||||
// Since this is an operation designed for humans to use, we will return
|
||||
// an error if the supplied address isn't among the peers since it's
|
||||
// likely they screwed up.
|
||||
{
|
||||
future := op.srv.raft.GetConfiguration()
|
||||
if err := future.Error(); err != nil {
|
||||
return err
|
||||
}
|
||||
for _, s := range future.Configuration().Servers {
|
||||
if s.Address == args.Address {
|
||||
goto REMOVE
|
||||
}
|
||||
}
|
||||
return fmt.Errorf("address %q was not found in the Raft configuration",
|
||||
args.Address)
|
||||
}
|
||||
|
||||
REMOVE:
|
||||
// The Raft library itself will prevent various forms of foot-shooting,
|
||||
// like making a configuration with no voters. Some consideration was
|
||||
// given here to adding more checks, but it was decided to make this as
|
||||
// low-level and direct as possible. We've got ACL coverage to lock this
|
||||
// down, and if you are an operator, it's assumed you know what you are
|
||||
// doing if you are calling this. If you remove a peer that's known to
|
||||
// Serf, for example, it will come back when the leader does a reconcile
|
||||
// pass.
|
||||
future := op.srv.raft.RemovePeer(args.Address)
|
||||
if err := future.Error(); err != nil {
|
||||
op.srv.logger.Printf("[WARN] consul.operator: Failed to remove Raft peer %q: %v",
|
||||
args.Address, err)
|
||||
return err
|
||||
}
|
||||
|
||||
op.srv.logger.Printf("[WARN] consul.operator: Removed Raft peer %q", args.Address)
|
||||
return nil
|
||||
}
|
|
@ -0,0 +1,245 @@
|
|||
package consul
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"os"
|
||||
"reflect"
|
||||
"strings"
|
||||
"testing"
|
||||
|
||||
"github.com/hashicorp/consul/consul/structs"
|
||||
"github.com/hashicorp/consul/testutil"
|
||||
"github.com/hashicorp/net-rpc-msgpackrpc"
|
||||
"github.com/hashicorp/raft"
|
||||
)
|
||||
|
||||
func TestOperator_RaftGetConfiguration(t *testing.T) {
|
||||
dir1, s1 := testServer(t)
|
||||
defer os.RemoveAll(dir1)
|
||||
defer s1.Shutdown()
|
||||
codec := rpcClient(t, s1)
|
||||
defer codec.Close()
|
||||
|
||||
testutil.WaitForLeader(t, s1.RPC, "dc1")
|
||||
|
||||
arg := structs.DCSpecificRequest{
|
||||
Datacenter: "dc1",
|
||||
}
|
||||
var reply structs.RaftConfigurationResponse
|
||||
if err := msgpackrpc.CallWithCodec(codec, "Operator.RaftGetConfiguration", &arg, &reply); err != nil {
|
||||
t.Fatalf("err: %v", err)
|
||||
}
|
||||
|
||||
future := s1.raft.GetConfiguration()
|
||||
if err := future.Error(); err != nil {
|
||||
t.Fatalf("err: %v", err)
|
||||
}
|
||||
if len(future.Configuration().Servers) != 1 {
|
||||
t.Fatalf("bad: %v", future.Configuration().Servers)
|
||||
}
|
||||
me := future.Configuration().Servers[0]
|
||||
expected := structs.RaftConfigurationResponse{
|
||||
Servers: []*structs.RaftServer{
|
||||
&structs.RaftServer{
|
||||
ID: me.ID,
|
||||
Node: s1.config.NodeName,
|
||||
Address: me.Address,
|
||||
Leader: true,
|
||||
Voter: true,
|
||||
},
|
||||
},
|
||||
Index: future.Index(),
|
||||
}
|
||||
if !reflect.DeepEqual(reply, expected) {
|
||||
t.Fatalf("bad: %v", reply)
|
||||
}
|
||||
}
|
||||
|
||||
func TestOperator_RaftGetConfiguration_ACLDeny(t *testing.T) {
|
||||
dir1, s1 := testServerWithConfig(t, func(c *Config) {
|
||||
c.ACLDatacenter = "dc1"
|
||||
c.ACLMasterToken = "root"
|
||||
c.ACLDefaultPolicy = "deny"
|
||||
})
|
||||
defer os.RemoveAll(dir1)
|
||||
defer s1.Shutdown()
|
||||
codec := rpcClient(t, s1)
|
||||
defer codec.Close()
|
||||
|
||||
testutil.WaitForLeader(t, s1.RPC, "dc1")
|
||||
|
||||
// Make a request with no token to make sure it gets denied.
|
||||
arg := structs.DCSpecificRequest{
|
||||
Datacenter: "dc1",
|
||||
}
|
||||
var reply structs.RaftConfigurationResponse
|
||||
err := msgpackrpc.CallWithCodec(codec, "Operator.RaftGetConfiguration", &arg, &reply)
|
||||
if err == nil || !strings.Contains(err.Error(), permissionDenied) {
|
||||
t.Fatalf("err: %v", err)
|
||||
}
|
||||
|
||||
// Create an ACL with operator read permissions.
|
||||
var token string
|
||||
{
|
||||
var rules = `
|
||||
operator = "read"
|
||||
`
|
||||
|
||||
req := structs.ACLRequest{
|
||||
Datacenter: "dc1",
|
||||
Op: structs.ACLSet,
|
||||
ACL: structs.ACL{
|
||||
Name: "User token",
|
||||
Type: structs.ACLTypeClient,
|
||||
Rules: rules,
|
||||
},
|
||||
WriteRequest: structs.WriteRequest{Token: "root"},
|
||||
}
|
||||
if err := msgpackrpc.CallWithCodec(codec, "ACL.Apply", &req, &token); err != nil {
|
||||
t.Fatalf("err: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
// Now it should go through.
|
||||
arg.Token = token
|
||||
if err := msgpackrpc.CallWithCodec(codec, "Operator.RaftGetConfiguration", &arg, &reply); err != nil {
|
||||
t.Fatalf("err: %v", err)
|
||||
}
|
||||
|
||||
future := s1.raft.GetConfiguration()
|
||||
if err := future.Error(); err != nil {
|
||||
t.Fatalf("err: %v", err)
|
||||
}
|
||||
if len(future.Configuration().Servers) != 1 {
|
||||
t.Fatalf("bad: %v", future.Configuration().Servers)
|
||||
}
|
||||
me := future.Configuration().Servers[0]
|
||||
expected := structs.RaftConfigurationResponse{
|
||||
Servers: []*structs.RaftServer{
|
||||
&structs.RaftServer{
|
||||
ID: me.ID,
|
||||
Node: s1.config.NodeName,
|
||||
Address: me.Address,
|
||||
Leader: true,
|
||||
Voter: true,
|
||||
},
|
||||
},
|
||||
Index: future.Index(),
|
||||
}
|
||||
if !reflect.DeepEqual(reply, expected) {
|
||||
t.Fatalf("bad: %v", reply)
|
||||
}
|
||||
}
|
||||
|
||||
func TestOperator_RaftRemovePeerByAddress(t *testing.T) {
|
||||
dir1, s1 := testServer(t)
|
||||
defer os.RemoveAll(dir1)
|
||||
defer s1.Shutdown()
|
||||
codec := rpcClient(t, s1)
|
||||
defer codec.Close()
|
||||
|
||||
testutil.WaitForLeader(t, s1.RPC, "dc1")
|
||||
|
||||
// Try to remove a peer that's not there.
|
||||
arg := structs.RaftPeerByAddressRequest{
|
||||
Datacenter: "dc1",
|
||||
Address: raft.ServerAddress(fmt.Sprintf("127.0.0.1:%d", getPort())),
|
||||
}
|
||||
var reply struct{}
|
||||
err := msgpackrpc.CallWithCodec(codec, "Operator.RaftRemovePeerByAddress", &arg, &reply)
|
||||
if err == nil || !strings.Contains(err.Error(), "not found in the Raft configuration") {
|
||||
t.Fatalf("err: %v", err)
|
||||
}
|
||||
|
||||
// Add it manually to Raft.
|
||||
{
|
||||
future := s1.raft.AddPeer(arg.Address)
|
||||
if err := future.Error(); err != nil {
|
||||
t.Fatalf("err: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
// Make sure it's there.
|
||||
{
|
||||
future := s1.raft.GetConfiguration()
|
||||
if err := future.Error(); err != nil {
|
||||
t.Fatalf("err: %v", err)
|
||||
}
|
||||
configuration := future.Configuration()
|
||||
if len(configuration.Servers) != 2 {
|
||||
t.Fatalf("bad: %v", configuration)
|
||||
}
|
||||
}
|
||||
|
||||
// Remove it, now it should go through.
|
||||
if err := msgpackrpc.CallWithCodec(codec, "Operator.RaftRemovePeerByAddress", &arg, &reply); err != nil {
|
||||
t.Fatalf("err: %v", err)
|
||||
}
|
||||
|
||||
// Make sure it's not there.
|
||||
{
|
||||
future := s1.raft.GetConfiguration()
|
||||
if err := future.Error(); err != nil {
|
||||
t.Fatalf("err: %v", err)
|
||||
}
|
||||
configuration := future.Configuration()
|
||||
if len(configuration.Servers) != 1 {
|
||||
t.Fatalf("bad: %v", configuration)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestOperator_RaftRemovePeerByAddress_ACLDeny(t *testing.T) {
|
||||
dir1, s1 := testServerWithConfig(t, func(c *Config) {
|
||||
c.ACLDatacenter = "dc1"
|
||||
c.ACLMasterToken = "root"
|
||||
c.ACLDefaultPolicy = "deny"
|
||||
})
|
||||
defer os.RemoveAll(dir1)
|
||||
defer s1.Shutdown()
|
||||
codec := rpcClient(t, s1)
|
||||
defer codec.Close()
|
||||
|
||||
testutil.WaitForLeader(t, s1.RPC, "dc1")
|
||||
|
||||
// Make a request with no token to make sure it gets denied.
|
||||
arg := structs.RaftPeerByAddressRequest{
|
||||
Datacenter: "dc1",
|
||||
Address: raft.ServerAddress(s1.config.RPCAddr.String()),
|
||||
}
|
||||
var reply struct{}
|
||||
err := msgpackrpc.CallWithCodec(codec, "Operator.RaftRemovePeerByAddress", &arg, &reply)
|
||||
if err == nil || !strings.Contains(err.Error(), permissionDenied) {
|
||||
t.Fatalf("err: %v", err)
|
||||
}
|
||||
|
||||
// Create an ACL with operator write permissions.
|
||||
var token string
|
||||
{
|
||||
var rules = `
|
||||
operator = "write"
|
||||
`
|
||||
|
||||
req := structs.ACLRequest{
|
||||
Datacenter: "dc1",
|
||||
Op: structs.ACLSet,
|
||||
ACL: structs.ACL{
|
||||
Name: "User token",
|
||||
Type: structs.ACLTypeClient,
|
||||
Rules: rules,
|
||||
},
|
||||
WriteRequest: structs.WriteRequest{Token: "root"},
|
||||
}
|
||||
if err := msgpackrpc.CallWithCodec(codec, "ACL.Apply", &req, &token); err != nil {
|
||||
t.Fatalf("err: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
// Now it should kick back for being an invalid config, which means it
|
||||
// tried to do the operation.
|
||||
arg.Token = token
|
||||
err = msgpackrpc.CallWithCodec(codec, "Operator.RaftRemovePeerByAddress", &arg, &reply)
|
||||
if err == nil || !strings.Contains(err.Error(), "at least one voter") {
|
||||
t.Fatalf("err: %v", err)
|
||||
}
|
||||
}
|
|
@ -162,15 +162,16 @@ type Server struct {
|
|||
|
||||
// Holds the RPC endpoints
|
||||
type endpoints struct {
|
||||
Catalog *Catalog
|
||||
Health *Health
|
||||
Status *Status
|
||||
KVS *KVS
|
||||
Session *Session
|
||||
Internal *Internal
|
||||
ACL *ACL
|
||||
Catalog *Catalog
|
||||
Coordinate *Coordinate
|
||||
Health *Health
|
||||
Internal *Internal
|
||||
KVS *KVS
|
||||
Operator *Operator
|
||||
PreparedQuery *PreparedQuery
|
||||
Session *Session
|
||||
Status *Status
|
||||
Txn *Txn
|
||||
}
|
||||
|
||||
|
@ -496,27 +497,29 @@ func (s *Server) setupRaft() error {
|
|||
// setupRPC is used to setup the RPC listener
|
||||
func (s *Server) setupRPC(tlsWrap tlsutil.DCWrapper) error {
|
||||
// Create endpoints
|
||||
s.endpoints.Status = &Status{s}
|
||||
s.endpoints.Catalog = &Catalog{s}
|
||||
s.endpoints.Health = &Health{s}
|
||||
s.endpoints.KVS = &KVS{s}
|
||||
s.endpoints.Session = &Session{s}
|
||||
s.endpoints.Internal = &Internal{s}
|
||||
s.endpoints.ACL = &ACL{s}
|
||||
s.endpoints.Catalog = &Catalog{s}
|
||||
s.endpoints.Coordinate = NewCoordinate(s)
|
||||
s.endpoints.Health = &Health{s}
|
||||
s.endpoints.Internal = &Internal{s}
|
||||
s.endpoints.KVS = &KVS{s}
|
||||
s.endpoints.Operator = &Operator{s}
|
||||
s.endpoints.PreparedQuery = &PreparedQuery{s}
|
||||
s.endpoints.Session = &Session{s}
|
||||
s.endpoints.Status = &Status{s}
|
||||
s.endpoints.Txn = &Txn{s}
|
||||
|
||||
// Register the handlers
|
||||
s.rpcServer.Register(s.endpoints.Status)
|
||||
s.rpcServer.Register(s.endpoints.Catalog)
|
||||
s.rpcServer.Register(s.endpoints.Health)
|
||||
s.rpcServer.Register(s.endpoints.KVS)
|
||||
s.rpcServer.Register(s.endpoints.Session)
|
||||
s.rpcServer.Register(s.endpoints.Internal)
|
||||
s.rpcServer.Register(s.endpoints.ACL)
|
||||
s.rpcServer.Register(s.endpoints.Catalog)
|
||||
s.rpcServer.Register(s.endpoints.Coordinate)
|
||||
s.rpcServer.Register(s.endpoints.Health)
|
||||
s.rpcServer.Register(s.endpoints.Internal)
|
||||
s.rpcServer.Register(s.endpoints.KVS)
|
||||
s.rpcServer.Register(s.endpoints.Operator)
|
||||
s.rpcServer.Register(s.endpoints.PreparedQuery)
|
||||
s.rpcServer.Register(s.endpoints.Session)
|
||||
s.rpcServer.Register(s.endpoints.Status)
|
||||
s.rpcServer.Register(s.endpoints.Txn)
|
||||
|
||||
list, err := net.ListenTCP("tcp", s.config.RPCAddr)
|
||||
|
|
|
@ -1248,7 +1248,13 @@ func (s *StateStore) parseCheckServiceNodes(
|
|||
return 0, nil, err
|
||||
}
|
||||
|
||||
var results structs.CheckServiceNodes
|
||||
// Special-case the zero return value to nil, since this ends up in
|
||||
// external APIs.
|
||||
if len(services) == 0 {
|
||||
return idx, nil, nil
|
||||
}
|
||||
|
||||
results := make(structs.CheckServiceNodes, 0, len(services))
|
||||
for _, sn := range services {
|
||||
// Retrieve the node.
|
||||
n, err := tx.First("nodes", "id", sn.Node)
|
||||
|
|
|
@ -0,0 +1,57 @@
|
|||
package structs
|
||||
|
||||
import (
|
||||
"github.com/hashicorp/raft"
|
||||
)
|
||||
|
||||
// RaftServer has information about a server in the Raft configuration.
|
||||
type RaftServer struct {
|
||||
// ID is the unique ID for the server. These are currently the same
|
||||
// as the address, but they will be changed to a real GUID in a future
|
||||
// release of Consul.
|
||||
ID raft.ServerID
|
||||
|
||||
// Node is the node name of the server, as known by Consul, or this
|
||||
// will be set to "(unknown)" otherwise.
|
||||
Node string
|
||||
|
||||
// Address is the IP:port of the server, used for Raft communications.
|
||||
Address raft.ServerAddress
|
||||
|
||||
// Leader is true if this server is the current cluster leader.
|
||||
Leader bool
|
||||
|
||||
// Voter is true if this server has a vote in the cluster. This might
|
||||
// be false if the server is staging and still coming online, or if
|
||||
// it's a non-voting server, which will be added in a future release of
|
||||
// Consul.
|
||||
Voter bool
|
||||
}
|
||||
|
||||
// RaftConfigrationResponse is returned when querying for the current Raft
|
||||
// configuration.
|
||||
type RaftConfigurationResponse struct {
|
||||
// Servers has the list of servers in the Raft configuration.
|
||||
Servers []*RaftServer
|
||||
|
||||
// Index has the Raft index of this configuration.
|
||||
Index uint64
|
||||
}
|
||||
|
||||
// RaftPeerByAddressRequest is used by the Operator endpoint to apply a Raft
|
||||
// operation on a specific Raft peer by address in the form of "IP:port".
|
||||
type RaftPeerByAddressRequest struct {
|
||||
// Datacenter is the target this request is intended for.
|
||||
Datacenter string
|
||||
|
||||
// Address is the peer to remove, in the form "IP:port".
|
||||
Address raft.ServerAddress
|
||||
|
||||
// WriteRequest holds the ACL token to go along with this request.
|
||||
WriteRequest
|
||||
}
|
||||
|
||||
// RequestDatacenter returns the datacenter for a given request.
|
||||
func (op *RaftPeerByAddressRequest) RequestDatacenter() string {
|
||||
return op.Datacenter
|
||||
}
|
|
@ -0,0 +1,132 @@
|
|||
---
|
||||
layout: "docs"
|
||||
page_title: "Operator (HTTP)"
|
||||
sidebar_current: "docs-agent-http-operator"
|
||||
description: >
|
||||
The operator endpoint provides cluster-level tools for Consul operators.
|
||||
---
|
||||
|
||||
# Operator HTTP Endpoint
|
||||
|
||||
The Operator endpoint provides cluster-level tools for Consul operators, such
|
||||
as interacting with the Raft subsystem. This was added in Consul 0.7.
|
||||
|
||||
~> Use this interface with extreme caution, as improper use could lead to a Consul
|
||||
outage and even loss of data.
|
||||
|
||||
If ACLs are enabled then a token with operator privileges may required in
|
||||
order to use this interface. See the [ACL](/docs/internals/acl.html#operator)
|
||||
internals guide for more information.
|
||||
|
||||
See the [Outage Recovery](/docs/guides/outage.html) guide for some examples of how
|
||||
these capabilities are used. For a CLI to perform these operations manually, please
|
||||
see the documentation for the [`consul operator`](/docs/commands/operator.html)
|
||||
command.
|
||||
|
||||
The following endpoints are supported:
|
||||
|
||||
* [`/v1/operator/raft/configuration`](#raft-configuration): Inspects the Raft configuration
|
||||
* [`/v1/operator/raft/peer`](#raft-peer): Operates on Raft peers
|
||||
|
||||
Not all endpoints support blocking queries and all consistency modes,
|
||||
see details in the sections below.
|
||||
|
||||
The operator endpoints support the use of ACL Tokens. See the
|
||||
[ACL](/docs/internals/acl.html#operator) internals guide for more information.
|
||||
|
||||
### <a name="raft-configuration"></a> /v1/operator/raft/configuration
|
||||
|
||||
The Raft configuration endpoint supports the `GET` method.
|
||||
|
||||
#### GET Method
|
||||
|
||||
When using the `GET` method, the request will be forwarded to the cluster
|
||||
leader to retrieve its latest Raft peer configuration.
|
||||
|
||||
If the cluster doesn't currently have a leader an error will be returned. You
|
||||
can use the "?stale" query parameter to read the Raft configuration from any
|
||||
of the Consul servers.
|
||||
|
||||
By default, the datacenter of the agent is queried; however, the `dc` can be
|
||||
provided using the "?dc=" query parameter.
|
||||
|
||||
If ACLs are enabled, the client will need to supply an ACL Token with
|
||||
[`operator`](/docs/internals/acl.html#operator) read privileges.
|
||||
|
||||
A JSON body is returned that looks like this:
|
||||
|
||||
```javascript
|
||||
{
|
||||
"Servers": [
|
||||
{
|
||||
"ID": "127.0.0.1:8300",
|
||||
"Node": "alice",
|
||||
"Address": "127.0.0.1:8300",
|
||||
"Leader": true,
|
||||
"Voter": true
|
||||
},
|
||||
{
|
||||
"ID": "127.0.0.2:8300",
|
||||
"Node": "bob",
|
||||
"Address": "127.0.0.2:8300",
|
||||
"Leader": false,
|
||||
"Voter": true
|
||||
},
|
||||
{
|
||||
"ID": "127.0.0.3:8300",
|
||||
"Node": "carol",
|
||||
"Address": "127.0.0.3:8300",
|
||||
"Leader": false,
|
||||
"Voter": true
|
||||
}
|
||||
],
|
||||
"Index": 22
|
||||
}
|
||||
```
|
||||
|
||||
The `Servers` array has information about the servers in the Raft peer
|
||||
configuration:
|
||||
|
||||
`ID` is the ID of the server. This is the same as the `Address` in Consul 0.7
|
||||
but may be upgraded to a GUID in a future version of Consul.
|
||||
|
||||
`Node` is the node name of the server, as known to Consul, or "(unknown)" if
|
||||
the node is stale and not known.
|
||||
|
||||
`Address` is the IP:port for the server.
|
||||
|
||||
`Leader` is either "true" or "false" depending on the server's role in the
|
||||
Raft configuration.
|
||||
|
||||
`Voter` is "true" or "false", indicating if the server has a vote in the Raft
|
||||
configuration. Future versions of Consul may add support for non-voting servers.
|
||||
|
||||
The `Index` value is the Raft corresponding to this configuration. Note that
|
||||
the latest configuration may not yet be committed if changes are in flight.
|
||||
|
||||
### <a name="raft-peer"></a> /v1/operator/raft/peer
|
||||
|
||||
The Raft peer endpoint supports the `DELETE` method.
|
||||
|
||||
#### DELETE Method
|
||||
|
||||
Using the `DELETE` method, this endpoint will remove the Consul server with
|
||||
given address from the Raft configuration.
|
||||
|
||||
There are rare cases where a peer may be left behind in the Raft configuration
|
||||
even though the server is no longer present and known to the cluster. This
|
||||
endpoint can be used to remove the failed server so that it is no longer
|
||||
affects the Raft quorum.
|
||||
|
||||
An "?address=" query parameter is required and should be set to the
|
||||
"IP:port" for the server to remove. The port number is usually 8300, unless
|
||||
configured otherwise. Nothing is required in the body of the request.
|
||||
|
||||
By default, the datacenter of the agent is targeted; however, the `dc` can be
|
||||
provided using the "?dc=" query parameter.
|
||||
|
||||
If ACLs are enabled, the client will need to supply an ACL Token with
|
||||
[`operator`](/docs/internals/acl.html#operator) write privileges.
|
||||
|
||||
The return code will indicate success or failure.
|
||||
|
|
@ -485,8 +485,9 @@ Consul will not enable TLS for the HTTP API unless the `https` port has been ass
|
|||
* <a name="allow_stale"></a><a href="#allow_stale">`allow_stale`</a> - Enables a stale query
|
||||
for DNS information. This allows any Consul server, rather than only the leader, to service
|
||||
the request. The advantage of this is you get linear read scalability with Consul servers.
|
||||
By default, this is false, meaning all requests are serviced by the leader, providing stronger
|
||||
consistency but less throughput and higher latency.
|
||||
In versions of Consul prior to 0.7, this defaulted to false, meaning all requests are serviced
|
||||
by the leader, providing stronger consistency but less throughput and higher latency. In Consul
|
||||
0.7 and later, this defaults to true for better utilization of available servers.
|
||||
|
||||
* <a name="max_stale"></a><a href="#max_stale">`max_stale`</a> When [`allow_stale`](#allow_stale)
|
||||
is specified, this is used to limit how
|
||||
|
|
|
@ -38,6 +38,7 @@ Available commands are:
|
|||
lock Execute a command holding a lock
|
||||
members Lists the members of a Consul cluster
|
||||
monitor Stream logs from a Consul agent
|
||||
operator Provides cluster-level tools for Consul operators
|
||||
reload Triggers the agent to reload configuration files
|
||||
rtt Estimates network round trip time between nodes
|
||||
version Prints the Consul version
|
||||
|
|
|
@ -0,0 +1,102 @@
|
|||
---
|
||||
layout: "docs"
|
||||
page_title: "Commands: Operator"
|
||||
sidebar_current: "docs-commands-operator"
|
||||
description: >
|
||||
The operator command provides cluster-level tools for Consul operators.
|
||||
---
|
||||
|
||||
# Consul Operator
|
||||
|
||||
Command: `consul operator`
|
||||
|
||||
The `operator` command provides cluster-level tools for Consul operators, such
|
||||
as interacting with the Raft subsystem. This was added in Consul 0.7.
|
||||
|
||||
~> Use this command with extreme caution, as improper use could lead to a Consul
|
||||
outage and even loss of data.
|
||||
|
||||
If ACLs are enabled then a token with operator privileges may required in
|
||||
order to use this command. Requests are forwarded internally to the leader
|
||||
if required, so this can be run from any Consul node in a cluster. See the
|
||||
[ACL](/docs/internals/acl.html#operator) internals guide for more information.
|
||||
|
||||
See the [Outage Recovery](/docs/guides/outage.html) guide for some examples of how
|
||||
this command is used. For an API to perform these operations programatically,
|
||||
please see the documentation for the [Operator](/docs/agent/http/operator.html)
|
||||
endpoint.
|
||||
|
||||
## Usage
|
||||
|
||||
Usage: `consul operator <subcommand> [common options] [action] [options]`
|
||||
|
||||
Run `consul operator <subcommand>` with no arguments for help on that
|
||||
subcommand. The following subcommands are available:
|
||||
|
||||
* `raft` - View and modify Consul's Raft configuration.
|
||||
|
||||
Options common to all subcommands include:
|
||||
|
||||
* `-http-addr` - Address to the HTTP server of the agent you want to contact
|
||||
to send this command. If this isn't specified, the command will contact
|
||||
"127.0.0.1:8500" which is the default HTTP address of a Consul agent.
|
||||
|
||||
* `-token` - ACL token to use. Defaults to that of agent.
|
||||
|
||||
## Raft Operations
|
||||
|
||||
The `raft` subcommand is used to view and modify Consul's Raft configuration.
|
||||
Two actions are available, as detailed in this section.
|
||||
|
||||
<a name="raft-list-peers"></a>
|
||||
#### Display Peer Configuration
|
||||
This action displays the current Raft peer configuration.
|
||||
|
||||
Usage: `raft -list-peers -stale=[true|false]`
|
||||
|
||||
* `-stale` - Optional and defaults to "false" which means the leader provides
|
||||
the result. If the cluster is in an outage state without a leader, you may need
|
||||
to set this to "true" to get the configuration from a non-leader server.
|
||||
|
||||
The output looks like this:
|
||||
|
||||
```
|
||||
Node ID Address State Voter
|
||||
alice 127.0.0.1:8300 127.0.0.1:8300 follower true
|
||||
bob 127.0.0.2:8300 127.0.0.2:8300 leader true
|
||||
carol 127.0.0.3:8300 127.0.0.3:8300 follower true
|
||||
```
|
||||
|
||||
`Node` is the node name of the server, as known to Consul, or "(unknown)" if
|
||||
the node is stale and not known.
|
||||
|
||||
`ID` is the ID of the server. This is the same as the `Address` in Consul 0.7
|
||||
but may be upgraded to a GUID in a future version of Consul.
|
||||
|
||||
`Address` is the IP:port for the server.
|
||||
|
||||
`State` is either "follower" or "leader" depending on the server's role in the
|
||||
Raft configuration.
|
||||
|
||||
`Voter` is "true" or "false", indicating if the server has a vote in the Raft
|
||||
configuration. Future versions of Consul may add support for non-voting servers.
|
||||
|
||||
<a name="raft-remove-peer"></a>
|
||||
#### Remove a Peer
|
||||
This command removes Consul server with given address from the Raft configuration.
|
||||
|
||||
There are rare cases where a peer may be left behind in the Raft configuration
|
||||
even though the server is no longer present and known to the cluster. This command
|
||||
can be used to remove the failed server so that it is no longer affects the
|
||||
Raft quorum. If the server still shows in the output of the
|
||||
[`consul members`](/docs/commands/members.html) command, it is preferable to
|
||||
clean up by simply running
|
||||
[`consul force-leave`](http://localhost:4567/docs/commands/force-leave.html)
|
||||
instead of this command.
|
||||
|
||||
Usage: `raft -remove-peer -address="IP:port"`
|
||||
|
||||
* `-address` - "IP:port" for the server to remove. The port number is usually
|
||||
8300, unless configured otherwise.
|
||||
|
||||
The return code will indicate success or failure.
|
|
@ -38,20 +38,72 @@ comes online as agents perform [anti-entropy](/docs/internals/anti-entropy.html)
|
|||
## Failure of a Server in a Multi-Server Cluster
|
||||
|
||||
If you think the failed server is recoverable, the easiest option is to bring
|
||||
it back online and have it rejoin the cluster, returning the cluster to a fully
|
||||
healthy state. Similarly, even if you need to rebuild a new Consul server to
|
||||
replace the failed node, you may wish to do that immediately. Keep in mind that
|
||||
the rebuilt server needs to have the same IP as the failed server. Again, once
|
||||
this server is online, the cluster will return to a fully healthy state.
|
||||
it back online and have it rejoin the cluster with the same IP address, returning
|
||||
the cluster to a fully healthy state. Similarly, even if you need to rebuild a
|
||||
new Consul server to replace the failed node, you may wish to do that immediately.
|
||||
Keep in mind that the rebuilt server needs to have the same IP address as the failed
|
||||
server. Again, once this server is online and has rejoined, the cluster will return
|
||||
to a fully healthy state.
|
||||
|
||||
Both of these strategies involve a potentially lengthy time to reboot or rebuild
|
||||
a failed server. If this is impractical or if building a new server with the same
|
||||
IP isn't an option, you need to remove the failed server. Usually, you can issue
|
||||
a [`force-leave`](/docs/commands/force-leave.html) command to remove the failed
|
||||
a [`consul force-leave`](/docs/commands/force-leave.html) command to remove the failed
|
||||
server if it's still a member of the cluster.
|
||||
|
||||
If the `force-leave` isn't able to remove the server, you can remove it manually
|
||||
using the `raft/peers.json` recovery file on all remaining servers.
|
||||
If [`consul force-leave`](/docs/commands/force-leave.html) isn't able to remove the
|
||||
server, you have two methods available to remove it, depending on your version of Consul:
|
||||
|
||||
* In Consul 0.7 and later, you can use the [`consul operator`](/docs/commands/operator.html#raft-remove-peer)
|
||||
command to remove the stale peer server on the fly with no downtime.
|
||||
|
||||
* In versions of Consul prior to 0.7, you can manually remove the stale peer
|
||||
server using the `raft/peers.json` recovery file on all remaining servers. See
|
||||
the [section below](#peers.json) for details on this procedure. This process
|
||||
requires a Consul downtime to complete.
|
||||
|
||||
In Consul 0.7 and later, you can use the [`consul operator`](/docs/commands/operator.html#raft-list-peers)
|
||||
command to inspect the Raft configuration:
|
||||
|
||||
```
|
||||
$ consul operator raft -list-peers
|
||||
Node ID Address State Voter
|
||||
alice 10.0.1.8:8300 10.0.1.8:8300 follower true
|
||||
bob 10.0.1.6:8300 10.0.1.6:8300 leader true
|
||||
carol 10.0.1.7:8300 10.0.1.7:8300 follower true
|
||||
```
|
||||
|
||||
## Failure of Multiple Servers in a Multi-Server Cluster
|
||||
|
||||
In the event that multiple servers are lost, causing a loss of quorum and a
|
||||
complete outage, partial recovery is possible using data on the remaining
|
||||
servers in the cluster. There may be data loss in this situation because multiple
|
||||
servers were lost, so information about what's committed could be incomplete.
|
||||
The recovery process implicitly commits all outstanding Raft log entries, so
|
||||
it's also possible to commit data that was uncommitted before the failure.
|
||||
|
||||
See the [section below](#peers.json) for details of the recovery procedure. You
|
||||
simply include just the remaining servers in the `raft/peers.json` recovery file.
|
||||
The cluster should be able to elect a leader once the remaining servers are all
|
||||
restarted with an identical `raft/peers.json` configuration.
|
||||
|
||||
Any new servers you introduce later can be fresh with totally clean data directories
|
||||
and joined using Consul's `join` command.
|
||||
|
||||
In extreme cases, it should be possible to recover with just a single remaining
|
||||
server by starting that single server with itself as the only peer in the
|
||||
`raft/peers.json` recovery file.
|
||||
|
||||
Note that prior to Consul 0.7 it wasn't always possible to recover from certain
|
||||
types of outages with `raft/peers.json` because this was ingested before any Raft
|
||||
log entries were played back. In Consul 0.7 and later, the `raft/peers.json`
|
||||
recovery file is final, and a snapshot is taken after it is ingested, so you are
|
||||
guaranteed to start with your recovered configuration. This does implicitly commit
|
||||
all Raft log entries, so should only be used to recover from an outage, but it
|
||||
should allow recovery from any situation where there's some cluster data available.
|
||||
|
||||
<a name="peers.json"></a>
|
||||
## Manual Recovery Using peers.json
|
||||
|
||||
To begin, stop all remaining servers. You can attempt a graceful leave,
|
||||
but it will not work in most cases. Do not worry if the leave exits with an
|
||||
|
@ -70,11 +122,6 @@ implicitly committed, so this should only be used after an outage where no
|
|||
other option is available to recover a lost server. Make sure you don't have
|
||||
any automated processes that will put the peers file in place on a periodic basis,
|
||||
for example.
|
||||
<br>
|
||||
<br>
|
||||
When the final version of Consul 0.7 ships, it should include a command to
|
||||
remove a dead peer without having to stop servers and edit the `raft/peers.json`
|
||||
recovery file.
|
||||
|
||||
The next step is to go to the [`-data-dir`](/docs/agent/options.html#_data_dir)
|
||||
of each Consul server. Inside that directory, there will be a `raft/`
|
||||
|
@ -83,9 +130,9 @@ something like:
|
|||
|
||||
```javascript
|
||||
[
|
||||
"10.0.1.8:8300",
|
||||
"10.0.1.6:8300",
|
||||
"10.0.1.7:8300"
|
||||
"10.0.1.8:8300",
|
||||
"10.0.1.6:8300",
|
||||
"10.0.1.7:8300"
|
||||
]
|
||||
```
|
||||
|
||||
|
@ -126,56 +173,13 @@ nodes should claim leadership and emit a log like:
|
|||
[INFO] consul: cluster leadership acquired
|
||||
```
|
||||
|
||||
Additionally, the [`info`](/docs/commands/info.html) command can be a useful
|
||||
debugging tool:
|
||||
In Consul 0.7 and later, you can use the [`consul operator`](/docs/commands/operator.html#raft-list-peers)
|
||||
command to inspect the Raft configuration:
|
||||
|
||||
```text
|
||||
$ consul info
|
||||
...
|
||||
raft:
|
||||
applied_index = 47244
|
||||
commit_index = 47244
|
||||
fsm_pending = 0
|
||||
last_log_index = 47244
|
||||
last_log_term = 21
|
||||
last_snapshot_index = 40966
|
||||
last_snapshot_term = 20
|
||||
num_peers = 2
|
||||
state = Leader
|
||||
term = 21
|
||||
...
|
||||
```
|
||||
|
||||
You should verify that one server claims to be the `Leader` and all the
|
||||
others should be in the `Follower` state. All the nodes should agree on the
|
||||
peer count as well. This count is (N-1), since a server does not count itself
|
||||
as a peer.
|
||||
|
||||
## Failure of Multiple Servers in a Multi-Server Cluster
|
||||
|
||||
In the event that multiple servers are lost, causing a loss of quorum and a
|
||||
complete outage, partial recovery is possible using data on the remaining
|
||||
servers in the cluster. There may be data loss in this situation because multiple
|
||||
servers were lost, so information about what's committed could be incomplete.
|
||||
The recovery process implicitly commits all outstanding Raft log entries, so
|
||||
it's also possible to commit data that was uncommitted before the failure.
|
||||
|
||||
The procedure is the same as for the single-server case above; you simply include
|
||||
just the remaining servers in the `raft/peers.json` recovery file. The cluster
|
||||
should be able to elect a leader once the remaining servers are all restarted with
|
||||
an identical `raft/peers.json` configuration.
|
||||
|
||||
Any new servers you introduce later can be fresh with totally clean data directories
|
||||
and joined using Consul's `join` command.
|
||||
|
||||
In extreme cases, it should be possible to recover with just a single remaining
|
||||
server by starting that single server with itself as the only peer in the
|
||||
`raft/peers.json` recovery file.
|
||||
|
||||
Note that prior to Consul 0.7 it wasn't always possible to recover from certain
|
||||
types of outages with `raft/peers.json` because this was ingested before any Raft
|
||||
log entries were played back. In Consul 0.7 and later, the `raft/peers.json`
|
||||
recovery file is final, and a snapshot is taken after it is ingested, so you are
|
||||
guaranteed to start with your recovered configuration. This does implicitly commit
|
||||
all Raft log entries, so should only be used to recover from an outage, but it
|
||||
should allow recovery from any situation where there's some cluster data available.
|
||||
$ consul operator raft -list-peers
|
||||
Node ID Address State Voter
|
||||
alice 10.0.1.8:8300 10.0.1.8:8300 follower true
|
||||
bob 10.0.1.6:8300 10.0.1.6:8300 leader true
|
||||
carol 10.0.1.7:8300 10.0.1.7:8300 follower true
|
||||
```
|
||||
|
|
|
@ -78,8 +78,9 @@ or add more powerful servers.
|
|||
|
||||
* For DNS-heavy workloads, configuring all Consul agents in a cluster with the
|
||||
[`allow_stale`](/docs/agent/options.html#allow_stale) configuration option will allow reads to
|
||||
scale across all Consul servers, not just the leader. See [Stale Reads](/docs/guides/dns-cache.html#stale)
|
||||
in the [DNS Caching](/docs/guides/dns-cache.html) guide for more details. It's also good to set
|
||||
scale across all Consul servers, not just the leader. Consul 0.7 and later enables stale reads
|
||||
for DNS by default. See [Stale Reads](/docs/guides/dns-cache.html#stale) in the
|
||||
[DNS Caching](/docs/guides/dns-cache.html) guide for more details. It's also good to set
|
||||
reasonable, non-zero [DNS TTL values](/docs/guides/dns-cache.html#ttl) if your clients will
|
||||
respect them.
|
||||
|
||||
|
|
|
@ -210,6 +210,9 @@ query "" {
|
|||
|
||||
# Read-only mode for the encryption keyring by default (list only)
|
||||
keyring = "read"
|
||||
|
||||
# Read-only mode for Consul operator interfaces (list only)
|
||||
operator = "read"
|
||||
```
|
||||
|
||||
This is equivalent to the following JSON input:
|
||||
|
@ -248,13 +251,14 @@ This is equivalent to the following JSON input:
|
|||
"policy": "read"
|
||||
}
|
||||
},
|
||||
"keyring": "read"
|
||||
"keyring": "read",
|
||||
"operator": "read"
|
||||
}
|
||||
```
|
||||
|
||||
## Building ACL Policies
|
||||
|
||||
#### Blacklist mode and `consul exec`
|
||||
#### Blacklist Mode and `consul exec`
|
||||
|
||||
If you set [`acl_default_policy`](/docs/agent/options.html#acl_default_policy)
|
||||
to `deny`, the `anonymous` token won't have permission to read the default
|
||||
|
@ -279,7 +283,7 @@ Alternatively, you can, of course, add an explicit
|
|||
[`acl_token`](/docs/agent/options.html#acl_token) to each agent, giving it access
|
||||
to that prefix.
|
||||
|
||||
#### Blacklist mode and Service Discovery
|
||||
#### Blacklist Mode and Service Discovery
|
||||
|
||||
If your [`acl_default_policy`](/docs/agent/options.html#acl_default_policy) is
|
||||
set to `deny`, the `anonymous` token will be unable to read any service
|
||||
|
@ -327,12 +331,12 @@ event "" {
|
|||
As always, the more secure way to handle user events is to explicitly grant
|
||||
access to each API token based on the events they should be able to fire.
|
||||
|
||||
#### Blacklist mode and Prepared Queries
|
||||
#### Blacklist Mode and Prepared Queries
|
||||
|
||||
After Consul 0.6.3, significant changes were made to ACLs for prepared queries,
|
||||
including a new `query` ACL policy. See [Prepared Query ACLs](#prepared_query_acls) below for more details.
|
||||
|
||||
#### Blacklist mode and Keyring Operations
|
||||
#### Blacklist Mode and Keyring Operations
|
||||
|
||||
Consul 0.6 and later supports securing the encryption keyring operations using
|
||||
ACL's. Encryption is an optional component of the gossip layer. More information
|
||||
|
@ -353,6 +357,28 @@ Encryption keyring operations are sensitive and should be properly secured. It
|
|||
is recommended that instead of configuring a wide-open policy like above, a
|
||||
per-token policy is applied to maximize security.
|
||||
|
||||
<a name="operator"></a>
|
||||
#### Blacklist Mode and Consul Operator Actions
|
||||
|
||||
Consul 0.7 added special Consul operator actions which are protected by a new
|
||||
`operator` ACL policy. The operator actions cover:
|
||||
|
||||
* [Operator HTTP endpoint](/docs/agent/http/operator.html)
|
||||
* [Operator CLI command](/docs/commands/operator.html)
|
||||
|
||||
If your [`acl_default_policy`](/docs/agent/options.html#acl_default_policy) is
|
||||
set to `deny`, then the `anonymous` token will not have access to Consul operator
|
||||
actions. Granting `read` access allows reading information for diagnostic purposes
|
||||
without making any changes to state. Granting `write` access allows reading
|
||||
information and changing state. Here's an example policy:
|
||||
|
||||
```
|
||||
operator = "write"
|
||||
```
|
||||
|
||||
~> Grant `write` access to operator actions with extreme caution, as improper use
|
||||
could lead to a Consul outage and even loss of data.
|
||||
|
||||
#### Services and Checks with ACLs
|
||||
|
||||
Consul allows configuring ACL policies which may control access to service and
|
||||
|
|
|
@ -19,9 +19,22 @@ standard upgrade flow.
|
|||
Consul version 0.7 is a very large release with many important changes. Changes
|
||||
to be aware of during an upgrade are categorized below.
|
||||
|
||||
#### Performance Tuning and New Defaults
|
||||
#### Defaults Changed for Better Performance
|
||||
|
||||
Consul 0.7 introduced support for tuning Raft performance using a new
|
||||
Consul 0.7 now defaults the DNS configuration to allow for stale queries by defaulting
|
||||
[`allow_stale`](/docs/agent/options.html#allow_stale) to true for better utilization
|
||||
of available servers. If you want to retain the previous behavior, set the following
|
||||
configuration:
|
||||
|
||||
```javascript
|
||||
{
|
||||
"dns_config": {
|
||||
"allow_stale": false
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
Consul also 0.7 introduced support for tuning Raft performance using a new
|
||||
[performance configuration block](/docs/agent/options.html#performance). Also,
|
||||
the default Raft timing is set to a lower-performance mode suitable for
|
||||
[minimal Consul servers](/docs/guides/performance.html#minumum).
|
||||
|
@ -40,7 +53,7 @@ to all Consul servers when upgrading:
|
|||
|
||||
See the [Server Performance](/docs/guides/performance.html) guide for more details.
|
||||
|
||||
#### Default Configuration Changes
|
||||
#### Servers No Longer Default to Leave on Interrupt
|
||||
|
||||
The default behavior of [`skip_leave_on_interrupt`](/docs/agent/options.html#skip_leave_on_interrupt)
|
||||
is now dependent on whether or not the agent is acting as a server or client. When Consul is started as a
|
||||
|
|
|
@ -118,6 +118,10 @@
|
|||
<a href="/docs/commands/monitor.html">monitor</a>
|
||||
</li>
|
||||
|
||||
<li<%= sidebar_current("docs-commands-operator") %>>
|
||||
<a href="/docs/commands/operator.html">operator</a>
|
||||
</li>
|
||||
|
||||
<li<%= sidebar_current("docs-commands-info") %>>
|
||||
<a href="/docs/commands/info.html">info</a>
|
||||
</li>
|
||||
|
@ -178,6 +182,10 @@
|
|||
<a href="/docs/agent/http/coordinate.html">Network Coordinates</a>
|
||||
</li>
|
||||
|
||||
<li<%= sidebar_current("docs-agent-http-operator") %>>
|
||||
<a href="/docs/agent/http/operator.html">Operator </a>
|
||||
</li>
|
||||
|
||||
<li<%= sidebar_current("docs-agent-http-query") %>>
|
||||
<a href="/docs/agent/http/query.html">Prepared Queries</a>
|
||||
</li>
|
||||
|
|
Loading…
Reference in New Issue