diff --git a/.circleci/config.yml b/.circleci/config.yml index a189602a28..af7087d271 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -77,16 +77,9 @@ jobs: - run: mkdir -p $TEST_RESULTS_DIR - run: sudo apt-get update && sudo apt-get install -y rsyslog - run: sudo service rsyslog start - # Use CircleCI test splitting by classname. Since there are no classes in go, - # we fake it by taking everything after github.com/hashicorp/consul/ and setting - # it as the classname. - - # This loop writes go test results to .xml per go package - run: | - for pkg in $(go list ./... | grep -v github.com/hashicorp/consul/agent/proxyprocess |circleci tests split --split-by=timings --timings-type=classname | tr '\n' ' '); do - reportname=$(echo $pkg | cut -d '/' -f3- | sed "s#/#_#g") - gotestsum --format=short-verbose --junitfile $TEST_RESULTS_DIR/$reportname.xml -- -tags=$GOTAGS $pkg - done + PACKAGE_NAMES=$(go list ./... | grep -v github.com/hashicorp/consul/agent/proxyprocess | circleci tests split --split-by=timings --timings-type=classname) + gotestsum --format=short-verbose --junitfile $TEST_RESULTS_DIR/gotestsum-report.xml -- -tags=$GOTAGS -p 3 $PACKAGE_NAMES - store_test_results: path: /tmp/test-results @@ -108,18 +101,11 @@ jobs: - attach_workspace: at: /go/bin - run: mkdir -p $TEST_RESULTS_DIR - # Use CircleCI test splitting by classname. Since there are no classes in go, - # we fake it by taking everything after github.com/hashicorp/consul/ and setting - # it as the classname. - - # This loop writes go test results to .xml per go package - run: working_directory: api command: | - for pkg in $(go list ./... | circleci tests split --split-by=timings --timings-type=classname | tr '\n' ' '); do - reportname=$(echo $pkg | cut -d '/' -f3- | sed "s#/#_#g") - gotestsum --format=short-verbose --junitfile $TEST_RESULTS_DIR/$reportname.xml -- -tags=$GOTAGS $pkg - done + PACKAGE_NAMES=$(go list ./... | circleci tests split --split-by=timings --timings-type=classname) + gotestsum --format=short-verbose --junitfile $TEST_RESULTS_DIR/gotestsum-report.xml -- -tags=$GOTAGS $PACKAGE_NAMES - store_test_results: path: /tmp/test-results @@ -476,7 +462,25 @@ jobs: git_merge_branch="ci/master-merge-$(date +%Y%m%d%H%M%S)" git checkout -b "${git_merge_branch}" latest_oss_commit="$(git rev-parse origin/master)" - git merge -m "Merge Consul OSS branch 'master' at commit ${latest_oss_commit}" "${latest_oss_commit}" + + if ! errors=$(git merge -m "Merge Consul OSS branch 'master' at commit ${latest_oss_commit}" "${latest_oss_commit}"); then + printf "oss/master merge into ${CIRCLE_BRANCH} failed because git was unable to auto-merge!\n${errors}" + curl -X POST -H 'Content-type: application/json' \ + --data \ + "{ \ + \"attachments\": [ \ + { \ + \"fallback\": \"master merge into ${CIRCLE_BRANCH} failed because git was unable to auto-merge!\", \ + \"text\": \"Nightly *master* merge into *${CIRCLE_BRANCH}* failed!\n\nBuild Log: ${CIRCLE_BUILD_URL}\n\nGit was unable to auto-merge due to possible merge conflict.\n\n*Errors:*\n${errors}\", \ + \"footer\": \"${CIRCLE_PROJECT_USERNAME}/${CIRCLE_PROJECT_REPONAME}\", \ + \"ts\": \"$(date +%s)\", \ + \"color\": \"danger\" \ + } \ + ] \ + }" ${CONSUL_SLACK_WEBHOOK_URL} + exit 1 + fi + git push origin "${git_merge_branch}" sleep 15 # Wait for merge branch to start CircleCI pipeline @@ -568,28 +572,26 @@ workflows: branches: only: - release/1-6 - build-distros: + go-tests: jobs: - lint-consul-retry - - go-fmt-and-vet: + - go-fmt-and-vet + - dev-build: requires: - lint-consul-retry - - build-386: &require-go-fmt-vet - requires: - go-fmt-and-vet - - build-amd64: *require-go-fmt-vet - - build-arm-arm64: *require-go-fmt-vet - test-integrations: - jobs: - - dev-build - go-test: &go-test requires: - dev-build - filters: - branches: - ignore: - - /^pull\/.*$/ # only run go tests on non forks - go-test-api: *go-test + build-distros: + jobs: + - build-386 + - build-amd64 + - build-arm-arm64 + test-integrations: + jobs: + - dev-build - dev-upload-s3: requires: - dev-build diff --git a/.travis.yml b/.travis.yml deleted file mode 100644 index 60cd57f2dd..0000000000 --- a/.travis.yml +++ /dev/null @@ -1,23 +0,0 @@ -language: go - -go: - # Please keep this in-sync with the go version we build against in - # build-support/docker/Build-Go.dockerfile. - - "1.12.1" - -branches: - only: - - master - - release/1-6 - -matrix: - include: - - env: GOTEST_PKGS="./api" - - env: GOTEST_PKGS="./agent" - - env: GOTEST_PKGS="./agent/consul" - - env: GOTEST_PKGS_EXCLUDE="./api|./agent|./agent/consul" - -script: - - make test-ci - -sudo: false diff --git a/CHANGELOG.md b/CHANGELOG.md index 7bf6c12969..3cc2bdd57c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,8 @@ FEATURES: IMPROVEMENTS: +* raft: allow trailing logs to be configured as an escape hatch for extreme load that prevents followers catching up with leader [[GH-6186](https://github.com/hashicorp/consul/pull/6186)] +* agent: added configurable limit for log files to be rotated [[GH-5831](https://github.com/hashicorp/consul/pull/5831)] * agent: health checks: change long timeout behavior to use to user-configured `timeout` value [[GH-6094](https://github.com/hashicorp/consul/pull/6094)] * api: Update filtering language to include substring and regular expression matching on string values [[GH-6190](https://github.com/hashicorp/consul/pull/6190)] * api: Display allowed HTTP CIDR information nicely [[GH-6029](https://github.com/hashicorp/consul/pull/6029)] @@ -93,9 +95,9 @@ SECURITY: [CVE-2019-9901](https://github.com/envoyproxy/envoy/issues/6435). Both are related to HTTP request parsing and so only affect Consul Connect users if they have configured HTTP routing rules via the ["escape - hatch"](#custom-configuration). We recommend Envoy 1.9.1 be used. - Note that while we officially deprecate support for older version of Envoy in 1.5.0, - we recommend using Envoy 1.9.1 with all previous versions of Consul Connect too + hatch"](#custom-configuration). We recommend Envoy 1.9.1 be used. + Note that while we officially deprecate support for older version of Envoy in 1.5.0, + we recommend using Envoy 1.9.1 with all previous versions of Consul Connect too (back to 1.3.0 where Envoy support was introduced). BREAKING CHANGES: @@ -246,7 +248,7 @@ BUG FIXES: * cli: display messages from serf in cli [[GH-5236](https://github.com/hashicorp/consul/pull/5236)] * connect: Fixed an issue where a blank CA config could be written to a snapshot when Connect was disabled. [[GH-4954](https://github.com/hashicorp/consul/pull/4954)] * connect: Fixed a bug with the create and modify indices of leaf certificates not being incremented properly. [[GH-4463](https://github.com/hashicorp/consul/issues/4463)] -* connect: Fixed an issue where certificates could leak and remain in client memory forever [[GH-5091](https://github.com/hashicorp/consul/pull/5091)] +* connect: Fixed an issue where certificates could leak and remain in client memory forever [[GH-5091](https://github.com/hashicorp/consul/pull/5091)] * connect: (Consul Enterprise) When requesting to sign intermediates the primary dc is now used * connect: added tls config for vault connect ca provider [[GH-5125](https://github.com/hashicorp/consul/issues/5125)] * connect: Fix a panic on 32 bit systems for unaligned 64 bit atomic operations. [[GH-5128](https://github.com/hashicorp/consul/issues/5128)] diff --git a/agent/agent.go b/agent/agent.go index f5f20cc8f2..7abe95749d 100644 --- a/agent/agent.go +++ b/agent/agent.go @@ -1162,6 +1162,9 @@ func (a *Agent) consulConfig() (*consul.Config, error) { if a.config.RaftSnapshotInterval != 0 { base.RaftConfig.SnapshotInterval = a.config.RaftSnapshotInterval } + if a.config.RaftTrailingLogs != 0 { + base.RaftConfig.TrailingLogs = uint64(a.config.RaftTrailingLogs) + } if a.config.ACLMasterToken != "" { base.ACLMasterToken = a.config.ACLMasterToken } diff --git a/agent/agent_endpoint.go b/agent/agent_endpoint.go index 617b858390..06e0dbf73d 100644 --- a/agent/agent_endpoint.go +++ b/agent/agent_endpoint.go @@ -141,7 +141,7 @@ func (s *HTTPServer) AgentReload(resp http.ResponseWriter, req *http.Request) (i } // Trigger the reload - errCh := make(chan error, 0) + errCh := make(chan error) select { case <-s.agent.shutdownCh: return nil, fmt.Errorf("Agent was shutdown before reload could be completed") diff --git a/agent/agent_endpoint_test.go b/agent/agent_endpoint_test.go index 9dac1ec873..c861bfa578 100644 --- a/agent/agent_endpoint_test.go +++ b/agent/agent_endpoint_test.go @@ -564,7 +564,7 @@ func TestAgent_Service(t *testing.T) { } start := time.Now() obj, err := a.srv.AgentService(resp, req) - elapsed := time.Now().Sub(start) + elapsed := time.Since(start) if tt.wantErr != "" { require.Error(err) @@ -5350,7 +5350,7 @@ func TestAgentConnectProxyConfig_Blocking(t *testing.T) { } start := time.Now() obj, err := a.srv.AgentConnectProxyConfig(resp, req) - elapsed := time.Now().Sub(start) + elapsed := time.Since(start) if tt.wantErr { require.Error(err) diff --git a/agent/agent_test.go b/agent/agent_test.go index 9265c52ae1..d3e84f6e09 100644 --- a/agent/agent_test.go +++ b/agent/agent_test.go @@ -3925,7 +3925,7 @@ func TestAgent_ReloadConfigTLSConfigFailure(t *testing.T) { require.Len(t, tlsConf.RootCAs.Subjects(), 1) } -func TestAgent_consulConfig(t *testing.T) { +func TestAgent_consulConfig_AutoEncryptAllowTLS(t *testing.T) { t.Parallel() dataDir := testutil.TempDir(t, "agent") // we manage the data dir defer os.RemoveAll(dataDir) @@ -3941,3 +3941,13 @@ func TestAgent_consulConfig(t *testing.T) { defer a.Shutdown() require.True(t, a.consulConfig().AutoEncryptAllowTLS) } + +func TestAgent_consulConfig_RaftTrailingLogs(t *testing.T) { + t.Parallel() + hcl := ` + raft_trailing_logs = 812345 + ` + a := NewTestAgent(t, t.Name(), hcl) + defer a.Shutdown() + require.Equal(t, uint64(812345), a.consulConfig().RaftConfig.TrailingLogs) +} diff --git a/agent/cache/cache.go b/agent/cache/cache.go index ee032bbb99..7892272451 100644 --- a/agent/cache/cache.go +++ b/agent/cache/cache.go @@ -703,7 +703,7 @@ func (c *Cache) runExpiryLoop() { c.entriesLock.RLock() if len(c.entriesExpiryHeap.Entries) > 0 { entry = c.entriesExpiryHeap.Entries[0] - expiryTimer = time.NewTimer(entry.Expires.Sub(time.Now())) + expiryTimer = time.NewTimer(time.Until(entry.Expires)) expiryCh = expiryTimer.C } c.entriesLock.RUnlock() diff --git a/agent/catalog_endpoint.go b/agent/catalog_endpoint.go index cf40a9f0cf..f8da6136e6 100644 --- a/agent/catalog_endpoint.go +++ b/agent/catalog_endpoint.go @@ -189,7 +189,7 @@ func (s *HTTPServer) CatalogServices(resp http.ResponseWriter, req *http.Request // Use empty map instead of nil if out.Services == nil { - out.Services = make(structs.Services, 0) + out.Services = make(structs.Services) } metrics.IncrCounterWithLabels([]string{"client", "api", "success", "catalog_services"}, 1, []metrics.Label{{Name: "node", Value: s.nodeName()}}) diff --git a/agent/config/builder.go b/agent/config/builder.go index 4ff46d1290..42619d5525 100644 --- a/agent/config/builder.go +++ b/agent/config/builder.go @@ -862,6 +862,7 @@ func (b *Builder) Build() (rt RuntimeConfig, err error) { RaftProtocol: b.intVal(c.RaftProtocol), RaftSnapshotThreshold: b.intVal(c.RaftSnapshotThreshold), RaftSnapshotInterval: b.durationVal("raft_snapshot_interval", c.RaftSnapshotInterval), + RaftTrailingLogs: b.intVal(c.RaftTrailingLogs), ReconnectTimeoutLAN: b.durationVal("reconnect_timeout", c.ReconnectTimeoutLAN), ReconnectTimeoutWAN: b.durationVal("reconnect_timeout_wan", c.ReconnectTimeoutWAN), RejoinAfterLeave: b.boolVal(c.RejoinAfterLeave), diff --git a/agent/config/config.go b/agent/config/config.go index 91470370de..0f244e3530 100644 --- a/agent/config/config.go +++ b/agent/config/config.go @@ -241,6 +241,7 @@ type Config struct { RaftProtocol *int `json:"raft_protocol,omitempty" hcl:"raft_protocol" mapstructure:"raft_protocol"` RaftSnapshotThreshold *int `json:"raft_snapshot_threshold,omitempty" hcl:"raft_snapshot_threshold" mapstructure:"raft_snapshot_threshold"` RaftSnapshotInterval *string `json:"raft_snapshot_interval,omitempty" hcl:"raft_snapshot_interval" mapstructure:"raft_snapshot_interval"` + RaftTrailingLogs *int `json:"raft_trailing_logs,omitempty" hcl:"raft_trailing_logs" mapstructure:"raft_trailing_logs"` ReconnectTimeoutLAN *string `json:"reconnect_timeout,omitempty" hcl:"reconnect_timeout" mapstructure:"reconnect_timeout"` ReconnectTimeoutWAN *string `json:"reconnect_timeout_wan,omitempty" hcl:"reconnect_timeout_wan" mapstructure:"reconnect_timeout_wan"` RejoinAfterLeave *bool `json:"rejoin_after_leave,omitempty" hcl:"rejoin_after_leave" mapstructure:"rejoin_after_leave"` diff --git a/agent/config/runtime.go b/agent/config/runtime.go index 86e7d0e52c..1862a15e3b 100644 --- a/agent/config/runtime.go +++ b/agent/config/runtime.go @@ -965,6 +965,22 @@ type RuntimeConfig struct { // hcl: raft_snapshot_threshold = int RaftSnapshotInterval time.Duration + // RaftTrailingLogs sets the number of log entries that will be left in the + // log store after a snapshot. This must be large enough that a follower can + // transfer and restore an entire snapshot of the state before this many new + // entries have been appended. In vast majority of cases the default is plenty + // but if there is a sustained high write throughput coupled with a huge + // multi-gigabyte snapshot setting this higher may be necessary to allow + // followers time to reload from snapshot without becoming unhealthy. If it's + // too low then followers are unable to ever recover from a restart and will + // enter a loop of constantly downloading full snapshots and never catching + // up. If you need to change this you should reconsider your usage of Consul + // as it is not designed to store multiple-gigabyte data sets with high write + // throughput. Defaults to 10000. + // + // hcl: raft_trailing_logs = int + RaftTrailingLogs int + // ReconnectTimeoutLAN specifies the amount of time to wait to reconnect with // another agent before deciding it's permanently gone. This can be used to // control the time it takes to reap failed nodes from the cluster. diff --git a/agent/config/runtime_test.go b/agent/config/runtime_test.go index e592a5c7bc..e380a51b55 100644 --- a/agent/config/runtime_test.go +++ b/agent/config/runtime_test.go @@ -3767,6 +3767,7 @@ func TestFullConfig(t *testing.T) { "raft_protocol": 19016, "raft_snapshot_threshold": 16384, "raft_snapshot_interval": "30s", + "raft_trailing_logs": 83749, "reconnect_timeout": "23739s", "reconnect_timeout_wan": "26694s", "recursors": [ "63.38.39.58", "92.49.18.18" ], @@ -4371,6 +4372,7 @@ func TestFullConfig(t *testing.T) { raft_protocol = 19016 raft_snapshot_threshold = 16384 raft_snapshot_interval = "30s" + raft_trailing_logs = 83749 reconnect_timeout = "23739s" reconnect_timeout_wan = "26694s" recursors = [ "63.38.39.58", "92.49.18.18" ] @@ -5043,6 +5045,7 @@ func TestFullConfig(t *testing.T) { RaftProtocol: 19016, RaftSnapshotThreshold: 16384, RaftSnapshotInterval: 30 * time.Second, + RaftTrailingLogs: 83749, ReconnectTimeoutLAN: 23739 * time.Second, ReconnectTimeoutWAN: 26694 * time.Second, RejoinAfterLeave: true, @@ -5901,6 +5904,7 @@ func TestSanitize(t *testing.T) { "RaftProtocol": 0, "RaftSnapshotInterval": "0s", "RaftSnapshotThreshold": 0, + "RaftTrailingLogs": 0, "ReconnectTimeoutLAN": "0s", "ReconnectTimeoutWAN": "0s", "RejoinAfterLeave": false, diff --git a/agent/connect/ca/provider_consul_test.go b/agent/connect/ca/provider_consul_test.go index 479d460e6a..6ec9b56752 100644 --- a/agent/connect/ca/provider_consul_test.go +++ b/agent/connect/ca/provider_consul_test.go @@ -178,7 +178,7 @@ func TestConsulCAProvider_SignLeaf(t *testing.T) { require.Equal(parsed.SerialNumber.Uint64(), uint64(2)) // Ensure the cert is valid now and expires within the correct limit. - require.True(parsed.NotAfter.Sub(time.Now()) < 3*24*time.Hour) + require.True(time.Until(parsed.NotAfter) < 3*24*time.Hour) require.True(parsed.NotBefore.Before(time.Now())) } diff --git a/agent/connect/ca/provider_vault_test.go b/agent/connect/ca/provider_vault_test.go index 10be1befb4..b0ddaa4112 100644 --- a/agent/connect/ca/provider_vault_test.go +++ b/agent/connect/ca/provider_vault_test.go @@ -186,7 +186,7 @@ func TestVaultCAProvider_SignLeaf(t *testing.T) { require.NotEqual(firstSerial, parsed.SerialNumber.Uint64()) // Ensure the cert is valid now and expires within the correct limit. - require.True(parsed.NotAfter.Sub(time.Now()) < time.Hour) + require.True(time.Until(parsed.NotAfter) < time.Hour) require.True(parsed.NotBefore.Before(time.Now())) } } diff --git a/agent/consul/acl_replication_types.go b/agent/consul/acl_replication_types.go index 3009bec983..e0222e1443 100644 --- a/agent/consul/acl_replication_types.go +++ b/agent/consul/acl_replication_types.go @@ -316,7 +316,7 @@ func (r *aclRoleReplicator) FetchUpdated(srv *Server, updates []string) (int, er delete(keep, role.ID) } missing := make([]string, 0, len(keep)) - for id, _ := range keep { + for id := range keep { missing = append(missing, id) } return 0, fmt.Errorf("role replication trying to replicated uncached roles with IDs: %v", missing) diff --git a/agent/consul/kvs_endpoint_test.go b/agent/consul/kvs_endpoint_test.go index a91985c725..e3ba380df5 100644 --- a/agent/consul/kvs_endpoint_test.go +++ b/agent/consul/kvs_endpoint_test.go @@ -596,11 +596,7 @@ key "zip" { t.Fatalf("err: %v", err) } - actualKeys = []string{} - - for _, key := range keyList.Keys { - actualKeys = append(actualKeys, key) - } + actualKeys = keyList.Keys verify.Values(t, "", actualKeys, expectedKeys) diff --git a/agent/consul/leader.go b/agent/consul/leader.go index 93a1e09477..4e66b39d0d 100644 --- a/agent/consul/leader.go +++ b/agent/consul/leader.go @@ -1192,7 +1192,7 @@ func (s *Server) pruneCARoots() error { var newRoots structs.CARoots for _, r := range roots { - if !r.Active && !r.RotatedOutAt.IsZero() && time.Now().Sub(r.RotatedOutAt) > common.LeafCertTTL*2 { + if !r.Active && !r.RotatedOutAt.IsZero() && time.Since(r.RotatedOutAt) > common.LeafCertTTL*2 { s.logger.Printf("[INFO] connect: pruning old unused root CA (ID: %s)", r.ID) continue } diff --git a/agent/consul/server_lookup.go b/agent/consul/server_lookup.go index e163856d71..f40b573770 100644 --- a/agent/consul/server_lookup.go +++ b/agent/consul/server_lookup.go @@ -51,8 +51,7 @@ func (sl *ServerLookup) ServerAddr(id raft.ServerID) (raft.ServerAddress, error) func (sl *ServerLookup) Server(addr raft.ServerAddress) *metadata.Server { sl.lock.RLock() defer sl.lock.RUnlock() - svr, _ := sl.addressToServer[addr] - return svr + return sl.addressToServer[addr] } func (sl *ServerLookup) Servers() []*metadata.Server { diff --git a/agent/consul/server_test.go b/agent/consul/server_test.go index 32eadb232c..1aeb1a5900 100644 --- a/agent/consul/server_test.go +++ b/agent/consul/server_test.go @@ -183,6 +183,10 @@ func newServer(c *Config) (*Server, error) { oldNotify() } } + // Restore old notify to guard against re-closing `up` on a retry + defer func() { + c.NotifyListen = oldNotify + }() // start server w := c.LogOutput @@ -820,7 +824,6 @@ func TestServer_BadExpect(t *testing.T) { type fakeGlobalResp struct{} func (r *fakeGlobalResp) Add(interface{}) { - return } func (r *fakeGlobalResp) New() interface{} { diff --git a/agent/consul/state/acl_test.go b/agent/consul/state/acl_test.go index 58c69f7433..563f6bde08 100644 --- a/agent/consul/state/acl_test.go +++ b/agent/consul/state/acl_test.go @@ -3824,11 +3824,11 @@ func stripIrrelevantTokenFields(token *structs.ACLToken) *structs.ACLToken { // When comparing the tokens disregard the policy link names. This // data is not cleanly updated in a variety of scenarios and should not // be relied upon. - for i, _ := range tokenCopy.Policies { + for i := range tokenCopy.Policies { tokenCopy.Policies[i].Name = "" } // Also do the same for Role links. - for i, _ := range tokenCopy.Roles { + for i := range tokenCopy.Roles { tokenCopy.Roles[i].Name = "" } // The raft indexes won't match either because the requester will not diff --git a/agent/http.go b/agent/http.go index 9025cc088e..bd876f46b4 100644 --- a/agent/http.go +++ b/agent/http.go @@ -293,7 +293,7 @@ func (s *HTTPServer) handler(enableDebug bool) http.Handler { mux.HandleFunc("/", s.Index) for pattern, fn := range endpoints { thisFn := fn - methods, _ := allowedMethods[pattern] + methods := allowedMethods[pattern] bound := func(resp http.ResponseWriter, req *http.Request) (interface{}, error) { return thisFn(s, resp, req) } diff --git a/agent/router/manager.go b/agent/router/manager.go index 081893c5e9..ae764087c4 100644 --- a/agent/router/manager.go +++ b/agent/router/manager.go @@ -342,8 +342,6 @@ func (m *Manager) RebalanceServers() { // continue to use the existing connection until the next // rebalance occurs. } - - return } // reconcileServerList returns true when the first server in serverList diff --git a/agent/ui_endpoint_test.go b/agent/ui_endpoint_test.go index 940e3ec36b..51d19a0bcf 100644 --- a/agent/ui_endpoint_test.go +++ b/agent/ui_endpoint_test.go @@ -60,7 +60,7 @@ func TestUiIndex(t *testing.T) { // Verify the body out := bytes.NewBuffer(nil) io.Copy(out, resp.Body) - if string(out.Bytes()) != "test" { + if out.String() != "test" { t.Fatalf("bad: %s", out.Bytes()) } } diff --git a/command/debug/debug.go b/command/debug/debug.go index 36a8ae3662..a9697b624a 100644 --- a/command/debug/debug.go +++ b/command/debug/debug.go @@ -302,7 +302,7 @@ func (c *cmd) captureStatic() error { var errors error // Collect the named outputs here - outputs := make(map[string]interface{}, 0) + outputs := make(map[string]interface{}) // Capture host information if c.configuredTarget("host") { diff --git a/lib/telemetry.go b/lib/telemetry.go index a335d6cc85..d815f4b587 100644 --- a/lib/telemetry.go +++ b/lib/telemetry.go @@ -225,7 +225,7 @@ func (c *TelemetryConfig) MergeDefaults(defaults *TelemetryConfig) { continue } case reflect.Bool: - if f.Bool() != false { + if f.Bool() { continue } default: diff --git a/sdk/testutil/retry/retry.go b/sdk/testutil/retry/retry.go index 2ef3c4c0eb..53c05a2b05 100644 --- a/sdk/testutil/retry/retry.go +++ b/sdk/testutil/retry/retry.go @@ -110,7 +110,7 @@ func dedup(a []string) string { delete(m, s) } } - return string(b.Bytes()) + return b.String() } func run(r Retryer, t Failer, f func(r *R)) { diff --git a/website/source/docs/agent/options.html.md b/website/source/docs/agent/options.html.md index 2dd58ac5e6..4ba9508178 100644 --- a/website/source/docs/agent/options.html.md +++ b/website/source/docs/agent/options.html.md @@ -407,21 +407,6 @@ will exit with an error at startup. [Raft Protocol Version Compatibility](/docs/upgrade-specific.html#raft-protocol-version-compatibility) for more details. -* `-raft-snapshot-threshold` - This controls the - minimum number of raft commit entries between snapshots that are saved to disk. This is a low-level parameter that should - rarely need to be changed. Very busy clusters experiencing excessive disk IO may increase this value to reduce disk IO, and minimize - the chances of all servers taking snapshots at the same time. Increasing this trades off disk IO for disk space since the log will - grow much larger and the space in the raft.db file can't be reclaimed till the next snapshot. Servers may take longer to recover from - crashes or failover if this is increased significantly as more logs will need to be replayed. In Consul 1.1.0 and later this - defaults to 16384, and in prior versions it was set to 8192. - -* `-raft-snapshot-interval` - This controls how often servers - check if they need to save a snapshot to disk. his is a low-level parameter that should rarely need to be changed. Very busy clusters - experiencing excessive disk IO may increase this value to reduce disk IO, and minimize the chances of all servers taking snapshots at the same time. - Increasing this trades off disk IO for disk space since the log will grow much larger and the space in the raft.db file can't be reclaimed - till the next snapshot. Servers may take longer to recover from crashes or failover if this is increased significantly as more logs - will need to be replayed. In Consul 1.1.0 and later this defaults to `30s`, and in prior versions it was set to `5s`. - * `-recursor` - Specifies the address of an upstream DNS server. This option may be provided multiple times, and is functionally equivalent to the [`recursors` configuration option](#recursors). @@ -1431,11 +1416,46 @@ default will automatically work with some tooling. * `raft_protocol` Equivalent to the [`-raft-protocol` command-line flag](#_raft_protocol). -* `raft_snapshot_threshold` Equivalent to the - [`-raft-snapshot-threshold` command-line flag](#_raft_snapshot_threshold). + +* + `raft_snapshot_threshold` This controls + the minimum number of raft commit entries between snapshots that are saved to + disk. This is a low-level parameter that should rarely need to be changed. + Very busy clusters experiencing excessive disk IO may increase this value to + reduce disk IO, and minimize the chances of all servers taking snapshots at + the same time. Increasing this trades off disk IO for disk space since the log + will grow much larger and the space in the raft.db file can't be reclaimed + till the next snapshot. Servers may take longer to recover from crashes or + failover if this is increased significantly as more logs will need to be + replayed. In Consul 1.1.0 and later this defaults to 16384, and in prior + versions it was set to 8192. -* `raft_snapshot_interval` Equivalent to the - [`-raft-snapshot-interval` command-line flag](#_raft_snapshot_interval). +* `raft_snapshot_interval` This controls how + often servers check if they need to save a snapshot to disk. his is a + low-level parameter that should rarely need to be changed. Very busy clusters + experiencing excessive disk IO may increase this value to reduce disk IO, and + minimize the chances of all servers taking snapshots at the same time. + Increasing this trades off disk IO for disk space since the log will grow much + larger and the space in th e raft.db file can't be reclaimed till the next + snapshot. Servers may take longer to recover from crashes or failover if this + is increased significantly as more logs will need to be replayed. In Consul + 1.1.0 and later this defaults to `30s`, and in prior versions it was set to + `5s`. + +* `raft_trailing_logs` - This controls how many + log entries are left in the log store on disk after a snapshot is made. This + should only be adjusted when followers cannot catch up to the leader due to a + very large snapshot size that and high write throughput causing log truncation + before an snapshot can be fully installed. If you need to use this to recover + a cluster, consider reducing write throughput or the amount of data stored on + Consul as it is likely under a load it is not designed to handle. The default + value is 10000 which is suitable for all normal workloads. Added in Consul + 1.5.3. * `reap` This controls Consul's automatic reaping of child processes, which is useful if Consul is running as PID 1 in a Docker container. If this isn't specified, then Consul will diff --git a/website/source/docs/guides/kubernetes-production-deploy.md b/website/source/docs/guides/kubernetes-production-deploy.md new file mode 100644 index 0000000000..f90454ca37 --- /dev/null +++ b/website/source/docs/guides/kubernetes-production-deploy.md @@ -0,0 +1,286 @@ +--- +name: "Consul-Kubernetes Deployment Guide" +content_length: 14 +id: kubernetes-production-deploy +layout: content_layout +products_used: + - Consul +description: This guide covers the necessary steps to install and configure a new Consul cluster on Kubernetes. +level: Advanced +___ + + +This guide covers the necessary steps to install and configure a new Consul +cluster on Kubernetes, as defined in the [Consul Reference Architecture +guide](/consul/day-1-operations/kubernetes-reference#consul-datacenter-deployed-in-kubernetes). +By the end of this guide, you will be able to identify the installation +prerequisites, customize the Helm chart to fit your environment requirements, +and interact with your new Consul cluster. + +~> You should have the following configured before starting this guide: Helm +installed and configured locally, tiller running in the Kubernetes cluster, and +the Kubernetes CLI configured. + +## Configure Kubernetes Permissions to Deploy Consul + +Before deploying Consul, you will need to create a new Kubernetes service +account with the correct permissions and to authenticate it on the command +line. You will need Kubernetes operators permissions to create and modify +policies, deploy services, access the Kubernetes dashboard, create secrets, and +create RBAC objects. You can find documentation for RBAC and service accounts +for the following cloud providers. + +- [AKS](https://docs.microsoft.com/en-us/azure/aks/kubernetes-service-principal) +- [EKS](https://docs.aws.amazon.com/eks/latest/userguide/install-aws-iam-authenticator.html) +- [GCP](https://console.cloud.google.com/iam-admin/serviceaccounts) + +Note, Consul can be deployed on any properly configured Kubernetes cluster in +the cloud or on premises. + +Once you have a service account, you will also need to add a permission to +deploy the helm chart. This is done with the `clusterrolebinding` method. + +```sh +$ kubectl create clusterrolebinding kubernetes-dashboard -n kube-system --clusterrole=cluster-admin --serviceaccount=kube-system:kubernetes-dashboard +``` + +Finally, you may need to create Kubernetes secrets to store Consul data. You +can reference these secrets in the customized Helm chart values file. + +- If you have purchased Enterprise Consul, the enterprise license file should be +used with the official image, `hashicorp/consul-enterprise:1.5.0-ent`. + +- Enable +[encryption](https://www.consul.io/docs/agent/encryption.html#gossip-encryption) to secure gossip traffic within the Consul cluster. + + +~> Note, depending on your environment, the previous secrets may not be +necessary. + +## Configure Helm Chart + +Now that you have prepared your Kubernetes cluster, you can customize the Helm +chart. First, you will need to download the latest official Helm chart. + +```sh +$ git clone https://github.com/hashicorp/consul-helm.git +``` + +The `consul-helm` directory will contain a `values.yaml` file with example +parameters. You can update this file to customize your Consul deployment. Below +we detail some of the parameters you should customize and provide an example +file, however you should consider your particular production needs when +configuring your chart. + +### Global Values + +The global values will affect all the other parameters in the chart. + +To enable all of the Consul components in the Helm chart, set `enabled` to +`true`. This means servers, clients, Consul DNS, and the Consul UI will be +installed with their defaults. You should also set the following global +parameters based on your specific environment requirements. + +- `image` is the name and tag of the Consul Docker image. +- `imagek8s` is the name and tag of the Docker image for the consul-k8s binary. +- `datacenter` the name of your Consul datacenter. +- `domain` the domain Consul uses for DNS queries. + +For security, set the `bootstrapACLs` parameter to true. This will enable +Kubernetes to initially setup Consul's [ACL +system](https://www.consul.io/docs/acl/acl-system.html). + +Read the Consul Helm chart documentation to review all the [global +parameters](https://www.consul.io/docs/platform/k8s/helm.html#v-global). + +### Consul UI + +To enable the Consul web UI update the `ui` section to your values file and set +`enabled` to `true`. + +Note, you can also set up a [loadbalancer +resource](https://github.com/hashicorp/demo-consul-101/tree/master/k8s#implement-load-balancer) +or other service type in Kubernetes to make it easier to access the UI. + +### Consul Servers + +For production deployments, you will need to deploy [3 or 5 Consul +servers](https://www.consul.io/docs/internals/consensus.html#deployment-table) +for quorum and failure tolerance. For most deployments, 3 servers are adequate. + +In the server section set both `replicas` and `bootstrapExpect` to 3. This will +deploy three servers and cause Consul to wait to perform leader election until +all three are healthy. The `resources` will depend on your environment; in the +example at the end of the guide, the resources are set for a large environment. + +#### Affinity + +To ensure the Consul servers are placed on different Kubernetes nodes, you will +need to configure affinity. Otherwise, the failure of one Kubernetes node could +cause the loss of multiple Consul servers, and result in quorum loss. By +default, the example `values.yaml` has affinity configured correctly. + +#### Enterprise License + +If you have an [Enterprise +license](https://www.hashicorp.com/products/consul/enterprise) you should +reference the Kubernetes secret in the `enterpriseLicense` parameter. + +Read the Consul Helm chart documentation to review all the [server +parameters](https://www.consul.io/docs/platform/k8s/helm.html#v-server) + +### Consul Clients + +A Consul client is deployed on every Kubernetes node, so you do not need to +specify the number of clients for your deployments. You will need to specify +resources and enable gRPC. The resources in the example at the end of this guide +should be +sufficient for most production scenarios since Consul clients are designed for +horizontal scalability. Enabling `grpc` enables the GRPC listener on port 8502 +and exposes it to the host. It is required to use Consul Connect. + +Read the Consul Helm chart documentation to review all the [client +parameters](https://www.consul.io/docs/platform/k8s/helm.html#v-client) + +### Consul Connect Injection Security + +Even though you enabled Consul server communication over Connect in the server section, you will also +need to enable `connectInject` by setting `enabled` to `true`. In the +`connectInject` section you will also configure security features. Enabling the +`default` parameter will allow the injector to automatically inject the Connect +sidecar into all pods. If you would prefer to manually annotate which pods to inject, you +can set this to false. Setting the 'aclBindingRuleSelector` parameter to +`serviceaccount.name!=default` ensures that new services do not all receive the +same token if you are only using a default service account. This setting is +only necessary if you have enabled ACLs in the global section. + +Read more about the [Connect Inject +parameters](https://www.consul.io/docs/platform/k8s/helm.html#v-connectinject). + +## Complete Example + +Your finished values file should resemble the following example. For more +complete descriptions of all the available parameters see the `values.yaml` +file provided with the Helm chart and the [reference +documentation](https://www.consul.io/docs/platform/k8s/helm.html). + +```yaml +# Configure global settings in this section. +global: + # Enable all the components within this chart by default. + enabled: true + # Specify the Consul and consul-k8s images to use + image: "consul:1.5.0" + imagek8s: "hashicorp/consul-k8s:0.8.1" + domain: consul + datacenter: primarydc + # Bootstrap ACLs within Consul. This is highly recommended. + bootstrapACLs: true + # Gossip encryption + gossipEncryption: | + secretName: "encrypt-key" + secretKey: "key +# Configure your Consul servers in this section. +server: + enabled: true + connect: true + # Specify three servers that wait till all are healthy to bootstrap the Consul cluster. + replicas: 3 + bootstrapExpect: 3 + # Specify the resources that servers request for placement. These values will serve a large environment. + resources: | + requests: + memory: "32Gi" + cpu: "4" + disk: "50Gi" + limits: + memory: "32Gi" + cpu: "4" + disk: "50Gi" + # If using Enterprise, reference the Kubernetes secret that holds your license here + enterpriseLicense: + secretName: "consul-license" + secretKey: "key" + # Prevent Consul servers from co-location on Kubernetes nodes. + affinity: | + podAntiAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + - labelSelector: + matchLabels: + app: {{ template "consul.name" . }} + release: "{{ .Release.Name }}" + component: server + topologyKey: kubernetes.io/hostname +# Configure Consul clients in this section +client: + enabled: true + # Specify the resources that clients request for deployment. + resources: | + requests: + memory: "8Gi" + cpu: "2" + disk: "15Gi" + limits: + memory: "8Gi" + cpu: "2" + disk: "15Gi" + grpc: true +# Enable and configure the Consul UI. +ui: + enabled: true +# Configure security for Consul Connect pod injection +connectInject: + enabled: true + default: true + namespaceSelector: "my-namespace" + aclBindingRuleSelector: “serviceaccount.name!=default” +``` +## Deploy Consul + +Now that you have customized the `values.yml` file, you can deploy Consul with +Helm. This should only take a few minutes. The Consul pods should appear in the +Kubernetes dashboard immediately and you can monitor the deployment process +there. + +```sh +$ helm install ./consul-helm -f values.yaml +``` + +To check the deployment process on the command line you can use `kubectl`. + +```sh +$ kubectl get pods +``` + +## Summary + +In this guide, you configured Consul, using the Helm chart, for a production +environment. This involved ensuring that your cluster had a properly +distributed server cluster, specifying enough resources for your agents, +securing the cluster with ACLs and gossip encryption, and enabling other Consul +functionality including Connect and the Consul UI. + +Now you can interact with your Consul cluster through the UI or CLI. + +If you exposed the UI using a load balancer it will be available at the +`LoadBalancer Ingress` IP address and `Port` that is output from the following +command. Note, you will need to replace _consul server_ with the server name +from your cluster. + +```sh +$ kubectl describe services consul-server +``` + +To access the Consul CLI, open a terminal session using the Kubernetes CLI. + +```sh +$ kubectl exec -it /bin/ash +``` + +To learn more about how to interact with your Consul cluster or use it for +service discovery, configuration or segmentation, try one of Learn’s +[Operations or Development tracks](/consul/#advanced). Follow the [Security and +Networking track](/consul/?track=security-networking#security-networking) to +learn more about securing your Consul cluster. + +