resolve circleci config conflicts

2019-07-23 20:18:36 -04:00 · 2019-07-23 20:18:36 -04:00 · ef6b80bab2
parent 768d038d6f d86efb83e5
commit ef6b80bab2
29 changed files with 423 additions and 105 deletions
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@ -77,16 +77,9 @@ jobs:
      - run: mkdir -p $TEST_RESULTS_DIR
      - run: sudo apt-get update && sudo apt-get install -y rsyslog
      - run: sudo service rsyslog start
-      # Use CircleCI test splitting by classname. Since there are no classes in go,
-      # we fake it by taking everything after github.com/hashicorp/consul/ and setting
-      # it as the classname.
-
-      # This loop writes go test results to <reportname>.xml per go package
      - run: |
-          for pkg in $(go list ./... | grep -v github.com/hashicorp/consul/agent/proxyprocess |circleci tests split --split-by=timings --timings-type=classname | tr '\n' ' '); do
-            reportname=$(echo $pkg | cut -d '/' -f3- | sed "s#/#_#g")
-            gotestsum --format=short-verbose --junitfile $TEST_RESULTS_DIR/$reportname.xml -- -tags=$GOTAGS $pkg
-          done
+          PACKAGE_NAMES=$(go list ./... | grep -v github.com/hashicorp/consul/agent/proxyprocess | circleci tests split --split-by=timings --timings-type=classname)
+          gotestsum --format=short-verbose --junitfile $TEST_RESULTS_DIR/gotestsum-report.xml -- -tags=$GOTAGS -p 3 $PACKAGE_NAMES

      - store_test_results:
          path: /tmp/test-results
@ -108,18 +101,11 @@ jobs:
      - attach_workspace:
          at: /go/bin
      - run: mkdir -p $TEST_RESULTS_DIR
-      # Use CircleCI test splitting by classname. Since there are no classes in go,
-      # we fake it by taking everything after github.com/hashicorp/consul/ and setting
-      # it as the classname.
-
-      # This loop writes go test results to <reportname>.xml per go package
      - run:
          working_directory: api
          command: |
-            for pkg in $(go list ./... | circleci tests split --split-by=timings --timings-type=classname | tr '\n' ' '); do
-              reportname=$(echo $pkg | cut -d '/' -f3- | sed "s#/#_#g")
-              gotestsum --format=short-verbose --junitfile $TEST_RESULTS_DIR/$reportname.xml -- -tags=$GOTAGS $pkg
-            done
+            PACKAGE_NAMES=$(go list ./... | circleci tests split --split-by=timings --timings-type=classname)
+            gotestsum --format=short-verbose --junitfile $TEST_RESULTS_DIR/gotestsum-report.xml -- -tags=$GOTAGS $PACKAGE_NAMES

      - store_test_results:
          path: /tmp/test-results
@ -476,7 +462,25 @@ jobs:
            git_merge_branch="ci/master-merge-$(date +%Y%m%d%H%M%S)"
            git checkout -b "${git_merge_branch}"
            latest_oss_commit="$(git rev-parse origin/master)"
-            git merge -m "Merge Consul OSS branch 'master' at commit ${latest_oss_commit}" "${latest_oss_commit}"
+
+            if ! errors=$(git merge -m "Merge Consul OSS branch 'master' at commit ${latest_oss_commit}" "${latest_oss_commit}"); then
+              printf "oss/master merge into ${CIRCLE_BRANCH} failed because git was unable to auto-merge!\n${errors}"
+              curl -X POST -H 'Content-type: application/json' \
+              --data \
+              "{ \
+                \"attachments\": [ \
+                  { \
+                    \"fallback\": \"master merge into ${CIRCLE_BRANCH} failed because git was unable to auto-merge!\", \
+                    \"text\": \"Nightly *master* merge into *${CIRCLE_BRANCH}* failed!\n\nBuild Log: ${CIRCLE_BUILD_URL}\n\nGit was unable to auto-merge due to possible merge conflict.\n\n*Errors:*\n${errors}\", \
+                    \"footer\": \"${CIRCLE_PROJECT_USERNAME}/${CIRCLE_PROJECT_REPONAME}\", \
+                    \"ts\": \"$(date +%s)\", \
+                    \"color\": \"danger\" \
+                  } \
+                ] \
+              }" ${CONSUL_SLACK_WEBHOOK_URL}
+              exit 1
+            fi
+
            git push origin "${git_merge_branch}"
            sleep 15  # Wait for merge branch to start CircleCI pipeline

@ -568,28 +572,26 @@ workflows:
            branches:
              only:
                - release/1-6
-  build-distros:
+  go-tests:
    jobs:
      - lint-consul-retry
-      - go-fmt-and-vet:
+      - go-fmt-and-vet
+      - dev-build:
          requires:
            - lint-consul-retry
-      - build-386: &require-go-fmt-vet
-          requires:
            - go-fmt-and-vet
-      - build-amd64: *require-go-fmt-vet
-      - build-arm-arm64: *require-go-fmt-vet
-  test-integrations:
-    jobs:
-      - dev-build
      - go-test: &go-test
          requires:
            - dev-build
-          filters:
-            branches:
-              ignore:
-                - /^pull\/.*$/ # only run go tests on non forks
      - go-test-api: *go-test
+  build-distros:
+    jobs:
+      - build-386
+      - build-amd64
+      - build-arm-arm64
+  test-integrations:
+    jobs:
+      - dev-build
      - dev-upload-s3:
          requires:
            - dev-build
--- a/.travis.yml
+++ b/.travis.yml
@ -1,23 +0,0 @@
-language: go
-
-go:
-  # Please keep this in-sync with the go version we build against in
-  # build-support/docker/Build-Go.dockerfile.
-  - "1.12.1"
-
-branches:
-  only:
-    - master
-    - release/1-6
-
-matrix:
-  include:
-    - env: GOTEST_PKGS="./api"
-    - env: GOTEST_PKGS="./agent"
-    - env: GOTEST_PKGS="./agent/consul"
-    - env: GOTEST_PKGS_EXCLUDE="./api|./agent|./agent/consul"
-
-script:
-  - make test-ci
-
-sudo: false
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -6,6 +6,8 @@ FEATURES:

 IMPROVEMENTS:

+* raft: allow trailing logs to be configured as an escape hatch for extreme load that prevents followers catching up with leader [[GH-6186](https://github.com/hashicorp/consul/pull/6186)]
+* agent: added configurable limit for log files to be rotated [[GH-5831](https://github.com/hashicorp/consul/pull/5831)]
 * agent: health checks: change long timeout behavior to use to user-configured `timeout` value [[GH-6094](https://github.com/hashicorp/consul/pull/6094)]
 * api: Update filtering language to include substring and regular expression matching on string values [[GH-6190](https://github.com/hashicorp/consul/pull/6190)]
 * api: Display allowed HTTP CIDR information nicely [[GH-6029](https://github.com/hashicorp/consul/pull/6029)]
--- a/agent/agent.go
+++ b/agent/agent.go
@ -1162,6 +1162,9 @@ func (a *Agent) consulConfig() (*consul.Config, error) {
 	if a.config.RaftSnapshotInterval != 0 {
 		base.RaftConfig.SnapshotInterval = a.config.RaftSnapshotInterval
 	}
+	if a.config.RaftTrailingLogs != 0 {
+		base.RaftConfig.TrailingLogs = uint64(a.config.RaftTrailingLogs)
+	}
 	if a.config.ACLMasterToken != "" {
 		base.ACLMasterToken = a.config.ACLMasterToken
 	}
--- a/agent/agent_endpoint.go
+++ b/agent/agent_endpoint.go
@ -141,7 +141,7 @@ func (s *HTTPServer) AgentReload(resp http.ResponseWriter, req *http.Request) (i
 	}

 	// Trigger the reload
-	errCh := make(chan error, 0)
+	errCh := make(chan error)
 	select {
 	case <-s.agent.shutdownCh:
 		return nil, fmt.Errorf("Agent was shutdown before reload could be completed")
--- a/agent/agent_endpoint_test.go
+++ b/agent/agent_endpoint_test.go
@ -564,7 +564,7 @@ func TestAgent_Service(t *testing.T) {
 			}
 			start := time.Now()
 			obj, err := a.srv.AgentService(resp, req)
-			elapsed := time.Now().Sub(start)
+			elapsed := time.Since(start)

 			if tt.wantErr != "" {
 				require.Error(err)
@ -5350,7 +5350,7 @@ func TestAgentConnectProxyConfig_Blocking(t *testing.T) {
 			}
 			start := time.Now()
 			obj, err := a.srv.AgentConnectProxyConfig(resp, req)
-			elapsed := time.Now().Sub(start)
+			elapsed := time.Since(start)

 			if tt.wantErr {
 				require.Error(err)
--- a/agent/agent_test.go
+++ b/agent/agent_test.go
@ -3925,7 +3925,7 @@ func TestAgent_ReloadConfigTLSConfigFailure(t *testing.T) {
 	require.Len(t, tlsConf.RootCAs.Subjects(), 1)
 }

-func TestAgent_consulConfig(t *testing.T) {
+func TestAgent_consulConfig_AutoEncryptAllowTLS(t *testing.T) {
 	t.Parallel()
 	dataDir := testutil.TempDir(t, "agent") // we manage the data dir
 	defer os.RemoveAll(dataDir)
@ -3941,3 +3941,13 @@ func TestAgent_consulConfig(t *testing.T) {
 	defer a.Shutdown()
 	require.True(t, a.consulConfig().AutoEncryptAllowTLS)
 }
+
+func TestAgent_consulConfig_RaftTrailingLogs(t *testing.T) {
+	t.Parallel()
+	hcl := `
+		raft_trailing_logs = 812345
+	`
+	a := NewTestAgent(t, t.Name(), hcl)
+	defer a.Shutdown()
+	require.Equal(t, uint64(812345), a.consulConfig().RaftConfig.TrailingLogs)
+}
--- a/agent/cache/cache.go
+++ b/agent/cache/cache.go
@ -703,7 +703,7 @@ func (c *Cache) runExpiryLoop() {
 		c.entriesLock.RLock()
 		if len(c.entriesExpiryHeap.Entries) > 0 {
 			entry = c.entriesExpiryHeap.Entries[0]
-			expiryTimer = time.NewTimer(entry.Expires.Sub(time.Now()))
+			expiryTimer = time.NewTimer(time.Until(entry.Expires))
 			expiryCh = expiryTimer.C
 		}
 		c.entriesLock.RUnlock()
--- a/agent/catalog_endpoint.go
+++ b/agent/catalog_endpoint.go
@ -189,7 +189,7 @@ func (s *HTTPServer) CatalogServices(resp http.ResponseWriter, req *http.Request

 	// Use empty map instead of nil
 	if out.Services == nil {
-		out.Services = make(structs.Services, 0)
+		out.Services = make(structs.Services)
 	}
 	metrics.IncrCounterWithLabels([]string{"client", "api", "success", "catalog_services"}, 1,
 		[]metrics.Label{{Name: "node", Value: s.nodeName()}})
--- a/agent/config/builder.go
+++ b/agent/config/builder.go
@ -862,6 +862,7 @@ func (b *Builder) Build() (rt RuntimeConfig, err error) {
 		RaftProtocol:                            b.intVal(c.RaftProtocol),
 		RaftSnapshotThreshold:                   b.intVal(c.RaftSnapshotThreshold),
 		RaftSnapshotInterval:                    b.durationVal("raft_snapshot_interval", c.RaftSnapshotInterval),
+		RaftTrailingLogs:                        b.intVal(c.RaftTrailingLogs),
 		ReconnectTimeoutLAN:                     b.durationVal("reconnect_timeout", c.ReconnectTimeoutLAN),
 		ReconnectTimeoutWAN:                     b.durationVal("reconnect_timeout_wan", c.ReconnectTimeoutWAN),
 		RejoinAfterLeave:                        b.boolVal(c.RejoinAfterLeave),
--- a/agent/config/config.go
+++ b/agent/config/config.go
@ -241,6 +241,7 @@ type Config struct {
 	RaftProtocol                     *int                     `json:"raft_protocol,omitempty" hcl:"raft_protocol" mapstructure:"raft_protocol"`
 	RaftSnapshotThreshold            *int                     `json:"raft_snapshot_threshold,omitempty" hcl:"raft_snapshot_threshold" mapstructure:"raft_snapshot_threshold"`
 	RaftSnapshotInterval             *string                  `json:"raft_snapshot_interval,omitempty" hcl:"raft_snapshot_interval" mapstructure:"raft_snapshot_interval"`
+	RaftTrailingLogs                 *int                     `json:"raft_trailing_logs,omitempty" hcl:"raft_trailing_logs" mapstructure:"raft_trailing_logs"`
 	ReconnectTimeoutLAN              *string                  `json:"reconnect_timeout,omitempty" hcl:"reconnect_timeout" mapstructure:"reconnect_timeout"`
 	ReconnectTimeoutWAN              *string                  `json:"reconnect_timeout_wan,omitempty" hcl:"reconnect_timeout_wan" mapstructure:"reconnect_timeout_wan"`
 	RejoinAfterLeave                 *bool                    `json:"rejoin_after_leave,omitempty" hcl:"rejoin_after_leave" mapstructure:"rejoin_after_leave"`
--- a/agent/config/runtime.go
+++ b/agent/config/runtime.go
@ -965,6 +965,22 @@ type RuntimeConfig struct {
 	// hcl: raft_snapshot_threshold = int
 	RaftSnapshotInterval time.Duration

+	// RaftTrailingLogs sets the number of log entries that will be left in the
+	// log store after a snapshot. This must be large enough that a follower can
+	// transfer and restore an entire snapshot of the state before this many new
+	// entries have been appended. In vast majority of cases the default is plenty
+	// but if there is a sustained high write throughput coupled with a huge
+	// multi-gigabyte snapshot setting this higher may be necessary to allow
+	// followers time to reload from snapshot without becoming unhealthy. If it's
+	// too low then followers are unable to ever recover from a restart and will
+	// enter a loop of constantly downloading full snapshots and never catching
+	// up. If you need to change this you should reconsider your usage of Consul
+	// as it is not designed to store multiple-gigabyte data sets with high write
+	// throughput. Defaults to 10000.
+	//
+	// hcl: raft_trailing_logs = int
+	RaftTrailingLogs int
+
 	// ReconnectTimeoutLAN specifies the amount of time to wait to reconnect with
 	// another agent before deciding it's permanently gone. This can be used to
 	// control the time it takes to reap failed nodes from the cluster.
--- a/agent/config/runtime_test.go
+++ b/agent/config/runtime_test.go
@ -3767,6 +3767,7 @@ func TestFullConfig(t *testing.T) {
 			"raft_protocol": 19016,
 			"raft_snapshot_threshold": 16384,
 			"raft_snapshot_interval": "30s",
+			"raft_trailing_logs": 83749,
 			"reconnect_timeout": "23739s",
 			"reconnect_timeout_wan": "26694s",
 			"recursors": [ "63.38.39.58", "92.49.18.18" ],
@ -4371,6 +4372,7 @@ func TestFullConfig(t *testing.T) {
 			raft_protocol = 19016
 			raft_snapshot_threshold = 16384
 			raft_snapshot_interval = "30s"
+			raft_trailing_logs = 83749
 			reconnect_timeout = "23739s"
 			reconnect_timeout_wan = "26694s"
 			recursors = [ "63.38.39.58", "92.49.18.18" ]
@ -5043,6 +5045,7 @@ func TestFullConfig(t *testing.T) {
 		RaftProtocol:                     19016,
 		RaftSnapshotThreshold:            16384,
 		RaftSnapshotInterval:             30 * time.Second,
+		RaftTrailingLogs:                 83749,
 		ReconnectTimeoutLAN:              23739 * time.Second,
 		ReconnectTimeoutWAN:              26694 * time.Second,
 		RejoinAfterLeave:                 true,
@ -5901,6 +5904,7 @@ func TestSanitize(t *testing.T) {
 		"RaftProtocol": 0,
 		"RaftSnapshotInterval": "0s",
 		"RaftSnapshotThreshold": 0,
+		"RaftTrailingLogs": 0,
 		"ReconnectTimeoutLAN": "0s",
 		"ReconnectTimeoutWAN": "0s",
 		"RejoinAfterLeave": false,
--- a/agent/connect/ca/provider_consul_test.go
+++ b/agent/connect/ca/provider_consul_test.go
@ -178,7 +178,7 @@ func TestConsulCAProvider_SignLeaf(t *testing.T) {
 		require.Equal(parsed.SerialNumber.Uint64(), uint64(2))

 		// Ensure the cert is valid now and expires within the correct limit.
-		require.True(parsed.NotAfter.Sub(time.Now()) < 3*24*time.Hour)
+		require.True(time.Until(parsed.NotAfter) < 3*24*time.Hour)
 		require.True(parsed.NotBefore.Before(time.Now()))
 	}

--- a/agent/connect/ca/provider_vault_test.go
+++ b/agent/connect/ca/provider_vault_test.go
@ -186,7 +186,7 @@ func TestVaultCAProvider_SignLeaf(t *testing.T) {
 		require.NotEqual(firstSerial, parsed.SerialNumber.Uint64())

 		// Ensure the cert is valid now and expires within the correct limit.
-		require.True(parsed.NotAfter.Sub(time.Now()) < time.Hour)
+		require.True(time.Until(parsed.NotAfter) < time.Hour)
 		require.True(parsed.NotBefore.Before(time.Now()))
 	}
 }
--- a/agent/consul/acl_replication_types.go
+++ b/agent/consul/acl_replication_types.go
@ -316,7 +316,7 @@ func (r *aclRoleReplicator) FetchUpdated(srv *Server, updates []string) (int, er
 				delete(keep, role.ID)
 			}
 			missing := make([]string, 0, len(keep))
-			for id, _ := range keep {
+			for id := range keep {
 				missing = append(missing, id)
 			}
 			return 0, fmt.Errorf("role replication trying to replicated uncached roles with IDs: %v", missing)
--- a/agent/consul/kvs_endpoint_test.go
+++ b/agent/consul/kvs_endpoint_test.go
@ -596,11 +596,7 @@ key "zip" {
 		t.Fatalf("err: %v", err)
 	}

-	actualKeys = []string{}
-
-	for _, key := range keyList.Keys {
-		actualKeys = append(actualKeys, key)
-	}
+	actualKeys = keyList.Keys

 	verify.Values(t, "", actualKeys, expectedKeys)

--- a/agent/consul/leader.go
+++ b/agent/consul/leader.go
@ -1192,7 +1192,7 @@ func (s *Server) pruneCARoots() error {

 	var newRoots structs.CARoots
 	for _, r := range roots {
-		if !r.Active && !r.RotatedOutAt.IsZero() && time.Now().Sub(r.RotatedOutAt) > common.LeafCertTTL*2 {
+		if !r.Active && !r.RotatedOutAt.IsZero() && time.Since(r.RotatedOutAt) > common.LeafCertTTL*2 {
 			s.logger.Printf("[INFO] connect: pruning old unused root CA (ID: %s)", r.ID)
 			continue
 		}
--- a/agent/consul/server_lookup.go
+++ b/agent/consul/server_lookup.go
@ -51,8 +51,7 @@ func (sl *ServerLookup) ServerAddr(id raft.ServerID) (raft.ServerAddress, error)
 func (sl *ServerLookup) Server(addr raft.ServerAddress) *metadata.Server {
 	sl.lock.RLock()
 	defer sl.lock.RUnlock()
-	svr, _ := sl.addressToServer[addr]
-	return svr
+	return sl.addressToServer[addr]
 }

 func (sl *ServerLookup) Servers() []*metadata.Server {
--- a/agent/consul/server_test.go
+++ b/agent/consul/server_test.go
@ -183,6 +183,10 @@ func newServer(c *Config) (*Server, error) {
 			oldNotify()
 		}
 	}
+	// Restore old notify to guard against re-closing `up` on a retry
+	defer func() {
+		c.NotifyListen = oldNotify
+	}()

 	// start server
 	w := c.LogOutput
@ -820,7 +824,6 @@ func TestServer_BadExpect(t *testing.T) {
 type fakeGlobalResp struct{}

 func (r *fakeGlobalResp) Add(interface{}) {
-	return
 }

 func (r *fakeGlobalResp) New() interface{} {
--- a/agent/consul/state/acl_test.go
+++ b/agent/consul/state/acl_test.go
@ -3824,11 +3824,11 @@ func stripIrrelevantTokenFields(token *structs.ACLToken) *structs.ACLToken {
 	// When comparing the tokens disregard the policy link names.  This
 	// data is not cleanly updated in a variety of scenarios and should not
 	// be relied upon.
-	for i, _ := range tokenCopy.Policies {
+	for i := range tokenCopy.Policies {
 		tokenCopy.Policies[i].Name = ""
 	}
 	// Also do the same for Role links.
-	for i, _ := range tokenCopy.Roles {
+	for i := range tokenCopy.Roles {
 		tokenCopy.Roles[i].Name = ""
 	}
 	// The raft indexes won't match either because the requester will not
--- a/agent/http.go
+++ b/agent/http.go
@ -293,7 +293,7 @@ func (s *HTTPServer) handler(enableDebug bool) http.Handler {
 	mux.HandleFunc("/", s.Index)
 	for pattern, fn := range endpoints {
 		thisFn := fn
-		methods, _ := allowedMethods[pattern]
+		methods := allowedMethods[pattern]
 		bound := func(resp http.ResponseWriter, req *http.Request) (interface{}, error) {
 			return thisFn(s, resp, req)
 		}
--- a/agent/router/manager.go
+++ b/agent/router/manager.go
@ -342,8 +342,6 @@ func (m *Manager) RebalanceServers() {
 		// continue to use the existing connection until the next
 		// rebalance occurs.
 	}
-
-	return
 }

 // reconcileServerList returns true when the first server in serverList
--- a/agent/ui_endpoint_test.go
+++ b/agent/ui_endpoint_test.go
@ -60,7 +60,7 @@ func TestUiIndex(t *testing.T) {
 	// Verify the body
 	out := bytes.NewBuffer(nil)
 	io.Copy(out, resp.Body)
-	if string(out.Bytes()) != "test" {
+	if out.String() != "test" {
 		t.Fatalf("bad: %s", out.Bytes())
 	}
 }
--- a/command/debug/debug.go
+++ b/command/debug/debug.go
@ -302,7 +302,7 @@ func (c *cmd) captureStatic() error {
 	var errors error

 	// Collect the named outputs here
-	outputs := make(map[string]interface{}, 0)
+	outputs := make(map[string]interface{})

 	// Capture host information
 	if c.configuredTarget("host") {
--- a/lib/telemetry.go
+++ b/lib/telemetry.go
@ -225,7 +225,7 @@ func (c *TelemetryConfig) MergeDefaults(defaults *TelemetryConfig) {
 				continue
 			}
 		case reflect.Bool:
-			if f.Bool() != false {
+			if f.Bool() {
 				continue
 			}
 		default:
--- a/sdk/testutil/retry/retry.go
+++ b/sdk/testutil/retry/retry.go
@ -110,7 +110,7 @@ func dedup(a []string) string {
 			delete(m, s)
 		}
 	}
-	return string(b.Bytes())
+	return b.String()
 }

 func run(r Retryer, t Failer, f func(r *R)) {
--- a/website/source/docs/agent/options.html.md
+++ b/website/source/docs/agent/options.html.md
@ -407,21 +407,6 @@ will exit with an error at startup.
  [Raft Protocol Version Compatibility](/docs/upgrade-specific.html#raft-protocol-version-compatibility)
  for more details.

-* <a name="_raft_snapshot_threshold"></a><a href="#_raft_snapshot_threshold">`-raft-snapshot-threshold`</a> - This controls the
-  minimum number of raft commit entries between snapshots that are saved to disk. This is a low-level parameter that should
-  rarely need to be changed. Very busy clusters experiencing excessive disk IO may increase this value to reduce disk IO, and minimize
-  the chances of all servers taking snapshots at the same time. Increasing this trades off disk IO for disk space since the log will
-  grow much larger and the space in the raft.db file can't be reclaimed till the next snapshot. Servers may take longer to recover from
-  crashes or failover if this is increased significantly as more logs will need to be replayed. In Consul 1.1.0 and later this
-  defaults to 16384, and in prior versions it was set to 8192.
-
-* <a name="_raft_snapshot_interval"></a><a href="#_raft_snapshot_interval">`-raft-snapshot-interval`</a> - This controls how often servers
-  check if they need to save a snapshot to disk. his is a low-level parameter that should rarely need to be changed. Very busy clusters
-  experiencing excessive disk IO may increase this value to reduce disk IO, and minimize the chances of all servers taking snapshots at the same time.
-  Increasing this trades off disk IO for disk space since the log will grow much larger and the space in the raft.db file can't be reclaimed
-  till the next snapshot. Servers may take longer to recover from crashes or failover if this is increased significantly as more logs
-  will need to be replayed. In Consul 1.1.0 and later this defaults to `30s`, and in prior versions it was set to `5s`.
-
 * <a name="_recursor"></a><a href="#_recursor">`-recursor`</a> - Specifies the address of an upstream DNS
  server. This option may be provided multiple times, and is functionally
  equivalent to the [`recursors` configuration option](#recursors).
@ -1431,11 +1416,46 @@ default will automatically work with some tooling.
 * <a name="raft_protocol"></a><a href="#raft_protocol">`raft_protocol`</a> Equivalent to the
  [`-raft-protocol` command-line flag](#_raft_protocol).

-* <a name="raft_snapshot_threshold"></a><a href="#raft_snapshot_threshold">`raft_snapshot_threshold`</a> Equivalent to the
-  [`-raft-snapshot-threshold` command-line flag](#_raft_snapshot_threshold).
+<!-- Note the extra _ anchors are here because we used to erroneously list these as
+command line flags even though they are not actually defined as valid flags and can 
+only be set in config file. Duplicating the anchor preserves any existing external links 
+to the old fragment -->
+* <a name="raft_snapshot_threshold"></a><a name="_raft_snapshot_threshold"></a>
+  <a href="#raft_snapshot_threshold">`raft_snapshot_threshold`</a> This controls
+  the minimum number of raft commit entries between snapshots that are saved to
+  disk. This is a low-level parameter that should rarely need to be changed.
+  Very busy clusters experiencing excessive disk IO may increase this value to
+  reduce disk IO, and minimize the chances of all servers taking snapshots at
+  the same time. Increasing this trades off disk IO for disk space since the log
+  will grow much larger and the space in the raft.db file can't be reclaimed
+  till the next snapshot. Servers may take longer to recover from crashes or
+  failover if this is increased significantly as more logs will need to be
+  replayed. In Consul 1.1.0 and later this defaults to 16384, and in prior
+  versions it was set to 8192.

-* <a name="raft_snapshot_interval"></a><a href="#raft_snapshot_interval">`raft_snapshot_interval`</a> Equivalent to the
-  [`-raft-snapshot-interval` command-line flag](#_raft_snapshot_interval).
+* <a name="raft_snapshot_interval"></a><a name="_raft_snapshot_interval"></a> <a
+  href="#raft_snapshot_interval">`raft_snapshot_interval`</a> This controls how
+  often servers check if they need to save a snapshot to disk. his is a
+  low-level parameter that should rarely need to be changed. Very busy clusters
+  experiencing excessive disk IO may increase this value to reduce disk IO, and
+  minimize the chances of all servers taking snapshots at the same time.
+  Increasing this trades off disk IO for disk space since the log will grow much
+  larger and the space in th e raft.db file can't be reclaimed till the next
+  snapshot. Servers may take longer to recover from crashes or failover if this
+  is increased significantly as more logs will need to be replayed. In Consul
+  1.1.0 and later this defaults to `30s`, and in prior versions it was set to
+  `5s`.
+
+* <a name="raft_trailing_logs"></a><a
+  href="#raft_trailing_logs">`raft_trailing_logs`</a> - This controls how many
+  log entries are left in the log store on disk after a snapshot is made. This
+  should only be adjusted when followers cannot catch up to the leader due to a
+  very large snapshot size that and high write throughput causing log truncation
+  before an snapshot can be fully installed. If you need to use this to recover
+  a cluster, consider reducing write throughput or the amount of data stored on
+  Consul as it is likely under a load it is not designed to handle. The default
+  value is 10000 which is suitable for all normal workloads. Added in Consul
+  1.5.3.

 * <a name="reap"></a><a href="#reap">`reap`</a> This controls Consul's automatic reaping of child processes,
  which is useful if Consul is running as PID 1 in a Docker container. If this isn't specified, then Consul will
--- a/website/source/docs/guides/kubernetes-production-deploy.md
+++ b/website/source/docs/guides/kubernetes-production-deploy.md
@ -0,0 +1,286 @@
+---
+name: "Consul-Kubernetes Deployment Guide"
+content_length: 14
+id: kubernetes-production-deploy
+layout: content_layout
+products_used:
+  - Consul
+description: This guide covers the necessary steps to install and configure a new Consul cluster on Kubernetes.
+level: Advanced
+___
+
+
+This guide covers the necessary steps to install and configure a new Consul
+cluster on Kubernetes, as defined in the [Consul Reference Architecture
+guide](/consul/day-1-operations/kubernetes-reference#consul-datacenter-deployed-in-kubernetes).
+By the end of this guide, you will be able to identify the installation
+prerequisites, customize the Helm chart to fit your environment requirements,
+and interact with your new Consul cluster.   
+
+~> You should have the following configured before starting this guide: Helm
+installed and configured locally, tiller running in the Kubernetes cluster, and
+the Kubernetes CLI configured. 
+
+## Configure Kubernetes Permissions to Deploy Consul
+
+Before deploying Consul, you will need to create a new Kubernetes service
+account with the correct permissions and to authenticate it on the command
+line. You will need Kubernetes operators permissions to create and modify
+policies, deploy services, access the Kubernetes dashboard, create secrets, and
+create RBAC objects. You can find documentation for RBAC and service accounts
+for the following cloud providers. 
+ 
+- [AKS](https://docs.microsoft.com/en-us/azure/aks/kubernetes-service-principal) 
+- [EKS](https://docs.aws.amazon.com/eks/latest/userguide/install-aws-iam-authenticator.html) 
+- [GCP](https://console.cloud.google.com/iam-admin/serviceaccounts)
+
+Note, Consul can be deployed on any properly configured Kubernetes cluster in
+the cloud or on premises. 
+
+Once you have a service account, you will also need to add a permission to
+deploy the helm chart. This is done with the `clusterrolebinding` method. 
+
+```sh
+$ kubectl create clusterrolebinding kubernetes-dashboard -n kube-system --clusterrole=cluster-admin --serviceaccount=kube-system:kubernetes-dashboard
+```
+
+Finally, you may need to create Kubernetes secrets to store Consul data. You
+can reference these secrets in the customized Helm chart values file. 
+
+- If you have purchased Enterprise Consul, the enterprise license file should be
+used with the official image,  `hashicorp/consul-enterprise:1.5.0-ent`.  
+
+- Enable
+[encryption](https://www.consul.io/docs/agent/encryption.html#gossip-encryption) to secure gossip traffic within the Consul cluster. 
+
+
+~> Note, depending on your environment, the previous secrets may not be
+necessary.  
+
+## Configure Helm Chart 
+
+Now that you have prepared your Kubernetes cluster, you can customize the Helm
+chart. 	First, you will need to download the latest official Helm chart.
+
+```sh 
+$ git clone https://github.com/hashicorp/consul-helm.git 
+```
+
+The `consul-helm` directory will contain a `values.yaml` file with example
+parameters. You can update this file to customize your Consul deployment. Below
+we detail some of the parameters you should customize and provide an example
+file, however you should consider your particular production needs when
+configuring your chart. 
+
+### Global Values
+
+The global values will affect all the other parameters in the chart. 
+
+To enable all of the Consul components in the Helm chart, set `enabled` to
+`true`. This means servers, clients, Consul DNS, and the Consul UI will be
+installed with their defaults. You should also set the following global
+parameters based on your specific environment requirements. 
+
+- `image` is the name and tag of the Consul Docker image.  
+- `imagek8s` is the name and tag of the Docker image for the consul-k8s binary.  
+- `datacenter` the name of your Consul datacenter.  
+- `domain` the domain Consul uses for DNS queries. 
+
+For security, set the `bootstrapACLs`  parameter to true. This will enable
+Kubernetes to initially setup Consul's [ACL
+system](https://www.consul.io/docs/acl/acl-system.html).
+
+Read the Consul Helm chart documentation to review all the [global
+parameters](https://www.consul.io/docs/platform/k8s/helm.html#v-global).
+
+### Consul UI
+
+To enable the Consul web UI update the `ui` section to your values file and set
+`enabled` to `true`. 
+
+Note, you can also set up a [loadbalancer
+resource](https://github.com/hashicorp/demo-consul-101/tree/master/k8s#implement-load-balancer)
+or other service type in Kubernetes to make it easier to access the UI.  
+
+### Consul Servers
+
+For production deployments, you will need to deploy [3 or 5 Consul
+servers](https://www.consul.io/docs/internals/consensus.html#deployment-table)
+for quorum and failure tolerance. For most deployments, 3 servers are adequate.
+
+In the server section set both `replicas` and `bootstrapExpect` to 3. This will
+deploy three servers and cause Consul to wait to perform leader election until
+all three are healthy. The `resources` will depend on your environment; in the
+example at the end of the guide, the resources are set for a large environment. 
+
+#### Affinity
+
+To ensure the Consul servers are placed on different Kubernetes nodes, you will
+need to configure affinity. Otherwise, the failure of one Kubernetes node could
+cause the loss of multiple Consul servers, and result in quorum loss. By
+default, the example `values.yaml` has affinity configured correctly.  
+
+#### Enterprise License
+
+If you have an [Enterprise
+license](https://www.hashicorp.com/products/consul/enterprise) you should
+reference the Kubernetes secret in the `enterpriseLicense` parameter.
+
+Read the Consul Helm chart documentation to review all the [server
+parameters](https://www.consul.io/docs/platform/k8s/helm.html#v-server)
+
+### Consul Clients
+
+A Consul client is deployed on every Kubernetes node, so you do not need to
+specify the number of clients for your deployments. You will need to specify
+resources and enable gRPC. The resources in the example at the end of this guide
+should be
+sufficient for most production scenarios since Consul clients are designed for
+horizontal scalability. Enabling `grpc` enables the GRPC listener on port 8502
+and exposes it to the host. It is required to use Consul Connect.
+
+Read the Consul Helm chart documentation to review all the [client
+parameters](https://www.consul.io/docs/platform/k8s/helm.html#v-client)
+
+### Consul Connect Injection Security
+
+Even though you enabled Consul server communication over Connect in the server section, you will also
+need to enable `connectInject` by setting `enabled` to `true`. In the
+`connectInject` section you will also configure security features. Enabling the
+`default` parameter will allow the injector to automatically inject the Connect
+sidecar into all pods. If you would prefer to manually annotate which pods to inject, you
+can set this to false. Setting the 'aclBindingRuleSelector` parameter to
+`serviceaccount.name!=default` ensures that new services do not all receive the
+same token if you are only using a default service account. This setting is
+only necessary if you have enabled ACLs in the global section.
+
+Read more about the [Connect Inject
+parameters](https://www.consul.io/docs/platform/k8s/helm.html#v-connectinject).
+
+## Complete Example
+
+Your finished values file should resemble the following example. For more
+complete descriptions of all the available parameters see the `values.yaml`
+file  provided with the Helm chart and the [reference
+documentation](https://www.consul.io/docs/platform/k8s/helm.html). 
+
+```yaml
+# Configure global settings in this section.
+global:
+  # Enable all the components within this chart by default.
+  enabled: true
+  # Specify the Consul and consul-k8s  images to use
+  image: "consul:1.5.0"
+  imagek8s: "hashicorp/consul-k8s:0.8.1"
+  domain: consul
+  datacenter: primarydc
+  # Bootstrap ACLs within Consul. This is highly recommended.
+  bootstrapACLs: true
+  # Gossip encryption
+  gossipEncryption: |
+    secretName: "encrypt-key"
+    secretKey: "key
+# Configure your Consul servers in this section.
+server:
+  enabled: true
+  connect: true
+  # Specify three servers that wait till all are healthy to bootstrap the Consul cluster.
+  replicas: 3
+  bootstrapExpect: 3
+  # Specify the resources that servers request for placement. These values will serve a large environment.
+  resources: |
+    requests:
+      memory: "32Gi"
+      cpu: "4"
+      disk: "50Gi"
+    limits:
+      memory: "32Gi"
+      cpu: "4"
+      disk: "50Gi"
+  # If using Enterprise, reference the Kubernetes secret that holds your license here
+  enterpriseLicense:
+    secretName: "consul-license"
+    secretKey: "key"
+  # Prevent Consul servers from co-location on Kubernetes nodes.
+  affinity: |
+   podAntiAffinity:
+    requiredDuringSchedulingIgnoredDuringExecution:
+      - labelSelector:
+          matchLabels:
+            app: {{ template "consul.name" . }}
+            release: "{{ .Release.Name }}"
+            component: server
+      topologyKey: kubernetes.io/hostname
+# Configure Consul clients in this section
+client:
+  enabled: true
+  # Specify the resources that clients request for deployment. 
+  resources: |
+    requests:
+      memory: "8Gi"
+      cpu: "2"
+      disk: "15Gi"
+    limits:
+      memory: "8Gi"
+      cpu: "2"
+      disk: "15Gi"
+  grpc: true
+# Enable and configure the Consul UI.
+ui:
+  enabled: true
+# Configure security for Consul Connect pod injection
+connectInject:
+  enabled: true
+  default: true
+  namespaceSelector: "my-namespace"
+  aclBindingRuleSelector: “serviceaccount.name!=default” 
+```
+## Deploy Consul 
+
+Now that you have customized the `values.yml` file, you can deploy Consul with
+Helm. This should only take a few minutes. The Consul pods should appear in the
+Kubernetes dashboard immediately and you can monitor the deployment process
+there.
+
+```sh 
+$ helm install ./consul-helm -f values.yaml 
+```
+
+To check the deployment process on the command line you can use `kubectl`.
+
+```sh 
+$ kubectl get pods 
+```
+
+## Summary
+
+In this guide, you configured Consul, using the Helm chart, for a production
+environment. This involved ensuring that your cluster had a properly
+distributed server cluster, specifying enough resources for your agents,
+securing the cluster with ACLs and gossip encryption, and enabling other Consul
+functionality including Connect and the Consul UI. 
+
+Now you can interact with your Consul cluster through the UI or CLI. 
+
+If you exposed the UI using a load balancer it will be available at the
+`LoadBalancer Ingress` IP address and `Port` that is output from the following
+command. Note, you will need to replace _consul server_ with the server name
+from your cluster.
+
+```sh 
+$ kubectl describe services consul-server 
+``` 
+
+To access the Consul CLI, open a terminal session using the Kubernetes CLI.
+
+```sh 
+$ kubectl exec <pod name> -it /bin/ash 
+```
+
+To learn more about how to interact with your Consul cluster or use it for
+service discovery, configuration or segmentation, try one of Learn’s
+[Operations or Development tracks](/consul/#advanced). Follow the [Security and
+Networking track](/consul/?track=security-networking#security-networking) to
+learn more about securing your Consul cluster.
+
+