From 7753b97cc7dd597a7ed3a4fbb14a6906e36ab108 Mon Sep 17 00:00:00 2001 From: Christian Muehlhaeuser Date: Sat, 20 Jul 2019 15:37:19 +0200 Subject: [PATCH 1/9] Simplified code in various places (#6176) All these changes should have no side-effects or change behavior: - Use bytes.Buffer's String() instead of a conversion - Use time.Since and time.Until where fitting - Drop unnecessary returns and assignment --- agent/agent_endpoint.go | 2 +- agent/agent_endpoint_test.go | 4 ++-- agent/cache/cache.go | 2 +- agent/catalog_endpoint.go | 2 +- agent/connect/ca/provider_consul_test.go | 2 +- agent/connect/ca/provider_vault_test.go | 2 +- agent/consul/acl_replication_types.go | 2 +- agent/consul/kvs_endpoint_test.go | 6 +----- agent/consul/leader.go | 2 +- agent/consul/server_lookup.go | 3 +-- agent/consul/server_test.go | 1 - agent/consul/state/acl_test.go | 4 ++-- agent/http.go | 2 +- agent/router/manager.go | 2 -- agent/ui_endpoint_test.go | 2 +- command/debug/debug.go | 2 +- lib/telemetry.go | 2 +- sdk/testutil/retry/retry.go | 2 +- 18 files changed, 18 insertions(+), 26 deletions(-) diff --git a/agent/agent_endpoint.go b/agent/agent_endpoint.go index e25f1ea6b1..4cecd1d41f 100644 --- a/agent/agent_endpoint.go +++ b/agent/agent_endpoint.go @@ -141,7 +141,7 @@ func (s *HTTPServer) AgentReload(resp http.ResponseWriter, req *http.Request) (i } // Trigger the reload - errCh := make(chan error, 0) + errCh := make(chan error) select { case <-s.agent.shutdownCh: return nil, fmt.Errorf("Agent was shutdown before reload could be completed") diff --git a/agent/agent_endpoint_test.go b/agent/agent_endpoint_test.go index b0fc8eb8ba..397c70d529 100644 --- a/agent/agent_endpoint_test.go +++ b/agent/agent_endpoint_test.go @@ -532,7 +532,7 @@ func TestAgent_Service(t *testing.T) { } start := time.Now() obj, err := a.srv.AgentService(resp, req) - elapsed := time.Now().Sub(start) + elapsed := time.Since(start) if tt.wantErr != "" { require.Error(err) @@ -5298,7 +5298,7 @@ func TestAgentConnectProxyConfig_Blocking(t *testing.T) { } start := time.Now() obj, err := a.srv.AgentConnectProxyConfig(resp, req) - elapsed := time.Now().Sub(start) + elapsed := time.Since(start) if tt.wantErr { require.Error(err) diff --git a/agent/cache/cache.go b/agent/cache/cache.go index ee032bbb99..7892272451 100644 --- a/agent/cache/cache.go +++ b/agent/cache/cache.go @@ -703,7 +703,7 @@ func (c *Cache) runExpiryLoop() { c.entriesLock.RLock() if len(c.entriesExpiryHeap.Entries) > 0 { entry = c.entriesExpiryHeap.Entries[0] - expiryTimer = time.NewTimer(entry.Expires.Sub(time.Now())) + expiryTimer = time.NewTimer(time.Until(entry.Expires)) expiryCh = expiryTimer.C } c.entriesLock.RUnlock() diff --git a/agent/catalog_endpoint.go b/agent/catalog_endpoint.go index 7381cdf642..6f4b428abf 100644 --- a/agent/catalog_endpoint.go +++ b/agent/catalog_endpoint.go @@ -151,7 +151,7 @@ RETRY_ONCE: // Use empty map instead of nil if out.Services == nil { - out.Services = make(structs.Services, 0) + out.Services = make(structs.Services) } metrics.IncrCounterWithLabels([]string{"client", "api", "success", "catalog_services"}, 1, []metrics.Label{{Name: "node", Value: s.nodeName()}}) diff --git a/agent/connect/ca/provider_consul_test.go b/agent/connect/ca/provider_consul_test.go index 479d460e6a..6ec9b56752 100644 --- a/agent/connect/ca/provider_consul_test.go +++ b/agent/connect/ca/provider_consul_test.go @@ -178,7 +178,7 @@ func TestConsulCAProvider_SignLeaf(t *testing.T) { require.Equal(parsed.SerialNumber.Uint64(), uint64(2)) // Ensure the cert is valid now and expires within the correct limit. - require.True(parsed.NotAfter.Sub(time.Now()) < 3*24*time.Hour) + require.True(time.Until(parsed.NotAfter) < 3*24*time.Hour) require.True(parsed.NotBefore.Before(time.Now())) } diff --git a/agent/connect/ca/provider_vault_test.go b/agent/connect/ca/provider_vault_test.go index 10be1befb4..b0ddaa4112 100644 --- a/agent/connect/ca/provider_vault_test.go +++ b/agent/connect/ca/provider_vault_test.go @@ -186,7 +186,7 @@ func TestVaultCAProvider_SignLeaf(t *testing.T) { require.NotEqual(firstSerial, parsed.SerialNumber.Uint64()) // Ensure the cert is valid now and expires within the correct limit. - require.True(parsed.NotAfter.Sub(time.Now()) < time.Hour) + require.True(time.Until(parsed.NotAfter) < time.Hour) require.True(parsed.NotBefore.Before(time.Now())) } } diff --git a/agent/consul/acl_replication_types.go b/agent/consul/acl_replication_types.go index 3009bec983..e0222e1443 100644 --- a/agent/consul/acl_replication_types.go +++ b/agent/consul/acl_replication_types.go @@ -316,7 +316,7 @@ func (r *aclRoleReplicator) FetchUpdated(srv *Server, updates []string) (int, er delete(keep, role.ID) } missing := make([]string, 0, len(keep)) - for id, _ := range keep { + for id := range keep { missing = append(missing, id) } return 0, fmt.Errorf("role replication trying to replicated uncached roles with IDs: %v", missing) diff --git a/agent/consul/kvs_endpoint_test.go b/agent/consul/kvs_endpoint_test.go index a91985c725..e3ba380df5 100644 --- a/agent/consul/kvs_endpoint_test.go +++ b/agent/consul/kvs_endpoint_test.go @@ -596,11 +596,7 @@ key "zip" { t.Fatalf("err: %v", err) } - actualKeys = []string{} - - for _, key := range keyList.Keys { - actualKeys = append(actualKeys, key) - } + actualKeys = keyList.Keys verify.Values(t, "", actualKeys, expectedKeys) diff --git a/agent/consul/leader.go b/agent/consul/leader.go index 21d96c3607..28acd7c312 100644 --- a/agent/consul/leader.go +++ b/agent/consul/leader.go @@ -1192,7 +1192,7 @@ func (s *Server) pruneCARoots() error { var newRoots structs.CARoots for _, r := range roots { - if !r.Active && !r.RotatedOutAt.IsZero() && time.Now().Sub(r.RotatedOutAt) > common.LeafCertTTL*2 { + if !r.Active && !r.RotatedOutAt.IsZero() && time.Since(r.RotatedOutAt) > common.LeafCertTTL*2 { s.logger.Printf("[INFO] connect: pruning old unused root CA (ID: %s)", r.ID) continue } diff --git a/agent/consul/server_lookup.go b/agent/consul/server_lookup.go index e163856d71..f40b573770 100644 --- a/agent/consul/server_lookup.go +++ b/agent/consul/server_lookup.go @@ -51,8 +51,7 @@ func (sl *ServerLookup) ServerAddr(id raft.ServerID) (raft.ServerAddress, error) func (sl *ServerLookup) Server(addr raft.ServerAddress) *metadata.Server { sl.lock.RLock() defer sl.lock.RUnlock() - svr, _ := sl.addressToServer[addr] - return svr + return sl.addressToServer[addr] } func (sl *ServerLookup) Servers() []*metadata.Server { diff --git a/agent/consul/server_test.go b/agent/consul/server_test.go index 32eadb232c..47278b78c3 100644 --- a/agent/consul/server_test.go +++ b/agent/consul/server_test.go @@ -820,7 +820,6 @@ func TestServer_BadExpect(t *testing.T) { type fakeGlobalResp struct{} func (r *fakeGlobalResp) Add(interface{}) { - return } func (r *fakeGlobalResp) New() interface{} { diff --git a/agent/consul/state/acl_test.go b/agent/consul/state/acl_test.go index 58c69f7433..563f6bde08 100644 --- a/agent/consul/state/acl_test.go +++ b/agent/consul/state/acl_test.go @@ -3824,11 +3824,11 @@ func stripIrrelevantTokenFields(token *structs.ACLToken) *structs.ACLToken { // When comparing the tokens disregard the policy link names. This // data is not cleanly updated in a variety of scenarios and should not // be relied upon. - for i, _ := range tokenCopy.Policies { + for i := range tokenCopy.Policies { tokenCopy.Policies[i].Name = "" } // Also do the same for Role links. - for i, _ := range tokenCopy.Roles { + for i := range tokenCopy.Roles { tokenCopy.Roles[i].Name = "" } // The raft indexes won't match either because the requester will not diff --git a/agent/http.go b/agent/http.go index 9025cc088e..bd876f46b4 100644 --- a/agent/http.go +++ b/agent/http.go @@ -293,7 +293,7 @@ func (s *HTTPServer) handler(enableDebug bool) http.Handler { mux.HandleFunc("/", s.Index) for pattern, fn := range endpoints { thisFn := fn - methods, _ := allowedMethods[pattern] + methods := allowedMethods[pattern] bound := func(resp http.ResponseWriter, req *http.Request) (interface{}, error) { return thisFn(s, resp, req) } diff --git a/agent/router/manager.go b/agent/router/manager.go index 081893c5e9..ae764087c4 100644 --- a/agent/router/manager.go +++ b/agent/router/manager.go @@ -342,8 +342,6 @@ func (m *Manager) RebalanceServers() { // continue to use the existing connection until the next // rebalance occurs. } - - return } // reconcileServerList returns true when the first server in serverList diff --git a/agent/ui_endpoint_test.go b/agent/ui_endpoint_test.go index 940e3ec36b..51d19a0bcf 100644 --- a/agent/ui_endpoint_test.go +++ b/agent/ui_endpoint_test.go @@ -60,7 +60,7 @@ func TestUiIndex(t *testing.T) { // Verify the body out := bytes.NewBuffer(nil) io.Copy(out, resp.Body) - if string(out.Bytes()) != "test" { + if out.String() != "test" { t.Fatalf("bad: %s", out.Bytes()) } } diff --git a/command/debug/debug.go b/command/debug/debug.go index 36a8ae3662..a9697b624a 100644 --- a/command/debug/debug.go +++ b/command/debug/debug.go @@ -302,7 +302,7 @@ func (c *cmd) captureStatic() error { var errors error // Collect the named outputs here - outputs := make(map[string]interface{}, 0) + outputs := make(map[string]interface{}) // Capture host information if c.configuredTarget("host") { diff --git a/lib/telemetry.go b/lib/telemetry.go index a335d6cc85..d815f4b587 100644 --- a/lib/telemetry.go +++ b/lib/telemetry.go @@ -225,7 +225,7 @@ func (c *TelemetryConfig) MergeDefaults(defaults *TelemetryConfig) { continue } case reflect.Bool: - if f.Bool() != false { + if f.Bool() { continue } default: diff --git a/sdk/testutil/retry/retry.go b/sdk/testutil/retry/retry.go index 2ef3c4c0eb..53c05a2b05 100644 --- a/sdk/testutil/retry/retry.go +++ b/sdk/testutil/retry/retry.go @@ -110,7 +110,7 @@ func dedup(a []string) string { delete(m, s) } } - return string(b.Bytes()) + return b.String() } func run(r Retryer, t Failer, f func(r *R)) { From 6fba0106a36d22eb876e02f54960e128dc83e37b Mon Sep 17 00:00:00 2001 From: Alvin Huang Date: Mon, 22 Jul 2019 14:01:22 -0400 Subject: [PATCH 2/9] Run go test packages in parallel (#6165) * modify gotestsum hacking that is unnecessary with the latest version * try running 2 packages at a time * try running 3 packages at a time * remove old comments --- .circleci/config.yml | 22 ++++------------------ 1 file changed, 4 insertions(+), 18 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 3c026ea28f..65e82305af 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -77,16 +77,9 @@ jobs: - run: mkdir -p $TEST_RESULTS_DIR - run: sudo apt-get update && sudo apt-get install -y rsyslog - run: sudo service rsyslog start - # Use CircleCI test splitting by classname. Since there are no classes in go, - # we fake it by taking everything after github.com/hashicorp/consul/ and setting - # it as the classname. - - # This loop writes go test results to .xml per go package - run: | - for pkg in $(go list ./... | grep -v github.com/hashicorp/consul/agent/proxyprocess |circleci tests split --split-by=timings --timings-type=classname | tr '\n' ' '); do - reportname=$(echo $pkg | cut -d '/' -f3- | sed "s#/#_#g") - gotestsum --format=short-verbose --junitfile $TEST_RESULTS_DIR/$reportname.xml -- -tags=$GOTAGS $pkg - done + PACKAGE_NAMES=$(go list ./... | grep -v github.com/hashicorp/consul/agent/proxyprocess | circleci tests split --split-by=timings --timings-type=classname) + gotestsum --format=short-verbose --junitfile $TEST_RESULTS_DIR/gotestsum-report.xml -- -tags=$GOTAGS -p 3 $PACKAGE_NAMES - store_test_results: path: /tmp/test-results @@ -108,18 +101,11 @@ jobs: - attach_workspace: at: /go/bin - run: mkdir -p $TEST_RESULTS_DIR - # Use CircleCI test splitting by classname. Since there are no classes in go, - # we fake it by taking everything after github.com/hashicorp/consul/ and setting - # it as the classname. - - # This loop writes go test results to .xml per go package - run: working_directory: api command: | - for pkg in $(go list ./... | circleci tests split --split-by=timings --timings-type=classname | tr '\n' ' '); do - reportname=$(echo $pkg | cut -d '/' -f3- | sed "s#/#_#g") - gotestsum --format=short-verbose --junitfile $TEST_RESULTS_DIR/$reportname.xml -- -tags=$GOTAGS $pkg - done + PACKAGE_NAMES=$(go list ./... | circleci tests split --split-by=timings --timings-type=classname) + gotestsum --format=short-verbose --junitfile $TEST_RESULTS_DIR/gotestsum-report.xml -- -tags=$GOTAGS $PACKAGE_NAMES - store_test_results: path: /tmp/test-results From 7c1f98e9a4286886e1a6a62711337bb18b4f064b Mon Sep 17 00:00:00 2001 From: Alvin Huang Date: Mon, 22 Jul 2019 18:08:54 -0400 Subject: [PATCH 3/9] enable circleci go tests for forks and reorganize jobs (#6191) --- .circleci/config.yml | 24 +++++++++++------------- 1 file changed, 11 insertions(+), 13 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 65e82305af..5346e4612d 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -437,28 +437,26 @@ jobs: workflows: version: 2 - build-distros: + go-tests: jobs: - lint-consul-retry - - go-fmt-and-vet: + - go-fmt-and-vet + - dev-build: requires: - lint-consul-retry - - build-386: &require-go-fmt-vet - requires: - go-fmt-and-vet - - build-amd64: *require-go-fmt-vet - - build-arm-arm64: *require-go-fmt-vet - test-integrations: - jobs: - - dev-build - go-test: &go-test requires: - dev-build - filters: - branches: - ignore: - - /^pull\/.*$/ # only run go tests on non forks - go-test-api: *go-test + build-distros: + jobs: + - build-386 + - build-amd64 + - build-arm-arm64 + test-integrations: + jobs: + - dev-build - dev-upload-s3: requires: - dev-build From 3a4e38a13ee9245bad315f9f239cc49995c6536e Mon Sep 17 00:00:00 2001 From: kaitlincarter-hc <43049322+kaitlincarter-hc@users.noreply.github.com> Date: Mon, 22 Jul 2019 19:16:06 -0500 Subject: [PATCH 4/9] [docs] New K8s-Consul deployment guide (#5859) * New K8s-Consul deployment guide * Update website/source/docs/guides/kubernetes-production-deploy.md * Update website/source/docs/guides/kubernetes-production-deploy.md Co-Authored-By: Rebecca Zanzig * Update website/source/docs/guides/kubernetes-production-deploy.md Co-Authored-By: Rebecca Zanzig * Update website/source/docs/guides/kubernetes-production-deploy.md Co-Authored-By: Rebecca Zanzig * Update website/source/docs/guides/kubernetes-production-deploy.md Co-Authored-By: Rebecca Zanzig * Update website/source/docs/guides/kubernetes-production-deploy.md Co-Authored-By: Rebecca Zanzig * updating based on comments * Update website/source/docs/guides/kubernetes-production-deploy.md Co-Authored-By: Rebecca Zanzig * Update website/source/docs/guides/kubernetes-production-deploy.md * Update website/source/docs/guides/kubernetes-production-deploy.md --- .../guides/kubernetes-production-deploy.md | 286 ++++++++++++++++++ 1 file changed, 286 insertions(+) create mode 100644 website/source/docs/guides/kubernetes-production-deploy.md diff --git a/website/source/docs/guides/kubernetes-production-deploy.md b/website/source/docs/guides/kubernetes-production-deploy.md new file mode 100644 index 0000000000..f90454ca37 --- /dev/null +++ b/website/source/docs/guides/kubernetes-production-deploy.md @@ -0,0 +1,286 @@ +--- +name: "Consul-Kubernetes Deployment Guide" +content_length: 14 +id: kubernetes-production-deploy +layout: content_layout +products_used: + - Consul +description: This guide covers the necessary steps to install and configure a new Consul cluster on Kubernetes. +level: Advanced +___ + + +This guide covers the necessary steps to install and configure a new Consul +cluster on Kubernetes, as defined in the [Consul Reference Architecture +guide](/consul/day-1-operations/kubernetes-reference#consul-datacenter-deployed-in-kubernetes). +By the end of this guide, you will be able to identify the installation +prerequisites, customize the Helm chart to fit your environment requirements, +and interact with your new Consul cluster. + +~> You should have the following configured before starting this guide: Helm +installed and configured locally, tiller running in the Kubernetes cluster, and +the Kubernetes CLI configured. + +## Configure Kubernetes Permissions to Deploy Consul + +Before deploying Consul, you will need to create a new Kubernetes service +account with the correct permissions and to authenticate it on the command +line. You will need Kubernetes operators permissions to create and modify +policies, deploy services, access the Kubernetes dashboard, create secrets, and +create RBAC objects. You can find documentation for RBAC and service accounts +for the following cloud providers. + +- [AKS](https://docs.microsoft.com/en-us/azure/aks/kubernetes-service-principal) +- [EKS](https://docs.aws.amazon.com/eks/latest/userguide/install-aws-iam-authenticator.html) +- [GCP](https://console.cloud.google.com/iam-admin/serviceaccounts) + +Note, Consul can be deployed on any properly configured Kubernetes cluster in +the cloud or on premises. + +Once you have a service account, you will also need to add a permission to +deploy the helm chart. This is done with the `clusterrolebinding` method. + +```sh +$ kubectl create clusterrolebinding kubernetes-dashboard -n kube-system --clusterrole=cluster-admin --serviceaccount=kube-system:kubernetes-dashboard +``` + +Finally, you may need to create Kubernetes secrets to store Consul data. You +can reference these secrets in the customized Helm chart values file. + +- If you have purchased Enterprise Consul, the enterprise license file should be +used with the official image, `hashicorp/consul-enterprise:1.5.0-ent`. + +- Enable +[encryption](https://www.consul.io/docs/agent/encryption.html#gossip-encryption) to secure gossip traffic within the Consul cluster. + + +~> Note, depending on your environment, the previous secrets may not be +necessary. + +## Configure Helm Chart + +Now that you have prepared your Kubernetes cluster, you can customize the Helm +chart. First, you will need to download the latest official Helm chart. + +```sh +$ git clone https://github.com/hashicorp/consul-helm.git +``` + +The `consul-helm` directory will contain a `values.yaml` file with example +parameters. You can update this file to customize your Consul deployment. Below +we detail some of the parameters you should customize and provide an example +file, however you should consider your particular production needs when +configuring your chart. + +### Global Values + +The global values will affect all the other parameters in the chart. + +To enable all of the Consul components in the Helm chart, set `enabled` to +`true`. This means servers, clients, Consul DNS, and the Consul UI will be +installed with their defaults. You should also set the following global +parameters based on your specific environment requirements. + +- `image` is the name and tag of the Consul Docker image. +- `imagek8s` is the name and tag of the Docker image for the consul-k8s binary. +- `datacenter` the name of your Consul datacenter. +- `domain` the domain Consul uses for DNS queries. + +For security, set the `bootstrapACLs` parameter to true. This will enable +Kubernetes to initially setup Consul's [ACL +system](https://www.consul.io/docs/acl/acl-system.html). + +Read the Consul Helm chart documentation to review all the [global +parameters](https://www.consul.io/docs/platform/k8s/helm.html#v-global). + +### Consul UI + +To enable the Consul web UI update the `ui` section to your values file and set +`enabled` to `true`. + +Note, you can also set up a [loadbalancer +resource](https://github.com/hashicorp/demo-consul-101/tree/master/k8s#implement-load-balancer) +or other service type in Kubernetes to make it easier to access the UI. + +### Consul Servers + +For production deployments, you will need to deploy [3 or 5 Consul +servers](https://www.consul.io/docs/internals/consensus.html#deployment-table) +for quorum and failure tolerance. For most deployments, 3 servers are adequate. + +In the server section set both `replicas` and `bootstrapExpect` to 3. This will +deploy three servers and cause Consul to wait to perform leader election until +all three are healthy. The `resources` will depend on your environment; in the +example at the end of the guide, the resources are set for a large environment. + +#### Affinity + +To ensure the Consul servers are placed on different Kubernetes nodes, you will +need to configure affinity. Otherwise, the failure of one Kubernetes node could +cause the loss of multiple Consul servers, and result in quorum loss. By +default, the example `values.yaml` has affinity configured correctly. + +#### Enterprise License + +If you have an [Enterprise +license](https://www.hashicorp.com/products/consul/enterprise) you should +reference the Kubernetes secret in the `enterpriseLicense` parameter. + +Read the Consul Helm chart documentation to review all the [server +parameters](https://www.consul.io/docs/platform/k8s/helm.html#v-server) + +### Consul Clients + +A Consul client is deployed on every Kubernetes node, so you do not need to +specify the number of clients for your deployments. You will need to specify +resources and enable gRPC. The resources in the example at the end of this guide +should be +sufficient for most production scenarios since Consul clients are designed for +horizontal scalability. Enabling `grpc` enables the GRPC listener on port 8502 +and exposes it to the host. It is required to use Consul Connect. + +Read the Consul Helm chart documentation to review all the [client +parameters](https://www.consul.io/docs/platform/k8s/helm.html#v-client) + +### Consul Connect Injection Security + +Even though you enabled Consul server communication over Connect in the server section, you will also +need to enable `connectInject` by setting `enabled` to `true`. In the +`connectInject` section you will also configure security features. Enabling the +`default` parameter will allow the injector to automatically inject the Connect +sidecar into all pods. If you would prefer to manually annotate which pods to inject, you +can set this to false. Setting the 'aclBindingRuleSelector` parameter to +`serviceaccount.name!=default` ensures that new services do not all receive the +same token if you are only using a default service account. This setting is +only necessary if you have enabled ACLs in the global section. + +Read more about the [Connect Inject +parameters](https://www.consul.io/docs/platform/k8s/helm.html#v-connectinject). + +## Complete Example + +Your finished values file should resemble the following example. For more +complete descriptions of all the available parameters see the `values.yaml` +file provided with the Helm chart and the [reference +documentation](https://www.consul.io/docs/platform/k8s/helm.html). + +```yaml +# Configure global settings in this section. +global: + # Enable all the components within this chart by default. + enabled: true + # Specify the Consul and consul-k8s images to use + image: "consul:1.5.0" + imagek8s: "hashicorp/consul-k8s:0.8.1" + domain: consul + datacenter: primarydc + # Bootstrap ACLs within Consul. This is highly recommended. + bootstrapACLs: true + # Gossip encryption + gossipEncryption: | + secretName: "encrypt-key" + secretKey: "key +# Configure your Consul servers in this section. +server: + enabled: true + connect: true + # Specify three servers that wait till all are healthy to bootstrap the Consul cluster. + replicas: 3 + bootstrapExpect: 3 + # Specify the resources that servers request for placement. These values will serve a large environment. + resources: | + requests: + memory: "32Gi" + cpu: "4" + disk: "50Gi" + limits: + memory: "32Gi" + cpu: "4" + disk: "50Gi" + # If using Enterprise, reference the Kubernetes secret that holds your license here + enterpriseLicense: + secretName: "consul-license" + secretKey: "key" + # Prevent Consul servers from co-location on Kubernetes nodes. + affinity: | + podAntiAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + - labelSelector: + matchLabels: + app: {{ template "consul.name" . }} + release: "{{ .Release.Name }}" + component: server + topologyKey: kubernetes.io/hostname +# Configure Consul clients in this section +client: + enabled: true + # Specify the resources that clients request for deployment. + resources: | + requests: + memory: "8Gi" + cpu: "2" + disk: "15Gi" + limits: + memory: "8Gi" + cpu: "2" + disk: "15Gi" + grpc: true +# Enable and configure the Consul UI. +ui: + enabled: true +# Configure security for Consul Connect pod injection +connectInject: + enabled: true + default: true + namespaceSelector: "my-namespace" + aclBindingRuleSelector: “serviceaccount.name!=default” +``` +## Deploy Consul + +Now that you have customized the `values.yml` file, you can deploy Consul with +Helm. This should only take a few minutes. The Consul pods should appear in the +Kubernetes dashboard immediately and you can monitor the deployment process +there. + +```sh +$ helm install ./consul-helm -f values.yaml +``` + +To check the deployment process on the command line you can use `kubectl`. + +```sh +$ kubectl get pods +``` + +## Summary + +In this guide, you configured Consul, using the Helm chart, for a production +environment. This involved ensuring that your cluster had a properly +distributed server cluster, specifying enough resources for your agents, +securing the cluster with ACLs and gossip encryption, and enabling other Consul +functionality including Connect and the Consul UI. + +Now you can interact with your Consul cluster through the UI or CLI. + +If you exposed the UI using a load balancer it will be available at the +`LoadBalancer Ingress` IP address and `Port` that is output from the following +command. Note, you will need to replace _consul server_ with the server name +from your cluster. + +```sh +$ kubectl describe services consul-server +``` + +To access the Consul CLI, open a terminal session using the Kubernetes CLI. + +```sh +$ kubectl exec -it /bin/ash +``` + +To learn more about how to interact with your Consul cluster or use it for +service discovery, configuration or segmentation, try one of Learn’s +[Operations or Development tracks](/consul/#advanced). Follow the [Security and +Networking track](/consul/?track=security-networking#security-networking) to +learn more about securing your Consul cluster. + + From f38da47c550757b1e688621cda33f2ecd3fff00b Mon Sep 17 00:00:00 2001 From: Paul Banks Date: Tue, 23 Jul 2019 15:19:57 +0100 Subject: [PATCH 5/9] Allow raft TrailingLogs to be configured. (#6186) This fixes pathological cases where the write throughput and snapshot size are both so large that more than 10k log entries are written in the time it takes to restore the snapshot from disk. In this case followers that restart can never catch up with leader replication again and enter a loop of constantly downloading a full snapshot and restoring it only to find that snapshot is already out of date and the leader has truncated its logs so a new snapshot is sent etc. In general if you need to adjust this, you are probably abusing Consul for purposes outside its design envelope and should reconsider your usage to reduce data size and/or write volume. --- agent/agent.go | 3 ++ agent/agent_test.go | 12 ++++- agent/config/builder.go | 1 + agent/config/config.go | 1 + agent/config/runtime.go | 16 +++++++ agent/config/runtime_test.go | 4 ++ website/source/docs/agent/options.html.md | 58 +++++++++++++++-------- 7 files changed, 75 insertions(+), 20 deletions(-) diff --git a/agent/agent.go b/agent/agent.go index 2a12ba6fcd..7a6246d23a 100644 --- a/agent/agent.go +++ b/agent/agent.go @@ -1162,6 +1162,9 @@ func (a *Agent) consulConfig() (*consul.Config, error) { if a.config.RaftSnapshotInterval != 0 { base.RaftConfig.SnapshotInterval = a.config.RaftSnapshotInterval } + if a.config.RaftTrailingLogs != 0 { + base.RaftConfig.TrailingLogs = uint64(a.config.RaftTrailingLogs) + } if a.config.ACLMasterToken != "" { base.ACLMasterToken = a.config.ACLMasterToken } diff --git a/agent/agent_test.go b/agent/agent_test.go index 9265c52ae1..d3e84f6e09 100644 --- a/agent/agent_test.go +++ b/agent/agent_test.go @@ -3925,7 +3925,7 @@ func TestAgent_ReloadConfigTLSConfigFailure(t *testing.T) { require.Len(t, tlsConf.RootCAs.Subjects(), 1) } -func TestAgent_consulConfig(t *testing.T) { +func TestAgent_consulConfig_AutoEncryptAllowTLS(t *testing.T) { t.Parallel() dataDir := testutil.TempDir(t, "agent") // we manage the data dir defer os.RemoveAll(dataDir) @@ -3941,3 +3941,13 @@ func TestAgent_consulConfig(t *testing.T) { defer a.Shutdown() require.True(t, a.consulConfig().AutoEncryptAllowTLS) } + +func TestAgent_consulConfig_RaftTrailingLogs(t *testing.T) { + t.Parallel() + hcl := ` + raft_trailing_logs = 812345 + ` + a := NewTestAgent(t, t.Name(), hcl) + defer a.Shutdown() + require.Equal(t, uint64(812345), a.consulConfig().RaftConfig.TrailingLogs) +} diff --git a/agent/config/builder.go b/agent/config/builder.go index 83b385f460..a286d80798 100644 --- a/agent/config/builder.go +++ b/agent/config/builder.go @@ -862,6 +862,7 @@ func (b *Builder) Build() (rt RuntimeConfig, err error) { RaftProtocol: b.intVal(c.RaftProtocol), RaftSnapshotThreshold: b.intVal(c.RaftSnapshotThreshold), RaftSnapshotInterval: b.durationVal("raft_snapshot_interval", c.RaftSnapshotInterval), + RaftTrailingLogs: b.intVal(c.RaftTrailingLogs), ReconnectTimeoutLAN: b.durationVal("reconnect_timeout", c.ReconnectTimeoutLAN), ReconnectTimeoutWAN: b.durationVal("reconnect_timeout_wan", c.ReconnectTimeoutWAN), RejoinAfterLeave: b.boolVal(c.RejoinAfterLeave), diff --git a/agent/config/config.go b/agent/config/config.go index 42a9423575..7d90ec983c 100644 --- a/agent/config/config.go +++ b/agent/config/config.go @@ -239,6 +239,7 @@ type Config struct { RaftProtocol *int `json:"raft_protocol,omitempty" hcl:"raft_protocol" mapstructure:"raft_protocol"` RaftSnapshotThreshold *int `json:"raft_snapshot_threshold,omitempty" hcl:"raft_snapshot_threshold" mapstructure:"raft_snapshot_threshold"` RaftSnapshotInterval *string `json:"raft_snapshot_interval,omitempty" hcl:"raft_snapshot_interval" mapstructure:"raft_snapshot_interval"` + RaftTrailingLogs *int `json:"raft_trailing_logs,omitempty" hcl:"raft_trailing_logs" mapstructure:"raft_trailing_logs"` ReconnectTimeoutLAN *string `json:"reconnect_timeout,omitempty" hcl:"reconnect_timeout" mapstructure:"reconnect_timeout"` ReconnectTimeoutWAN *string `json:"reconnect_timeout_wan,omitempty" hcl:"reconnect_timeout_wan" mapstructure:"reconnect_timeout_wan"` RejoinAfterLeave *bool `json:"rejoin_after_leave,omitempty" hcl:"rejoin_after_leave" mapstructure:"rejoin_after_leave"` diff --git a/agent/config/runtime.go b/agent/config/runtime.go index a6021b4a1a..4684e3d78f 100644 --- a/agent/config/runtime.go +++ b/agent/config/runtime.go @@ -965,6 +965,22 @@ type RuntimeConfig struct { // hcl: raft_snapshot_threshold = int RaftSnapshotInterval time.Duration + // RaftTrailingLogs sets the number of log entries that will be left in the + // log store after a snapshot. This must be large enough that a follower can + // transfer and restore an entire snapshot of the state before this many new + // entries have been appended. In vast majority of cases the default is plenty + // but if there is a sustained high write throughput coupled with a huge + // multi-gigabyte snapshot setting this higher may be necessary to allow + // followers time to reload from snapshot without becoming unhealthy. If it's + // too low then followers are unable to ever recover from a restart and will + // enter a loop of constantly downloading full snapshots and never catching + // up. If you need to change this you should reconsider your usage of Consul + // as it is not designed to store multiple-gigabyte data sets with high write + // throughput. Defaults to 10000. + // + // hcl: raft_trailing_logs = int + RaftTrailingLogs int + // ReconnectTimeoutLAN specifies the amount of time to wait to reconnect with // another agent before deciding it's permanently gone. This can be used to // control the time it takes to reap failed nodes from the cluster. diff --git a/agent/config/runtime_test.go b/agent/config/runtime_test.go index afb0443cf5..608aa7f428 100644 --- a/agent/config/runtime_test.go +++ b/agent/config/runtime_test.go @@ -3298,6 +3298,7 @@ func TestFullConfig(t *testing.T) { "raft_protocol": 19016, "raft_snapshot_threshold": 16384, "raft_snapshot_interval": "30s", + "raft_trailing_logs": 83749, "reconnect_timeout": "23739s", "reconnect_timeout_wan": "26694s", "recursors": [ "63.38.39.58", "92.49.18.18" ], @@ -3881,6 +3882,7 @@ func TestFullConfig(t *testing.T) { raft_protocol = 19016 raft_snapshot_threshold = 16384 raft_snapshot_interval = "30s" + raft_trailing_logs = 83749 reconnect_timeout = "23739s" reconnect_timeout_wan = "26694s" recursors = [ "63.38.39.58", "92.49.18.18" ] @@ -4532,6 +4534,7 @@ func TestFullConfig(t *testing.T) { RaftProtocol: 19016, RaftSnapshotThreshold: 16384, RaftSnapshotInterval: 30 * time.Second, + RaftTrailingLogs: 83749, ReconnectTimeoutLAN: 23739 * time.Second, ReconnectTimeoutWAN: 26694 * time.Second, RejoinAfterLeave: true, @@ -5353,6 +5356,7 @@ func TestSanitize(t *testing.T) { "RaftProtocol": 0, "RaftSnapshotInterval": "0s", "RaftSnapshotThreshold": 0, + "RaftTrailingLogs": 0, "ReconnectTimeoutLAN": "0s", "ReconnectTimeoutWAN": "0s", "RejoinAfterLeave": false, diff --git a/website/source/docs/agent/options.html.md b/website/source/docs/agent/options.html.md index 687a1b1b4d..1a7470cb74 100644 --- a/website/source/docs/agent/options.html.md +++ b/website/source/docs/agent/options.html.md @@ -407,21 +407,6 @@ will exit with an error at startup. [Raft Protocol Version Compatibility](/docs/upgrade-specific.html#raft-protocol-version-compatibility) for more details. -* `-raft-snapshot-threshold` - This controls the - minimum number of raft commit entries between snapshots that are saved to disk. This is a low-level parameter that should - rarely need to be changed. Very busy clusters experiencing excessive disk IO may increase this value to reduce disk IO, and minimize - the chances of all servers taking snapshots at the same time. Increasing this trades off disk IO for disk space since the log will - grow much larger and the space in the raft.db file can't be reclaimed till the next snapshot. Servers may take longer to recover from - crashes or failover if this is increased significantly as more logs will need to be replayed. In Consul 1.1.0 and later this - defaults to 16384, and in prior versions it was set to 8192. - -* `-raft-snapshot-interval` - This controls how often servers - check if they need to save a snapshot to disk. his is a low-level parameter that should rarely need to be changed. Very busy clusters - experiencing excessive disk IO may increase this value to reduce disk IO, and minimize the chances of all servers taking snapshots at the same time. - Increasing this trades off disk IO for disk space since the log will grow much larger and the space in the raft.db file can't be reclaimed - till the next snapshot. Servers may take longer to recover from crashes or failover if this is increased significantly as more logs - will need to be replayed. In Consul 1.1.0 and later this defaults to `30s`, and in prior versions it was set to `5s`. - * `-recursor` - Specifies the address of an upstream DNS server. This option may be provided multiple times, and is functionally equivalent to the [`recursors` configuration option](#recursors). @@ -1431,11 +1416,46 @@ default will automatically work with some tooling. * `raft_protocol` Equivalent to the [`-raft-protocol` command-line flag](#_raft_protocol). -* `raft_snapshot_threshold` Equivalent to the - [`-raft-snapshot-threshold` command-line flag](#_raft_snapshot_threshold). + +* + `raft_snapshot_threshold` This controls + the minimum number of raft commit entries between snapshots that are saved to + disk. This is a low-level parameter that should rarely need to be changed. + Very busy clusters experiencing excessive disk IO may increase this value to + reduce disk IO, and minimize the chances of all servers taking snapshots at + the same time. Increasing this trades off disk IO for disk space since the log + will grow much larger and the space in the raft.db file can't be reclaimed + till the next snapshot. Servers may take longer to recover from crashes or + failover if this is increased significantly as more logs will need to be + replayed. In Consul 1.1.0 and later this defaults to 16384, and in prior + versions it was set to 8192. -* `raft_snapshot_interval` Equivalent to the - [`-raft-snapshot-interval` command-line flag](#_raft_snapshot_interval). +* `raft_snapshot_interval` This controls how + often servers check if they need to save a snapshot to disk. his is a + low-level parameter that should rarely need to be changed. Very busy clusters + experiencing excessive disk IO may increase this value to reduce disk IO, and + minimize the chances of all servers taking snapshots at the same time. + Increasing this trades off disk IO for disk space since the log will grow much + larger and the space in th e raft.db file can't be reclaimed till the next + snapshot. Servers may take longer to recover from crashes or failover if this + is increased significantly as more logs will need to be replayed. In Consul + 1.1.0 and later this defaults to `30s`, and in prior versions it was set to + `5s`. + +* `raft_trailing_logs` - This controls how many + log entries are left in the log store on disk after a snapshot is made. This + should only be adjusted when followers cannot catch up to the leader due to a + very large snapshot size that and high write throughput causing log truncation + before an snapshot can be fully installed. If you need to use this to recover + a cluster, consider reducing write throughput or the amount of data stored on + Consul as it is likely under a load it is not designed to handle. The default + value is 10000 which is suitable for all normal workloads. Added in Consul + 1.5.3. * `reap` This controls Consul's automatic reaping of child processes, which is useful if Consul is running as PID 1 in a Docker container. If this isn't specified, then Consul will From af7a392a17de5d41f0162ac9c2bdbf382dc424a5 Mon Sep 17 00:00:00 2001 From: Paul Banks Date: Tue, 23 Jul 2019 15:45:39 +0100 Subject: [PATCH 6/9] Update CHANGELOG.md --- CHANGELOG.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 67e0d92e5f..72b4856f9c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,8 @@ ## UNRELEASED +IMPROVEMENTS: +* raft: Allow trailing logs to be configured as an escape hatch for extreme load that prevents followers catching up with leader [[GH-6186](https://github.com/hashicorp/consul/pull/6186)] + BUG FIXES: * autopilot: update to also remove failed nodes from WAN gossip pool [[GH-6028](https://github.com/hashicorp/consul/pull/6028)] From 230ca21a0ef295bda497fa32e7b44fe80dda0583 Mon Sep 17 00:00:00 2001 From: Alvin Huang Date: Tue, 23 Jul 2019 15:35:46 -0400 Subject: [PATCH 7/9] remove travis config (#6199) --- .travis.yml | 23 ----------------------- 1 file changed, 23 deletions(-) delete mode 100644 .travis.yml diff --git a/.travis.yml b/.travis.yml deleted file mode 100644 index 60cd57f2dd..0000000000 --- a/.travis.yml +++ /dev/null @@ -1,23 +0,0 @@ -language: go - -go: - # Please keep this in-sync with the go version we build against in - # build-support/docker/Build-Go.dockerfile. - - "1.12.1" - -branches: - only: - - master - - release/1-6 - -matrix: - include: - - env: GOTEST_PKGS="./api" - - env: GOTEST_PKGS="./agent" - - env: GOTEST_PKGS="./agent/consul" - - env: GOTEST_PKGS_EXCLUDE="./api|./agent|./agent/consul" - -script: - - make test-ci - -sudo: false From a13de7dee916f07aa137c7ae3dcea4bc218d577b Mon Sep 17 00:00:00 2001 From: Freddy Date: Tue, 23 Jul 2019 14:32:10 -0600 Subject: [PATCH 8/9] Update CHANGELOG.md --- CHANGELOG.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 72b4856f9c..f3f8b03e91 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,7 +1,8 @@ ## UNRELEASED IMPROVEMENTS: -* raft: Allow trailing logs to be configured as an escape hatch for extreme load that prevents followers catching up with leader [[GH-6186](https://github.com/hashicorp/consul/pull/6186)] +* raft: allow trailing logs to be configured as an escape hatch for extreme load that prevents followers catching up with leader [[GH-6186](https://github.com/hashicorp/consul/pull/6186)] +* agent: added configurable limit for log files to be rotated [[GH-5831](https://github.com/hashicorp/consul/pull/5831)] BUG FIXES: From d86efb83e53d29d0993912b8e85bd086190932ae Mon Sep 17 00:00:00 2001 From: Freddy Date: Tue, 23 Jul 2019 14:33:00 -0600 Subject: [PATCH 9/9] Restore NotifyListen to avoid panic in newServer retry (#6200) --- agent/consul/server_test.go | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/agent/consul/server_test.go b/agent/consul/server_test.go index 47278b78c3..1aeb1a5900 100644 --- a/agent/consul/server_test.go +++ b/agent/consul/server_test.go @@ -183,6 +183,10 @@ func newServer(c *Config) (*Server, error) { oldNotify() } } + // Restore old notify to guard against re-closing `up` on a retry + defer func() { + c.NotifyListen = oldNotify + }() // start server w := c.LogOutput