resolve circleci config conflicts

This commit is contained in:
Alvin Huang 2019-07-23 20:18:36 -04:00
commit ef6b80bab2
29 changed files with 423 additions and 105 deletions

View File

@ -77,16 +77,9 @@ jobs:
- run: mkdir -p $TEST_RESULTS_DIR
- run: sudo apt-get update && sudo apt-get install -y rsyslog
- run: sudo service rsyslog start
# Use CircleCI test splitting by classname. Since there are no classes in go,
# we fake it by taking everything after github.com/hashicorp/consul/ and setting
# it as the classname.
# This loop writes go test results to <reportname>.xml per go package
- run: |
for pkg in $(go list ./... | grep -v github.com/hashicorp/consul/agent/proxyprocess |circleci tests split --split-by=timings --timings-type=classname | tr '\n' ' '); do
reportname=$(echo $pkg | cut -d '/' -f3- | sed "s#/#_#g")
gotestsum --format=short-verbose --junitfile $TEST_RESULTS_DIR/$reportname.xml -- -tags=$GOTAGS $pkg
done
PACKAGE_NAMES=$(go list ./... | grep -v github.com/hashicorp/consul/agent/proxyprocess | circleci tests split --split-by=timings --timings-type=classname)
gotestsum --format=short-verbose --junitfile $TEST_RESULTS_DIR/gotestsum-report.xml -- -tags=$GOTAGS -p 3 $PACKAGE_NAMES
- store_test_results:
path: /tmp/test-results
@ -108,18 +101,11 @@ jobs:
- attach_workspace:
at: /go/bin
- run: mkdir -p $TEST_RESULTS_DIR
# Use CircleCI test splitting by classname. Since there are no classes in go,
# we fake it by taking everything after github.com/hashicorp/consul/ and setting
# it as the classname.
# This loop writes go test results to <reportname>.xml per go package
- run:
working_directory: api
command: |
for pkg in $(go list ./... | circleci tests split --split-by=timings --timings-type=classname | tr '\n' ' '); do
reportname=$(echo $pkg | cut -d '/' -f3- | sed "s#/#_#g")
gotestsum --format=short-verbose --junitfile $TEST_RESULTS_DIR/$reportname.xml -- -tags=$GOTAGS $pkg
done
PACKAGE_NAMES=$(go list ./... | circleci tests split --split-by=timings --timings-type=classname)
gotestsum --format=short-verbose --junitfile $TEST_RESULTS_DIR/gotestsum-report.xml -- -tags=$GOTAGS $PACKAGE_NAMES
- store_test_results:
path: /tmp/test-results
@ -476,7 +462,25 @@ jobs:
git_merge_branch="ci/master-merge-$(date +%Y%m%d%H%M%S)"
git checkout -b "${git_merge_branch}"
latest_oss_commit="$(git rev-parse origin/master)"
git merge -m "Merge Consul OSS branch 'master' at commit ${latest_oss_commit}" "${latest_oss_commit}"
if ! errors=$(git merge -m "Merge Consul OSS branch 'master' at commit ${latest_oss_commit}" "${latest_oss_commit}"); then
printf "oss/master merge into ${CIRCLE_BRANCH} failed because git was unable to auto-merge!\n${errors}"
curl -X POST -H 'Content-type: application/json' \
--data \
"{ \
\"attachments\": [ \
{ \
\"fallback\": \"master merge into ${CIRCLE_BRANCH} failed because git was unable to auto-merge!\", \
\"text\": \"Nightly *master* merge into *${CIRCLE_BRANCH}* failed!\n\nBuild Log: ${CIRCLE_BUILD_URL}\n\nGit was unable to auto-merge due to possible merge conflict.\n\n*Errors:*\n${errors}\", \
\"footer\": \"${CIRCLE_PROJECT_USERNAME}/${CIRCLE_PROJECT_REPONAME}\", \
\"ts\": \"$(date +%s)\", \
\"color\": \"danger\" \
} \
] \
}" ${CONSUL_SLACK_WEBHOOK_URL}
exit 1
fi
git push origin "${git_merge_branch}"
sleep 15 # Wait for merge branch to start CircleCI pipeline
@ -568,28 +572,26 @@ workflows:
branches:
only:
- release/1-6
build-distros:
go-tests:
jobs:
- lint-consul-retry
- go-fmt-and-vet:
- go-fmt-and-vet
- dev-build:
requires:
- lint-consul-retry
- build-386: &require-go-fmt-vet
requires:
- go-fmt-and-vet
- build-amd64: *require-go-fmt-vet
- build-arm-arm64: *require-go-fmt-vet
test-integrations:
jobs:
- dev-build
- go-test: &go-test
requires:
- dev-build
filters:
branches:
ignore:
- /^pull\/.*$/ # only run go tests on non forks
- go-test-api: *go-test
build-distros:
jobs:
- build-386
- build-amd64
- build-arm-arm64
test-integrations:
jobs:
- dev-build
- dev-upload-s3:
requires:
- dev-build

View File

@ -1,23 +0,0 @@
language: go
go:
# Please keep this in-sync with the go version we build against in
# build-support/docker/Build-Go.dockerfile.
- "1.12.1"
branches:
only:
- master
- release/1-6
matrix:
include:
- env: GOTEST_PKGS="./api"
- env: GOTEST_PKGS="./agent"
- env: GOTEST_PKGS="./agent/consul"
- env: GOTEST_PKGS_EXCLUDE="./api|./agent|./agent/consul"
script:
- make test-ci
sudo: false

View File

@ -6,6 +6,8 @@ FEATURES:
IMPROVEMENTS:
* raft: allow trailing logs to be configured as an escape hatch for extreme load that prevents followers catching up with leader [[GH-6186](https://github.com/hashicorp/consul/pull/6186)]
* agent: added configurable limit for log files to be rotated [[GH-5831](https://github.com/hashicorp/consul/pull/5831)]
* agent: health checks: change long timeout behavior to use to user-configured `timeout` value [[GH-6094](https://github.com/hashicorp/consul/pull/6094)]
* api: Update filtering language to include substring and regular expression matching on string values [[GH-6190](https://github.com/hashicorp/consul/pull/6190)]
* api: Display allowed HTTP CIDR information nicely [[GH-6029](https://github.com/hashicorp/consul/pull/6029)]

View File

@ -1162,6 +1162,9 @@ func (a *Agent) consulConfig() (*consul.Config, error) {
if a.config.RaftSnapshotInterval != 0 {
base.RaftConfig.SnapshotInterval = a.config.RaftSnapshotInterval
}
if a.config.RaftTrailingLogs != 0 {
base.RaftConfig.TrailingLogs = uint64(a.config.RaftTrailingLogs)
}
if a.config.ACLMasterToken != "" {
base.ACLMasterToken = a.config.ACLMasterToken
}

View File

@ -141,7 +141,7 @@ func (s *HTTPServer) AgentReload(resp http.ResponseWriter, req *http.Request) (i
}
// Trigger the reload
errCh := make(chan error, 0)
errCh := make(chan error)
select {
case <-s.agent.shutdownCh:
return nil, fmt.Errorf("Agent was shutdown before reload could be completed")

View File

@ -564,7 +564,7 @@ func TestAgent_Service(t *testing.T) {
}
start := time.Now()
obj, err := a.srv.AgentService(resp, req)
elapsed := time.Now().Sub(start)
elapsed := time.Since(start)
if tt.wantErr != "" {
require.Error(err)
@ -5350,7 +5350,7 @@ func TestAgentConnectProxyConfig_Blocking(t *testing.T) {
}
start := time.Now()
obj, err := a.srv.AgentConnectProxyConfig(resp, req)
elapsed := time.Now().Sub(start)
elapsed := time.Since(start)
if tt.wantErr {
require.Error(err)

View File

@ -3925,7 +3925,7 @@ func TestAgent_ReloadConfigTLSConfigFailure(t *testing.T) {
require.Len(t, tlsConf.RootCAs.Subjects(), 1)
}
func TestAgent_consulConfig(t *testing.T) {
func TestAgent_consulConfig_AutoEncryptAllowTLS(t *testing.T) {
t.Parallel()
dataDir := testutil.TempDir(t, "agent") // we manage the data dir
defer os.RemoveAll(dataDir)
@ -3941,3 +3941,13 @@ func TestAgent_consulConfig(t *testing.T) {
defer a.Shutdown()
require.True(t, a.consulConfig().AutoEncryptAllowTLS)
}
func TestAgent_consulConfig_RaftTrailingLogs(t *testing.T) {
t.Parallel()
hcl := `
raft_trailing_logs = 812345
`
a := NewTestAgent(t, t.Name(), hcl)
defer a.Shutdown()
require.Equal(t, uint64(812345), a.consulConfig().RaftConfig.TrailingLogs)
}

View File

@ -703,7 +703,7 @@ func (c *Cache) runExpiryLoop() {
c.entriesLock.RLock()
if len(c.entriesExpiryHeap.Entries) > 0 {
entry = c.entriesExpiryHeap.Entries[0]
expiryTimer = time.NewTimer(entry.Expires.Sub(time.Now()))
expiryTimer = time.NewTimer(time.Until(entry.Expires))
expiryCh = expiryTimer.C
}
c.entriesLock.RUnlock()

View File

@ -189,7 +189,7 @@ func (s *HTTPServer) CatalogServices(resp http.ResponseWriter, req *http.Request
// Use empty map instead of nil
if out.Services == nil {
out.Services = make(structs.Services, 0)
out.Services = make(structs.Services)
}
metrics.IncrCounterWithLabels([]string{"client", "api", "success", "catalog_services"}, 1,
[]metrics.Label{{Name: "node", Value: s.nodeName()}})

View File

@ -862,6 +862,7 @@ func (b *Builder) Build() (rt RuntimeConfig, err error) {
RaftProtocol: b.intVal(c.RaftProtocol),
RaftSnapshotThreshold: b.intVal(c.RaftSnapshotThreshold),
RaftSnapshotInterval: b.durationVal("raft_snapshot_interval", c.RaftSnapshotInterval),
RaftTrailingLogs: b.intVal(c.RaftTrailingLogs),
ReconnectTimeoutLAN: b.durationVal("reconnect_timeout", c.ReconnectTimeoutLAN),
ReconnectTimeoutWAN: b.durationVal("reconnect_timeout_wan", c.ReconnectTimeoutWAN),
RejoinAfterLeave: b.boolVal(c.RejoinAfterLeave),

View File

@ -241,6 +241,7 @@ type Config struct {
RaftProtocol *int `json:"raft_protocol,omitempty" hcl:"raft_protocol" mapstructure:"raft_protocol"`
RaftSnapshotThreshold *int `json:"raft_snapshot_threshold,omitempty" hcl:"raft_snapshot_threshold" mapstructure:"raft_snapshot_threshold"`
RaftSnapshotInterval *string `json:"raft_snapshot_interval,omitempty" hcl:"raft_snapshot_interval" mapstructure:"raft_snapshot_interval"`
RaftTrailingLogs *int `json:"raft_trailing_logs,omitempty" hcl:"raft_trailing_logs" mapstructure:"raft_trailing_logs"`
ReconnectTimeoutLAN *string `json:"reconnect_timeout,omitempty" hcl:"reconnect_timeout" mapstructure:"reconnect_timeout"`
ReconnectTimeoutWAN *string `json:"reconnect_timeout_wan,omitempty" hcl:"reconnect_timeout_wan" mapstructure:"reconnect_timeout_wan"`
RejoinAfterLeave *bool `json:"rejoin_after_leave,omitempty" hcl:"rejoin_after_leave" mapstructure:"rejoin_after_leave"`

View File

@ -965,6 +965,22 @@ type RuntimeConfig struct {
// hcl: raft_snapshot_threshold = int
RaftSnapshotInterval time.Duration
// RaftTrailingLogs sets the number of log entries that will be left in the
// log store after a snapshot. This must be large enough that a follower can
// transfer and restore an entire snapshot of the state before this many new
// entries have been appended. In vast majority of cases the default is plenty
// but if there is a sustained high write throughput coupled with a huge
// multi-gigabyte snapshot setting this higher may be necessary to allow
// followers time to reload from snapshot without becoming unhealthy. If it's
// too low then followers are unable to ever recover from a restart and will
// enter a loop of constantly downloading full snapshots and never catching
// up. If you need to change this you should reconsider your usage of Consul
// as it is not designed to store multiple-gigabyte data sets with high write
// throughput. Defaults to 10000.
//
// hcl: raft_trailing_logs = int
RaftTrailingLogs int
// ReconnectTimeoutLAN specifies the amount of time to wait to reconnect with
// another agent before deciding it's permanently gone. This can be used to
// control the time it takes to reap failed nodes from the cluster.

View File

@ -3767,6 +3767,7 @@ func TestFullConfig(t *testing.T) {
"raft_protocol": 19016,
"raft_snapshot_threshold": 16384,
"raft_snapshot_interval": "30s",
"raft_trailing_logs": 83749,
"reconnect_timeout": "23739s",
"reconnect_timeout_wan": "26694s",
"recursors": [ "63.38.39.58", "92.49.18.18" ],
@ -4371,6 +4372,7 @@ func TestFullConfig(t *testing.T) {
raft_protocol = 19016
raft_snapshot_threshold = 16384
raft_snapshot_interval = "30s"
raft_trailing_logs = 83749
reconnect_timeout = "23739s"
reconnect_timeout_wan = "26694s"
recursors = [ "63.38.39.58", "92.49.18.18" ]
@ -5043,6 +5045,7 @@ func TestFullConfig(t *testing.T) {
RaftProtocol: 19016,
RaftSnapshotThreshold: 16384,
RaftSnapshotInterval: 30 * time.Second,
RaftTrailingLogs: 83749,
ReconnectTimeoutLAN: 23739 * time.Second,
ReconnectTimeoutWAN: 26694 * time.Second,
RejoinAfterLeave: true,
@ -5901,6 +5904,7 @@ func TestSanitize(t *testing.T) {
"RaftProtocol": 0,
"RaftSnapshotInterval": "0s",
"RaftSnapshotThreshold": 0,
"RaftTrailingLogs": 0,
"ReconnectTimeoutLAN": "0s",
"ReconnectTimeoutWAN": "0s",
"RejoinAfterLeave": false,

View File

@ -178,7 +178,7 @@ func TestConsulCAProvider_SignLeaf(t *testing.T) {
require.Equal(parsed.SerialNumber.Uint64(), uint64(2))
// Ensure the cert is valid now and expires within the correct limit.
require.True(parsed.NotAfter.Sub(time.Now()) < 3*24*time.Hour)
require.True(time.Until(parsed.NotAfter) < 3*24*time.Hour)
require.True(parsed.NotBefore.Before(time.Now()))
}

View File

@ -186,7 +186,7 @@ func TestVaultCAProvider_SignLeaf(t *testing.T) {
require.NotEqual(firstSerial, parsed.SerialNumber.Uint64())
// Ensure the cert is valid now and expires within the correct limit.
require.True(parsed.NotAfter.Sub(time.Now()) < time.Hour)
require.True(time.Until(parsed.NotAfter) < time.Hour)
require.True(parsed.NotBefore.Before(time.Now()))
}
}

View File

@ -316,7 +316,7 @@ func (r *aclRoleReplicator) FetchUpdated(srv *Server, updates []string) (int, er
delete(keep, role.ID)
}
missing := make([]string, 0, len(keep))
for id, _ := range keep {
for id := range keep {
missing = append(missing, id)
}
return 0, fmt.Errorf("role replication trying to replicated uncached roles with IDs: %v", missing)

View File

@ -596,11 +596,7 @@ key "zip" {
t.Fatalf("err: %v", err)
}
actualKeys = []string{}
for _, key := range keyList.Keys {
actualKeys = append(actualKeys, key)
}
actualKeys = keyList.Keys
verify.Values(t, "", actualKeys, expectedKeys)

View File

@ -1192,7 +1192,7 @@ func (s *Server) pruneCARoots() error {
var newRoots structs.CARoots
for _, r := range roots {
if !r.Active && !r.RotatedOutAt.IsZero() && time.Now().Sub(r.RotatedOutAt) > common.LeafCertTTL*2 {
if !r.Active && !r.RotatedOutAt.IsZero() && time.Since(r.RotatedOutAt) > common.LeafCertTTL*2 {
s.logger.Printf("[INFO] connect: pruning old unused root CA (ID: %s)", r.ID)
continue
}

View File

@ -51,8 +51,7 @@ func (sl *ServerLookup) ServerAddr(id raft.ServerID) (raft.ServerAddress, error)
func (sl *ServerLookup) Server(addr raft.ServerAddress) *metadata.Server {
sl.lock.RLock()
defer sl.lock.RUnlock()
svr, _ := sl.addressToServer[addr]
return svr
return sl.addressToServer[addr]
}
func (sl *ServerLookup) Servers() []*metadata.Server {

View File

@ -183,6 +183,10 @@ func newServer(c *Config) (*Server, error) {
oldNotify()
}
}
// Restore old notify to guard against re-closing `up` on a retry
defer func() {
c.NotifyListen = oldNotify
}()
// start server
w := c.LogOutput
@ -820,7 +824,6 @@ func TestServer_BadExpect(t *testing.T) {
type fakeGlobalResp struct{}
func (r *fakeGlobalResp) Add(interface{}) {
return
}
func (r *fakeGlobalResp) New() interface{} {

View File

@ -3824,11 +3824,11 @@ func stripIrrelevantTokenFields(token *structs.ACLToken) *structs.ACLToken {
// When comparing the tokens disregard the policy link names. This
// data is not cleanly updated in a variety of scenarios and should not
// be relied upon.
for i, _ := range tokenCopy.Policies {
for i := range tokenCopy.Policies {
tokenCopy.Policies[i].Name = ""
}
// Also do the same for Role links.
for i, _ := range tokenCopy.Roles {
for i := range tokenCopy.Roles {
tokenCopy.Roles[i].Name = ""
}
// The raft indexes won't match either because the requester will not

View File

@ -293,7 +293,7 @@ func (s *HTTPServer) handler(enableDebug bool) http.Handler {
mux.HandleFunc("/", s.Index)
for pattern, fn := range endpoints {
thisFn := fn
methods, _ := allowedMethods[pattern]
methods := allowedMethods[pattern]
bound := func(resp http.ResponseWriter, req *http.Request) (interface{}, error) {
return thisFn(s, resp, req)
}

View File

@ -342,8 +342,6 @@ func (m *Manager) RebalanceServers() {
// continue to use the existing connection until the next
// rebalance occurs.
}
return
}
// reconcileServerList returns true when the first server in serverList

View File

@ -60,7 +60,7 @@ func TestUiIndex(t *testing.T) {
// Verify the body
out := bytes.NewBuffer(nil)
io.Copy(out, resp.Body)
if string(out.Bytes()) != "test" {
if out.String() != "test" {
t.Fatalf("bad: %s", out.Bytes())
}
}

View File

@ -302,7 +302,7 @@ func (c *cmd) captureStatic() error {
var errors error
// Collect the named outputs here
outputs := make(map[string]interface{}, 0)
outputs := make(map[string]interface{})
// Capture host information
if c.configuredTarget("host") {

View File

@ -225,7 +225,7 @@ func (c *TelemetryConfig) MergeDefaults(defaults *TelemetryConfig) {
continue
}
case reflect.Bool:
if f.Bool() != false {
if f.Bool() {
continue
}
default:

View File

@ -110,7 +110,7 @@ func dedup(a []string) string {
delete(m, s)
}
}
return string(b.Bytes())
return b.String()
}
func run(r Retryer, t Failer, f func(r *R)) {

View File

@ -407,21 +407,6 @@ will exit with an error at startup.
[Raft Protocol Version Compatibility](/docs/upgrade-specific.html#raft-protocol-version-compatibility)
for more details.
* <a name="_raft_snapshot_threshold"></a><a href="#_raft_snapshot_threshold">`-raft-snapshot-threshold`</a> - This controls the
minimum number of raft commit entries between snapshots that are saved to disk. This is a low-level parameter that should
rarely need to be changed. Very busy clusters experiencing excessive disk IO may increase this value to reduce disk IO, and minimize
the chances of all servers taking snapshots at the same time. Increasing this trades off disk IO for disk space since the log will
grow much larger and the space in the raft.db file can't be reclaimed till the next snapshot. Servers may take longer to recover from
crashes or failover if this is increased significantly as more logs will need to be replayed. In Consul 1.1.0 and later this
defaults to 16384, and in prior versions it was set to 8192.
* <a name="_raft_snapshot_interval"></a><a href="#_raft_snapshot_interval">`-raft-snapshot-interval`</a> - This controls how often servers
check if they need to save a snapshot to disk. his is a low-level parameter that should rarely need to be changed. Very busy clusters
experiencing excessive disk IO may increase this value to reduce disk IO, and minimize the chances of all servers taking snapshots at the same time.
Increasing this trades off disk IO for disk space since the log will grow much larger and the space in the raft.db file can't be reclaimed
till the next snapshot. Servers may take longer to recover from crashes or failover if this is increased significantly as more logs
will need to be replayed. In Consul 1.1.0 and later this defaults to `30s`, and in prior versions it was set to `5s`.
* <a name="_recursor"></a><a href="#_recursor">`-recursor`</a> - Specifies the address of an upstream DNS
server. This option may be provided multiple times, and is functionally
equivalent to the [`recursors` configuration option](#recursors).
@ -1431,11 +1416,46 @@ default will automatically work with some tooling.
* <a name="raft_protocol"></a><a href="#raft_protocol">`raft_protocol`</a> Equivalent to the
[`-raft-protocol` command-line flag](#_raft_protocol).
* <a name="raft_snapshot_threshold"></a><a href="#raft_snapshot_threshold">`raft_snapshot_threshold`</a> Equivalent to the
[`-raft-snapshot-threshold` command-line flag](#_raft_snapshot_threshold).
<!-- Note the extra _ anchors are here because we used to erroneously list these as
command line flags even though they are not actually defined as valid flags and can
only be set in config file. Duplicating the anchor preserves any existing external links
to the old fragment -->
* <a name="raft_snapshot_threshold"></a><a name="_raft_snapshot_threshold"></a>
<a href="#raft_snapshot_threshold">`raft_snapshot_threshold`</a> This controls
the minimum number of raft commit entries between snapshots that are saved to
disk. This is a low-level parameter that should rarely need to be changed.
Very busy clusters experiencing excessive disk IO may increase this value to
reduce disk IO, and minimize the chances of all servers taking snapshots at
the same time. Increasing this trades off disk IO for disk space since the log
will grow much larger and the space in the raft.db file can't be reclaimed
till the next snapshot. Servers may take longer to recover from crashes or
failover if this is increased significantly as more logs will need to be
replayed. In Consul 1.1.0 and later this defaults to 16384, and in prior
versions it was set to 8192.
* <a name="raft_snapshot_interval"></a><a href="#raft_snapshot_interval">`raft_snapshot_interval`</a> Equivalent to the
[`-raft-snapshot-interval` command-line flag](#_raft_snapshot_interval).
* <a name="raft_snapshot_interval"></a><a name="_raft_snapshot_interval"></a> <a
href="#raft_snapshot_interval">`raft_snapshot_interval`</a> This controls how
often servers check if they need to save a snapshot to disk. his is a
low-level parameter that should rarely need to be changed. Very busy clusters
experiencing excessive disk IO may increase this value to reduce disk IO, and
minimize the chances of all servers taking snapshots at the same time.
Increasing this trades off disk IO for disk space since the log will grow much
larger and the space in th e raft.db file can't be reclaimed till the next
snapshot. Servers may take longer to recover from crashes or failover if this
is increased significantly as more logs will need to be replayed. In Consul
1.1.0 and later this defaults to `30s`, and in prior versions it was set to
`5s`.
* <a name="raft_trailing_logs"></a><a
href="#raft_trailing_logs">`raft_trailing_logs`</a> - This controls how many
log entries are left in the log store on disk after a snapshot is made. This
should only be adjusted when followers cannot catch up to the leader due to a
very large snapshot size that and high write throughput causing log truncation
before an snapshot can be fully installed. If you need to use this to recover
a cluster, consider reducing write throughput or the amount of data stored on
Consul as it is likely under a load it is not designed to handle. The default
value is 10000 which is suitable for all normal workloads. Added in Consul
1.5.3.
* <a name="reap"></a><a href="#reap">`reap`</a> This controls Consul's automatic reaping of child processes,
which is useful if Consul is running as PID 1 in a Docker container. If this isn't specified, then Consul will

View File

@ -0,0 +1,286 @@
---
name: "Consul-Kubernetes Deployment Guide"
content_length: 14
id: kubernetes-production-deploy
layout: content_layout
products_used:
- Consul
description: This guide covers the necessary steps to install and configure a new Consul cluster on Kubernetes.
level: Advanced
___
This guide covers the necessary steps to install and configure a new Consul
cluster on Kubernetes, as defined in the [Consul Reference Architecture
guide](/consul/day-1-operations/kubernetes-reference#consul-datacenter-deployed-in-kubernetes).
By the end of this guide, you will be able to identify the installation
prerequisites, customize the Helm chart to fit your environment requirements,
and interact with your new Consul cluster.
~> You should have the following configured before starting this guide: Helm
installed and configured locally, tiller running in the Kubernetes cluster, and
the Kubernetes CLI configured.
## Configure Kubernetes Permissions to Deploy Consul
Before deploying Consul, you will need to create a new Kubernetes service
account with the correct permissions and to authenticate it on the command
line. You will need Kubernetes operators permissions to create and modify
policies, deploy services, access the Kubernetes dashboard, create secrets, and
create RBAC objects. You can find documentation for RBAC and service accounts
for the following cloud providers.
- [AKS](https://docs.microsoft.com/en-us/azure/aks/kubernetes-service-principal)
- [EKS](https://docs.aws.amazon.com/eks/latest/userguide/install-aws-iam-authenticator.html)
- [GCP](https://console.cloud.google.com/iam-admin/serviceaccounts)
Note, Consul can be deployed on any properly configured Kubernetes cluster in
the cloud or on premises.
Once you have a service account, you will also need to add a permission to
deploy the helm chart. This is done with the `clusterrolebinding` method.
```sh
$ kubectl create clusterrolebinding kubernetes-dashboard -n kube-system --clusterrole=cluster-admin --serviceaccount=kube-system:kubernetes-dashboard
```
Finally, you may need to create Kubernetes secrets to store Consul data. You
can reference these secrets in the customized Helm chart values file.
- If you have purchased Enterprise Consul, the enterprise license file should be
used with the official image, `hashicorp/consul-enterprise:1.5.0-ent`.
- Enable
[encryption](https://www.consul.io/docs/agent/encryption.html#gossip-encryption) to secure gossip traffic within the Consul cluster.
~> Note, depending on your environment, the previous secrets may not be
necessary.
## Configure Helm Chart
Now that you have prepared your Kubernetes cluster, you can customize the Helm
chart. First, you will need to download the latest official Helm chart.
```sh
$ git clone https://github.com/hashicorp/consul-helm.git
```
The `consul-helm` directory will contain a `values.yaml` file with example
parameters. You can update this file to customize your Consul deployment. Below
we detail some of the parameters you should customize and provide an example
file, however you should consider your particular production needs when
configuring your chart.
### Global Values
The global values will affect all the other parameters in the chart.
To enable all of the Consul components in the Helm chart, set `enabled` to
`true`. This means servers, clients, Consul DNS, and the Consul UI will be
installed with their defaults. You should also set the following global
parameters based on your specific environment requirements.
- `image` is the name and tag of the Consul Docker image.
- `imagek8s` is the name and tag of the Docker image for the consul-k8s binary.
- `datacenter` the name of your Consul datacenter.
- `domain` the domain Consul uses for DNS queries.
For security, set the `bootstrapACLs` parameter to true. This will enable
Kubernetes to initially setup Consul's [ACL
system](https://www.consul.io/docs/acl/acl-system.html).
Read the Consul Helm chart documentation to review all the [global
parameters](https://www.consul.io/docs/platform/k8s/helm.html#v-global).
### Consul UI
To enable the Consul web UI update the `ui` section to your values file and set
`enabled` to `true`.
Note, you can also set up a [loadbalancer
resource](https://github.com/hashicorp/demo-consul-101/tree/master/k8s#implement-load-balancer)
or other service type in Kubernetes to make it easier to access the UI.
### Consul Servers
For production deployments, you will need to deploy [3 or 5 Consul
servers](https://www.consul.io/docs/internals/consensus.html#deployment-table)
for quorum and failure tolerance. For most deployments, 3 servers are adequate.
In the server section set both `replicas` and `bootstrapExpect` to 3. This will
deploy three servers and cause Consul to wait to perform leader election until
all three are healthy. The `resources` will depend on your environment; in the
example at the end of the guide, the resources are set for a large environment.
#### Affinity
To ensure the Consul servers are placed on different Kubernetes nodes, you will
need to configure affinity. Otherwise, the failure of one Kubernetes node could
cause the loss of multiple Consul servers, and result in quorum loss. By
default, the example `values.yaml` has affinity configured correctly.
#### Enterprise License
If you have an [Enterprise
license](https://www.hashicorp.com/products/consul/enterprise) you should
reference the Kubernetes secret in the `enterpriseLicense` parameter.
Read the Consul Helm chart documentation to review all the [server
parameters](https://www.consul.io/docs/platform/k8s/helm.html#v-server)
### Consul Clients
A Consul client is deployed on every Kubernetes node, so you do not need to
specify the number of clients for your deployments. You will need to specify
resources and enable gRPC. The resources in the example at the end of this guide
should be
sufficient for most production scenarios since Consul clients are designed for
horizontal scalability. Enabling `grpc` enables the GRPC listener on port 8502
and exposes it to the host. It is required to use Consul Connect.
Read the Consul Helm chart documentation to review all the [client
parameters](https://www.consul.io/docs/platform/k8s/helm.html#v-client)
### Consul Connect Injection Security
Even though you enabled Consul server communication over Connect in the server section, you will also
need to enable `connectInject` by setting `enabled` to `true`. In the
`connectInject` section you will also configure security features. Enabling the
`default` parameter will allow the injector to automatically inject the Connect
sidecar into all pods. If you would prefer to manually annotate which pods to inject, you
can set this to false. Setting the 'aclBindingRuleSelector` parameter to
`serviceaccount.name!=default` ensures that new services do not all receive the
same token if you are only using a default service account. This setting is
only necessary if you have enabled ACLs in the global section.
Read more about the [Connect Inject
parameters](https://www.consul.io/docs/platform/k8s/helm.html#v-connectinject).
## Complete Example
Your finished values file should resemble the following example. For more
complete descriptions of all the available parameters see the `values.yaml`
file provided with the Helm chart and the [reference
documentation](https://www.consul.io/docs/platform/k8s/helm.html).
```yaml
# Configure global settings in this section.
global:
# Enable all the components within this chart by default.
enabled: true
# Specify the Consul and consul-k8s images to use
image: "consul:1.5.0"
imagek8s: "hashicorp/consul-k8s:0.8.1"
domain: consul
datacenter: primarydc
# Bootstrap ACLs within Consul. This is highly recommended.
bootstrapACLs: true
# Gossip encryption
gossipEncryption: |
secretName: "encrypt-key"
secretKey: "key
# Configure your Consul servers in this section.
server:
enabled: true
connect: true
# Specify three servers that wait till all are healthy to bootstrap the Consul cluster.
replicas: 3
bootstrapExpect: 3
# Specify the resources that servers request for placement. These values will serve a large environment.
resources: |
requests:
memory: "32Gi"
cpu: "4"
disk: "50Gi"
limits:
memory: "32Gi"
cpu: "4"
disk: "50Gi"
# If using Enterprise, reference the Kubernetes secret that holds your license here
enterpriseLicense:
secretName: "consul-license"
secretKey: "key"
# Prevent Consul servers from co-location on Kubernetes nodes.
affinity: |
podAntiAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
- labelSelector:
matchLabels:
app: {{ template "consul.name" . }}
release: "{{ .Release.Name }}"
component: server
topologyKey: kubernetes.io/hostname
# Configure Consul clients in this section
client:
enabled: true
# Specify the resources that clients request for deployment.
resources: |
requests:
memory: "8Gi"
cpu: "2"
disk: "15Gi"
limits:
memory: "8Gi"
cpu: "2"
disk: "15Gi"
grpc: true
# Enable and configure the Consul UI.
ui:
enabled: true
# Configure security for Consul Connect pod injection
connectInject:
enabled: true
default: true
namespaceSelector: "my-namespace"
aclBindingRuleSelector: “serviceaccount.name!=default”
```
## Deploy Consul
Now that you have customized the `values.yml` file, you can deploy Consul with
Helm. This should only take a few minutes. The Consul pods should appear in the
Kubernetes dashboard immediately and you can monitor the deployment process
there.
```sh
$ helm install ./consul-helm -f values.yaml
```
To check the deployment process on the command line you can use `kubectl`.
```sh
$ kubectl get pods
```
## Summary
In this guide, you configured Consul, using the Helm chart, for a production
environment. This involved ensuring that your cluster had a properly
distributed server cluster, specifying enough resources for your agents,
securing the cluster with ACLs and gossip encryption, and enabling other Consul
functionality including Connect and the Consul UI.
Now you can interact with your Consul cluster through the UI or CLI.
If you exposed the UI using a load balancer it will be available at the
`LoadBalancer Ingress` IP address and `Port` that is output from the following
command. Note, you will need to replace _consul server_ with the server name
from your cluster.
```sh
$ kubectl describe services consul-server
```
To access the Consul CLI, open a terminal session using the Kubernetes CLI.
```sh
$ kubectl exec <pod name> -it /bin/ash
```
To learn more about how to interact with your Consul cluster or use it for
service discovery, configuration or segmentation, try one of Learns
[Operations or Development tracks](/consul/#advanced). Follow the [Security and
Networking track](/consul/?track=security-networking#security-networking) to
learn more about securing your Consul cluster.