Allow raft TrailingLogs to be configured. (#6186)

This fixes pathological cases where the write throughput and snapshot size are both so large that more than 10k log entries are written in the time it takes to restore the snapshot from disk. In this case followers that restart can never catch up with leader replication again and enter a loop of constantly downloading a full snapshot and restoring it only to find that snapshot is already out of date and the leader has truncated its logs so a new snapshot is sent etc.

In general if you need to adjust this, you are probably abusing Consul for purposes outside its design envelope and should reconsider your usage to reduce data size and/or write volume.
This commit is contained in:
Paul Banks 2019-07-23 15:19:57 +01:00 committed by GitHub
parent 3a4e38a13e
commit f38da47c55
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 75 additions and 20 deletions

View File

@ -1162,6 +1162,9 @@ func (a *Agent) consulConfig() (*consul.Config, error) {
if a.config.RaftSnapshotInterval != 0 {
base.RaftConfig.SnapshotInterval = a.config.RaftSnapshotInterval
}
if a.config.RaftTrailingLogs != 0 {
base.RaftConfig.TrailingLogs = uint64(a.config.RaftTrailingLogs)
}
if a.config.ACLMasterToken != "" {
base.ACLMasterToken = a.config.ACLMasterToken
}

View File

@ -3925,7 +3925,7 @@ func TestAgent_ReloadConfigTLSConfigFailure(t *testing.T) {
require.Len(t, tlsConf.RootCAs.Subjects(), 1)
}
func TestAgent_consulConfig(t *testing.T) {
func TestAgent_consulConfig_AutoEncryptAllowTLS(t *testing.T) {
t.Parallel()
dataDir := testutil.TempDir(t, "agent") // we manage the data dir
defer os.RemoveAll(dataDir)
@ -3941,3 +3941,13 @@ func TestAgent_consulConfig(t *testing.T) {
defer a.Shutdown()
require.True(t, a.consulConfig().AutoEncryptAllowTLS)
}
func TestAgent_consulConfig_RaftTrailingLogs(t *testing.T) {
t.Parallel()
hcl := `
raft_trailing_logs = 812345
`
a := NewTestAgent(t, t.Name(), hcl)
defer a.Shutdown()
require.Equal(t, uint64(812345), a.consulConfig().RaftConfig.TrailingLogs)
}

View File

@ -862,6 +862,7 @@ func (b *Builder) Build() (rt RuntimeConfig, err error) {
RaftProtocol: b.intVal(c.RaftProtocol),
RaftSnapshotThreshold: b.intVal(c.RaftSnapshotThreshold),
RaftSnapshotInterval: b.durationVal("raft_snapshot_interval", c.RaftSnapshotInterval),
RaftTrailingLogs: b.intVal(c.RaftTrailingLogs),
ReconnectTimeoutLAN: b.durationVal("reconnect_timeout", c.ReconnectTimeoutLAN),
ReconnectTimeoutWAN: b.durationVal("reconnect_timeout_wan", c.ReconnectTimeoutWAN),
RejoinAfterLeave: b.boolVal(c.RejoinAfterLeave),

View File

@ -239,6 +239,7 @@ type Config struct {
RaftProtocol *int `json:"raft_protocol,omitempty" hcl:"raft_protocol" mapstructure:"raft_protocol"`
RaftSnapshotThreshold *int `json:"raft_snapshot_threshold,omitempty" hcl:"raft_snapshot_threshold" mapstructure:"raft_snapshot_threshold"`
RaftSnapshotInterval *string `json:"raft_snapshot_interval,omitempty" hcl:"raft_snapshot_interval" mapstructure:"raft_snapshot_interval"`
RaftTrailingLogs *int `json:"raft_trailing_logs,omitempty" hcl:"raft_trailing_logs" mapstructure:"raft_trailing_logs"`
ReconnectTimeoutLAN *string `json:"reconnect_timeout,omitempty" hcl:"reconnect_timeout" mapstructure:"reconnect_timeout"`
ReconnectTimeoutWAN *string `json:"reconnect_timeout_wan,omitempty" hcl:"reconnect_timeout_wan" mapstructure:"reconnect_timeout_wan"`
RejoinAfterLeave *bool `json:"rejoin_after_leave,omitempty" hcl:"rejoin_after_leave" mapstructure:"rejoin_after_leave"`

View File

@ -965,6 +965,22 @@ type RuntimeConfig struct {
// hcl: raft_snapshot_threshold = int
RaftSnapshotInterval time.Duration
// RaftTrailingLogs sets the number of log entries that will be left in the
// log store after a snapshot. This must be large enough that a follower can
// transfer and restore an entire snapshot of the state before this many new
// entries have been appended. In vast majority of cases the default is plenty
// but if there is a sustained high write throughput coupled with a huge
// multi-gigabyte snapshot setting this higher may be necessary to allow
// followers time to reload from snapshot without becoming unhealthy. If it's
// too low then followers are unable to ever recover from a restart and will
// enter a loop of constantly downloading full snapshots and never catching
// up. If you need to change this you should reconsider your usage of Consul
// as it is not designed to store multiple-gigabyte data sets with high write
// throughput. Defaults to 10000.
//
// hcl: raft_trailing_logs = int
RaftTrailingLogs int
// ReconnectTimeoutLAN specifies the amount of time to wait to reconnect with
// another agent before deciding it's permanently gone. This can be used to
// control the time it takes to reap failed nodes from the cluster.

View File

@ -3298,6 +3298,7 @@ func TestFullConfig(t *testing.T) {
"raft_protocol": 19016,
"raft_snapshot_threshold": 16384,
"raft_snapshot_interval": "30s",
"raft_trailing_logs": 83749,
"reconnect_timeout": "23739s",
"reconnect_timeout_wan": "26694s",
"recursors": [ "63.38.39.58", "92.49.18.18" ],
@ -3881,6 +3882,7 @@ func TestFullConfig(t *testing.T) {
raft_protocol = 19016
raft_snapshot_threshold = 16384
raft_snapshot_interval = "30s"
raft_trailing_logs = 83749
reconnect_timeout = "23739s"
reconnect_timeout_wan = "26694s"
recursors = [ "63.38.39.58", "92.49.18.18" ]
@ -4532,6 +4534,7 @@ func TestFullConfig(t *testing.T) {
RaftProtocol: 19016,
RaftSnapshotThreshold: 16384,
RaftSnapshotInterval: 30 * time.Second,
RaftTrailingLogs: 83749,
ReconnectTimeoutLAN: 23739 * time.Second,
ReconnectTimeoutWAN: 26694 * time.Second,
RejoinAfterLeave: true,
@ -5353,6 +5356,7 @@ func TestSanitize(t *testing.T) {
"RaftProtocol": 0,
"RaftSnapshotInterval": "0s",
"RaftSnapshotThreshold": 0,
"RaftTrailingLogs": 0,
"ReconnectTimeoutLAN": "0s",
"ReconnectTimeoutWAN": "0s",
"RejoinAfterLeave": false,

View File

@ -407,21 +407,6 @@ will exit with an error at startup.
[Raft Protocol Version Compatibility](/docs/upgrade-specific.html#raft-protocol-version-compatibility)
for more details.
* <a name="_raft_snapshot_threshold"></a><a href="#_raft_snapshot_threshold">`-raft-snapshot-threshold`</a> - This controls the
minimum number of raft commit entries between snapshots that are saved to disk. This is a low-level parameter that should
rarely need to be changed. Very busy clusters experiencing excessive disk IO may increase this value to reduce disk IO, and minimize
the chances of all servers taking snapshots at the same time. Increasing this trades off disk IO for disk space since the log will
grow much larger and the space in the raft.db file can't be reclaimed till the next snapshot. Servers may take longer to recover from
crashes or failover if this is increased significantly as more logs will need to be replayed. In Consul 1.1.0 and later this
defaults to 16384, and in prior versions it was set to 8192.
* <a name="_raft_snapshot_interval"></a><a href="#_raft_snapshot_interval">`-raft-snapshot-interval`</a> - This controls how often servers
check if they need to save a snapshot to disk. his is a low-level parameter that should rarely need to be changed. Very busy clusters
experiencing excessive disk IO may increase this value to reduce disk IO, and minimize the chances of all servers taking snapshots at the same time.
Increasing this trades off disk IO for disk space since the log will grow much larger and the space in the raft.db file can't be reclaimed
till the next snapshot. Servers may take longer to recover from crashes or failover if this is increased significantly as more logs
will need to be replayed. In Consul 1.1.0 and later this defaults to `30s`, and in prior versions it was set to `5s`.
* <a name="_recursor"></a><a href="#_recursor">`-recursor`</a> - Specifies the address of an upstream DNS
server. This option may be provided multiple times, and is functionally
equivalent to the [`recursors` configuration option](#recursors).
@ -1431,11 +1416,46 @@ default will automatically work with some tooling.
* <a name="raft_protocol"></a><a href="#raft_protocol">`raft_protocol`</a> Equivalent to the
[`-raft-protocol` command-line flag](#_raft_protocol).
* <a name="raft_snapshot_threshold"></a><a href="#raft_snapshot_threshold">`raft_snapshot_threshold`</a> Equivalent to the
[`-raft-snapshot-threshold` command-line flag](#_raft_snapshot_threshold).
<!-- Note the extra _ anchors are here because we used to erroneously list these as
command line flags even though they are not actually defined as valid flags and can
only be set in config file. Duplicating the anchor preserves any existing external links
to the old fragment -->
* <a name="raft_snapshot_threshold"></a><a name="_raft_snapshot_threshold"></a>
<a href="#raft_snapshot_threshold">`raft_snapshot_threshold`</a> This controls
the minimum number of raft commit entries between snapshots that are saved to
disk. This is a low-level parameter that should rarely need to be changed.
Very busy clusters experiencing excessive disk IO may increase this value to
reduce disk IO, and minimize the chances of all servers taking snapshots at
the same time. Increasing this trades off disk IO for disk space since the log
will grow much larger and the space in the raft.db file can't be reclaimed
till the next snapshot. Servers may take longer to recover from crashes or
failover if this is increased significantly as more logs will need to be
replayed. In Consul 1.1.0 and later this defaults to 16384, and in prior
versions it was set to 8192.
* <a name="raft_snapshot_interval"></a><a href="#raft_snapshot_interval">`raft_snapshot_interval`</a> Equivalent to the
[`-raft-snapshot-interval` command-line flag](#_raft_snapshot_interval).
* <a name="raft_snapshot_interval"></a><a name="_raft_snapshot_interval"></a> <a
href="#raft_snapshot_interval">`raft_snapshot_interval`</a> This controls how
often servers check if they need to save a snapshot to disk. his is a
low-level parameter that should rarely need to be changed. Very busy clusters
experiencing excessive disk IO may increase this value to reduce disk IO, and
minimize the chances of all servers taking snapshots at the same time.
Increasing this trades off disk IO for disk space since the log will grow much
larger and the space in th e raft.db file can't be reclaimed till the next
snapshot. Servers may take longer to recover from crashes or failover if this
is increased significantly as more logs will need to be replayed. In Consul
1.1.0 and later this defaults to `30s`, and in prior versions it was set to
`5s`.
* <a name="raft_trailing_logs"></a><a
href="#raft_trailing_logs">`raft_trailing_logs`</a> - This controls how many
log entries are left in the log store on disk after a snapshot is made. This
should only be adjusted when followers cannot catch up to the leader due to a
very large snapshot size that and high write throughput causing log truncation
before an snapshot can be fully installed. If you need to use this to recover
a cluster, consider reducing write throughput or the amount of data stored on
Consul as it is likely under a load it is not designed to handle. The default
value is 10000 which is suitable for all normal workloads. Added in Consul
1.5.3.
* <a name="reap"></a><a href="#reap">`reap`</a> This controls Consul's automatic reaping of child processes,
which is useful if Consul is running as PID 1 in a Docker container. If this isn't specified, then Consul will