Add per-agent reconnect timeouts (#8781)

This allows for client agent to be run in a more stateless manner where they may be abruptly terminated and not expected to come back. If advertising a per-agent reconnect timeout using the advertise_reconnect_timeout configuration when that agent leaves, other agents will wait only that amount of time for the agent to come back before reaping it.

This has the advantageous side effect of causing servers to deregister the node/services/checks for that agent sooner than if the global reconnect_timeout was used.
This commit is contained in:
Matt Keeler 2020-10-08 15:02:19 -04:00 committed by GitHub
parent 57a7057067
commit 38f5ddce2a
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
24 changed files with 275 additions and 75 deletions

3
.changelog/8781.txt Normal file
View File

@ -0,0 +1,3 @@
```release-note:feature
agent: Allow client agents to be configured with an advertised reconnect timeout to control how long until the nodes are reaped by others in the cluster.
```

View File

@ -1025,6 +1025,8 @@ func newConsulConfig(runtimeCfg *config.RuntimeConfig, logger hclog.Logger) (*co
cfg.SerfWANConfig = nil
}
cfg.AdvertiseReconnectTimeout = runtimeCfg.AdvertiseReconnectTimeout
cfg.RPCAddr = runtimeCfg.RPCBindAddr
cfg.RPCAdvertise = runtimeCfg.RPCAdvertiseAddr

View File

@ -949,11 +949,12 @@ func (b *Builder) Build() (rt RuntimeConfig, err error) {
},
// Agent
AdvertiseAddrLAN: advertiseAddrLAN,
AdvertiseAddrWAN: advertiseAddrWAN,
BindAddr: bindAddr,
Bootstrap: b.boolVal(c.Bootstrap),
BootstrapExpect: b.intVal(c.BootstrapExpect),
AdvertiseAddrLAN: advertiseAddrLAN,
AdvertiseAddrWAN: advertiseAddrWAN,
AdvertiseReconnectTimeout: b.durationVal("advertise_reconnect_timeout", c.AdvertiseReconnectTimeout),
BindAddr: bindAddr,
Bootstrap: b.boolVal(c.Bootstrap),
BootstrapExpect: b.intVal(c.BootstrapExpect),
Cache: cache.Options{
EntryFetchRate: rate.Limit(
b.float64ValWithDefault(c.Cache.EntryFetchRate, float64(cache.DefaultEntryFetchRate)),
@ -1389,6 +1390,10 @@ func (b *Builder) Validate(rt RuntimeConfig) error {
return fmt.Errorf("auto_encrypt.allow_tls can only be used on a server.")
}
if rt.ServerMode && rt.AdvertiseReconnectTimeout != 0 {
return fmt.Errorf("advertise_reconnect_timeout can only be used on a client")
}
// ----------------------------------------------------------------
// warnings
//

View File

@ -142,6 +142,7 @@ type Config struct {
AdvertiseAddrWAN *string `json:"advertise_addr_wan,omitempty" hcl:"advertise_addr_wan" mapstructure:"advertise_addr_wan"`
AdvertiseAddrWANIPv4 *string `json:"advertise_addr_wan_ipv4,omitempty" hcl:"advertise_addr_wan_ipv4" mapstructure:"advertise_addr_wan_ipv4"`
AdvertiseAddrWANIPv6 *string `json:"advertise_addr_wan_ipv6,omitempty" hcl:"advertise_addr_wan_ipv6" mapstructure:"advertise_addr_ipv6"`
AdvertiseReconnectTimeout *string `json:"advertise_reconnect_timeout,omitempty" hcl:"advertise_reconnect_timeout" mapstructure:"advertise_reconnect_timeout"`
AutoConfig AutoConfigRaw `json:"auto_config,omitempty" hcl:"auto_config" mapstructure:"auto_config"`
Autopilot Autopilot `json:"autopilot,omitempty" hcl:"autopilot" mapstructure:"autopilot"`
BindAddr *string `json:"bind_addr,omitempty" hcl:"bind_addr" mapstructure:"bind_addr"`

View File

@ -980,6 +980,13 @@ type RuntimeConfig struct {
// hcl: reconnect_timeout = "duration"
ReconnectTimeoutWAN time.Duration
// AdvertiseReconnectTimeout specifies the amount of time other agents should
// wait for us to reconnect before deciding we are permanently gone. This
// should only be set for client agents that are run in a stateless or
// ephemeral manner in order to realize their deletion sooner than we
// would otherwise.
AdvertiseReconnectTimeout time.Duration
// RejoinAfterLeave controls our interaction with the cluster after leave.
// When set to false (default), a leave causes Consul to not rejoin
// the cluster until an explicit join is received. If this is set to

View File

@ -51,7 +51,7 @@ type configTest struct {
// should check one option at a time if possible and should use generic
// values, e.g. 'a' or 1 instead of 'servicex' or 3306.
func TestBuilder_BuildAndValide_ConfigFlagsAndEdgecases(t *testing.T) {
func TestBuilder_BuildAndValidate_ConfigFlagsAndEdgecases(t *testing.T) {
dataDir := testutil.TempDir(t, "consul")
defaultEntMeta := structs.DefaultEnterpriseMeta()
@ -4438,7 +4438,6 @@ func TestBuilder_BuildAndValide_ConfigFlagsAndEdgecases(t *testing.T) {
rt.CertFile = "foo"
},
},
// UI Config tests
{
desc: "ui config deprecated",
@ -4601,6 +4600,23 @@ func TestBuilder_BuildAndValide_ConfigFlagsAndEdgecases(t *testing.T) {
`},
err: `ui_config.dashboard_url_templates values must be a valid http or https URL.`,
},
// Per node reconnect timeout test
{
desc: "server and advertised reconnect timeout error",
args: []string{
`-data-dir=` + dataDir,
`-server`,
},
hcl: []string{`
advertise_reconnect_timeout = "5s"
`},
json: []string{`
{
"advertise_reconnect_timeout": "5s"
}`},
err: "advertise_reconnect_timeout can only be used on a client",
},
}
testConfig(t, tests, dataDir)
@ -4834,6 +4850,7 @@ func TestFullConfig(t *testing.T) {
},
"advertise_addr": "17.99.29.16",
"advertise_addr_wan": "78.63.37.19",
"advertise_reconnect_timeout": "0s",
"audit": {
"enabled": false
},
@ -5515,6 +5532,7 @@ func TestFullConfig(t *testing.T) {
}
advertise_addr = "17.99.29.16"
advertise_addr_wan = "78.63.37.19"
advertise_reconnect_timeout = "0s"
audit = {
enabled = false
}
@ -6295,6 +6313,7 @@ func TestFullConfig(t *testing.T) {
ACLTokenReplication: true,
AdvertiseAddrLAN: ipAddr("17.99.29.16"),
AdvertiseAddrWAN: ipAddr("78.63.37.19"),
AdvertiseReconnectTimeout: 0 * time.Second,
AutopilotCleanupDeadServers: true,
AutopilotDisableUpgradeMigration: true,
AutopilotLastContactThreshold: 12705 * time.Second,
@ -7278,6 +7297,7 @@ func TestSanitize(t *testing.T) {
"AEInterval": "0s",
"AdvertiseAddrLAN": "",
"AdvertiseAddrWAN": "",
"AdvertiseReconnectTimeout": "0s",
"AutopilotCleanupDeadServers": false,
"AutopilotDisableUpgradeMigration": false,
"AutopilotLastContactThreshold": "0s",

View File

@ -6,7 +6,7 @@ import (
"github.com/hashicorp/consul/acl"
"github.com/hashicorp/consul/agent/structs"
"github.com/hashicorp/consul/lib"
"github.com/hashicorp/consul/lib/serf"
)
var clientACLCacheConfig *structs.ACLCachesConfig = &structs.ACLCachesConfig{
@ -123,5 +123,5 @@ func (c *Client) ResolveTokenAndDefaultMeta(token string, entMeta *structs.Enter
func (c *Client) updateSerfTags(key, value string) {
// Update the LAN serf
lib.UpdateSerfTag(c.serf, key, value)
serf.UpdateTag(c.serf, key, value)
}

View File

@ -6,7 +6,7 @@ import (
"github.com/hashicorp/consul/acl"
"github.com/hashicorp/consul/agent/structs"
"github.com/hashicorp/consul/lib"
"github.com/hashicorp/consul/lib/serf"
)
var serverACLCacheConfig *structs.ACLCachesConfig = &structs.ACLCachesConfig{
@ -86,10 +86,10 @@ func (s *Server) checkBindingRuleUUID(id string) (bool, error) {
func (s *Server) updateSerfTags(key, value string) {
// Update the LAN serf
lib.UpdateSerfTag(s.serfLAN, key, value)
serf.UpdateTag(s.serfLAN, key, value)
if s.serfWAN != nil {
lib.UpdateSerfTag(s.serfWAN, key, value)
serf.UpdateTag(s.serfWAN, key, value)
}
s.updateEnterpriseSerfTags(key, value)

View File

@ -8,6 +8,7 @@ import (
"github.com/hashicorp/consul/agent/metadata"
"github.com/hashicorp/consul/agent/structs"
"github.com/hashicorp/consul/lib"
libserf "github.com/hashicorp/consul/lib/serf"
"github.com/hashicorp/consul/logging"
"github.com/hashicorp/consul/types"
"github.com/hashicorp/go-hclog"
@ -27,6 +28,9 @@ func (c *Client) setupSerf(conf *serf.Config, ch chan serf.Event, path string) (
conf.Tags["vsn_min"] = fmt.Sprintf("%d", ProtocolVersionMin)
conf.Tags["vsn_max"] = fmt.Sprintf("%d", ProtocolVersionMax)
conf.Tags["build"] = c.config.Build
if c.config.AdvertiseReconnectTimeout != 0 {
conf.Tags[libserf.ReconnectTimeoutTag] = c.config.AdvertiseReconnectTimeout.String()
}
if c.acls.ACLsEnabled() {
// we start in legacy mode and then transition to normal
// mode once we know the cluster can handle it.
@ -65,6 +69,8 @@ func (c *Client) setupSerf(conf *serf.Config, ch chan serf.Event, path string) (
c.addEnterpriseSerfTags(conf.Tags)
conf.ReconnectTimeoutOverride = libserf.NewReconnectOverride(c.logger)
return serf.Create(conf)
}

View File

@ -747,3 +747,34 @@ func TestClient_Reload(t *testing.T) {
require.Equal(t, rate.Limit(1000), limiter.Limit())
require.Equal(t, 10000, limiter.Burst())
}
func TestClient_ShortReconnectTimeout(t *testing.T) {
cluster := newTestCluster(t, &testClusterConfig{
Datacenter: "dc1",
Servers: 1,
Clients: 2,
ServerConf: func(c *Config) {
c.SerfLANConfig.ReapInterval = 50 * time.Millisecond
},
ClientConf: func(c *Config) {
c.SerfLANConfig.ReapInterval = 50 * time.Millisecond
c.AdvertiseReconnectTimeout = 100 * time.Millisecond
},
})
// shutdown the client
cluster.Clients[1].Shutdown()
// Now wait for it to be reaped. We set the advertised reconnect
// timeout to 100ms so we are going to check every 50 ms and allow
// up to 10x the time in the case of slow CI.
require.Eventually(t,
func() bool {
return len(cluster.Servers[0].LANMembers()) == 2 &&
len(cluster.Clients[0].LANMembers()) == 2
},
time.Second,
50*time.Millisecond,
"The client node was not reaped within the alotted time")
}

View File

@ -9,7 +9,7 @@ import (
"github.com/hashicorp/consul/agent/checks"
"github.com/hashicorp/consul/agent/consul/autopilot"
"github.com/hashicorp/consul/agent/structs"
"github.com/hashicorp/consul/lib"
libserf "github.com/hashicorp/consul/lib/serf"
"github.com/hashicorp/consul/tlsutil"
"github.com/hashicorp/consul/types"
"github.com/hashicorp/consul/version"
@ -219,6 +219,11 @@ type Config struct {
// true, we ignore the leave, and rejoin the cluster on start.
RejoinAfterLeave bool
// AdvertiseReconnectTimeout is the duration after which this node should be
// assumed to not be returning and thus should be reaped within Serf. This
// can only be set for Client agents
AdvertiseReconnectTimeout time.Duration
// Build is a string that is gossiped around, and can be used to help
// operators track which versions are actively deployed
Build string
@ -544,8 +549,8 @@ func DefaultConfig() *Config {
NodeName: hostname,
RPCAddr: DefaultRPCAddr,
RaftConfig: raft.DefaultConfig(),
SerfLANConfig: lib.SerfDefaultConfig(),
SerfWANConfig: lib.SerfDefaultConfig(),
SerfLANConfig: libserf.DefaultConfig(),
SerfWANConfig: libserf.DefaultConfig(),
SerfFloodInterval: 60 * time.Second,
ReconcileInterval: 60 * time.Second,
ProtocolVersion: ProtocolVersion2Compatible,

View File

@ -11,6 +11,7 @@ import (
"github.com/hashicorp/consul/agent/metadata"
"github.com/hashicorp/consul/agent/structs"
"github.com/hashicorp/consul/lib"
libserf "github.com/hashicorp/consul/lib/serf"
"github.com/hashicorp/consul/logging"
"github.com/hashicorp/go-hclog"
"github.com/hashicorp/memberlist"
@ -168,6 +169,8 @@ func (s *Server) setupSerf(conf *serf.Config, ch chan serf.Event, path string, w
return nil, err
}
conf.ReconnectTimeoutOverride = libserf.NewReconnectOverride(s.logger)
s.addEnterpriseSerfTags(conf.Tags)
if s.config.OverrideInitialSerfTags != nil {

View File

@ -10,7 +10,7 @@ require (
github.com/hashicorp/go-hclog v0.12.0
github.com/hashicorp/go-rootcerts v1.0.2
github.com/hashicorp/go-uuid v1.0.1
github.com/hashicorp/serf v0.9.3
github.com/hashicorp/serf v0.9.5
github.com/mitchellh/mapstructure v1.1.2
github.com/stretchr/testify v1.4.0
)

View File

@ -41,8 +41,8 @@ github.com/hashicorp/logutils v1.0.0/go.mod h1:QIAnNjmIWmVIIkWDTG1z5v++HQmx9WQRO
github.com/hashicorp/mdns v1.0.1/go.mod h1:4gW7WsVCke5TE7EPeYliwHlRUyBtfCwuFwuMg2DmyNY=
github.com/hashicorp/memberlist v0.2.2 h1:5+RffWKwqJ71YPu9mWsF7ZOscZmwfasdA8kbdC7AO2g=
github.com/hashicorp/memberlist v0.2.2/go.mod h1:MS2lj3INKhZjWNqd3N0m3J+Jxf3DAOnAH9VT3Sh9MUE=
github.com/hashicorp/serf v0.9.3 h1:AVF6JDQQens6nMHT9OGERBvK0f8rPrAGILnsKLr6lzM=
github.com/hashicorp/serf v0.9.3/go.mod h1:UWDWwZeL5cuWDJdl0C6wrvrUwEqtQ4ZKBKKENpqIUyk=
github.com/hashicorp/serf v0.9.5 h1:EBWvyu9tcRszt3Bxp3KNssBMP1KuHWyO51lz9+786iM=
github.com/hashicorp/serf v0.9.5/go.mod h1:UWDWwZeL5cuWDJdl0C6wrvrUwEqtQ4ZKBKKENpqIUyk=
github.com/kr/pretty v0.2.0 h1:s5hAObm+yFO5uHYt5dYjxi2rXrsnmRpJx4OYvIWUaQs=
github.com/kr/pretty v0.2.0/go.mod h1:ipq/a2n7PKx3OHsz4KJII5eveXtPO4qwEXGdVfWzfnI=
github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ=

2
go.mod
View File

@ -56,7 +56,7 @@ require (
github.com/hashicorp/net-rpc-msgpackrpc v0.0.0-20151116020338-a14192a58a69
github.com/hashicorp/raft v1.1.2
github.com/hashicorp/raft-boltdb v0.0.0-20171010151810-6e5ba93211ea
github.com/hashicorp/serf v0.9.4
github.com/hashicorp/serf v0.9.5
github.com/hashicorp/vault/api v1.0.5-0.20200717191844-f687267c8086
github.com/hashicorp/yamux v0.0.0-20181012175058-2f1d1f20f75d
github.com/imdario/mergo v0.3.6

5
go.sum
View File

@ -288,9 +288,8 @@ github.com/hashicorp/raft v1.1.2 h1:oxEL5DDeurYxLd3UbcY/hccgSPhLLpiBZ1YxtWEq59c=
github.com/hashicorp/raft v1.1.2/go.mod h1:vPAJM8Asw6u8LxC3eJCUZmRP/E4QmUGE1R7g7k8sG/8=
github.com/hashicorp/raft-boltdb v0.0.0-20171010151810-6e5ba93211ea h1:xykPFhrBAS2J0VBzVa5e80b5ZtYuNQtgXjN40qBZlD4=
github.com/hashicorp/raft-boltdb v0.0.0-20171010151810-6e5ba93211ea/go.mod h1:pNv7Wc3ycL6F5oOWn+tPGo2gWD4a5X+yp/ntwdKLjRk=
github.com/hashicorp/serf v0.9.3/go.mod h1:UWDWwZeL5cuWDJdl0C6wrvrUwEqtQ4ZKBKKENpqIUyk=
github.com/hashicorp/serf v0.9.4 h1:xrZ4ZR0wT5Dz8oQHHdfOzr0ei1jMToWlFFz3hh/DI7I=
github.com/hashicorp/serf v0.9.4/go.mod h1:UWDWwZeL5cuWDJdl0C6wrvrUwEqtQ4ZKBKKENpqIUyk=
github.com/hashicorp/serf v0.9.5 h1:EBWvyu9tcRszt3Bxp3KNssBMP1KuHWyO51lz9+786iM=
github.com/hashicorp/serf v0.9.5/go.mod h1:UWDWwZeL5cuWDJdl0C6wrvrUwEqtQ4ZKBKKENpqIUyk=
github.com/hashicorp/vault/api v1.0.5-0.20200717191844-f687267c8086 h1:OKsyxKi2sNmqm1Gv93adf2AID2FOBFdCbbZn9fGtIdg=
github.com/hashicorp/vault/api v1.0.5-0.20200717191844-f687267c8086/go.mod h1:R3Umvhlxi2TN7Ex2hzOowyeNb+SfbVWI973N+ctaFMk=
github.com/hashicorp/vault/sdk v0.1.14-0.20200519221838-e0cfd64bc267 h1:e1ok06zGrWJW91rzRroyl5nRNqraaBe4d5hiKcVZuHM=

View File

@ -1,44 +0,0 @@
package lib
import (
"time"
"github.com/hashicorp/serf/serf"
)
// SerfDefaultConfig returns a Consul-flavored Serf default configuration,
// suitable as a basis for a LAN, WAN, segment, or area.
func SerfDefaultConfig() *serf.Config {
base := serf.DefaultConfig()
// This effectively disables the annoying queue depth warnings.
base.QueueDepthWarning = 1000000
// This enables dynamic sizing of the message queue depth based on the
// cluster size.
base.MinQueueDepth = 4096
// This gives leaves some time to propagate through the cluster before
// we shut down. The value was chosen to be reasonably short, but to
// allow a leave to get to over 99.99% of the cluster with 100k nodes
// (using https://www.serf.io/docs/internals/simulator.html).
base.LeavePropagateDelay = 3 * time.Second
return base
}
func GetSerfTags(serf *serf.Serf) map[string]string {
tags := make(map[string]string)
for tag, value := range serf.LocalMember().Tags {
tags[tag] = value
}
return tags
}
func UpdateSerfTag(serf *serf.Serf, tag, value string) {
tags := GetSerfTags(serf)
tags[tag] = value
serf.SetTags(tags)
}

82
lib/serf/serf.go Normal file
View File

@ -0,0 +1,82 @@
package serf
import (
"time"
"github.com/hashicorp/go-hclog"
"github.com/hashicorp/serf/serf"
)
const (
ReconnectTimeoutTag = "rc_tm"
)
// DefaultConfig returns a Consul-flavored Serf default configuration,
// suitable as a basis for a LAN, WAN, segment, or area.
func DefaultConfig() *serf.Config {
base := serf.DefaultConfig()
// This effectively disables the annoying queue depth warnings.
base.QueueDepthWarning = 1000000
// This enables dynamic sizing of the message queue depth based on the
// cluster size.
base.MinQueueDepth = 4096
// This gives leaves some time to propagate through the cluster before
// we shut down. The value was chosen to be reasonably short, but to
// allow a leave to get to over 99.99% of the cluster with 100k nodes
// (using https://www.serf.io/docs/internals/simulator.html).
base.LeavePropagateDelay = 3 * time.Second
return base
}
func GetTags(serf *serf.Serf) map[string]string {
tags := make(map[string]string)
for tag, value := range serf.LocalMember().Tags {
tags[tag] = value
}
return tags
}
func UpdateTag(serf *serf.Serf, tag, value string) {
tags := GetTags(serf)
tags[tag] = value
serf.SetTags(tags)
}
type ReconnectOverride struct {
logger hclog.Logger
}
func NewReconnectOverride(logger hclog.Logger) *ReconnectOverride {
if logger == nil {
logger = hclog.Default()
}
return &ReconnectOverride{
logger: logger,
}
}
func (r *ReconnectOverride) ReconnectTimeout(m *serf.Member, timeout time.Duration) time.Duration {
val, ok := m.Tags[ReconnectTimeoutTag]
if !ok {
return timeout
}
newTimeout, err := time.ParseDuration(val)
if err != nil {
r.logger.Warn("Member is advertising a malformed reconnect timeout", "member", m.Name, "rc_tm", val)
return timeout
}
// ignore a timeout of 0 as that indicates the default should be used
if newTimeout == 0 {
return timeout
}
return newTimeout
}

View File

@ -11,7 +11,7 @@ import (
// given config.
func GenerateClients(nodes int, config *Config) ([]*Client, error) {
clients := make([]*Client, nodes)
for i, _ := range clients {
for i := range clients {
client, err := NewClient(config)
if err != nil {
return nil, err
@ -146,7 +146,7 @@ func Simulate(clients []*Client, truth [][]time.Duration, cycles int) {
nodes := len(clients)
for cycle := 0; cycle < cycles; cycle++ {
for i, _ := range clients {
for i := range clients {
if j := rand.Intn(nodes); j != i {
c := clients[j].GetCoordinate()
rtt := truth[i][j]

View File

@ -253,6 +253,15 @@ type Config struct {
//
// WARNING: this should ONLY be used in tests
messageDropper func(typ messageType) bool
// ReconnectTimeoutOverride is an optional interface which when present allows
// the application to cause reaping of a node to happen when it otherwise wouldn't
ReconnectTimeoutOverride ReconnectTimeoutOverrider
// ValidateNodeNames controls whether nodenames only
// contain alphanumeric, dashes and '.'characters
// and sets maximum length to 128 characters
ValidateNodeNames bool
}
// Init allocates the subdata structures
@ -298,6 +307,7 @@ func DefaultConfig() *Config {
QuerySizeLimit: 1024,
EnableNameConflictResolution: true,
DisableCoordinates: false,
ValidateNodeNames: false,
UserEventSizeLimit: 512,
}
}

View File

@ -1,6 +1,7 @@
package serf
import (
"fmt"
"net"
"github.com/hashicorp/memberlist"
@ -17,22 +18,31 @@ type mergeDelegate struct {
func (m *mergeDelegate) NotifyMerge(nodes []*memberlist.Node) error {
members := make([]*Member, len(nodes))
for idx, n := range nodes {
members[idx] = m.nodeToMember(n)
var err error
members[idx], err = m.nodeToMember(n)
if err != nil {
return err
}
}
return m.serf.config.Merge.NotifyMerge(members)
}
func (m *mergeDelegate) NotifyAlive(peer *memberlist.Node) error {
member := m.nodeToMember(peer)
member, err := m.nodeToMember(peer)
if err != nil {
return err
}
return m.serf.config.Merge.NotifyMerge([]*Member{member})
}
func (m *mergeDelegate) nodeToMember(n *memberlist.Node) *Member {
func (m *mergeDelegate) nodeToMember(n *memberlist.Node) (*Member, error) {
status := StatusNone
if n.State == memberlist.StateLeft {
status = StatusLeft
}
if err := m.validateMemberInfo(n); err != nil {
return nil, err
}
return &Member{
Name: n.Name,
Addr: net.IP(n.Addr),
@ -45,5 +55,22 @@ func (m *mergeDelegate) nodeToMember(n *memberlist.Node) *Member {
DelegateMin: n.DMin,
DelegateMax: n.DMax,
DelegateCur: n.DCur,
}
}, nil
}
// validateMemberInfo checks that the data we are sending is valid
func (m *mergeDelegate) validateMemberInfo(n *memberlist.Node) error {
if err := m.serf.validateNodeName(n.Name); err != nil {
return err
}
if len(n.Addr) != 4 && len(n.Addr) != 16 {
return fmt.Errorf("IP byte length is invalid: %d bytes is not either 4 or 16", len(n.Addr))
}
if len(n.Meta) > memberlist.MetaMaxSize {
return fmt.Errorf("Encoded length of tags exceeds limit of %d bytes",
memberlist.MetaMaxSize)
}
return nil
}

View File

@ -11,6 +11,7 @@ import (
"math/rand"
"net"
"os"
"regexp"
"strconv"
"sync"
"sync/atomic"
@ -36,6 +37,8 @@ const (
tagMagicByte uint8 = 255
)
const MaxNodeNameLength int = 128
var (
// FeatureNotSupported is returned if a feature cannot be used
// due to an older protocol version being used.
@ -47,6 +50,12 @@ func init() {
rand.Seed(time.Now().UnixNano())
}
// ReconnectTimeoutOverrider is an interface that can be implemented to allow overriding
// the reconnect timeout for individual members.
type ReconnectTimeoutOverrider interface {
ReconnectTimeout(member *Member, timeout time.Duration) time.Duration
}
// Serf is a single node that is part of a single cluster that gets
// events about joins/leaves/failures/etc. It is created with the Create
// method.
@ -269,6 +278,9 @@ func Create(conf *Config) (*Serf, error) {
if len(serf.encodeTags(conf.Tags)) > memberlist.MetaMaxSize {
return nil, fmt.Errorf("Encoded length of tags exceeds limit of %d bytes", memberlist.MetaMaxSize)
}
if err := serf.ValidateNodeNames(); err != nil {
return nil, err
}
// Check if serf member event coalescing is enabled
if conf.CoalescePeriod > 0 && conf.QuiescentPeriod > 0 && conf.EventCh != nil {
@ -1571,8 +1583,13 @@ func (s *Serf) reap(old []*memberState, now time.Time, timeout time.Duration) []
for i := 0; i < n; i++ {
m := old[i]
memberTimeout := timeout
if s.config.ReconnectTimeoutOverride != nil {
memberTimeout = s.config.ReconnectTimeoutOverride.ReconnectTimeout(&m.Member, memberTimeout)
}
// Skip if the timeout is not yet reached
if now.Sub(m.leaveTime) <= timeout {
if now.Sub(m.leaveTime) <= memberTimeout {
continue
}
@ -1884,3 +1901,24 @@ func (s *Serf) NumNodes() (numNodes int) {
return numNodes
}
// ValidateNodeNames verifies the NodeName contains
// only alphanumeric, -, or . and is under 128 chracters
func (s *Serf) ValidateNodeNames() error {
return s.validateNodeName(s.config.NodeName)
}
func (s *Serf) validateNodeName(name string) error {
if s.config.ValidateNodeNames {
var InvalidNameRe = regexp.MustCompile(`[^A-Za-z0-9\-\.]+`)
if InvalidNameRe.MatchString(name) {
return fmt.Errorf("Node name contains invalid characters %v , Valid characters include "+
"all alpha-numerics and dashes and '.' ", name)
}
if len(name) > MaxNodeNameLength {
return fmt.Errorf("Node name is %v characters. "+
"Valid length is between 1 and 128 characters", len(name))
}
}
return nil
}

2
vendor/modules.txt vendored
View File

@ -280,7 +280,7 @@ github.com/hashicorp/net-rpc-msgpackrpc
github.com/hashicorp/raft
# github.com/hashicorp/raft-boltdb v0.0.0-20171010151810-6e5ba93211ea
github.com/hashicorp/raft-boltdb
# github.com/hashicorp/serf v0.9.4
# github.com/hashicorp/serf v0.9.5
github.com/hashicorp/serf/coordinate
github.com/hashicorp/serf/serf
# github.com/hashicorp/vault/api v1.0.5-0.20200717191844-f687267c8086

View File

@ -834,6 +834,11 @@ Valid time units are 'ns', 'us' (or 'µs'), 'ms', 's', 'm', 'h'."
- `advertise_addr_wan_ipv6` This was added together with [`advertise_addr_wan_ipv4`](#advertise_addr_wan_ipv4) to support dual stack IPv4/IPv6 environments. Using this, both IPv4 and IPv6 addresses can be specified and requested during eg service discovery.
- `advertise_reconnect_timeout` This is a per-agent setting of the [`reconnect_timeout`](#reconnect_timeout) parameter.
This agent will advertise to all other nodes in the cluster that after this timeout, the node may be completely
removed from the cluster. This may only be set on client agents and if unset then other nodes will use the main
`reconnect_timeout` setting when determing when this node may be removed from the cluster.
- `serf_lan` ((#serf_lan_bind)) Equivalent to the [`-serf-lan-bind` command-line flag](#_serf_lan_bind).
- `serf_lan_allowed_cidrs` ((#serf_lan_allowed_cidrs)) Equivalent to the [`-serf-lan-allowed-cidrs` command-line flag](#_serf_lan_allowed_cidrs).