mirror of https://github.com/status-im/consul.git
Scales coordinate sends to hit a fixed aggregate rate across the cluster.
This commit is contained in:
parent
66a3d29743
commit
ad65d953f6
|
@ -566,7 +566,9 @@ func (a *Agent) ResumeSync() {
|
||||||
// to the server. Closing the agent's shutdownChannel will cause this to exit.
|
// to the server. Closing the agent's shutdownChannel will cause this to exit.
|
||||||
func (a *Agent) sendCoordinate() {
|
func (a *Agent) sendCoordinate() {
|
||||||
for {
|
for {
|
||||||
intv := aeScale(a.config.SyncCoordinateInterval, len(a.LANMembers()))
|
rate := a.config.SyncCoordinateRateTarget
|
||||||
|
min := a.config.SyncCoordinateIntervalMin
|
||||||
|
intv := rateScaledInterval(rate, min, len(a.LANMembers()))
|
||||||
intv = intv + randomStagger(intv)
|
intv = intv + randomStagger(intv)
|
||||||
|
|
||||||
select {
|
select {
|
||||||
|
|
|
@ -375,12 +375,17 @@ type Config struct {
|
||||||
// DisableCoordinates controls features related to network coordinates.
|
// DisableCoordinates controls features related to network coordinates.
|
||||||
DisableCoordinates bool `mapstructure:"disable_coordinates" json:"-"`
|
DisableCoordinates bool `mapstructure:"disable_coordinates" json:"-"`
|
||||||
|
|
||||||
// SyncCoordinateInterval controls the interval for sending network
|
// SyncCoordinateRateTarget controls the rate for sending network
|
||||||
// coordinates to the server. Defaults to every 20s, but scales up as
|
// coordinates to the server, in updates per second. This is the max rate
|
||||||
// the number of nodes increases in the network, to prevent servers from
|
// that the server supports, so we scale our interval based on the size
|
||||||
// being overwhelmed. If you update this, you may need to adjust the
|
// of the cluster to try to achieve this in aggregate at the server.
|
||||||
// tuning of CoordinateUpdatePeriod and CoordinateUpdateMaxBatchSize.
|
SyncCoordinateRateTarget float64 `mapstructure:"-" json:"-"`
|
||||||
SyncCoordinateInterval time.Duration `mapstructure:"-" json:"-"`
|
|
||||||
|
// SyncCoordinateIntervalMin sets the minimum interval that coordinates
|
||||||
|
// will be sent to the server. We scale the interval based on the cluster
|
||||||
|
// size, but below a certain interval it doesn't make sense send them any
|
||||||
|
// faster.
|
||||||
|
SyncCoordinateIntervalMin time.Duration `mapstructure:"-" json:"-"`
|
||||||
|
|
||||||
// Checks holds the provided check definitions
|
// Checks holds the provided check definitions
|
||||||
Checks []*CheckDefinition `mapstructure:"-" json:"-"`
|
Checks []*CheckDefinition `mapstructure:"-" json:"-"`
|
||||||
|
@ -471,18 +476,25 @@ func DefaultConfig() *Config {
|
||||||
DNSConfig: DNSConfig{
|
DNSConfig: DNSConfig{
|
||||||
MaxStale: 5 * time.Second,
|
MaxStale: 5 * time.Second,
|
||||||
},
|
},
|
||||||
StatsitePrefix: "consul",
|
StatsitePrefix: "consul",
|
||||||
SyslogFacility: "LOCAL0",
|
SyslogFacility: "LOCAL0",
|
||||||
Protocol: consul.ProtocolVersionMax,
|
Protocol: consul.ProtocolVersionMax,
|
||||||
CheckUpdateInterval: 5 * time.Minute,
|
CheckUpdateInterval: 5 * time.Minute,
|
||||||
AEInterval: time.Minute,
|
AEInterval: time.Minute,
|
||||||
DisableCoordinates: false,
|
DisableCoordinates: false,
|
||||||
SyncCoordinateInterval: 20 * time.Second,
|
|
||||||
ACLTTL: 30 * time.Second,
|
// SyncCoordinateRateTarget is set based on the rate that we want
|
||||||
ACLDownPolicy: "extend-cache",
|
// the server to handle as an aggregate across the entire cluster.
|
||||||
ACLDefaultPolicy: "allow",
|
// If you update this, you'll need to adjust CoordinateUpdate* in
|
||||||
RetryInterval: 30 * time.Second,
|
// the server-side config accordingly.
|
||||||
RetryIntervalWan: 30 * time.Second,
|
SyncCoordinateRateTarget: 100.0, // updates / second
|
||||||
|
SyncCoordinateIntervalMin: 5 * time.Second,
|
||||||
|
|
||||||
|
ACLTTL: 30 * time.Second,
|
||||||
|
ACLDownPolicy: "extend-cache",
|
||||||
|
ACLDefaultPolicy: "allow",
|
||||||
|
RetryInterval: 30 * time.Second,
|
||||||
|
RetryIntervalWan: 30 * time.Second,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -806,9 +806,10 @@ func TestAgent_nestedPauseResume(t *testing.T) {
|
||||||
|
|
||||||
func TestAgent_sendCoordinate(t *testing.T) {
|
func TestAgent_sendCoordinate(t *testing.T) {
|
||||||
conf := nextConfig()
|
conf := nextConfig()
|
||||||
conf.SyncCoordinateInterval = 10 * time.Millisecond
|
conf.SyncCoordinateRateTarget = 10.0 // updates/sec
|
||||||
|
conf.SyncCoordinateIntervalMin = 1 * time.Millisecond
|
||||||
conf.ConsulConfig.CoordinateUpdatePeriod = 100 * time.Millisecond
|
conf.ConsulConfig.CoordinateUpdatePeriod = 100 * time.Millisecond
|
||||||
conf.ConsulConfig.CoordinateUpdateBatchSize = 15
|
conf.ConsulConfig.CoordinateUpdateBatchSize = 10
|
||||||
conf.ConsulConfig.CoordinateUpdateMaxBatches = 1
|
conf.ConsulConfig.CoordinateUpdateMaxBatches = 1
|
||||||
dir, agent := makeAgent(t, conf)
|
dir, agent := makeAgent(t, conf)
|
||||||
defer os.RemoveAll(dir)
|
defer os.RemoveAll(dir)
|
||||||
|
|
|
@ -39,6 +39,17 @@ func aeScale(interval time.Duration, n int) time.Duration {
|
||||||
return time.Duration(multiplier) * interval
|
return time.Duration(multiplier) * interval
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// rateScaledInterval is used to choose an interval to perform an action in order
|
||||||
|
// to target an aggregate number of actions per second across the whole cluster.
|
||||||
|
func rateScaledInterval(rate float64, min time.Duration, n int) time.Duration {
|
||||||
|
interval := time.Duration(float64(time.Second) * float64(n) / rate)
|
||||||
|
if interval < min {
|
||||||
|
return min
|
||||||
|
}
|
||||||
|
|
||||||
|
return interval
|
||||||
|
}
|
||||||
|
|
||||||
// Returns a random stagger interval between 0 and the duration
|
// Returns a random stagger interval between 0 and the duration
|
||||||
func randomStagger(intv time.Duration) time.Duration {
|
func randomStagger(intv time.Duration) time.Duration {
|
||||||
return time.Duration(uint64(rand.Int63()) % uint64(intv))
|
return time.Duration(uint64(rand.Int63()) % uint64(intv))
|
||||||
|
|
|
@ -24,6 +24,29 @@ func TestAEScale(t *testing.T) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestRateScaledInterval(t *testing.T) {
|
||||||
|
min := 1*time.Second
|
||||||
|
rate := 200.0
|
||||||
|
if v := rateScaledInterval(rate, min, 0); v != min {
|
||||||
|
t.Fatalf("Bad: %v", v)
|
||||||
|
}
|
||||||
|
if v := rateScaledInterval(rate, min, 100); v != min {
|
||||||
|
t.Fatalf("Bad: %v", v)
|
||||||
|
}
|
||||||
|
if v := rateScaledInterval(rate, min, 200); v != 1*time.Second {
|
||||||
|
t.Fatalf("Bad: %v", v)
|
||||||
|
}
|
||||||
|
if v := rateScaledInterval(rate, min, 1000); v != 5*time.Second {
|
||||||
|
t.Fatalf("Bad: %v", v)
|
||||||
|
}
|
||||||
|
if v := rateScaledInterval(rate, min, 5000); v != 25*time.Second {
|
||||||
|
t.Fatalf("Bad: %v", v)
|
||||||
|
}
|
||||||
|
if v := rateScaledInterval(rate, min, 10000); v != 50*time.Second {
|
||||||
|
t.Fatalf("Bad: %v", v)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func TestRandomStagger(t *testing.T) {
|
func TestRandomStagger(t *testing.T) {
|
||||||
intv := time.Minute
|
intv := time.Minute
|
||||||
for i := 0; i < 10; i++ {
|
for i := 0; i < 10; i++ {
|
||||||
|
|
|
@ -276,13 +276,11 @@ func DefaultConfig() *Config {
|
||||||
SessionTTLMin: 10 * time.Second,
|
SessionTTLMin: 10 * time.Second,
|
||||||
DisableCoordinates: false,
|
DisableCoordinates: false,
|
||||||
|
|
||||||
// SyncCoordinateInterval defaults to 20 seconds, and scales up
|
// These are tuned to provide a total throughput of 128 updates
|
||||||
// as the number of nodes in the cluster goes up. For 100k nodes,
|
// per second. If you update these, you should update the client-
|
||||||
// it will move up to 201 seconds, which gives an update rate of
|
// side SyncCoordinateRateTarget parameter accordingly.
|
||||||
// just under 500 per second coming in from the clients. With this
|
|
||||||
// tuning we will be doing 1 transaction per second at this load.
|
|
||||||
CoordinateUpdatePeriod: 5 * time.Second,
|
CoordinateUpdatePeriod: 5 * time.Second,
|
||||||
CoordinateUpdateBatchSize: 512,
|
CoordinateUpdateBatchSize: 128,
|
||||||
CoordinateUpdateMaxBatches: 5,
|
CoordinateUpdateMaxBatches: 5,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -60,7 +60,7 @@ func (c *Coordinate) batchApplyUpdates() error {
|
||||||
limit := c.srv.config.CoordinateUpdateBatchSize * c.srv.config.CoordinateUpdateMaxBatches
|
limit := c.srv.config.CoordinateUpdateBatchSize * c.srv.config.CoordinateUpdateMaxBatches
|
||||||
size := len(pending)
|
size := len(pending)
|
||||||
if size > limit {
|
if size > limit {
|
||||||
c.srv.logger.Printf("[ERR] consul.coordinate: Discarded %d coordinate updates; increase SyncCoordinateInterval", size - limit)
|
c.srv.logger.Printf("[WARN] consul.coordinate: Discarded %d coordinate updates", size - limit)
|
||||||
size = limit
|
size = limit
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue