From 7c91c084571b63d8bbdff6c3717aaa6cc0347a2c Mon Sep 17 00:00:00 2001 From: Ryan Uber Date: Sun, 12 Oct 2014 10:50:15 -0700 Subject: [PATCH 1/8] agent: first pass at join retry --- command/agent/command.go | 55 ++++++++++++++++++++++++++++++++++++++-- command/agent/config.go | 31 ++++++++++++++++++++++ 2 files changed, 84 insertions(+), 2 deletions(-) diff --git a/command/agent/command.go b/command/agent/command.go index 8058bfb0ab..cc1ccf57a8 100644 --- a/command/agent/command.go +++ b/command/agent/command.go @@ -52,6 +52,7 @@ type Command struct { func (c *Command) readConfig() *Config { var cmdConfig Config var configFiles []string + var retryInterval string cmdFlags := flag.NewFlagSet("agent", flag.ContinueOnError) cmdFlags.Usage = func() { c.Ui.Output(c.Help()) } @@ -82,11 +83,26 @@ func (c *Command) readConfig() *Config { "enable re-joining after a previous leave") cmdFlags.Var((*AppendSliceValue)(&cmdConfig.StartJoin), "join", "address of agent to join on startup") + cmdFlags.Var((*AppendSliceValue)(&cmdConfig.RetryJoin), "retry-join", + "address of agent to join on startup with retry") + cmdFlags.IntVar(&cmdConfig.RetryMaxAttempts, "retry-max", 0, + "number of retries for joining") + cmdFlags.StringVar(&retryInterval, "retry-interval", "", + "interval between join attempts") if err := cmdFlags.Parse(c.args); err != nil { return nil } + if retryInterval != "" { + dur, err := time.ParseDuration(retryInterval) + if err != nil { + c.Ui.Error(fmt.Sprintf("Error: %s", err)) + return nil + } + cmdConfig.RetryInterval = dur + } + config := DefaultConfig() if len(configFiles) > 0 { fileConfig, err := ReadConfigPaths(configFiles) @@ -353,6 +369,35 @@ func (c *Command) startupJoin(config *Config) error { return nil } +func (c *Command) retryJoin(config *Config, errCh chan<- struct{}) { + if len(config.RetryJoin) == 0 { + return + } + + logger := c.agent.logger + logger.Printf("[INFO] agent: Joining cluster...") + + attempt := 0 + for { + n, err := c.agent.JoinLAN(config.RetryJoin) + if err == nil { + logger.Printf("[INFO] agent: Join completed. Synced with %d initial agents", n) + return + } + + attempt++ + if config.RetryMaxAttempts > 0 && attempt > config.RetryMaxAttempts { + logger.Printf("[ERROR] agent: max join retry exhausted, exiting") + close(errCh) + return + } + + logger.Printf("[WARN] agent: Join failed: %v, retrying in %v", err, + config.RetryInterval) + time.Sleep(config.RetryInterval) + } +} + func (c *Command) Run(args []string) int { c.Ui = &cli.PrefixedUi{ OutputPrefix: "==> ", @@ -492,12 +537,16 @@ func (c *Command) Run(args []string) int { c.Ui.Output("Log data will now stream in as it occurs:\n") logGate.Flush() + // Start retry join process + errCh := make(chan struct{}) + go c.retryJoin(config, errCh) + // Wait for exit - return c.handleSignals(config) + return c.handleSignals(config, errCh) } // handleSignals blocks until we get an exit-causing signal -func (c *Command) handleSignals(config *Config) int { +func (c *Command) handleSignals(config *Config, retryJoin <-chan struct{}) int { signalCh := make(chan os.Signal, 4) signal.Notify(signalCh, os.Interrupt, syscall.SIGTERM, syscall.SIGHUP) @@ -511,6 +560,8 @@ WAIT: sig = syscall.SIGHUP case <-c.ShutdownCh: sig = os.Interrupt + case <-retryJoin: + return 1 case <-c.agent.ShutdownCh(): // Agent is already shutdown! return 0 diff --git a/command/agent/config.go b/command/agent/config.go index c9a47fa2d6..419fa90914 100644 --- a/command/agent/config.go +++ b/command/agent/config.go @@ -182,6 +182,20 @@ type Config struct { // addresses, then the agent will error and exit. StartJoin []string `mapstructure:"start_join"` + // RetryJoin is a list of addresses to join with retry enabled. + RetryJoin []string `mapstructure:"retry_join"` + + // RetryMaxAttempts specifies the maximum number of times to retry joining a + // host on startup. This is useful for cases where we know the node will be + // online eventually. + RetryMaxAttempts int `mapstructure:"retry_max"` + + // RetryInterval specifies the amount of time to wait in between join + // attempts on agent start. The minimum allowed value is 1 second and + // the default is 30s. + RetryInterval time.Duration `mapstructure:"-" json:"-"` + RetryIntervalRaw string `mapstructure:"retry_interval"` + // UiDir is the directory containing the Web UI resources. // If provided, the UI endpoints will be enabled. UiDir string `mapstructure:"ui_dir"` @@ -321,6 +335,7 @@ func DefaultConfig() *Config { ACLTTL: 30 * time.Second, ACLDownPolicy: "extend-cache", ACLDefaultPolicy: "allow", + RetryInterval: 30 * time.Second, } } @@ -443,6 +458,14 @@ func DecodeConfig(r io.Reader) (*Config, error) { result.ACLTTL = dur } + if raw := result.RetryIntervalRaw; raw != "" { + dur, err := time.ParseDuration(raw) + if err != nil { + return nil, fmt.Errorf("RetryInterval invalid: %v", err) + } + result.RetryInterval = dur + } + return &result, nil } @@ -674,6 +697,9 @@ func MergeConfig(a, b *Config) *Config { if b.RejoinAfterLeave { result.RejoinAfterLeave = true } + if b.RetryMaxAttempts != 0 { + result.RetryMaxAttempts = b.RetryMaxAttempts + } if b.DNSConfig.NodeTTL != 0 { result.DNSConfig.NodeTTL = b.DNSConfig.NodeTTL } @@ -737,6 +763,11 @@ func MergeConfig(a, b *Config) *Config { result.StartJoin = append(result.StartJoin, a.StartJoin...) result.StartJoin = append(result.StartJoin, b.StartJoin...) + // Copy the retry join addresses + result.RetryJoin = make([]string, 0, len(a.RetryJoin)+len(b.RetryJoin)) + result.RetryJoin = append(result.RetryJoin, a.RetryJoin...) + result.RetryJoin = append(result.RetryJoin, b.RetryJoin...) + return &result } From 6d75fc8fb2789e2768ee8e100f0a343135f36f3a Mon Sep 17 00:00:00 2001 From: Ryan Uber Date: Sun, 12 Oct 2014 10:54:53 -0700 Subject: [PATCH 2/8] agent: merge RetryInterval in config merger --- command/agent/config.go | 3 +++ 1 file changed, 3 insertions(+) diff --git a/command/agent/config.go b/command/agent/config.go index 419fa90914..d253dabcd8 100644 --- a/command/agent/config.go +++ b/command/agent/config.go @@ -700,6 +700,9 @@ func MergeConfig(a, b *Config) *Config { if b.RetryMaxAttempts != 0 { result.RetryMaxAttempts = b.RetryMaxAttempts } + if b.RetryInterval != 0 { + result.RetryInterval = b.RetryInterval + } if b.DNSConfig.NodeTTL != 0 { result.DNSConfig.NodeTTL = b.DNSConfig.NodeTTL } From 4bc4ba2d0944d9fce5be97c26b4046a523797a9d Mon Sep 17 00:00:00 2001 From: Ryan Uber Date: Sun, 12 Oct 2014 11:20:33 -0700 Subject: [PATCH 3/8] agent: test retry join config --- command/agent/config_test.go | 35 +++++++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) diff --git a/command/agent/config_test.go b/command/agent/config_test.go index e12178af2d..291f4022a4 100644 --- a/command/agent/config_test.go +++ b/command/agent/config_test.go @@ -265,6 +265,37 @@ func TestDecodeConfig(t *testing.T) { t.Fatalf("bad: %#v", config) } + // Retry join + input = `{"retry_join": ["1.1.1.1", "2.2.2.2"]}` + config, err = DecodeConfig(bytes.NewReader([]byte(input))) + if err != nil { + t.Fatalf("err: %s", err) + } + + if len(config.RetryJoin) != 2 { + t.Fatalf("bad: %#v", config) + } + if config.RetryJoin[0] != "1.1.1.1" { + t.Fatalf("bad: %#v", config) + } + if config.RetryJoin[1] != "2.2.2.2" { + t.Fatalf("bad: %#v", config) + } + + // Retry interval + input = `{"retry_interval": "10s"}` + config, err = DecodeConfig(bytes.NewReader([]byte(input))) + if err != nil { + t.Fatalf("err: %s", err) + } + + if config.RetryIntervalRaw != "10s" { + t.Fatalf("bad: %#v", config) + } + if config.RetryInterval.String() != "10s" { + t.Fatalf("bad: %#v", config) + } + // UI Dir input = `{"ui_dir": "/opt/consul-ui"}` config, err = DecodeConfig(bytes.NewReader([]byte(input))) @@ -561,6 +592,7 @@ func TestMergeConfig(t *testing.T) { SkipLeaveOnInt: false, EnableDebug: false, CheckUpdateIntervalRaw: "8m", + RetryIntervalRaw: "10s", } b := &Config{ @@ -611,6 +643,9 @@ func TestMergeConfig(t *testing.T) { UiDir: "/opt/consul-ui", EnableSyslog: true, RejoinAfterLeave: true, + RetryJoin: []string{"1.1.1.1"}, + RetryIntervalRaw: "10s", + RetryInterval: 10 * time.Second, CheckUpdateInterval: 8 * time.Minute, CheckUpdateIntervalRaw: "8m", ACLToken: "1234", From 67e1f363e1584ab3950ef7fa9ac6f33dd0814f2e Mon Sep 17 00:00:00 2001 From: Ryan Uber Date: Sun, 12 Oct 2014 12:27:03 -0700 Subject: [PATCH 4/8] agent: add retry join tests --- command/agent/command_test.go | 92 ++++++++++++++++++++++++++++++++++- 1 file changed, 91 insertions(+), 1 deletion(-) diff --git a/command/agent/command_test.go b/command/agent/command_test.go index 23036a5386..f4d0344087 100644 --- a/command/agent/command_test.go +++ b/command/agent/command_test.go @@ -1,8 +1,14 @@ package agent import ( - "github.com/mitchellh/cli" + "fmt" + "io/ioutil" + "log" + "os" "testing" + + "github.com/hashicorp/consul/testutil" + "github.com/mitchellh/cli" ) func TestCommand_implements(t *testing.T) { @@ -31,3 +37,87 @@ func TestValidDatacenter(t *testing.T) { } } } + +func TestRetryJoin(t *testing.T) { + dir, agent := makeAgent(t, nextConfig()) + defer os.RemoveAll(dir) + defer agent.Shutdown() + + conf2 := nextConfig() + tmpDir, err := ioutil.TempDir("", "consul") + if err != nil { + t.Fatalf("err: %s", err) + } + defer os.RemoveAll(tmpDir) + + doneCh := make(chan struct{}) + shutdownCh := make(chan struct{}) + + defer func() { + close(shutdownCh) + <-doneCh + }() + + cmd := &Command{ + ShutdownCh: shutdownCh, + Ui: new(cli.MockUi), + } + + serfAddr := fmt.Sprintf( + "%s:%d", + agent.config.BindAddr, + agent.config.Ports.SerfLan) + + args := []string{ + "-data-dir", tmpDir, + "-node", conf2.NodeName, + "-retry-join", serfAddr, + "-retry-interval", "1s", + } + + go func() { + if code := cmd.Run(args); code != 0 { + log.Printf("bad: %d", code) + } + close(doneCh) + }() + + testutil.WaitForResult(func() (bool, error) { + mem := agent.LANMembers() + if len(mem) != 2 { + return false, fmt.Errorf("bad: %#v", mem) + } + return true, nil + }, func(err error) { + t.Fatalf(err.Error()) + }) +} + +func TestRetryJoinFail(t *testing.T) { + conf := nextConfig() + tmpDir, err := ioutil.TempDir("", "consul") + if err != nil { + t.Fatalf("err: %s", err) + } + defer os.RemoveAll(tmpDir) + + shutdownCh := make(chan struct{}) + defer close(shutdownCh) + + cmd := &Command{ + ShutdownCh: shutdownCh, + Ui: new(cli.MockUi), + } + + serfAddr := fmt.Sprintf("%s:%d", conf.BindAddr, conf.Ports.SerfLan) + + args := []string{ + "-data-dir", tmpDir, + "-retry-join", serfAddr, + "-retry-max", "1", + } + + if code := cmd.Run(args); code == 0 { + t.Fatalf("bad: %d", code) + } +} From bec34b0c467cc80122e539cc4d8becaa33ef429d Mon Sep 17 00:00:00 2001 From: Ryan Uber Date: Sun, 12 Oct 2014 12:31:47 -0700 Subject: [PATCH 5/8] website: document retry join --- website/source/docs/agent/options.html.markdown | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/website/source/docs/agent/options.html.markdown b/website/source/docs/agent/options.html.markdown index 077fe64727..f9839c9383 100644 --- a/website/source/docs/agent/options.html.markdown +++ b/website/source/docs/agent/options.html.markdown @@ -91,6 +91,16 @@ The options below are all specified on the command-line. unable to join with any of the specified addresses, agent startup will fail. By default, the agent won't join any nodes when it starts up. +* `-retry-join` - Similar to `-join`, but allows retrying a join if the first + attempt fails. This is useful for cases where we know the address will become + available eventually. + +* `-retry-interval` - Time to wait between join attempts. Defaults to 30s. + +* `-retry-max` - The maximum number of join attempts to be made before exiting + with return code 1. By default, this is set to 0, which will continue to + retry the join indefinitely. + * `-log-level` - The level of logging to show after the Consul agent has started. This defaults to "info". The available log levels are "trace", "debug", "info", "warn", "err". This is the log level that will be shown From 62b4752804f5b2b1f706187e0f453d6fd6bffdf1 Mon Sep 17 00:00:00 2001 From: Ryan Uber Date: Sun, 12 Oct 2014 12:35:25 -0700 Subject: [PATCH 6/8] command/agent: add help for retry join --- command/agent/command.go | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/command/agent/command.go b/command/agent/command.go index cc1ccf57a8..ce036089de 100644 --- a/command/agent/command.go +++ b/command/agent/command.go @@ -720,6 +720,11 @@ Options: -encrypt=key Provides the gossip encryption key -join=1.2.3.4 Address of an agent to join at start time. Can be specified multiple times. + -retry-join=1.2.3.4 Address of an agent to join at start time with + retries enabled. Can be specified multiple times. + -retry-interval=30s Time to wait between join attempts. + -retry-max=0 Maximum number of join attempts. Defaults to 0, which + will retry indefinitely. -log-level=info Log level of the agent. -node=hostname Name of this node. Must be unique in the cluster -protocol=N Sets the protocol version. Defaults to latest. From 599d0558e7d2b8aed95c31c64698d72d7892d06a Mon Sep 17 00:00:00 2001 From: Ryan Uber Date: Sun, 12 Oct 2014 12:40:52 -0700 Subject: [PATCH 7/8] agent: test max retries in config --- command/agent/config_test.go | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/command/agent/config_test.go b/command/agent/config_test.go index 291f4022a4..afaf2f668d 100644 --- a/command/agent/config_test.go +++ b/command/agent/config_test.go @@ -296,6 +296,17 @@ func TestDecodeConfig(t *testing.T) { t.Fatalf("bad: %#v", config) } + // Retry Max + input = `{"retry_max": 3}` + config, err = DecodeConfig(bytes.NewReader([]byte(input))) + if err != nil { + t.Fatalf("err: %s", err) + } + + if config.RetryMaxAttempts != 3 { + t.Fatalf("bad: %#v", config) + } + // UI Dir input = `{"ui_dir": "/opt/consul-ui"}` config, err = DecodeConfig(bytes.NewReader([]byte(input))) From b35578a9178f94643f7d97c28ca89ff366af4c16 Mon Sep 17 00:00:00 2001 From: Ryan Uber Date: Sun, 12 Oct 2014 12:45:40 -0700 Subject: [PATCH 8/8] command: formatting --- command/agent/command.go | 2 ++ 1 file changed, 2 insertions(+) diff --git a/command/agent/command.go b/command/agent/command.go index ce036089de..5cf54eb5e0 100644 --- a/command/agent/command.go +++ b/command/agent/command.go @@ -369,6 +369,8 @@ func (c *Command) startupJoin(config *Config) error { return nil } +// retryJoin is used to handle retrying a join until it succeeds or all +// retries are exhausted. func (c *Command) retryJoin(config *Config, errCh chan<- struct{}) { if len(config.RetryJoin) == 0 { return