From b023904298ea5361bebbf9eaadc9418f583a69e6 Mon Sep 17 00:00:00 2001 From: Peter Fern Date: Thu, 23 Jul 2015 21:45:08 +1000 Subject: [PATCH 1/2] Add TCP check type Adds the ability to simply check whether a TCP socket accepts connections to determine if it is healthy. This is a light-weight - though less comprehensive than scripting - method of checking network service health. The check parameter `tcp` should be set to the `address:port` combination for the service to be tested. Supports both IPv6 and IPv4, in the case of a hostname that resolves to both, connections will be attempted via both protocol versions, with the first successful connection returning a successful check result. Example check: ```json { "check": { "id": "ssh", "name": "SSH (TCP)", "tcp": "example.com:22", "interval": "10s" } } ``` --- api/agent.go | 1 + command/agent/agent.go | 33 +++++++++++++ command/agent/check.go | 98 +++++++++++++++++++++++++++++++++++-- command/agent/check_test.go | 72 +++++++++++++++++++++++++++ 4 files changed, 200 insertions(+), 4 deletions(-) diff --git a/api/agent.go b/api/agent.go index e56a18dcd2..2b950d0a3e 100644 --- a/api/agent.go +++ b/api/agent.go @@ -68,6 +68,7 @@ type AgentServiceCheck struct { Timeout string `json:",omitempty"` TTL string `json:",omitempty"` HTTP string `json:",omitempty"` + TCP string `json:",omitempty"` Status string `json:",omitempty"` } type AgentServiceChecks []*AgentServiceCheck diff --git a/command/agent/agent.go b/command/agent/agent.go index 449b03c476..397f616a43 100644 --- a/command/agent/agent.go +++ b/command/agent/agent.go @@ -75,6 +75,9 @@ type Agent struct { // checkHTTPs maps the check ID to an associated HTTP check checkHTTPs map[string]*CheckHTTP + // checkTCPs maps the check ID to an associated TCP check + checkTCPs map[string]*CheckTCP + // checkTTLs maps the check ID to an associated check TTL checkTTLs map[string]*CheckTTL @@ -145,6 +148,7 @@ func Create(config *Config, logOutput io.Writer) (*Agent, error) { checkMonitors: make(map[string]*CheckMonitor), checkTTLs: make(map[string]*CheckTTL), checkHTTPs: make(map[string]*CheckHTTP), + checkTCPs: make(map[string]*CheckTCP), eventCh: make(chan serf.UserEvent, 1024), eventBuf: make([]*UserEvent, 256), shutdownCh: make(chan struct{}), @@ -440,6 +444,10 @@ func (a *Agent) Shutdown() error { chk.Stop() } + for _, chk := range a.checkTCPs { + chk.Stop() + } + a.logger.Println("[INFO] agent: requesting shutdown") var err error if a.server != nil { @@ -801,6 +809,27 @@ func (a *Agent) AddCheck(check *structs.HealthCheck, chkType *CheckType, persist http.Start() a.checkHTTPs[check.CheckID] = http + } else if chkType.IsTCP() { + if existing, ok := a.checkTCPs[check.CheckID]; ok { + existing.Stop() + } + if chkType.Interval < MinInterval { + a.logger.Println(fmt.Sprintf("[WARN] agent: check '%s' has interval below minimum of %v", + check.CheckID, MinInterval)) + chkType.Interval = MinInterval + } + + tcp := &CheckTCP{ + Notify: &a.state, + CheckID: check.CheckID, + TCP: chkType.TCP, + Interval: chkType.Interval, + Timeout: chkType.Timeout, + Logger: a.logger, + } + tcp.Start() + a.checkTCPs[check.CheckID] = tcp + } else { if existing, ok := a.checkMonitors[check.CheckID]; ok { existing.Stop() @@ -857,6 +886,10 @@ func (a *Agent) RemoveCheck(checkID string, persist bool) error { check.Stop() delete(a.checkHTTPs, checkID) } + if check, ok := a.checkTCPs[checkID]; ok { + check.Stop() + delete(a.checkTCPs, checkID) + } if check, ok := a.checkTTLs[checkID]; ok { check.Stop() delete(a.checkTTLs, checkID) diff --git a/command/agent/check.go b/command/agent/check.go index 6677483886..97ac592eb4 100644 --- a/command/agent/check.go +++ b/command/agent/check.go @@ -4,6 +4,7 @@ import ( "fmt" "io/ioutil" "log" + "net" "net/http" "os/exec" "sync" @@ -31,13 +32,14 @@ const ( // CheckType is used to create either the CheckMonitor // or the CheckTTL. -// Three types are supported: Script, HTTP, and TTL -// Script and HTTP both require Interval +// Four types are supported: Script, HTTP, TCP and TTL +// Script, HTTP and TCP all require Interval // Only one of the types needs to be provided -// TTL or Script/Interval or HTTP/Interval +// TTL or Script/Interval or HTTP/Interval or TCP/Interval type CheckType struct { Script string HTTP string + TCP string Interval time.Duration Timeout time.Duration @@ -51,7 +53,7 @@ type CheckTypes []*CheckType // Valid checks if the CheckType is valid func (c *CheckType) Valid() bool { - return c.IsTTL() || c.IsMonitor() || c.IsHTTP() + return c.IsTTL() || c.IsMonitor() || c.IsHTTP() || c.IsTCP() } // IsTTL checks if this is a TTL type @@ -69,6 +71,11 @@ func (c *CheckType) IsHTTP() bool { return c.HTTP != "" && c.Interval != 0 } +// IsTCP checks if this is a TCP type +func (c *CheckType) IsTCP() bool { + return c.TCP != "" && c.Interval != 0 +} + // CheckNotifier interface is used by the CheckMonitor // to notify when a check has a status update. The update // should take care to be idempotent. @@ -402,3 +409,86 @@ func (c *CheckHTTP) check() { c.Notify.UpdateCheck(c.CheckID, structs.HealthCritical, result) } } + +// CheckTCP is used to periodically make an TCP/UDP connection to +// determine the health of a given check. +// The check is passing if the connection succeeds +// The check is critical if the connection returns an error +type CheckTCP struct { + Notify CheckNotifier + CheckID string + TCP string + Interval time.Duration + Timeout time.Duration + Logger *log.Logger + + dialer *net.Dialer + stop bool + stopCh chan struct{} + stopLock sync.Mutex +} + +// Start is used to start a TCP check. +// The check runs until stop is called +func (c *CheckTCP) Start() { + c.stopLock.Lock() + defer c.stopLock.Unlock() + + if c.dialer == nil { + // Create the socket dialer + c.dialer = &net.Dialer{DualStack: true} + + // For long (>10s) interval checks the socket timeout is 10s, otherwise + // the timeout is the interval. This means that a check *should* return + // before the next check begins. + if c.Timeout > 0 && c.Timeout < c.Interval { + c.dialer.Timeout = c.Timeout + } else if c.Interval < 10*time.Second { + c.dialer.Timeout = c.Interval + } + } + + c.stop = false + c.stopCh = make(chan struct{}) + go c.run() +} + +// Stop is used to stop a TCP check. +func (c *CheckTCP) Stop() { + c.stopLock.Lock() + defer c.stopLock.Unlock() + if !c.stop { + c.stop = true + close(c.stopCh) + } +} + +// run is invoked by a goroutine to run until Stop() is called +func (c *CheckTCP) run() { + // Get the randomized initial pause time + initialPauseTime := randomStagger(c.Interval) + c.Logger.Printf("[DEBUG] agent: pausing %v before first socket connection of %s", initialPauseTime, c.TCP) + next := time.After(initialPauseTime) + for { + select { + case <-next: + c.check() + next = time.After(c.Interval) + case <-c.stopCh: + return + } + } +} + +// check is invoked periodically to perform the TCP check +func (c *CheckTCP) check() { + conn, err := c.dialer.Dial(`tcp`, c.TCP) + if err != nil { + c.Logger.Printf("[WARN] agent: socket connection failed '%s': %s", c.TCP, err) + c.Notify.UpdateCheck(c.CheckID, structs.HealthCritical, err.Error()) + return + } + conn.Close() + c.Logger.Printf("[DEBUG] agent: check '%v' is passing", c.CheckID) + c.Notify.UpdateCheck(c.CheckID, structs.HealthPassing, fmt.Sprintf("TCP connect %s: Success", c.TCP)) +} diff --git a/command/agent/check_test.go b/command/agent/check_test.go index 3fbd0ff97a..6b9f59df7e 100644 --- a/command/agent/check_test.go +++ b/command/agent/check_test.go @@ -3,6 +3,7 @@ package agent import ( "fmt" "log" + "net" "net/http" "net/http/httptest" "os" @@ -321,3 +322,74 @@ func TestCheckHTTP_disablesKeepAlives(t *testing.T) { t.Fatalf("should have disabled keepalives") } } + +func mockTCPServer(network string) net.Listener { + var ( + addr string + ) + + if network == `tcp6` { + addr = `[::1]:0` + } else { + addr = `127.0.0.1:0` + } + + listener, err := net.Listen(network, addr) + if err != nil { + panic(err) + } + + return listener +} + +func expectTCPStatus(t *testing.T, tcp string, status string) { + mock := &MockNotify{ + state: make(map[string]string), + updates: make(map[string]int), + output: make(map[string]string), + } + check := &CheckTCP{ + Notify: mock, + CheckID: "foo", + TCP: tcp, + Interval: 10 * time.Millisecond, + Logger: log.New(os.Stderr, "", log.LstdFlags), + } + check.Start() + defer check.Stop() + + time.Sleep(50 * time.Millisecond) + + // Should have at least 2 updates + if mock.updates["foo"] < 2 { + t.Fatalf("should have 2 updates %v", mock.updates) + } + + if mock.state["foo"] != status { + t.Fatalf("should be %v %v", status, mock.state) + } +} + +func TestCheckTCPCritical(t *testing.T) { + var ( + tcpServer net.Listener + ) + + tcpServer = mockTCPServer(`tcp`) + expectTCPStatus(t, `127.0.0.1:0`, "critical") + tcpServer.Close() +} + +func TestCheckTCPPassing(t *testing.T) { + var ( + tcpServer net.Listener + ) + + tcpServer = mockTCPServer(`tcp`) + expectTCPStatus(t, tcpServer.Addr().String(), "passing") + tcpServer.Close() + + tcpServer = mockTCPServer(`tcp6`) + expectTCPStatus(t, tcpServer.Addr().String(), "passing") + tcpServer.Close() +} From 916ff7e5fa40684b23ff3a6d9f5f12cc38f98d0e Mon Sep 17 00:00:00 2001 From: Peter Fern Date: Mon, 27 Jul 2015 10:53:52 +1000 Subject: [PATCH 2/2] Document `TCP` check type --- .../source/docs/agent/checks.html.markdown | 30 ++++++++++++++++++- .../docs/agent/http/agent.html.markdown | 17 ++++++++--- .../source/docs/agent/services.html.markdown | 15 +++++----- 3 files changed, 50 insertions(+), 12 deletions(-) diff --git a/website/source/docs/agent/checks.html.markdown b/website/source/docs/agent/checks.html.markdown index 19789910e7..336b6e99fb 100644 --- a/website/source/docs/agent/checks.html.markdown +++ b/website/source/docs/agent/checks.html.markdown @@ -31,6 +31,20 @@ There are three different kinds of checks: It is possible to configure a custom HTTP check timeout value by specifying the `timeout` field in the check definition. +* TCP + Interval - These checks make an TCP connection attempt every Interval + (e.g. every 30 seconds) to the specified IP/hostname and port. The status of + the service depends on whether the connection attempt is successful (ie - the + port is currently accepting connections). If the connection is accepted, the + status is `success`, otherwise the status is `critical`. In the case of a + hostname that resolves to both IPv4 and IPv6 addresses, an attempt will be + made to both addresses, and the first successful connection attempt will + result in a successful check. This type of check should be preferred over a + script that uses `netcat` or another external process to check a simple socket + operation. By default, TCP checks will be configured with a request timeout + equal to the check interval, with a max of 10 seconds. It is possible to + configure a custom TCP check timeout value by specifying the `timeout` field + in the check definition. + * Time to Live (TTL) - These checks retain their last known state for a given TTL. The state of the check must be updated periodically over the HTTP interface. If an external system fails to update the status within a given TTL, the check is @@ -75,6 +89,20 @@ A HTTP check: } ``` +A TCP check: + +```javascript +{ + "check": { + "id": "ssh", + "name": "SSH TCP on port 22", + "tcp": "localhost:22", + "interval": "10s", + "timeout": "1s" + } +} +``` + A TTL check: ```javascript @@ -102,7 +130,7 @@ Checks may also contain a `token` field to provide an ACL token. This token is used for any interaction with the catalog for the check, including [anti-entropy syncs](/docs/internals/anti-entropy.html) and deregistration. -Both script and HTTP checks must include an `interval` field. This field is +Script, TCP and HTTP checks must include an `interval` field. This field is parsed by Go's `time` package, and has the following [formatting specification](http://golang.org/pkg/time/#ParseDuration): > A duration string is a possibly signed sequence of decimal numbers, each with diff --git a/website/source/docs/agent/http/agent.html.markdown b/website/source/docs/agent/http/agent.html.markdown index 0b16d5bc3f..4f3670b715 100644 --- a/website/source/docs/agent/http/agent.html.markdown +++ b/website/source/docs/agent/http/agent.html.markdown @@ -224,8 +224,8 @@ The endpoint always returns 200. The register endpoint is used to add a new check to the local agent. There is more documentation on checks [here](/docs/agent/checks.html). -Checks may be of script, HTTP, or TTL type. The agent is responsible for managing -the status of the check and keeping the Catalog in sync. +Checks may be of script, HTTP, TCP, or TTL type. The agent is responsible for +managing the status of the check and keeping the Catalog in sync. The register endpoint expects a JSON request body to be PUT. The request body must look like: @@ -237,13 +237,14 @@ body must look like: "Notes": "Ensure we don't oversubscribe memory", "Script": "/usr/local/bin/check_mem.py", "HTTP": "http://example.com", + "TCP": "example.com:22", "Interval": "10s", "TTL": "15s" } ``` -The `Name` field is mandatory, as is one of `Script`, `HTTP` or `TTL`. -`Script` and `HTTP` also require that `Interval` be set. +The `Name` field is mandatory, as is one of `Script`, `HTTP`, `TCP` or `TTL`. +`Script`, `TCP` and `HTTP` also require that `Interval` be set. If an `ID` is not provided, it is set to `Name`. You cannot have duplicate `ID` entries per agent, so it may be necessary to provide an `ID`. @@ -258,6 +259,14 @@ be a URL) every `Interval`. If the response is any `2xx` code, the check is `pas If the response is `429 Too Many Requests`, the check is `warning`. Otherwise, the check is `critical`. +An `TCP` check will perform an TCP connection attempt against the value of `TCP` +(expected to be an IP/hostname and port combination) every `Interval`. If the +connection attempt is successful, the check is `passing`. If the connection +attempt is unsuccessful, the check is `critical`. In the case of a hostname +that resolves to both IPv4 and IPv6 addresses, an attempt will be made to both +addresses, and the first successful connection attempt will result in a +successful check. + If a `TTL` type is used, then the TTL update endpoint must be used periodically to update the state of the check. diff --git a/website/source/docs/agent/services.html.markdown b/website/source/docs/agent/services.html.markdown index 79327201c6..1b589b31b0 100644 --- a/website/source/docs/agent/services.html.markdown +++ b/website/source/docs/agent/services.html.markdown @@ -62,13 +62,14 @@ the DNS interface as well. If a service is failing its health check or a node has any failing system-level check, the DNS interface will omit that node from any service query. -The check must be of the script, HTTP, or TTL type. If it is a script type, `script` -and `interval` must be provided. If it is a HTTP type, `http` and -`interval` must be provided. If it is a TTL type, then only `ttl` must be -provided. The check name is automatically generated as -`service:`. If there are multiple service checks registered, the -ID will be generated as `service::` where `` is an -incrementing number starting from `1`. +The check must be of the script, HTTP, TCP or TTL type. If it is a script type, +`script` and `interval` must be provided. If it is a HTTP type, `http` and +`interval` must be provided. If it is a TCP type, `tcp` and `interval` must be +provided. If it is a TTL type, then only `ttl` must be provided. The check name +is automatically generated as `service:`. If there are multiple +service checks registered, the ID will be generated as +`service::` where `` is an incrementing number starting +from `1`. Note: there is more information about [checks here](/docs/agent/checks.html).