Merge pull request #1130 from pdf/check_socket

Add Socket check type
This commit is contained in:
Armon Dadgar 2015-07-27 14:21:24 -07:00
commit 4a9b91f2a2
7 changed files with 250 additions and 16 deletions

View File

@ -68,6 +68,7 @@ type AgentServiceCheck struct {
Timeout string `json:",omitempty"`
TTL string `json:",omitempty"`
HTTP string `json:",omitempty"`
TCP string `json:",omitempty"`
Status string `json:",omitempty"`
}
type AgentServiceChecks []*AgentServiceCheck

View File

@ -75,6 +75,9 @@ type Agent struct {
// checkHTTPs maps the check ID to an associated HTTP check
checkHTTPs map[string]*CheckHTTP
// checkTCPs maps the check ID to an associated TCP check
checkTCPs map[string]*CheckTCP
// checkTTLs maps the check ID to an associated check TTL
checkTTLs map[string]*CheckTTL
@ -145,6 +148,7 @@ func Create(config *Config, logOutput io.Writer) (*Agent, error) {
checkMonitors: make(map[string]*CheckMonitor),
checkTTLs: make(map[string]*CheckTTL),
checkHTTPs: make(map[string]*CheckHTTP),
checkTCPs: make(map[string]*CheckTCP),
eventCh: make(chan serf.UserEvent, 1024),
eventBuf: make([]*UserEvent, 256),
shutdownCh: make(chan struct{}),
@ -440,6 +444,10 @@ func (a *Agent) Shutdown() error {
chk.Stop()
}
for _, chk := range a.checkTCPs {
chk.Stop()
}
a.logger.Println("[INFO] agent: requesting shutdown")
var err error
if a.server != nil {
@ -801,6 +809,27 @@ func (a *Agent) AddCheck(check *structs.HealthCheck, chkType *CheckType, persist
http.Start()
a.checkHTTPs[check.CheckID] = http
} else if chkType.IsTCP() {
if existing, ok := a.checkTCPs[check.CheckID]; ok {
existing.Stop()
}
if chkType.Interval < MinInterval {
a.logger.Println(fmt.Sprintf("[WARN] agent: check '%s' has interval below minimum of %v",
check.CheckID, MinInterval))
chkType.Interval = MinInterval
}
tcp := &CheckTCP{
Notify: &a.state,
CheckID: check.CheckID,
TCP: chkType.TCP,
Interval: chkType.Interval,
Timeout: chkType.Timeout,
Logger: a.logger,
}
tcp.Start()
a.checkTCPs[check.CheckID] = tcp
} else {
if existing, ok := a.checkMonitors[check.CheckID]; ok {
existing.Stop()
@ -857,6 +886,10 @@ func (a *Agent) RemoveCheck(checkID string, persist bool) error {
check.Stop()
delete(a.checkHTTPs, checkID)
}
if check, ok := a.checkTCPs[checkID]; ok {
check.Stop()
delete(a.checkTCPs, checkID)
}
if check, ok := a.checkTTLs[checkID]; ok {
check.Stop()
delete(a.checkTTLs, checkID)

View File

@ -4,6 +4,7 @@ import (
"fmt"
"io/ioutil"
"log"
"net"
"net/http"
"os/exec"
"sync"
@ -31,13 +32,14 @@ const (
// CheckType is used to create either the CheckMonitor
// or the CheckTTL.
// Three types are supported: Script, HTTP, and TTL
// Script and HTTP both require Interval
// Four types are supported: Script, HTTP, TCP and TTL
// Script, HTTP and TCP all require Interval
// Only one of the types needs to be provided
// TTL or Script/Interval or HTTP/Interval
// TTL or Script/Interval or HTTP/Interval or TCP/Interval
type CheckType struct {
Script string
HTTP string
TCP string
Interval time.Duration
Timeout time.Duration
@ -51,7 +53,7 @@ type CheckTypes []*CheckType
// Valid checks if the CheckType is valid
func (c *CheckType) Valid() bool {
return c.IsTTL() || c.IsMonitor() || c.IsHTTP()
return c.IsTTL() || c.IsMonitor() || c.IsHTTP() || c.IsTCP()
}
// IsTTL checks if this is a TTL type
@ -69,6 +71,11 @@ func (c *CheckType) IsHTTP() bool {
return c.HTTP != "" && c.Interval != 0
}
// IsTCP checks if this is a TCP type
func (c *CheckType) IsTCP() bool {
return c.TCP != "" && c.Interval != 0
}
// CheckNotifier interface is used by the CheckMonitor
// to notify when a check has a status update. The update
// should take care to be idempotent.
@ -402,3 +409,86 @@ func (c *CheckHTTP) check() {
c.Notify.UpdateCheck(c.CheckID, structs.HealthCritical, result)
}
}
// CheckTCP is used to periodically make an TCP/UDP connection to
// determine the health of a given check.
// The check is passing if the connection succeeds
// The check is critical if the connection returns an error
type CheckTCP struct {
Notify CheckNotifier
CheckID string
TCP string
Interval time.Duration
Timeout time.Duration
Logger *log.Logger
dialer *net.Dialer
stop bool
stopCh chan struct{}
stopLock sync.Mutex
}
// Start is used to start a TCP check.
// The check runs until stop is called
func (c *CheckTCP) Start() {
c.stopLock.Lock()
defer c.stopLock.Unlock()
if c.dialer == nil {
// Create the socket dialer
c.dialer = &net.Dialer{DualStack: true}
// For long (>10s) interval checks the socket timeout is 10s, otherwise
// the timeout is the interval. This means that a check *should* return
// before the next check begins.
if c.Timeout > 0 && c.Timeout < c.Interval {
c.dialer.Timeout = c.Timeout
} else if c.Interval < 10*time.Second {
c.dialer.Timeout = c.Interval
}
}
c.stop = false
c.stopCh = make(chan struct{})
go c.run()
}
// Stop is used to stop a TCP check.
func (c *CheckTCP) Stop() {
c.stopLock.Lock()
defer c.stopLock.Unlock()
if !c.stop {
c.stop = true
close(c.stopCh)
}
}
// run is invoked by a goroutine to run until Stop() is called
func (c *CheckTCP) run() {
// Get the randomized initial pause time
initialPauseTime := randomStagger(c.Interval)
c.Logger.Printf("[DEBUG] agent: pausing %v before first socket connection of %s", initialPauseTime, c.TCP)
next := time.After(initialPauseTime)
for {
select {
case <-next:
c.check()
next = time.After(c.Interval)
case <-c.stopCh:
return
}
}
}
// check is invoked periodically to perform the TCP check
func (c *CheckTCP) check() {
conn, err := c.dialer.Dial(`tcp`, c.TCP)
if err != nil {
c.Logger.Printf("[WARN] agent: socket connection failed '%s': %s", c.TCP, err)
c.Notify.UpdateCheck(c.CheckID, structs.HealthCritical, err.Error())
return
}
conn.Close()
c.Logger.Printf("[DEBUG] agent: check '%v' is passing", c.CheckID)
c.Notify.UpdateCheck(c.CheckID, structs.HealthPassing, fmt.Sprintf("TCP connect %s: Success", c.TCP))
}

View File

@ -3,6 +3,7 @@ package agent
import (
"fmt"
"log"
"net"
"net/http"
"net/http/httptest"
"os"
@ -321,3 +322,74 @@ func TestCheckHTTP_disablesKeepAlives(t *testing.T) {
t.Fatalf("should have disabled keepalives")
}
}
func mockTCPServer(network string) net.Listener {
var (
addr string
)
if network == `tcp6` {
addr = `[::1]:0`
} else {
addr = `127.0.0.1:0`
}
listener, err := net.Listen(network, addr)
if err != nil {
panic(err)
}
return listener
}
func expectTCPStatus(t *testing.T, tcp string, status string) {
mock := &MockNotify{
state: make(map[string]string),
updates: make(map[string]int),
output: make(map[string]string),
}
check := &CheckTCP{
Notify: mock,
CheckID: "foo",
TCP: tcp,
Interval: 10 * time.Millisecond,
Logger: log.New(os.Stderr, "", log.LstdFlags),
}
check.Start()
defer check.Stop()
time.Sleep(50 * time.Millisecond)
// Should have at least 2 updates
if mock.updates["foo"] < 2 {
t.Fatalf("should have 2 updates %v", mock.updates)
}
if mock.state["foo"] != status {
t.Fatalf("should be %v %v", status, mock.state)
}
}
func TestCheckTCPCritical(t *testing.T) {
var (
tcpServer net.Listener
)
tcpServer = mockTCPServer(`tcp`)
expectTCPStatus(t, `127.0.0.1:0`, "critical")
tcpServer.Close()
}
func TestCheckTCPPassing(t *testing.T) {
var (
tcpServer net.Listener
)
tcpServer = mockTCPServer(`tcp`)
expectTCPStatus(t, tcpServer.Addr().String(), "passing")
tcpServer.Close()
tcpServer = mockTCPServer(`tcp6`)
expectTCPStatus(t, tcpServer.Addr().String(), "passing")
tcpServer.Close()
}

View File

@ -31,6 +31,20 @@ There are three different kinds of checks:
It is possible to configure a custom HTTP check timeout value by specifying
the `timeout` field in the check definition.
* TCP + Interval - These checks make an TCP connection attempt every Interval
(e.g. every 30 seconds) to the specified IP/hostname and port. The status of
the service depends on whether the connection attempt is successful (ie - the
port is currently accepting connections). If the connection is accepted, the
status is `success`, otherwise the status is `critical`. In the case of a
hostname that resolves to both IPv4 and IPv6 addresses, an attempt will be
made to both addresses, and the first successful connection attempt will
result in a successful check. This type of check should be preferred over a
script that uses `netcat` or another external process to check a simple socket
operation. By default, TCP checks will be configured with a request timeout
equal to the check interval, with a max of 10 seconds. It is possible to
configure a custom TCP check timeout value by specifying the `timeout` field
in the check definition.
* <a name="TTL"></a>Time to Live (TTL) - These checks retain their last known state for a given TTL.
The state of the check must be updated periodically over the HTTP interface. If an
external system fails to update the status within a given TTL, the check is
@ -75,6 +89,20 @@ A HTTP check:
}
```
A TCP check:
```javascript
{
"check": {
"id": "ssh",
"name": "SSH TCP on port 22",
"tcp": "localhost:22",
"interval": "10s",
"timeout": "1s"
}
}
```
A TTL check:
```javascript
@ -102,7 +130,7 @@ Checks may also contain a `token` field to provide an ACL token. This token is
used for any interaction with the catalog for the check, including
[anti-entropy syncs](/docs/internals/anti-entropy.html) and deregistration.
Both script and HTTP checks must include an `interval` field. This field is
Script, TCP and HTTP checks must include an `interval` field. This field is
parsed by Go's `time` package, and has the following
[formatting specification](http://golang.org/pkg/time/#ParseDuration):
> A duration string is a possibly signed sequence of decimal numbers, each with

View File

@ -224,8 +224,8 @@ The endpoint always returns 200.
The register endpoint is used to add a new check to the local agent.
There is more documentation on checks [here](/docs/agent/checks.html).
Checks may be of script, HTTP, or TTL type. The agent is responsible for managing
the status of the check and keeping the Catalog in sync.
Checks may be of script, HTTP, TCP, or TTL type. The agent is responsible for
managing the status of the check and keeping the Catalog in sync.
The register endpoint expects a JSON request body to be PUT. The request
body must look like:
@ -237,13 +237,14 @@ body must look like:
"Notes": "Ensure we don't oversubscribe memory",
"Script": "/usr/local/bin/check_mem.py",
"HTTP": "http://example.com",
"TCP": "example.com:22",
"Interval": "10s",
"TTL": "15s"
}
```
The `Name` field is mandatory, as is one of `Script`, `HTTP` or `TTL`.
`Script` and `HTTP` also require that `Interval` be set.
The `Name` field is mandatory, as is one of `Script`, `HTTP`, `TCP` or `TTL`.
`Script`, `TCP` and `HTTP` also require that `Interval` be set.
If an `ID` is not provided, it is set to `Name`. You cannot have duplicate
`ID` entries per agent, so it may be necessary to provide an `ID`.
@ -258,6 +259,14 @@ be a URL) every `Interval`. If the response is any `2xx` code, the check is `pas
If the response is `429 Too Many Requests`, the check is `warning`. Otherwise, the check
is `critical`.
An `TCP` check will perform an TCP connection attempt against the value of `TCP`
(expected to be an IP/hostname and port combination) every `Interval`. If the
connection attempt is successful, the check is `passing`. If the connection
attempt is unsuccessful, the check is `critical`. In the case of a hostname
that resolves to both IPv4 and IPv6 addresses, an attempt will be made to both
addresses, and the first successful connection attempt will result in a
successful check.
If a `TTL` type is used, then the TTL update endpoint must be used periodically to update
the state of the check.

View File

@ -62,13 +62,14 @@ the DNS interface as well. If a service is failing its health check or a
node has any failing system-level check, the DNS interface will omit that
node from any service query.
The check must be of the script, HTTP, or TTL type. If it is a script type, `script`
and `interval` must be provided. If it is a HTTP type, `http` and
`interval` must be provided. If it is a TTL type, then only `ttl` must be
provided. The check name is automatically generated as
`service:<service-id>`. If there are multiple service checks registered, the
ID will be generated as `service:<service-id>:<num>` where `<num>` is an
incrementing number starting from `1`.
The check must be of the script, HTTP, TCP or TTL type. If it is a script type,
`script` and `interval` must be provided. If it is a HTTP type, `http` and
`interval` must be provided. If it is a TCP type, `tcp` and `interval` must be
provided. If it is a TTL type, then only `ttl` must be provided. The check name
is automatically generated as `service:<service-id>`. If there are multiple
service checks registered, the ID will be generated as
`service:<service-id>:<num>` where `<num>` is an incrementing number starting
from `1`.
Note: there is more information about [checks here](/docs/agent/checks.html).