mirror of https://github.com/status-im/consul.git
command/agent: Add simple HTTP check type
These checks make an `HTTP GET` request every Interval to the specified URL. The status of the service depends on the HTTP Response Code. `200` is passing, `503` is warning and anything else is failing.
This commit is contained in:
parent
8b320b852e
commit
fb5ba8d97d
|
@ -6,6 +6,7 @@ import (
|
||||||
"io"
|
"io"
|
||||||
"log"
|
"log"
|
||||||
"net"
|
"net"
|
||||||
|
"net/http"
|
||||||
"os"
|
"os"
|
||||||
"path/filepath"
|
"path/filepath"
|
||||||
"strconv"
|
"strconv"
|
||||||
|
@ -51,11 +52,16 @@ type Agent struct {
|
||||||
state localState
|
state localState
|
||||||
|
|
||||||
// checkMonitors maps the check ID to an associated monitor
|
// checkMonitors maps the check ID to an associated monitor
|
||||||
// checkTTLs maps the check ID to an associated check TTL
|
|
||||||
// checkLock protects updates to either
|
|
||||||
checkMonitors map[string]*CheckMonitor
|
checkMonitors map[string]*CheckMonitor
|
||||||
checkTTLs map[string]*CheckTTL
|
|
||||||
checkLock sync.Mutex
|
// checkHTTPs maps the check ID to an associated HTTP check
|
||||||
|
checkHTTPs map[string]*CheckHTTP
|
||||||
|
|
||||||
|
// checkTTLs maps the check ID to an associated check TTL
|
||||||
|
checkTTLs map[string]*CheckTTL
|
||||||
|
|
||||||
|
// checkLock protects updates to the check* maps
|
||||||
|
checkLock sync.Mutex
|
||||||
|
|
||||||
// eventCh is used to receive user events
|
// eventCh is used to receive user events
|
||||||
eventCh chan serf.UserEvent
|
eventCh chan serf.UserEvent
|
||||||
|
@ -111,6 +117,7 @@ func Create(config *Config, logOutput io.Writer) (*Agent, error) {
|
||||||
logOutput: logOutput,
|
logOutput: logOutput,
|
||||||
checkMonitors: make(map[string]*CheckMonitor),
|
checkMonitors: make(map[string]*CheckMonitor),
|
||||||
checkTTLs: make(map[string]*CheckTTL),
|
checkTTLs: make(map[string]*CheckTTL),
|
||||||
|
checkHTTPs: make(map[string]*CheckHTTP),
|
||||||
eventCh: make(chan serf.UserEvent, 1024),
|
eventCh: make(chan serf.UserEvent, 1024),
|
||||||
eventBuf: make([]*UserEvent, 256),
|
eventBuf: make([]*UserEvent, 256),
|
||||||
shutdownCh: make(chan struct{}),
|
shutdownCh: make(chan struct{}),
|
||||||
|
@ -382,6 +389,10 @@ func (a *Agent) Shutdown() error {
|
||||||
chk.Stop()
|
chk.Stop()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
for _, chk := range a.checkHTTPs {
|
||||||
|
chk.Stop()
|
||||||
|
}
|
||||||
|
|
||||||
a.logger.Println("[INFO] agent: requesting shutdown")
|
a.logger.Println("[INFO] agent: requesting shutdown")
|
||||||
var err error
|
var err error
|
||||||
if a.server != nil {
|
if a.server != nil {
|
||||||
|
@ -661,6 +672,29 @@ func (a *Agent) AddCheck(check *structs.HealthCheck, chkType *CheckType, persist
|
||||||
ttl.Start()
|
ttl.Start()
|
||||||
a.checkTTLs[check.CheckID] = ttl
|
a.checkTTLs[check.CheckID] = ttl
|
||||||
|
|
||||||
|
} else if chkType.IsHTTP() {
|
||||||
|
if existing, ok := a.checkHTTPs[check.CheckID]; ok {
|
||||||
|
existing.Stop()
|
||||||
|
}
|
||||||
|
if chkType.Interval < MinInterval {
|
||||||
|
a.logger.Println(fmt.Sprintf("[WARN] agent: check '%s' has interval below minimum of %v",
|
||||||
|
check.CheckID, MinInterval))
|
||||||
|
chkType.Interval = MinInterval
|
||||||
|
}
|
||||||
|
|
||||||
|
http := &CheckHTTP{
|
||||||
|
Notify: &a.state,
|
||||||
|
CheckID: check.CheckID,
|
||||||
|
HTTP: chkType.HTTP,
|
||||||
|
Interval: chkType.Interval,
|
||||||
|
Logger: a.logger,
|
||||||
|
httpClient: &http.Client{
|
||||||
|
Timeout: chkType.Interval,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
http.Start()
|
||||||
|
a.checkHTTPs[check.CheckID] = http
|
||||||
|
|
||||||
} else {
|
} else {
|
||||||
if existing, ok := a.checkMonitors[check.CheckID]; ok {
|
if existing, ok := a.checkMonitors[check.CheckID]; ok {
|
||||||
existing.Stop()
|
existing.Stop()
|
||||||
|
|
|
@ -5,6 +5,7 @@ import (
|
||||||
"github.com/armon/circbuf"
|
"github.com/armon/circbuf"
|
||||||
"github.com/hashicorp/consul/consul/structs"
|
"github.com/hashicorp/consul/consul/structs"
|
||||||
"log"
|
"log"
|
||||||
|
"net/http"
|
||||||
"os/exec"
|
"os/exec"
|
||||||
"sync"
|
"sync"
|
||||||
"syscall"
|
"syscall"
|
||||||
|
@ -23,10 +24,14 @@ const (
|
||||||
)
|
)
|
||||||
|
|
||||||
// CheckType is used to create either the CheckMonitor
|
// CheckType is used to create either the CheckMonitor
|
||||||
// or the CheckTTL. Only one of TTL or Script/Interval
|
// or the CheckTTL.
|
||||||
// needs to be provided
|
// Three types are supported: Script, HTTP, and TTL
|
||||||
|
// Script and HTTP both require Interval
|
||||||
|
// Only one of the types needs to be provided
|
||||||
|
// TTL or Script/Interval or HTTP/Interval
|
||||||
type CheckType struct {
|
type CheckType struct {
|
||||||
Script string
|
Script string
|
||||||
|
HTTP string
|
||||||
Interval time.Duration
|
Interval time.Duration
|
||||||
|
|
||||||
TTL time.Duration
|
TTL time.Duration
|
||||||
|
@ -36,7 +41,7 @@ type CheckType struct {
|
||||||
|
|
||||||
// Valid checks if the CheckType is valid
|
// Valid checks if the CheckType is valid
|
||||||
func (c *CheckType) Valid() bool {
|
func (c *CheckType) Valid() bool {
|
||||||
return c.IsTTL() || c.IsMonitor()
|
return c.IsTTL() || c.IsMonitor() || c.IsHTTP()
|
||||||
}
|
}
|
||||||
|
|
||||||
// IsTTL checks if this is a TTL type
|
// IsTTL checks if this is a TTL type
|
||||||
|
@ -49,6 +54,11 @@ func (c *CheckType) IsMonitor() bool {
|
||||||
return c.Script != "" && c.Interval != 0
|
return c.Script != "" && c.Interval != 0
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// IsHTTP checks if this is a HTTP type
|
||||||
|
func (c *CheckType) IsHTTP() bool {
|
||||||
|
return c.HTTP != "" && c.Interval != 0
|
||||||
|
}
|
||||||
|
|
||||||
// CheckNotifier interface is used by the CheckMonitor
|
// CheckNotifier interface is used by the CheckMonitor
|
||||||
// to notify when a check has a status update. The update
|
// to notify when a check has a status update. The update
|
||||||
// should take care to be idempotent.
|
// should take care to be idempotent.
|
||||||
|
@ -244,3 +254,93 @@ type persistedCheck struct {
|
||||||
Check *structs.HealthCheck
|
Check *structs.HealthCheck
|
||||||
ChkType *CheckType
|
ChkType *CheckType
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// CheckHTTP is used to periodically make an HTTP request to
|
||||||
|
// determine the health of a given check.
|
||||||
|
// The check is passing if the response code is 200.
|
||||||
|
// The check is warning if the response code is 503.
|
||||||
|
// The check is critical if the response code is anything else
|
||||||
|
// or if the request returns an error
|
||||||
|
type CheckHTTP struct {
|
||||||
|
Notify CheckNotifier
|
||||||
|
CheckID string
|
||||||
|
HTTP string
|
||||||
|
Interval time.Duration
|
||||||
|
Logger *log.Logger
|
||||||
|
|
||||||
|
httpClient *http.Client
|
||||||
|
stop bool
|
||||||
|
stopCh chan struct{}
|
||||||
|
stopLock sync.Mutex
|
||||||
|
}
|
||||||
|
|
||||||
|
// Start is used to start an HTTP check.
|
||||||
|
// The check runs until stop is called
|
||||||
|
func (c *CheckHTTP) Start() {
|
||||||
|
c.stopLock.Lock()
|
||||||
|
defer c.stopLock.Unlock()
|
||||||
|
c.stop = false
|
||||||
|
c.stopCh = make(chan struct{})
|
||||||
|
go c.run()
|
||||||
|
}
|
||||||
|
|
||||||
|
// Stop is used to stop an HTTP check.
|
||||||
|
func (c *CheckHTTP) Stop() {
|
||||||
|
c.stopLock.Lock()
|
||||||
|
defer c.stopLock.Unlock()
|
||||||
|
if !c.stop {
|
||||||
|
c.stop = true
|
||||||
|
close(c.stopCh)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// run is invoked by a goroutine to run until Stop() is called
|
||||||
|
func (c *CheckHTTP) run() {
|
||||||
|
// Get the randomized initial pause time
|
||||||
|
initialPauseTime := randomStagger(c.Interval)
|
||||||
|
c.Logger.Printf("[DEBUG] agent: pausing %v before first HTTP request of %s", initialPauseTime, c.HTTP)
|
||||||
|
next := time.After(initialPauseTime)
|
||||||
|
for {
|
||||||
|
select {
|
||||||
|
case <-next:
|
||||||
|
c.check()
|
||||||
|
next = time.After(c.Interval)
|
||||||
|
case <-c.stopCh:
|
||||||
|
return
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// check is invoked periodically to perform the HTTP check
|
||||||
|
func (c *CheckHTTP) check() {
|
||||||
|
resp, err := c.httpClient.Get(c.HTTP)
|
||||||
|
if err != nil {
|
||||||
|
c.Logger.Printf("[WARN] agent: http request failed '%s': %s", c.HTTP, err)
|
||||||
|
c.Notify.UpdateCheck(c.CheckID, structs.HealthCritical, err.Error())
|
||||||
|
return
|
||||||
|
}
|
||||||
|
resp.Body.Close()
|
||||||
|
|
||||||
|
switch resp.StatusCode {
|
||||||
|
|
||||||
|
// PASSING
|
||||||
|
case http.StatusOK:
|
||||||
|
c.Logger.Printf("[DEBUG] http check '%v' is passing", c.CheckID)
|
||||||
|
result := fmt.Sprintf("%s from %s", resp.Status, c.HTTP)
|
||||||
|
c.Notify.UpdateCheck(c.CheckID, structs.HealthPassing, result)
|
||||||
|
|
||||||
|
// WARNING
|
||||||
|
// 503 Service Unavailable
|
||||||
|
// The server is currently unable to handle the request due to
|
||||||
|
// a temporary overloading or maintenance of the server.
|
||||||
|
// http://www.w3.org/Protocols/rfc2616/rfc2616-sec10.html
|
||||||
|
case http.StatusServiceUnavailable:
|
||||||
|
c.Logger.Printf("[WARN] check '%v' is now warning", c.CheckID)
|
||||||
|
c.Notify.UpdateCheck(c.CheckID, structs.HealthWarning, resp.Status)
|
||||||
|
|
||||||
|
// CRITICAL
|
||||||
|
default:
|
||||||
|
c.Logger.Printf("[WARN] check '%v' is now critical", c.CheckID)
|
||||||
|
c.Notify.UpdateCheck(c.CheckID, structs.HealthCritical, resp.Status)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
|
@ -13,13 +13,18 @@ application level health checks. A health check is considered to be application
|
||||||
level if it associated with a service. A check is defined in a configuration file,
|
level if it associated with a service. A check is defined in a configuration file,
|
||||||
or added at runtime over the HTTP interface.
|
or added at runtime over the HTTP interface.
|
||||||
|
|
||||||
There are two different kinds of checks:
|
There are three different kinds of checks:
|
||||||
|
|
||||||
* Script + Interval - These checks depend on invoking an external application
|
* Script + Interval - These checks depend on invoking an external application
|
||||||
that does the health check and exits with an appropriate exit code, potentially
|
that does the health check and exits with an appropriate exit code, potentially
|
||||||
generating some output. A script is paired with an invocation interval (e.g.
|
generating some output. A script is paired with an invocation interval (e.g.
|
||||||
every 30 seconds). This is similar to the Nagios plugin system.
|
every 30 seconds). This is similar to the Nagios plugin system.
|
||||||
|
|
||||||
|
* HTTP + Interval - These checks make an `HTTP GET` request every Interval (e.g.
|
||||||
|
every 30 seconds) to the specified URL. The status of the service depends on the HTTP Response Code.
|
||||||
|
`200` is passing, `503` is warning and anything else is failing.
|
||||||
|
This type of check should be preferred over a script that for example uses `curl`.
|
||||||
|
|
||||||
* Time to Live (TTL) - These checks retain their last known state for a given TTL.
|
* Time to Live (TTL) - These checks retain their last known state for a given TTL.
|
||||||
The state of the check must be updated periodically over the HTTP interface. If an
|
The state of the check must be updated periodically over the HTTP interface. If an
|
||||||
external system fails to update the status within a given TTL, the check is
|
external system fails to update the status within a given TTL, the check is
|
||||||
|
@ -43,6 +48,19 @@ A check definition that is a script looks like:
|
||||||
}
|
}
|
||||||
```
|
```
|
||||||
|
|
||||||
|
An HTTP based check looks like:
|
||||||
|
|
||||||
|
```javascript
|
||||||
|
{
|
||||||
|
"check": {
|
||||||
|
"id": "api",
|
||||||
|
"name": "HTTP API on port 5000",
|
||||||
|
"http": "http://localhost:5000/health",
|
||||||
|
"interval": "10s"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
A TTL based check is very similar:
|
A TTL based check is very similar:
|
||||||
|
|
||||||
```javascript
|
```javascript
|
||||||
|
@ -56,7 +74,7 @@ A TTL based check is very similar:
|
||||||
}
|
}
|
||||||
```
|
```
|
||||||
|
|
||||||
Both types of definitions must include a `name`, and may optionally
|
Each type of definitions must include a `name`, and may optionally
|
||||||
provide an `id` and `notes` field. The `id` is set to the `name` if not
|
provide an `id` and `notes` field. The `id` is set to the `name` if not
|
||||||
provided. It is required that all checks have a unique ID per node, so if names
|
provided. It is required that all checks have a unique ID per node, so if names
|
||||||
might conflict then unique ID's should be provided.
|
might conflict then unique ID's should be provided.
|
||||||
|
@ -102,6 +120,12 @@ key in your configuration file.
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"id": "chk2",
|
"id": "chk2",
|
||||||
|
"name": "/health",
|
||||||
|
"http": "http://localhost:5000/health",
|
||||||
|
"interval": "15s"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "chk3",
|
||||||
"name": "cpu",
|
"name": "cpu",
|
||||||
"script": "/bin/check_cpu",
|
"script": "/bin/check_cpu",
|
||||||
"interval": "10s"
|
"interval": "10s"
|
||||||
|
|
|
@ -422,7 +422,7 @@ The endpoint always returns 200.
|
||||||
|
|
||||||
The register endpoint is used to add a new check to the local agent.
|
The register endpoint is used to add a new check to the local agent.
|
||||||
There is more documentation on checks [here](/docs/agent/checks.html).
|
There is more documentation on checks [here](/docs/agent/checks.html).
|
||||||
Checks are either a script or TTL type. The agent is responsible for managing
|
Checks are of script, HTTP, or TTL type. The agent is responsible for managing
|
||||||
the status of the check and keeping the Catalog in sync.
|
the status of the check and keeping the Catalog in sync.
|
||||||
|
|
||||||
The register endpoint expects a JSON request body to be PUT. The request
|
The register endpoint expects a JSON request body to be PUT. The request
|
||||||
|
@ -434,20 +434,25 @@ body must look like:
|
||||||
"Name": "Memory utilization",
|
"Name": "Memory utilization",
|
||||||
"Notes": "Ensure we don't oversubscribe memory",
|
"Notes": "Ensure we don't oversubscribe memory",
|
||||||
"Script": "/usr/local/bin/check_mem.py",
|
"Script": "/usr/local/bin/check_mem.py",
|
||||||
|
"HTTP": "http://example.com",
|
||||||
"Interval": "10s",
|
"Interval": "10s",
|
||||||
"TTL": "15s"
|
"TTL": "15s"
|
||||||
}
|
}
|
||||||
```
|
```
|
||||||
|
|
||||||
The `Name` field is mandatory, as is either `Script` and `Interval`
|
The `Name` field is mandatory, as is one of `Script`, `HTTP` or `TTL`.
|
||||||
or `TTL`. Only one of `Script` and `Interval` or `TTL` should be provided.
|
`Script` and `HTTP` also require that `Interval` be set.
|
||||||
|
|
||||||
If an `ID` is not provided, it is set to `Name`. You cannot have duplicate
|
If an `ID` is not provided, it is set to `Name`. You cannot have duplicate
|
||||||
`ID` entries per agent, so it may be necessary to provide an ID. The `Notes`
|
`ID` entries per agent, so it may be necessary to provide an ID. The `Notes`
|
||||||
field is not used by Consul, and is meant to be human readable.
|
field is not used by Consul, and is meant to be human readable.
|
||||||
|
|
||||||
If a `Script` is provided, the check type is a script, and Consul will
|
If a `Script` is provided, the check type is a script, and Consul will
|
||||||
evaluate the script every `Interval` to update the status. If a `TTL` type
|
evaluate the script every `Interval` to update the status.
|
||||||
is used, then the TTL update APIs must be used to periodically update
|
|
||||||
|
An `HTTP` check will preform an HTTP GET request to the value of `HTTP` (expected to be a URL) every `Interval`. If the response is `200` the check is passing, if the response is `503` the check is warning, otherwise the check is critical.
|
||||||
|
|
||||||
|
If a `TTL` type is used, then the TTL update APIs must be used to periodically update
|
||||||
the state of the check.
|
the state of the check.
|
||||||
|
|
||||||
The return code is 200 on success.
|
The return code is 200 on success.
|
||||||
|
@ -515,6 +520,7 @@ body must look like:
|
||||||
"Port": 8000,
|
"Port": 8000,
|
||||||
"Check": {
|
"Check": {
|
||||||
"Script": "/usr/local/bin/check_redis.py",
|
"Script": "/usr/local/bin/check_redis.py",
|
||||||
|
"HTTP": "http://localhost:5000/health",
|
||||||
"Interval": "10s",
|
"Interval": "10s",
|
||||||
"TTL": "15s"
|
"TTL": "15s"
|
||||||
}
|
}
|
||||||
|
@ -523,8 +529,10 @@ body must look like:
|
||||||
|
|
||||||
The `Name` field is mandatory, If an `ID` is not provided, it is set to `Name`.
|
The `Name` field is mandatory, If an `ID` is not provided, it is set to `Name`.
|
||||||
You cannot have duplicate `ID` entries per agent, so it may be necessary to provide an ID.
|
You cannot have duplicate `ID` entries per agent, so it may be necessary to provide an ID.
|
||||||
`Tags`, `Address`, `Port` and `Check` are optional. If `Check` is provided, only one of `Script` and `Interval`
|
`Tags`, `Address`, `Port` and `Check` are optional.
|
||||||
or `TTL` should be provided. There is more information about checks [here](/docs/agent/checks.html).
|
If `Check` is provided, only one of `Script`, `HTTP` or `TTL` should be provided.
|
||||||
|
`Script` and `HTTP` also require `Interval`.
|
||||||
|
There is more information about checks [here](/docs/agent/checks.html).
|
||||||
The `Address` will default to that of the agent if not provided.
|
The `Address` will default to that of the agent if not provided.
|
||||||
|
|
||||||
The created check will be named "service:\<ServiceId\>".
|
The created check will be named "service:\<ServiceId\>".
|
||||||
|
|
|
@ -55,7 +55,8 @@ a node has any failing system-level check, the DNS interface will omit that
|
||||||
node from any service query.
|
node from any service query.
|
||||||
|
|
||||||
There is more information about [checks here](/docs/agent/checks.html). The
|
There is more information about [checks here](/docs/agent/checks.html). The
|
||||||
check must be of the script or TTL type. If it is a script type, `script` and
|
check must be of the script, HTTP or TTL type. If it is a script type, `script` and
|
||||||
|
`interval` must be provided. If it is a HTTP type, `http` and
|
||||||
`interval` must be provided. If it is a TTL type, then only `ttl` must be
|
`interval` must be provided. If it is a TTL type, then only `ttl` must be
|
||||||
provided. The check name is automatically generated as "service:<service-id>".
|
provided. The check name is automatically generated as "service:<service-id>".
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue