mirror of https://github.com/status-im/consul.git
command/agent: Add simple HTTP check type
These checks make an `HTTP GET` request every Interval to the specified URL. The status of the service depends on the HTTP Response Code. `200` is passing, `503` is warning and anything else is failing.
This commit is contained in:
parent
8b320b852e
commit
fb5ba8d97d
|
@ -6,6 +6,7 @@ import (
|
|||
"io"
|
||||
"log"
|
||||
"net"
|
||||
"net/http"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strconv"
|
||||
|
@ -51,11 +52,16 @@ type Agent struct {
|
|||
state localState
|
||||
|
||||
// checkMonitors maps the check ID to an associated monitor
|
||||
// checkTTLs maps the check ID to an associated check TTL
|
||||
// checkLock protects updates to either
|
||||
checkMonitors map[string]*CheckMonitor
|
||||
checkTTLs map[string]*CheckTTL
|
||||
checkLock sync.Mutex
|
||||
|
||||
// checkHTTPs maps the check ID to an associated HTTP check
|
||||
checkHTTPs map[string]*CheckHTTP
|
||||
|
||||
// checkTTLs maps the check ID to an associated check TTL
|
||||
checkTTLs map[string]*CheckTTL
|
||||
|
||||
// checkLock protects updates to the check* maps
|
||||
checkLock sync.Mutex
|
||||
|
||||
// eventCh is used to receive user events
|
||||
eventCh chan serf.UserEvent
|
||||
|
@ -111,6 +117,7 @@ func Create(config *Config, logOutput io.Writer) (*Agent, error) {
|
|||
logOutput: logOutput,
|
||||
checkMonitors: make(map[string]*CheckMonitor),
|
||||
checkTTLs: make(map[string]*CheckTTL),
|
||||
checkHTTPs: make(map[string]*CheckHTTP),
|
||||
eventCh: make(chan serf.UserEvent, 1024),
|
||||
eventBuf: make([]*UserEvent, 256),
|
||||
shutdownCh: make(chan struct{}),
|
||||
|
@ -382,6 +389,10 @@ func (a *Agent) Shutdown() error {
|
|||
chk.Stop()
|
||||
}
|
||||
|
||||
for _, chk := range a.checkHTTPs {
|
||||
chk.Stop()
|
||||
}
|
||||
|
||||
a.logger.Println("[INFO] agent: requesting shutdown")
|
||||
var err error
|
||||
if a.server != nil {
|
||||
|
@ -661,6 +672,29 @@ func (a *Agent) AddCheck(check *structs.HealthCheck, chkType *CheckType, persist
|
|||
ttl.Start()
|
||||
a.checkTTLs[check.CheckID] = ttl
|
||||
|
||||
} else if chkType.IsHTTP() {
|
||||
if existing, ok := a.checkHTTPs[check.CheckID]; ok {
|
||||
existing.Stop()
|
||||
}
|
||||
if chkType.Interval < MinInterval {
|
||||
a.logger.Println(fmt.Sprintf("[WARN] agent: check '%s' has interval below minimum of %v",
|
||||
check.CheckID, MinInterval))
|
||||
chkType.Interval = MinInterval
|
||||
}
|
||||
|
||||
http := &CheckHTTP{
|
||||
Notify: &a.state,
|
||||
CheckID: check.CheckID,
|
||||
HTTP: chkType.HTTP,
|
||||
Interval: chkType.Interval,
|
||||
Logger: a.logger,
|
||||
httpClient: &http.Client{
|
||||
Timeout: chkType.Interval,
|
||||
},
|
||||
}
|
||||
http.Start()
|
||||
a.checkHTTPs[check.CheckID] = http
|
||||
|
||||
} else {
|
||||
if existing, ok := a.checkMonitors[check.CheckID]; ok {
|
||||
existing.Stop()
|
||||
|
|
|
@ -5,6 +5,7 @@ import (
|
|||
"github.com/armon/circbuf"
|
||||
"github.com/hashicorp/consul/consul/structs"
|
||||
"log"
|
||||
"net/http"
|
||||
"os/exec"
|
||||
"sync"
|
||||
"syscall"
|
||||
|
@ -23,10 +24,14 @@ const (
|
|||
)
|
||||
|
||||
// CheckType is used to create either the CheckMonitor
|
||||
// or the CheckTTL. Only one of TTL or Script/Interval
|
||||
// needs to be provided
|
||||
// or the CheckTTL.
|
||||
// Three types are supported: Script, HTTP, and TTL
|
||||
// Script and HTTP both require Interval
|
||||
// Only one of the types needs to be provided
|
||||
// TTL or Script/Interval or HTTP/Interval
|
||||
type CheckType struct {
|
||||
Script string
|
||||
HTTP string
|
||||
Interval time.Duration
|
||||
|
||||
TTL time.Duration
|
||||
|
@ -36,7 +41,7 @@ type CheckType struct {
|
|||
|
||||
// Valid checks if the CheckType is valid
|
||||
func (c *CheckType) Valid() bool {
|
||||
return c.IsTTL() || c.IsMonitor()
|
||||
return c.IsTTL() || c.IsMonitor() || c.IsHTTP()
|
||||
}
|
||||
|
||||
// IsTTL checks if this is a TTL type
|
||||
|
@ -49,6 +54,11 @@ func (c *CheckType) IsMonitor() bool {
|
|||
return c.Script != "" && c.Interval != 0
|
||||
}
|
||||
|
||||
// IsHTTP checks if this is a HTTP type
|
||||
func (c *CheckType) IsHTTP() bool {
|
||||
return c.HTTP != "" && c.Interval != 0
|
||||
}
|
||||
|
||||
// CheckNotifier interface is used by the CheckMonitor
|
||||
// to notify when a check has a status update. The update
|
||||
// should take care to be idempotent.
|
||||
|
@ -244,3 +254,93 @@ type persistedCheck struct {
|
|||
Check *structs.HealthCheck
|
||||
ChkType *CheckType
|
||||
}
|
||||
|
||||
// CheckHTTP is used to periodically make an HTTP request to
|
||||
// determine the health of a given check.
|
||||
// The check is passing if the response code is 200.
|
||||
// The check is warning if the response code is 503.
|
||||
// The check is critical if the response code is anything else
|
||||
// or if the request returns an error
|
||||
type CheckHTTP struct {
|
||||
Notify CheckNotifier
|
||||
CheckID string
|
||||
HTTP string
|
||||
Interval time.Duration
|
||||
Logger *log.Logger
|
||||
|
||||
httpClient *http.Client
|
||||
stop bool
|
||||
stopCh chan struct{}
|
||||
stopLock sync.Mutex
|
||||
}
|
||||
|
||||
// Start is used to start an HTTP check.
|
||||
// The check runs until stop is called
|
||||
func (c *CheckHTTP) Start() {
|
||||
c.stopLock.Lock()
|
||||
defer c.stopLock.Unlock()
|
||||
c.stop = false
|
||||
c.stopCh = make(chan struct{})
|
||||
go c.run()
|
||||
}
|
||||
|
||||
// Stop is used to stop an HTTP check.
|
||||
func (c *CheckHTTP) Stop() {
|
||||
c.stopLock.Lock()
|
||||
defer c.stopLock.Unlock()
|
||||
if !c.stop {
|
||||
c.stop = true
|
||||
close(c.stopCh)
|
||||
}
|
||||
}
|
||||
|
||||
// run is invoked by a goroutine to run until Stop() is called
|
||||
func (c *CheckHTTP) run() {
|
||||
// Get the randomized initial pause time
|
||||
initialPauseTime := randomStagger(c.Interval)
|
||||
c.Logger.Printf("[DEBUG] agent: pausing %v before first HTTP request of %s", initialPauseTime, c.HTTP)
|
||||
next := time.After(initialPauseTime)
|
||||
for {
|
||||
select {
|
||||
case <-next:
|
||||
c.check()
|
||||
next = time.After(c.Interval)
|
||||
case <-c.stopCh:
|
||||
return
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// check is invoked periodically to perform the HTTP check
|
||||
func (c *CheckHTTP) check() {
|
||||
resp, err := c.httpClient.Get(c.HTTP)
|
||||
if err != nil {
|
||||
c.Logger.Printf("[WARN] agent: http request failed '%s': %s", c.HTTP, err)
|
||||
c.Notify.UpdateCheck(c.CheckID, structs.HealthCritical, err.Error())
|
||||
return
|
||||
}
|
||||
resp.Body.Close()
|
||||
|
||||
switch resp.StatusCode {
|
||||
|
||||
// PASSING
|
||||
case http.StatusOK:
|
||||
c.Logger.Printf("[DEBUG] http check '%v' is passing", c.CheckID)
|
||||
result := fmt.Sprintf("%s from %s", resp.Status, c.HTTP)
|
||||
c.Notify.UpdateCheck(c.CheckID, structs.HealthPassing, result)
|
||||
|
||||
// WARNING
|
||||
// 503 Service Unavailable
|
||||
// The server is currently unable to handle the request due to
|
||||
// a temporary overloading or maintenance of the server.
|
||||
// http://www.w3.org/Protocols/rfc2616/rfc2616-sec10.html
|
||||
case http.StatusServiceUnavailable:
|
||||
c.Logger.Printf("[WARN] check '%v' is now warning", c.CheckID)
|
||||
c.Notify.UpdateCheck(c.CheckID, structs.HealthWarning, resp.Status)
|
||||
|
||||
// CRITICAL
|
||||
default:
|
||||
c.Logger.Printf("[WARN] check '%v' is now critical", c.CheckID)
|
||||
c.Notify.UpdateCheck(c.CheckID, structs.HealthCritical, resp.Status)
|
||||
}
|
||||
}
|
||||
|
|
|
@ -13,13 +13,18 @@ application level health checks. A health check is considered to be application
|
|||
level if it associated with a service. A check is defined in a configuration file,
|
||||
or added at runtime over the HTTP interface.
|
||||
|
||||
There are two different kinds of checks:
|
||||
There are three different kinds of checks:
|
||||
|
||||
* Script + Interval - These checks depend on invoking an external application
|
||||
that does the health check and exits with an appropriate exit code, potentially
|
||||
generating some output. A script is paired with an invocation interval (e.g.
|
||||
every 30 seconds). This is similar to the Nagios plugin system.
|
||||
|
||||
* HTTP + Interval - These checks make an `HTTP GET` request every Interval (e.g.
|
||||
every 30 seconds) to the specified URL. The status of the service depends on the HTTP Response Code.
|
||||
`200` is passing, `503` is warning and anything else is failing.
|
||||
This type of check should be preferred over a script that for example uses `curl`.
|
||||
|
||||
* Time to Live (TTL) - These checks retain their last known state for a given TTL.
|
||||
The state of the check must be updated periodically over the HTTP interface. If an
|
||||
external system fails to update the status within a given TTL, the check is
|
||||
|
@ -43,6 +48,19 @@ A check definition that is a script looks like:
|
|||
}
|
||||
```
|
||||
|
||||
An HTTP based check looks like:
|
||||
|
||||
```javascript
|
||||
{
|
||||
"check": {
|
||||
"id": "api",
|
||||
"name": "HTTP API on port 5000",
|
||||
"http": "http://localhost:5000/health",
|
||||
"interval": "10s"
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
A TTL based check is very similar:
|
||||
|
||||
```javascript
|
||||
|
@ -56,7 +74,7 @@ A TTL based check is very similar:
|
|||
}
|
||||
```
|
||||
|
||||
Both types of definitions must include a `name`, and may optionally
|
||||
Each type of definitions must include a `name`, and may optionally
|
||||
provide an `id` and `notes` field. The `id` is set to the `name` if not
|
||||
provided. It is required that all checks have a unique ID per node, so if names
|
||||
might conflict then unique ID's should be provided.
|
||||
|
@ -102,6 +120,12 @@ key in your configuration file.
|
|||
},
|
||||
{
|
||||
"id": "chk2",
|
||||
"name": "/health",
|
||||
"http": "http://localhost:5000/health",
|
||||
"interval": "15s"
|
||||
},
|
||||
{
|
||||
"id": "chk3",
|
||||
"name": "cpu",
|
||||
"script": "/bin/check_cpu",
|
||||
"interval": "10s"
|
||||
|
|
|
@ -422,7 +422,7 @@ The endpoint always returns 200.
|
|||
|
||||
The register endpoint is used to add a new check to the local agent.
|
||||
There is more documentation on checks [here](/docs/agent/checks.html).
|
||||
Checks are either a script or TTL type. The agent is responsible for managing
|
||||
Checks are of script, HTTP, or TTL type. The agent is responsible for managing
|
||||
the status of the check and keeping the Catalog in sync.
|
||||
|
||||
The register endpoint expects a JSON request body to be PUT. The request
|
||||
|
@ -434,20 +434,25 @@ body must look like:
|
|||
"Name": "Memory utilization",
|
||||
"Notes": "Ensure we don't oversubscribe memory",
|
||||
"Script": "/usr/local/bin/check_mem.py",
|
||||
"HTTP": "http://example.com",
|
||||
"Interval": "10s",
|
||||
"TTL": "15s"
|
||||
}
|
||||
```
|
||||
|
||||
The `Name` field is mandatory, as is either `Script` and `Interval`
|
||||
or `TTL`. Only one of `Script` and `Interval` or `TTL` should be provided.
|
||||
The `Name` field is mandatory, as is one of `Script`, `HTTP` or `TTL`.
|
||||
`Script` and `HTTP` also require that `Interval` be set.
|
||||
|
||||
If an `ID` is not provided, it is set to `Name`. You cannot have duplicate
|
||||
`ID` entries per agent, so it may be necessary to provide an ID. The `Notes`
|
||||
field is not used by Consul, and is meant to be human readable.
|
||||
|
||||
If a `Script` is provided, the check type is a script, and Consul will
|
||||
evaluate the script every `Interval` to update the status. If a `TTL` type
|
||||
is used, then the TTL update APIs must be used to periodically update
|
||||
evaluate the script every `Interval` to update the status.
|
||||
|
||||
An `HTTP` check will preform an HTTP GET request to the value of `HTTP` (expected to be a URL) every `Interval`. If the response is `200` the check is passing, if the response is `503` the check is warning, otherwise the check is critical.
|
||||
|
||||
If a `TTL` type is used, then the TTL update APIs must be used to periodically update
|
||||
the state of the check.
|
||||
|
||||
The return code is 200 on success.
|
||||
|
@ -515,6 +520,7 @@ body must look like:
|
|||
"Port": 8000,
|
||||
"Check": {
|
||||
"Script": "/usr/local/bin/check_redis.py",
|
||||
"HTTP": "http://localhost:5000/health",
|
||||
"Interval": "10s",
|
||||
"TTL": "15s"
|
||||
}
|
||||
|
@ -523,8 +529,10 @@ body must look like:
|
|||
|
||||
The `Name` field is mandatory, If an `ID` is not provided, it is set to `Name`.
|
||||
You cannot have duplicate `ID` entries per agent, so it may be necessary to provide an ID.
|
||||
`Tags`, `Address`, `Port` and `Check` are optional. If `Check` is provided, only one of `Script` and `Interval`
|
||||
or `TTL` should be provided. There is more information about checks [here](/docs/agent/checks.html).
|
||||
`Tags`, `Address`, `Port` and `Check` are optional.
|
||||
If `Check` is provided, only one of `Script`, `HTTP` or `TTL` should be provided.
|
||||
`Script` and `HTTP` also require `Interval`.
|
||||
There is more information about checks [here](/docs/agent/checks.html).
|
||||
The `Address` will default to that of the agent if not provided.
|
||||
|
||||
The created check will be named "service:\<ServiceId\>".
|
||||
|
|
|
@ -55,7 +55,8 @@ a node has any failing system-level check, the DNS interface will omit that
|
|||
node from any service query.
|
||||
|
||||
There is more information about [checks here](/docs/agent/checks.html). The
|
||||
check must be of the script or TTL type. If it is a script type, `script` and
|
||||
check must be of the script, HTTP or TTL type. If it is a script type, `script` and
|
||||
`interval` must be provided. If it is a HTTP type, `http` and
|
||||
`interval` must be provided. If it is a TTL type, then only `ttl` must be
|
||||
provided. The check name is automatically generated as "service:<service-id>".
|
||||
|
||||
|
|
Loading…
Reference in New Issue