mirror of
https://github.com/status-im/consul.git
synced 2025-01-11 06:16:08 +00:00
Merge pull request #2175 from hashicorp/f-hold-rpc
Gracefully handle short lived outages by holding RPC calls
This commit is contained in:
commit
c6ef1d8ac0
@ -34,7 +34,7 @@ func TestCatalogRegister(t *testing.T) {
|
||||
var out struct{}
|
||||
|
||||
err := msgpackrpc.CallWithCodec(codec, "Catalog.Register", &arg, &out)
|
||||
if err == nil || err.Error() != "No cluster leader" {
|
||||
if err != nil {
|
||||
t.Fatalf("err: %v", err)
|
||||
}
|
||||
|
||||
@ -198,7 +198,7 @@ func TestCatalogDeregister(t *testing.T) {
|
||||
var out struct{}
|
||||
|
||||
err := msgpackrpc.CallWithCodec(codec, "Catalog.Deregister", &arg, &out)
|
||||
if err == nil || err.Error() != "No cluster leader" {
|
||||
if err != nil {
|
||||
t.Fatalf("err: %v", err)
|
||||
}
|
||||
|
||||
@ -302,7 +302,7 @@ func TestCatalogListNodes(t *testing.T) {
|
||||
}
|
||||
var out structs.IndexedNodes
|
||||
err := msgpackrpc.CallWithCodec(codec, "Catalog.ListNodes", &args, &out)
|
||||
if err == nil || err.Error() != "No cluster leader" {
|
||||
if err != nil {
|
||||
t.Fatalf("err: %v", err)
|
||||
}
|
||||
|
||||
@ -621,7 +621,7 @@ func TestCatalogListServices(t *testing.T) {
|
||||
}
|
||||
var out structs.IndexedServices
|
||||
err := msgpackrpc.CallWithCodec(codec, "Catalog.ListServices", &args, &out)
|
||||
if err == nil || err.Error() != "No cluster leader" {
|
||||
if err != nil {
|
||||
t.Fatalf("err: %v", err)
|
||||
}
|
||||
|
||||
@ -810,7 +810,7 @@ func TestCatalogListServiceNodes(t *testing.T) {
|
||||
}
|
||||
var out structs.IndexedServiceNodes
|
||||
err := msgpackrpc.CallWithCodec(codec, "Catalog.ServiceNodes", &args, &out)
|
||||
if err == nil || err.Error() != "No cluster leader" {
|
||||
if err != nil {
|
||||
t.Fatalf("err: %v", err)
|
||||
}
|
||||
|
||||
@ -857,7 +857,7 @@ func TestCatalogListServiceNodes_DistanceSort(t *testing.T) {
|
||||
}
|
||||
var out structs.IndexedServiceNodes
|
||||
err := msgpackrpc.CallWithCodec(codec, "Catalog.ServiceNodes", &args, &out)
|
||||
if err == nil || err.Error() != "No cluster leader" {
|
||||
if err != nil {
|
||||
t.Fatalf("err: %v", err)
|
||||
}
|
||||
|
||||
@ -944,7 +944,7 @@ func TestCatalogNodeServices(t *testing.T) {
|
||||
}
|
||||
var out structs.IndexedNodeServices
|
||||
err := msgpackrpc.CallWithCodec(codec, "Catalog.NodeServices", &args, &out)
|
||||
if err == nil || err.Error() != "No cluster leader" {
|
||||
if err != nil {
|
||||
t.Fatalf("err: %v", err)
|
||||
}
|
||||
|
||||
@ -1001,7 +1001,7 @@ func TestCatalogRegister_FailedCase1(t *testing.T) {
|
||||
var out struct{}
|
||||
|
||||
err := msgpackrpc.CallWithCodec(codec, "Catalog.Register", &arg, &out)
|
||||
if err == nil || err.Error() != "No cluster leader" {
|
||||
if err != nil {
|
||||
t.Fatalf("err: %v", err)
|
||||
}
|
||||
|
||||
|
@ -224,6 +224,13 @@ type Config struct {
|
||||
// are willing to apply in one period. After this limit we will issue a
|
||||
// warning and discard the remaining updates.
|
||||
CoordinateUpdateMaxBatches int
|
||||
|
||||
// RPCHoldTimeout is how long an RPC can be "held" before it is errored.
|
||||
// This is used to paper over a loss of leadership by instead holding RPCs,
|
||||
// so that the caller experiences a slow response rather than an error.
|
||||
// This period is meant to be long enough for a leader election to take
|
||||
// place, and a small jitter is applied to avoid a thundering herd.
|
||||
RPCHoldTimeout time.Duration
|
||||
}
|
||||
|
||||
// CheckVersion is used to check if the ProtocolVersion is valid
|
||||
@ -286,6 +293,9 @@ func DefaultConfig() *Config {
|
||||
CoordinateUpdatePeriod: 5 * time.Second,
|
||||
CoordinateUpdateBatchSize: 128,
|
||||
CoordinateUpdateMaxBatches: 5,
|
||||
|
||||
// Hold an RPC for up to 5 seconds by default
|
||||
RPCHoldTimeout: 5 * time.Second,
|
||||
}
|
||||
|
||||
// Increase our reap interval to 3 days instead of 24h.
|
||||
|
@ -10,6 +10,7 @@ import (
|
||||
"time"
|
||||
|
||||
"github.com/armon/go-metrics"
|
||||
"github.com/hashicorp/consul/consul/agent"
|
||||
"github.com/hashicorp/consul/consul/state"
|
||||
"github.com/hashicorp/consul/consul/structs"
|
||||
"github.com/hashicorp/consul/lib"
|
||||
@ -39,7 +40,8 @@ const (
|
||||
|
||||
// jitterFraction is a the limit to the amount of jitter we apply
|
||||
// to a user specified MaxQueryTime. We divide the specified time by
|
||||
// the fraction. So 16 == 6.25% limit of jitter
|
||||
// the fraction. So 16 == 6.25% limit of jitter. This same fraction
|
||||
// is applied to the RPCHoldTimeout
|
||||
jitterFraction = 16
|
||||
|
||||
// Warn if the Raft command is larger than this.
|
||||
@ -189,6 +191,8 @@ func (s *Server) handleConsulConn(conn net.Conn) {
|
||||
// forward is used to forward to a remote DC or to forward to the local leader
|
||||
// Returns a bool of if forwarding was performed, as well as any error
|
||||
func (s *Server) forward(method string, info structs.RPCInfo, args interface{}, reply interface{}) (bool, error) {
|
||||
var firstCheck time.Time
|
||||
|
||||
// Handle DC forwarding
|
||||
dc := info.RequestDatacenter()
|
||||
if dc != s.config.Datacenter {
|
||||
@ -201,20 +205,51 @@ func (s *Server) forward(method string, info structs.RPCInfo, args interface{},
|
||||
return false, nil
|
||||
}
|
||||
|
||||
// Handle leader forwarding
|
||||
if !s.IsLeader() {
|
||||
err := s.forwardLeader(method, args, reply)
|
||||
CHECK_LEADER:
|
||||
// Find the leader
|
||||
isLeader, remoteServer := s.getLeader()
|
||||
|
||||
// Handle the case we are the leader
|
||||
if isLeader {
|
||||
return false, nil
|
||||
}
|
||||
|
||||
// Handle the case of a known leader
|
||||
if remoteServer != nil {
|
||||
err := s.forwardLeader(remoteServer, method, args, reply)
|
||||
return true, err
|
||||
}
|
||||
return false, nil
|
||||
|
||||
// Gate the request until there is a leader
|
||||
if firstCheck.IsZero() {
|
||||
firstCheck = time.Now()
|
||||
}
|
||||
if time.Now().Sub(firstCheck) < s.config.RPCHoldTimeout {
|
||||
jitter := lib.RandomStagger(s.config.RPCHoldTimeout / jitterFraction)
|
||||
select {
|
||||
case <-time.After(jitter):
|
||||
goto CHECK_LEADER
|
||||
case <-s.shutdownCh:
|
||||
}
|
||||
}
|
||||
|
||||
// No leader found and hold time exceeded
|
||||
return true, structs.ErrNoLeader
|
||||
}
|
||||
|
||||
// forwardLeader is used to forward an RPC call to the leader, or fail if no leader
|
||||
func (s *Server) forwardLeader(method string, args interface{}, reply interface{}) error {
|
||||
// getLeader returns if the current node is the leader, and if not
|
||||
// then it returns the leader which is potentially nil if the cluster
|
||||
// has not yet elected a leader.
|
||||
func (s *Server) getLeader() (bool, *agent.Server) {
|
||||
// Check if we are the leader
|
||||
if s.IsLeader() {
|
||||
return true, nil
|
||||
}
|
||||
|
||||
// Get the leader
|
||||
leader := s.raft.Leader()
|
||||
if leader == "" {
|
||||
return structs.ErrNoLeader
|
||||
return false, nil
|
||||
}
|
||||
|
||||
// Lookup the server
|
||||
@ -222,6 +257,12 @@ func (s *Server) forwardLeader(method string, args interface{}, reply interface{
|
||||
server := s.localConsuls[leader]
|
||||
s.localLock.RUnlock()
|
||||
|
||||
// Server could be nil
|
||||
return false, server
|
||||
}
|
||||
|
||||
// forwardLeader is used to forward an RPC call to the leader, or fail if no leader
|
||||
func (s *Server) forwardLeader(server *agent.Server, method string, args interface{}, reply interface{}) error {
|
||||
// Handle a missing server
|
||||
if server == nil {
|
||||
return structs.ErrNoLeader
|
||||
|
Loading…
x
Reference in New Issue
Block a user