mirror of https://github.com/status-im/consul.git
RPC Timeout/Retries account for blocking requests (#8978)
This commit is contained in:
parent
b130f355ee
commit
f785c5b332
|
@ -0,0 +1,4 @@
|
||||||
|
```release-note:bug
|
||||||
|
use the MaxQueryTime instead of RPCHoldTimeout for blocking RPC queries
|
||||||
|
[[GH-8978](https://github.com/hashicorp/consul/pull/8978)].
|
||||||
|
```
|
|
@ -1665,7 +1665,7 @@ func TestCatalog_ListServices_Stale(t *testing.T) {
|
||||||
defer codec.Close()
|
defer codec.Close()
|
||||||
|
|
||||||
// Run the query, do not wait for leader, never any contact with leader, should fail
|
// Run the query, do not wait for leader, never any contact with leader, should fail
|
||||||
if err := msgpackrpc.CallWithCodec(codec, "Catalog.ListServices", &args, &out); err == nil || err.Error() != structs.ErrNoLeader.Error() {
|
if err := msgpackrpc.CallWithCodec(codec, "Catalog.ListServices", &args, &out); err.Error() != structs.ErrNoLeader.Error() {
|
||||||
t.Fatalf("expected %v but got err: %v and %v", structs.ErrNoLeader, err, out)
|
t.Fatalf("expected %v but got err: %v and %v", structs.ErrNoLeader, err, out)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1677,6 +1677,7 @@ func TestCatalog_ListServices_Stale(t *testing.T) {
|
||||||
testrpc.WaitForLeader(t, s2.RPC, "dc1")
|
testrpc.WaitForLeader(t, s2.RPC, "dc1")
|
||||||
|
|
||||||
retry.Run(t, func(r *retry.R) {
|
retry.Run(t, func(r *retry.R) {
|
||||||
|
out = structs.IndexedServices{}
|
||||||
if err := msgpackrpc.CallWithCodec(codec, "Catalog.ListServices", &args, &out); err != nil {
|
if err := msgpackrpc.CallWithCodec(codec, "Catalog.ListServices", &args, &out); err != nil {
|
||||||
r.Fatalf("err: %v", err)
|
r.Fatalf("err: %v", err)
|
||||||
}
|
}
|
||||||
|
@ -1696,24 +1697,25 @@ func TestCatalog_ListServices_Stale(t *testing.T) {
|
||||||
|
|
||||||
args.AllowStale = false
|
args.AllowStale = false
|
||||||
// Since the leader is now down, non-stale query should fail now
|
// Since the leader is now down, non-stale query should fail now
|
||||||
if err := msgpackrpc.CallWithCodec(codec, "Catalog.ListServices", &args, &out); err == nil || err.Error() != structs.ErrNoLeader.Error() {
|
out = structs.IndexedServices{}
|
||||||
|
if err := msgpackrpc.CallWithCodec(codec, "Catalog.ListServices", &args, &out); err.Error() != structs.ErrLeaderNotTracked.Error() {
|
||||||
t.Fatalf("expected %v but got err: %v and %v", structs.ErrNoLeader, err, out)
|
t.Fatalf("expected %v but got err: %v and %v", structs.ErrNoLeader, err, out)
|
||||||
}
|
}
|
||||||
|
if out.KnownLeader {
|
||||||
|
t.Fatalf("should not have a leader anymore: %#v", out)
|
||||||
|
}
|
||||||
|
|
||||||
// With stale, request should still work
|
// With stale, request should still work
|
||||||
args.AllowStale = true
|
args.AllowStale = true
|
||||||
|
retry.Run(t, func(r *retry.R) {
|
||||||
|
out = structs.IndexedServices{}
|
||||||
if err := msgpackrpc.CallWithCodec(codec, "Catalog.ListServices", &args, &out); err != nil {
|
if err := msgpackrpc.CallWithCodec(codec, "Catalog.ListServices", &args, &out); err != nil {
|
||||||
t.Fatalf("err: %v", err)
|
r.Fatalf("err: %v", err)
|
||||||
}
|
}
|
||||||
|
if out.KnownLeader || len(out.Services) != 1 {
|
||||||
// Should find old service
|
r.Fatalf("got %t nodes want %d", out.KnownLeader, len(out.Services))
|
||||||
if len(out.Services) != 1 {
|
|
||||||
t.Fatalf("bad: %#v", out)
|
|
||||||
}
|
|
||||||
|
|
||||||
if out.KnownLeader {
|
|
||||||
t.Fatalf("should not have a leader anymore: %#v", out)
|
|
||||||
}
|
}
|
||||||
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
func TestCatalog_ListServiceNodes(t *testing.T) {
|
func TestCatalog_ListServiceNodes(t *testing.T) {
|
||||||
|
|
|
@ -285,19 +285,17 @@ TRY:
|
||||||
|
|
||||||
// Use the zero value for RPCInfo if the request doesn't implement RPCInfo
|
// Use the zero value for RPCInfo if the request doesn't implement RPCInfo
|
||||||
info, _ := args.(structs.RPCInfo)
|
info, _ := args.(structs.RPCInfo)
|
||||||
if retry := canRetry(info, rpcErr); !retry {
|
if retry := canRetry(info, rpcErr, firstCheck, c.config); !retry {
|
||||||
return rpcErr
|
return rpcErr
|
||||||
}
|
}
|
||||||
|
|
||||||
// We can wait a bit and retry!
|
// We can wait a bit and retry!
|
||||||
if time.Since(firstCheck) < c.config.RPCHoldTimeout {
|
jitter := lib.RandomStagger(c.config.RPCHoldTimeout / structs.JitterFraction)
|
||||||
jitter := lib.RandomStagger(c.config.RPCHoldTimeout / jitterFraction)
|
|
||||||
select {
|
select {
|
||||||
case <-time.After(jitter):
|
case <-time.After(jitter):
|
||||||
goto TRY
|
goto TRY
|
||||||
case <-c.shutdownCh:
|
case <-c.shutdownCh:
|
||||||
}
|
}
|
||||||
}
|
|
||||||
return rpcErr
|
return rpcErr
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -75,12 +75,6 @@ var RPCSummaries = []prometheus.SummaryDefinition{
|
||||||
}
|
}
|
||||||
|
|
||||||
const (
|
const (
|
||||||
// jitterFraction is a the limit to the amount of jitter we apply
|
|
||||||
// to a user specified MaxQueryTime. We divide the specified time by
|
|
||||||
// the fraction. So 16 == 6.25% limit of jitter. This same fraction
|
|
||||||
// is applied to the RPCHoldTimeout
|
|
||||||
jitterFraction = 16
|
|
||||||
|
|
||||||
// Warn if the Raft command is larger than this.
|
// Warn if the Raft command is larger than this.
|
||||||
// If it's over 1MB something is probably being abusive.
|
// If it's over 1MB something is probably being abusive.
|
||||||
raftWarnSize = 1024 * 1024
|
raftWarnSize = 1024 * 1024
|
||||||
|
@ -526,7 +520,14 @@ func (c *limitedConn) Read(b []byte) (n int, err error) {
|
||||||
}
|
}
|
||||||
|
|
||||||
// canRetry returns true if the request and error indicate that a retry is safe.
|
// canRetry returns true if the request and error indicate that a retry is safe.
|
||||||
func canRetry(info structs.RPCInfo, err error) bool {
|
func canRetry(info structs.RPCInfo, err error, start time.Time, config *Config) bool {
|
||||||
|
if info != nil && info.HasTimedOut(start, config.RPCHoldTimeout, config.MaxQueryTime, config.DefaultQueryTime) {
|
||||||
|
// RPCInfo timeout may include extra time for MaxQueryTime
|
||||||
|
return false
|
||||||
|
} else if info == nil && time.Since(start) > config.RPCHoldTimeout {
|
||||||
|
// When not RPCInfo, timeout is only RPCHoldTimeout
|
||||||
|
return false
|
||||||
|
}
|
||||||
// No leader errors are always safe to retry since no state could have
|
// No leader errors are always safe to retry since no state could have
|
||||||
// been changed.
|
// been changed.
|
||||||
if structs.IsErrNoLeader(err) {
|
if structs.IsErrNoLeader(err) {
|
||||||
|
@ -545,16 +546,16 @@ func canRetry(info structs.RPCInfo, err error) bool {
|
||||||
|
|
||||||
// ForwardRPC is used to forward an RPC request to a remote DC or to the local leader
|
// ForwardRPC is used to forward an RPC request to a remote DC or to the local leader
|
||||||
// Returns a bool of if forwarding was performed, as well as any error
|
// Returns a bool of if forwarding was performed, as well as any error
|
||||||
func (s *Server) ForwardRPC(method string, req structs.RPCInfo, reply interface{}) (bool, error) {
|
func (s *Server) ForwardRPC(method string, info structs.RPCInfo, reply interface{}) (bool, error) {
|
||||||
var firstCheck time.Time
|
firstCheck := time.Now()
|
||||||
|
|
||||||
// Handle DC forwarding
|
// Handle DC forwarding
|
||||||
dc := req.RequestDatacenter()
|
dc := info.RequestDatacenter()
|
||||||
if dc != s.config.Datacenter {
|
if dc != s.config.Datacenter {
|
||||||
// Local tokens only work within the current datacenter. Check to see
|
// Local tokens only work within the current datacenter. Check to see
|
||||||
// if we are attempting to forward one to a remote datacenter and strip
|
// if we are attempting to forward one to a remote datacenter and strip
|
||||||
// it, falling back on the anonymous token on the other end.
|
// it, falling back on the anonymous token on the other end.
|
||||||
if token := req.TokenSecret(); token != "" {
|
if token := info.TokenSecret(); token != "" {
|
||||||
done, ident, err := s.ResolveIdentityFromToken(token)
|
done, ident, err := s.ResolveIdentityFromToken(token)
|
||||||
if done {
|
if done {
|
||||||
if err != nil && !acl.IsErrNotFound(err) {
|
if err != nil && !acl.IsErrNotFound(err) {
|
||||||
|
@ -562,18 +563,18 @@ func (s *Server) ForwardRPC(method string, req structs.RPCInfo, reply interface{
|
||||||
}
|
}
|
||||||
if ident != nil && ident.IsLocal() {
|
if ident != nil && ident.IsLocal() {
|
||||||
// Strip it from the request.
|
// Strip it from the request.
|
||||||
req.SetTokenSecret("")
|
info.SetTokenSecret("")
|
||||||
defer req.SetTokenSecret(token)
|
defer info.SetTokenSecret(token)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
err := s.forwardDC(method, dc, req, reply)
|
err := s.forwardDC(method, dc, info, reply)
|
||||||
return true, err
|
return true, err
|
||||||
}
|
}
|
||||||
|
|
||||||
// Check if we can allow a stale read, ensure our local DB is initialized
|
// Check if we can allow a stale read, ensure our local DB is initialized
|
||||||
if req.IsRead() && req.AllowStaleRead() && !s.raft.LastContact().IsZero() {
|
if info.IsRead() && info.AllowStaleRead() && !s.raft.LastContact().IsZero() {
|
||||||
return false, nil
|
return false, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -596,20 +597,15 @@ CHECK_LEADER:
|
||||||
// Handle the case of a known leader
|
// Handle the case of a known leader
|
||||||
if leader != nil {
|
if leader != nil {
|
||||||
rpcErr = s.connPool.RPC(s.config.Datacenter, leader.ShortName, leader.Addr,
|
rpcErr = s.connPool.RPC(s.config.Datacenter, leader.ShortName, leader.Addr,
|
||||||
method, req, reply)
|
method, info, reply)
|
||||||
if rpcErr != nil && canRetry(req, rpcErr) {
|
if rpcErr == nil {
|
||||||
goto RETRY
|
return true, nil
|
||||||
}
|
}
|
||||||
return true, rpcErr
|
|
||||||
}
|
}
|
||||||
|
|
||||||
RETRY:
|
if retry := canRetry(info, rpcErr, firstCheck, s.config); retry {
|
||||||
// Gate the request until there is a leader
|
// Gate the request until there is a leader
|
||||||
if firstCheck.IsZero() {
|
jitter := lib.RandomStagger(s.config.RPCHoldTimeout / structs.JitterFraction)
|
||||||
firstCheck = time.Now()
|
|
||||||
}
|
|
||||||
if time.Since(firstCheck) < s.config.RPCHoldTimeout {
|
|
||||||
jitter := lib.RandomStagger(s.config.RPCHoldTimeout / jitterFraction)
|
|
||||||
select {
|
select {
|
||||||
case <-time.After(jitter):
|
case <-time.After(jitter):
|
||||||
goto CHECK_LEADER
|
goto CHECK_LEADER
|
||||||
|
@ -832,7 +828,7 @@ func (s *Server) blockingQuery(queryOpts structs.QueryOptionsCompat, queryMeta s
|
||||||
}
|
}
|
||||||
|
|
||||||
// Apply a small amount of jitter to the request.
|
// Apply a small amount of jitter to the request.
|
||||||
queryTimeout += lib.RandomStagger(queryTimeout / jitterFraction)
|
queryTimeout += lib.RandomStagger(queryTimeout / structs.JitterFraction)
|
||||||
|
|
||||||
// wrap the base context with a deadline
|
// wrap the base context with a deadline
|
||||||
ctx, cancel = context.WithDeadline(ctx, time.Now().Add(queryTimeout))
|
ctx, cancel = context.WithDeadline(ctx, time.Now().Add(queryTimeout))
|
||||||
|
@ -933,7 +929,7 @@ func (s *Server) consistentRead() error {
|
||||||
if s.isReadyForConsistentReads() {
|
if s.isReadyForConsistentReads() {
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
jitter := lib.RandomStagger(s.config.RPCHoldTimeout / jitterFraction)
|
jitter := lib.RandomStagger(s.config.RPCHoldTimeout / structs.JitterFraction)
|
||||||
deadline := time.Now().Add(s.config.RPCHoldTimeout)
|
deadline := time.Now().Add(s.config.RPCHoldTimeout)
|
||||||
|
|
||||||
for time.Now().Before(deadline) {
|
for time.Now().Before(deadline) {
|
||||||
|
|
|
@ -962,10 +962,17 @@ func TestCanRetry(t *testing.T) {
|
||||||
req structs.RPCInfo
|
req structs.RPCInfo
|
||||||
err error
|
err error
|
||||||
expected bool
|
expected bool
|
||||||
|
timeout time.Time
|
||||||
}
|
}
|
||||||
|
config := DefaultConfig()
|
||||||
|
now := time.Now()
|
||||||
|
config.RPCHoldTimeout = 7 * time.Second
|
||||||
run := func(t *testing.T, tc testCase) {
|
run := func(t *testing.T, tc testCase) {
|
||||||
require.Equal(t, tc.expected, canRetry(tc.req, tc.err))
|
timeOutValue := tc.timeout
|
||||||
|
if timeOutValue.IsZero() {
|
||||||
|
timeOutValue = now
|
||||||
|
}
|
||||||
|
require.Equal(t, tc.expected, canRetry(tc.req, tc.err, timeOutValue, config))
|
||||||
}
|
}
|
||||||
|
|
||||||
var testCases = []testCase{
|
var testCases = []testCase{
|
||||||
|
@ -990,6 +997,46 @@ func TestCanRetry(t *testing.T) {
|
||||||
err: io.EOF,
|
err: io.EOF,
|
||||||
expected: true,
|
expected: true,
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
name: "EOF error",
|
||||||
|
req: &structs.DCSpecificRequest{},
|
||||||
|
err: io.EOF,
|
||||||
|
expected: true,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "HasTimedOut implementation with no error",
|
||||||
|
req: &structs.DCSpecificRequest{},
|
||||||
|
err: nil,
|
||||||
|
expected: false,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "HasTimedOut implementation timedOut with no error",
|
||||||
|
req: &structs.DCSpecificRequest{},
|
||||||
|
err: nil,
|
||||||
|
expected: false,
|
||||||
|
timeout: now.Add(-(config.RPCHoldTimeout + time.Second)),
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "HasTimedOut implementation timedOut (with EOF error)",
|
||||||
|
req: &structs.DCSpecificRequest{},
|
||||||
|
err: io.EOF,
|
||||||
|
expected: false,
|
||||||
|
timeout: now.Add(-(config.RPCHoldTimeout + time.Second)),
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "HasTimedOut implementation timedOut blocking call",
|
||||||
|
req: &structs.DCSpecificRequest{QueryOptions: structs.QueryOptions{MaxQueryTime: 300, MinQueryIndex: 1}},
|
||||||
|
err: nil,
|
||||||
|
expected: false,
|
||||||
|
timeout: now.Add(-(config.RPCHoldTimeout + config.MaxQueryTime + time.Second)),
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "HasTimedOut implementation timedOut blocking call (MaxQueryTime not set)",
|
||||||
|
req: &structs.DCSpecificRequest{QueryOptions: structs.QueryOptions{MinQueryIndex: 1}},
|
||||||
|
err: nil,
|
||||||
|
expected: false,
|
||||||
|
timeout: now.Add(-(config.RPCHoldTimeout + config.MaxQueryTime + time.Second)),
|
||||||
|
},
|
||||||
{
|
{
|
||||||
name: "EOF on write request",
|
name: "EOF on write request",
|
||||||
err: io.EOF,
|
err: io.EOF,
|
||||||
|
@ -1011,3 +1058,7 @@ type isReadRequest struct {
|
||||||
func (r isReadRequest) IsRead() bool {
|
func (r isReadRequest) IsRead() bool {
|
||||||
return true
|
return true
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (r isReadRequest) HasTimedOut(since time.Time, rpcHoldTimeout, maxQueryTime, defaultQueryTime time.Duration) bool {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
|
@ -159,6 +159,12 @@ const (
|
||||||
// we multiply by time.Second
|
// we multiply by time.Second
|
||||||
lockDelayMinThreshold = 1000
|
lockDelayMinThreshold = 1000
|
||||||
|
|
||||||
|
// JitterFraction is a the limit to the amount of jitter we apply
|
||||||
|
// to a user specified MaxQueryTime. We divide the specified time by
|
||||||
|
// the fraction. So 16 == 6.25% limit of jitter. This same fraction
|
||||||
|
// is applied to the RPCHoldTimeout
|
||||||
|
JitterFraction = 16
|
||||||
|
|
||||||
// WildcardSpecifier is the string which should be used for specifying a wildcard
|
// WildcardSpecifier is the string which should be used for specifying a wildcard
|
||||||
// The exact semantics of the wildcard is left up to the code where its used.
|
// The exact semantics of the wildcard is left up to the code where its used.
|
||||||
WildcardSpecifier = "*"
|
WildcardSpecifier = "*"
|
||||||
|
@ -193,6 +199,7 @@ type RPCInfo interface {
|
||||||
AllowStaleRead() bool
|
AllowStaleRead() bool
|
||||||
TokenSecret() string
|
TokenSecret() string
|
||||||
SetTokenSecret(string)
|
SetTokenSecret(string)
|
||||||
|
HasTimedOut(since time.Time, rpcHoldTimeout, maxQueryTime, defaultQueryTime time.Duration) bool
|
||||||
}
|
}
|
||||||
|
|
||||||
// QueryOptions is used to specify various flags for read queries
|
// QueryOptions is used to specify various flags for read queries
|
||||||
|
@ -291,6 +298,20 @@ func (q *QueryOptions) SetTokenSecret(s string) {
|
||||||
q.Token = s
|
q.Token = s
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (q QueryOptions) HasTimedOut(start time.Time, rpcHoldTimeout, maxQueryTime, defaultQueryTime time.Duration) bool {
|
||||||
|
if q.MinQueryIndex > 0 {
|
||||||
|
if q.MaxQueryTime > maxQueryTime {
|
||||||
|
q.MaxQueryTime = maxQueryTime
|
||||||
|
} else if q.MaxQueryTime <= 0 {
|
||||||
|
q.MaxQueryTime = defaultQueryTime
|
||||||
|
}
|
||||||
|
q.MaxQueryTime += lib.RandomStagger(q.MaxQueryTime / JitterFraction)
|
||||||
|
|
||||||
|
return time.Since(start) > (q.MaxQueryTime + rpcHoldTimeout)
|
||||||
|
}
|
||||||
|
return time.Since(start) > rpcHoldTimeout
|
||||||
|
}
|
||||||
|
|
||||||
type WriteRequest struct {
|
type WriteRequest struct {
|
||||||
// Token is the ACL token ID. If not provided, the 'anonymous'
|
// Token is the ACL token ID. If not provided, the 'anonymous'
|
||||||
// token is assumed for backwards compatibility.
|
// token is assumed for backwards compatibility.
|
||||||
|
@ -314,6 +335,10 @@ func (w *WriteRequest) SetTokenSecret(s string) {
|
||||||
w.Token = s
|
w.Token = s
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (w WriteRequest) HasTimedOut(start time.Time, rpcHoldTimeout, maxQueryTime, defaultQueryTime time.Duration) bool {
|
||||||
|
return time.Since(start) > rpcHoldTimeout
|
||||||
|
}
|
||||||
|
|
||||||
// QueryMeta allows a query response to include potentially
|
// QueryMeta allows a query response to include potentially
|
||||||
// useful metadata about a query
|
// useful metadata about a query
|
||||||
type QueryMeta struct {
|
type QueryMeta struct {
|
||||||
|
|
|
@ -16,6 +16,9 @@ var yamuxSessionShutdown = yamux.ErrSessionShutdown.Error()
|
||||||
// IsErrEOF returns true if we get an EOF error from the socket itself, or
|
// IsErrEOF returns true if we get an EOF error from the socket itself, or
|
||||||
// an EOF equivalent error from yamux.
|
// an EOF equivalent error from yamux.
|
||||||
func IsErrEOF(err error) bool {
|
func IsErrEOF(err error) bool {
|
||||||
|
if err == nil {
|
||||||
|
return false
|
||||||
|
}
|
||||||
if errors.Is(err, io.EOF) {
|
if errors.Is(err, io.EOF) {
|
||||||
return true
|
return true
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,5 +1,7 @@
|
||||||
package pbautoconf
|
package pbautoconf
|
||||||
|
|
||||||
|
import "time"
|
||||||
|
|
||||||
func (req *AutoConfigRequest) RequestDatacenter() string {
|
func (req *AutoConfigRequest) RequestDatacenter() string {
|
||||||
return req.Datacenter
|
return req.Datacenter
|
||||||
}
|
}
|
||||||
|
@ -19,3 +21,7 @@ func (req *AutoConfigRequest) TokenSecret() string {
|
||||||
func (req *AutoConfigRequest) SetTokenSecret(token string) {
|
func (req *AutoConfigRequest) SetTokenSecret(token string) {
|
||||||
req.ConsulToken = token
|
req.ConsulToken = token
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (req *AutoConfigRequest) HasTimedOut(start time.Time, rpcHoldTimeout, maxQueryTime, defaultQueryTime time.Duration) bool {
|
||||||
|
return time.Since(start) > rpcHoldTimeout
|
||||||
|
}
|
||||||
|
|
Loading…
Reference in New Issue