increase protobuf size limit for cluster peering (#14976)

This commit is contained in:
malizz 2022-10-13 13:46:51 -07:00 committed by GitHub
parent e04c56a3a1
commit b0b0cbb8ee
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 57 additions and 19 deletions

View File

@ -338,9 +338,7 @@ func (s *Server) establishStream(ctx context.Context, logger hclog.Logger, ws me
// Try a new address on each iteration by advancing the ring buffer on errors. // Try a new address on each iteration by advancing the ring buffer on errors.
addr := <-nextServerAddr addr := <-nextServerAddr
logger.Trace("dialing peer", "addr", addr) opts := []grpc.DialOption{
conn, err := grpc.DialContext(streamCtx, addr,
// TODO(peering): use a grpc.WithStatsHandler here?)
tlsOption, tlsOption,
// For keep alive parameters there is a larger comment in ClientConnPool.dial about that. // For keep alive parameters there is a larger comment in ClientConnPool.dial about that.
grpc.WithKeepaliveParams(keepalive.ClientParameters{ grpc.WithKeepaliveParams(keepalive.ClientParameters{
@ -349,7 +347,12 @@ func (s *Server) establishStream(ctx context.Context, logger hclog.Logger, ws me
// send keepalive pings even if there is no active streams // send keepalive pings even if there is no active streams
PermitWithoutStream: true, PermitWithoutStream: true,
}), }),
) grpc.WithDefaultCallOptions(grpc.MaxCallSendMsgSize(50 * 1024 * 1024)),
}
// TODO(peering): use a grpc.WithStatsHandler here?)
logger.Trace("dialing peer", "addr", addr)
conn, err := grpc.DialContext(streamCtx, addr, opts...)
if err != nil { if err != nil {
return fmt.Errorf("failed to dial: %w", err) return fmt.Errorf("failed to dial: %w", err)
} }
@ -397,10 +400,14 @@ func (s *Server) establishStream(ctx context.Context, logger hclog.Logger, ws me
}, func(err error) { }, func(err error) {
// TODO(peering): why are we using TrackSendError here? This could also be a receive error. // TODO(peering): why are we using TrackSendError here? This could also be a receive error.
streamStatus.TrackSendError(err.Error()) streamStatus.TrackSendError(err.Error())
if isFailedPreconditionErr(err) { if isErrCode(err, codes.FailedPrecondition) {
logger.Debug("stream disconnected due to 'failed precondition' error; reconnecting", logger.Debug("stream disconnected due to 'failed precondition' error; reconnecting",
"error", err) "error", err)
return return
} else if isErrCode(err, codes.ResourceExhausted) {
logger.Debug("stream disconnected due to 'resource exhausted' error; reconnecting",
"error", err)
return
} }
logger.Error("error managing peering stream", "error", err) logger.Error("error managing peering stream", "error", err)
}, peeringRetryTimeout) }, peeringRetryTimeout)
@ -680,7 +687,7 @@ func retryLoopBackoffPeering(ctx context.Context, logger hclog.Logger, loopFn fu
// quickly. For example in the case of connecting with a follower through a load balancer, we just need to retry // quickly. For example in the case of connecting with a follower through a load balancer, we just need to retry
// until our request lands on a leader. // until our request lands on a leader.
func peeringRetryTimeout(failedAttempts uint, loopErr error) time.Duration { func peeringRetryTimeout(failedAttempts uint, loopErr error) time.Duration {
if loopErr != nil && isFailedPreconditionErr(loopErr) { if loopErr != nil && isErrCode(loopErr, codes.FailedPrecondition) {
// Wait a constant time for the first number of retries. // Wait a constant time for the first number of retries.
if failedAttempts <= maxFastConnRetries { if failedAttempts <= maxFastConnRetries {
return fastConnRetryTimeout return fastConnRetryTimeout
@ -697,6 +704,11 @@ func peeringRetryTimeout(failedAttempts uint, loopErr error) time.Duration {
return ms return ms
} }
// if the message sent is too large probably should not retry at all
if loopErr != nil && isErrCode(loopErr, codes.ResourceExhausted) {
return maxFastRetryBackoff
}
// Else we go with the default backoff from retryLoopBackoff. // Else we go with the default backoff from retryLoopBackoff.
if (1 << failedAttempts) < maxRetryBackoff { if (1 << failedAttempts) < maxRetryBackoff {
return (1 << failedAttempts) * time.Second return (1 << failedAttempts) * time.Second
@ -704,23 +716,22 @@ func peeringRetryTimeout(failedAttempts uint, loopErr error) time.Duration {
return time.Duration(maxRetryBackoff) * time.Second return time.Duration(maxRetryBackoff) * time.Second
} }
// isFailedPreconditionErr returns true if err is a gRPC error with code FailedPrecondition. // isErrCode returns true if err is a gRPC error with given error code.
func isFailedPreconditionErr(err error) bool { func isErrCode(err error, code codes.Code) bool {
if err == nil { if err == nil {
return false return false
} }
// Handle wrapped errors, since status.FromError does a naive assertion. // Handle wrapped errors, since status.FromError does a naive assertion.
var statusErr interface { var statusErr interface {
GRPCStatus() *grpcstatus.Status GRPCStatus() *grpcstatus.Status
} }
if errors.As(err, &statusErr) { if errors.As(err, &statusErr) {
return statusErr.GRPCStatus().Code() == codes.FailedPrecondition return statusErr.GRPCStatus().Code() == code
} }
grpcErr, ok := grpcstatus.FromError(err) grpcErr, ok := grpcstatus.FromError(err)
if !ok { if !ok {
return false return false
} }
return grpcErr.Code() == codes.FailedPrecondition return grpcErr.Code() == code
} }

View File

@ -1686,14 +1686,35 @@ func TestLeader_Peering_retryLoopBackoffPeering_cancelContext(t *testing.T) {
}, allErrors) }, allErrors)
} }
func Test_isFailedPreconditionErr(t *testing.T) { func Test_isErrCode(t *testing.T) {
st := grpcstatus.New(codes.FailedPrecondition, "cannot establish a peering stream on a follower node") tests := []struct {
err := st.Err() name string
assert.True(t, isFailedPreconditionErr(err)) expectedCode codes.Code
}{
{
name: "cannot establish a peering stream on a follower node",
expectedCode: codes.FailedPrecondition,
},
{
name: "received message larger than max ",
expectedCode: codes.ResourceExhausted,
},
{
name: "deadline exceeded",
expectedCode: codes.DeadlineExceeded,
},
}
for _, tc := range tests {
t.Run(tc.name, func(t *testing.T) {
st := grpcstatus.New(tc.expectedCode, tc.name)
err := st.Err()
assert.True(t, isErrCode(err, tc.expectedCode))
// test that wrapped errors are checked correctly // test that wrapped errors are checked correctly
werr := fmt.Errorf("wrapped: %w", err) werr := fmt.Errorf("wrapped: %w", err)
assert.True(t, isFailedPreconditionErr(werr)) assert.True(t, isErrCode(werr, tc.expectedCode))
})
}
} }
func Test_Leader_PeeringSync_ServerAddressUpdates(t *testing.T) { func Test_Leader_PeeringSync_ServerAddressUpdates(t *testing.T) {

View File

@ -29,6 +29,7 @@ func NewServer(logger agentmiddleware.Logger, metricsObj *metrics.Metrics) *grpc
opts := []grpc.ServerOption{ opts := []grpc.ServerOption{
grpc.MaxConcurrentStreams(2048), grpc.MaxConcurrentStreams(2048),
grpc.MaxRecvMsgSize(50 * 1024 * 1024),
grpc.StatsHandler(agentmiddleware.NewStatsHandler(metricsObj, metricsLabels)), grpc.StatsHandler(agentmiddleware.NewStatsHandler(metricsObj, metricsLabels)),
middleware.WithUnaryServerChain( middleware.WithUnaryServerChain(
// Add middlware interceptors to recover in case of panics. // Add middlware interceptors to recover in case of panics.

View File

@ -367,6 +367,11 @@ func (s *Server) realHandleStream(streamReq HandleStreamRequest) error {
// resources, not request/ack messages. // resources, not request/ack messages.
if msg.GetResponse() != nil { if msg.GetResponse() != nil {
if err != nil { if err != nil {
if id := msg.GetResponse().GetResourceID(); id != "" {
logger.Error("failed to send resource", "resourceID", id, "error", err)
status.TrackSendError(err.Error())
return nil
}
status.TrackSendError(err.Error()) status.TrackSendError(err.Error())
} else { } else {
status.TrackSendSuccess() status.TrackSendSuccess()

View File

@ -55,7 +55,7 @@ The cluster peering beta release includes the following features and functionali
Not all features and functionality are available in the beta release. In particular, consider the following technical constraints: Not all features and functionality are available in the beta release. In particular, consider the following technical constraints:
- Mesh gateways for _server to server traffic_ are not available. - Mesh gateways for _server to server traffic_ are not available.
- Services with node, instance, and check definitions totaling more than 4MB cannot be exported to a peer. - Services with node, instance, and check definitions totaling more than 50MB cannot be exported to a peer.
- The `service-splitter` and `service-router` configuration entry kinds cannot directly target a peer. To split or route traffic to a service instance on a peer, you must supplement your desired dynamic routing definition with a `service-resolver` config entry on the dialing cluster. Refer to [L7 traffic management between peers](/docs/connect/cluster-peering/create-manage-peering#L7-traffic) for more information. - The `service-splitter` and `service-router` configuration entry kinds cannot directly target a peer. To split or route traffic to a service instance on a peer, you must supplement your desired dynamic routing definition with a `service-resolver` config entry on the dialing cluster. Refer to [L7 traffic management between peers](/docs/connect/cluster-peering/create-manage-peering#L7-traffic) for more information.
- The `consul intention` CLI command is not supported. To manage intentions that specify services in peered clusters, use [configuration entries](/docs/connect/config-entries/service-intentions). - The `consul intention` CLI command is not supported. To manage intentions that specify services in peered clusters, use [configuration entries](/docs/connect/config-entries/service-intentions).
- Accessing key/value stores across peers is not supported. - Accessing key/value stores across peers is not supported.