From 5587296464a6fdd9f91c1b6d4240900cae546616 Mon Sep 17 00:00:00 2001 From: Paul Banks Date: Thu, 8 Apr 2021 11:05:38 +0100 Subject: [PATCH] Merge pull request #9977 from hashicorp/grpc-tuning streaming: Grpc tuning --- .../cache-types/streaming_health_services.go | 10 +++++++--- agent/grpc/client.go | 19 ++++++++++++++++++- agent/grpc/handler.go | 5 +++++ 3 files changed, 30 insertions(+), 4 deletions(-) diff --git a/agent/cache-types/streaming_health_services.go b/agent/cache-types/streaming_health_services.go index d150a43f48..2ee2e0bd39 100644 --- a/agent/cache-types/streaming_health_services.go +++ b/agent/cache-types/streaming_health_services.go @@ -112,9 +112,13 @@ func newMaterializer( Logger: deps.Logger, Waiter: &retry.Waiter{ MinFailures: 1, - MinWait: 0, - MaxWait: 60 * time.Second, - Jitter: retry.NewJitter(100), + // Start backing off with small increments (200-400ms) which will double + // each attempt. (200-400, 400-800, 800-1600, 1600-3200, 3200-6000, 6000 + // after that). (retry.Wait applies Max limit after jitter right now). + Factor: 200 * time.Millisecond, + MinWait: 0, + MaxWait: 60 * time.Second, + Jitter: retry.NewJitter(100), }, Request: newRequestFn, }), nil diff --git a/agent/grpc/client.go b/agent/grpc/client.go index 9fdfa54e62..8e43873828 100644 --- a/agent/grpc/client.go +++ b/agent/grpc/client.go @@ -5,8 +5,10 @@ import ( "fmt" "net" "sync" + "time" "google.golang.org/grpc" + "google.golang.org/grpc/keepalive" "github.com/hashicorp/consul/agent/metadata" "github.com/hashicorp/consul/agent/pool" @@ -64,7 +66,22 @@ func (c *ClientConnPool) ClientConn(datacenter string) (*grpc.ClientConn, error) grpc.WithDisableRetry(), grpc.WithStatsHandler(newStatsHandler(defaultMetrics())), // nolint:staticcheck // there is no other supported alternative to WithBalancerName - grpc.WithBalancerName("pick_first")) + grpc.WithBalancerName("pick_first"), + // Keep alive parameters are based on the same default ones we used for + // Yamux. These are somewhat arbitrary but we did observe in scale testing + // that the gRPC defaults (servers send keepalives only every 2 hours, + // clients never) seemed to result in TCP drops going undetected until + // actual updates needed to be sent which caused unnecessary delays for + // deliveries. These settings should be no more work for servers than + // existing yamux clients but hopefully allow TCP drops to be detected + // earlier and so have a smaller chance of going unnoticed until there are + // actual updates to send out from the servers. The servers have a policy to + // not accept pings any faster than once every 15 seconds to protect against + // abuse. + grpc.WithKeepaliveParams(keepalive.ClientParameters{ + Time: 30 * time.Second, + Timeout: 10 * time.Second, + })) if err != nil { return nil, err } diff --git a/agent/grpc/handler.go b/agent/grpc/handler.go index 388954dc45..53705b4dde 100644 --- a/agent/grpc/handler.go +++ b/agent/grpc/handler.go @@ -6,8 +6,10 @@ package grpc import ( "fmt" "net" + "time" "google.golang.org/grpc" + "google.golang.org/grpc/keepalive" ) // NewHandler returns a gRPC server that accepts connections from Handle(conn). @@ -20,6 +22,9 @@ func NewHandler(addr net.Addr, register func(server *grpc.Server)) *Handler { srv := grpc.NewServer( grpc.StatsHandler(newStatsHandler(metrics)), grpc.StreamInterceptor((&activeStreamCounter{metrics: metrics}).Intercept), + grpc.KeepaliveEnforcementPolicy(keepalive.EnforcementPolicy{ + MinTime: 15 * time.Second, + }), ) register(srv)