Anton Evangelatov 7c9314f231 swarm: integrate OpenTracing; propagate ctx to internal APIs (#17169)
* swarm: propagate ctx, enable opentracing

* swarm/tracing: log error when tracing is misconfigured
2018-07-13 17:40:28 +02:00

557 lines
18 KiB
Go

// Copyright (c) 2017 Uber Technologies, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package jaeger
import (
"fmt"
"math"
"net/url"
"sync"
"sync/atomic"
"time"
"github.com/uber/jaeger-client-go/log"
"github.com/uber/jaeger-client-go/thrift-gen/sampling"
"github.com/uber/jaeger-client-go/utils"
)
const (
defaultSamplingServerURL = "http://localhost:5778/sampling"
defaultSamplingRefreshInterval = time.Minute
defaultMaxOperations = 2000
)
// Sampler decides whether a new trace should be sampled or not.
type Sampler interface {
// IsSampled decides whether a trace with given `id` and `operation`
// should be sampled. This function will also return the tags that
// can be used to identify the type of sampling that was applied to
// the root span. Most simple samplers would return two tags,
// sampler.type and sampler.param, similar to those used in the Configuration
IsSampled(id TraceID, operation string) (sampled bool, tags []Tag)
// Close does a clean shutdown of the sampler, stopping any background
// go-routines it may have started.
Close()
// Equal checks if the `other` sampler is functionally equivalent
// to this sampler.
// TODO remove this function. This function is used to determine if 2 samplers are equivalent
// which does not bode well with the adaptive sampler which has to create all the composite samplers
// for the comparison to occur. This is expensive to do if only one sampler has changed.
Equal(other Sampler) bool
}
// -----------------------
// ConstSampler is a sampler that always makes the same decision.
type ConstSampler struct {
Decision bool
tags []Tag
}
// NewConstSampler creates a ConstSampler.
func NewConstSampler(sample bool) Sampler {
tags := []Tag{
{key: SamplerTypeTagKey, value: SamplerTypeConst},
{key: SamplerParamTagKey, value: sample},
}
return &ConstSampler{Decision: sample, tags: tags}
}
// IsSampled implements IsSampled() of Sampler.
func (s *ConstSampler) IsSampled(id TraceID, operation string) (bool, []Tag) {
return s.Decision, s.tags
}
// Close implements Close() of Sampler.
func (s *ConstSampler) Close() {
// nothing to do
}
// Equal implements Equal() of Sampler.
func (s *ConstSampler) Equal(other Sampler) bool {
if o, ok := other.(*ConstSampler); ok {
return s.Decision == o.Decision
}
return false
}
// -----------------------
// ProbabilisticSampler is a sampler that randomly samples a certain percentage
// of traces.
type ProbabilisticSampler struct {
samplingRate float64
samplingBoundary uint64
tags []Tag
}
const maxRandomNumber = ^(uint64(1) << 63) // i.e. 0x7fffffffffffffff
// NewProbabilisticSampler creates a sampler that randomly samples a certain percentage of traces specified by the
// samplingRate, in the range between 0.0 and 1.0.
//
// It relies on the fact that new trace IDs are 63bit random numbers themselves, thus making the sampling decision
// without generating a new random number, but simply calculating if traceID < (samplingRate * 2^63).
// TODO remove the error from this function for next major release
func NewProbabilisticSampler(samplingRate float64) (*ProbabilisticSampler, error) {
if samplingRate < 0.0 || samplingRate > 1.0 {
return nil, fmt.Errorf("Sampling Rate must be between 0.0 and 1.0, received %f", samplingRate)
}
return newProbabilisticSampler(samplingRate), nil
}
func newProbabilisticSampler(samplingRate float64) *ProbabilisticSampler {
samplingRate = math.Max(0.0, math.Min(samplingRate, 1.0))
tags := []Tag{
{key: SamplerTypeTagKey, value: SamplerTypeProbabilistic},
{key: SamplerParamTagKey, value: samplingRate},
}
return &ProbabilisticSampler{
samplingRate: samplingRate,
samplingBoundary: uint64(float64(maxRandomNumber) * samplingRate),
tags: tags,
}
}
// SamplingRate returns the sampling probability this sampled was constructed with.
func (s *ProbabilisticSampler) SamplingRate() float64 {
return s.samplingRate
}
// IsSampled implements IsSampled() of Sampler.
func (s *ProbabilisticSampler) IsSampled(id TraceID, operation string) (bool, []Tag) {
return s.samplingBoundary >= id.Low, s.tags
}
// Close implements Close() of Sampler.
func (s *ProbabilisticSampler) Close() {
// nothing to do
}
// Equal implements Equal() of Sampler.
func (s *ProbabilisticSampler) Equal(other Sampler) bool {
if o, ok := other.(*ProbabilisticSampler); ok {
return s.samplingBoundary == o.samplingBoundary
}
return false
}
// -----------------------
type rateLimitingSampler struct {
maxTracesPerSecond float64
rateLimiter utils.RateLimiter
tags []Tag
}
// NewRateLimitingSampler creates a sampler that samples at most maxTracesPerSecond. The distribution of sampled
// traces follows burstiness of the service, i.e. a service with uniformly distributed requests will have those
// requests sampled uniformly as well, but if requests are bursty, especially sub-second, then a number of
// sequential requests can be sampled each second.
func NewRateLimitingSampler(maxTracesPerSecond float64) Sampler {
tags := []Tag{
{key: SamplerTypeTagKey, value: SamplerTypeRateLimiting},
{key: SamplerParamTagKey, value: maxTracesPerSecond},
}
return &rateLimitingSampler{
maxTracesPerSecond: maxTracesPerSecond,
rateLimiter: utils.NewRateLimiter(maxTracesPerSecond, math.Max(maxTracesPerSecond, 1.0)),
tags: tags,
}
}
// IsSampled implements IsSampled() of Sampler.
func (s *rateLimitingSampler) IsSampled(id TraceID, operation string) (bool, []Tag) {
return s.rateLimiter.CheckCredit(1.0), s.tags
}
func (s *rateLimitingSampler) Close() {
// nothing to do
}
func (s *rateLimitingSampler) Equal(other Sampler) bool {
if o, ok := other.(*rateLimitingSampler); ok {
return s.maxTracesPerSecond == o.maxTracesPerSecond
}
return false
}
// -----------------------
// GuaranteedThroughputProbabilisticSampler is a sampler that leverages both probabilisticSampler and
// rateLimitingSampler. The rateLimitingSampler is used as a guaranteed lower bound sampler such that
// every operation is sampled at least once in a time interval defined by the lowerBound. ie a lowerBound
// of 1.0 / (60 * 10) will sample an operation at least once every 10 minutes.
//
// The probabilisticSampler is given higher priority when tags are emitted, ie. if IsSampled() for both
// samplers return true, the tags for probabilisticSampler will be used.
type GuaranteedThroughputProbabilisticSampler struct {
probabilisticSampler *ProbabilisticSampler
lowerBoundSampler Sampler
tags []Tag
samplingRate float64
lowerBound float64
}
// NewGuaranteedThroughputProbabilisticSampler returns a delegating sampler that applies both
// probabilisticSampler and rateLimitingSampler.
func NewGuaranteedThroughputProbabilisticSampler(
lowerBound, samplingRate float64,
) (*GuaranteedThroughputProbabilisticSampler, error) {
return newGuaranteedThroughputProbabilisticSampler(lowerBound, samplingRate), nil
}
func newGuaranteedThroughputProbabilisticSampler(lowerBound, samplingRate float64) *GuaranteedThroughputProbabilisticSampler {
s := &GuaranteedThroughputProbabilisticSampler{
lowerBoundSampler: NewRateLimitingSampler(lowerBound),
lowerBound: lowerBound,
}
s.setProbabilisticSampler(samplingRate)
return s
}
func (s *GuaranteedThroughputProbabilisticSampler) setProbabilisticSampler(samplingRate float64) {
if s.probabilisticSampler == nil || s.samplingRate != samplingRate {
s.probabilisticSampler = newProbabilisticSampler(samplingRate)
s.samplingRate = s.probabilisticSampler.SamplingRate()
s.tags = []Tag{
{key: SamplerTypeTagKey, value: SamplerTypeLowerBound},
{key: SamplerParamTagKey, value: s.samplingRate},
}
}
}
// IsSampled implements IsSampled() of Sampler.
func (s *GuaranteedThroughputProbabilisticSampler) IsSampled(id TraceID, operation string) (bool, []Tag) {
if sampled, tags := s.probabilisticSampler.IsSampled(id, operation); sampled {
s.lowerBoundSampler.IsSampled(id, operation)
return true, tags
}
sampled, _ := s.lowerBoundSampler.IsSampled(id, operation)
return sampled, s.tags
}
// Close implements Close() of Sampler.
func (s *GuaranteedThroughputProbabilisticSampler) Close() {
s.probabilisticSampler.Close()
s.lowerBoundSampler.Close()
}
// Equal implements Equal() of Sampler.
func (s *GuaranteedThroughputProbabilisticSampler) Equal(other Sampler) bool {
// NB The Equal() function is expensive and will be removed. See adaptiveSampler.Equal() for
// more information.
return false
}
// this function should only be called while holding a Write lock
func (s *GuaranteedThroughputProbabilisticSampler) update(lowerBound, samplingRate float64) {
s.setProbabilisticSampler(samplingRate)
if s.lowerBound != lowerBound {
s.lowerBoundSampler = NewRateLimitingSampler(lowerBound)
s.lowerBound = lowerBound
}
}
// -----------------------
type adaptiveSampler struct {
sync.RWMutex
samplers map[string]*GuaranteedThroughputProbabilisticSampler
defaultSampler *ProbabilisticSampler
lowerBound float64
maxOperations int
}
// NewAdaptiveSampler returns a delegating sampler that applies both probabilisticSampler and
// rateLimitingSampler via the guaranteedThroughputProbabilisticSampler. This sampler keeps track of all
// operations and delegates calls to the respective guaranteedThroughputProbabilisticSampler.
func NewAdaptiveSampler(strategies *sampling.PerOperationSamplingStrategies, maxOperations int) (Sampler, error) {
return newAdaptiveSampler(strategies, maxOperations), nil
}
func newAdaptiveSampler(strategies *sampling.PerOperationSamplingStrategies, maxOperations int) Sampler {
samplers := make(map[string]*GuaranteedThroughputProbabilisticSampler)
for _, strategy := range strategies.PerOperationStrategies {
sampler := newGuaranteedThroughputProbabilisticSampler(
strategies.DefaultLowerBoundTracesPerSecond,
strategy.ProbabilisticSampling.SamplingRate,
)
samplers[strategy.Operation] = sampler
}
return &adaptiveSampler{
samplers: samplers,
defaultSampler: newProbabilisticSampler(strategies.DefaultSamplingProbability),
lowerBound: strategies.DefaultLowerBoundTracesPerSecond,
maxOperations: maxOperations,
}
}
func (s *adaptiveSampler) IsSampled(id TraceID, operation string) (bool, []Tag) {
s.RLock()
sampler, ok := s.samplers[operation]
if ok {
defer s.RUnlock()
return sampler.IsSampled(id, operation)
}
s.RUnlock()
s.Lock()
defer s.Unlock()
// Check if sampler has already been created
sampler, ok = s.samplers[operation]
if ok {
return sampler.IsSampled(id, operation)
}
// Store only up to maxOperations of unique ops.
if len(s.samplers) >= s.maxOperations {
return s.defaultSampler.IsSampled(id, operation)
}
newSampler := newGuaranteedThroughputProbabilisticSampler(s.lowerBound, s.defaultSampler.SamplingRate())
s.samplers[operation] = newSampler
return newSampler.IsSampled(id, operation)
}
func (s *adaptiveSampler) Close() {
s.Lock()
defer s.Unlock()
for _, sampler := range s.samplers {
sampler.Close()
}
s.defaultSampler.Close()
}
func (s *adaptiveSampler) Equal(other Sampler) bool {
// NB The Equal() function is overly expensive for adaptiveSampler since it's composed of multiple
// samplers which all need to be initialized before this function can be called for a comparison.
// Therefore, adaptiveSampler uses the update() function to only alter the samplers that need
// changing. Hence this function always returns false so that the update function can be called.
// Once the Equal() function is removed from the Sampler API, this will no longer be needed.
return false
}
func (s *adaptiveSampler) update(strategies *sampling.PerOperationSamplingStrategies) {
s.Lock()
defer s.Unlock()
for _, strategy := range strategies.PerOperationStrategies {
operation := strategy.Operation
samplingRate := strategy.ProbabilisticSampling.SamplingRate
lowerBound := strategies.DefaultLowerBoundTracesPerSecond
if sampler, ok := s.samplers[operation]; ok {
sampler.update(lowerBound, samplingRate)
} else {
sampler := newGuaranteedThroughputProbabilisticSampler(
lowerBound,
samplingRate,
)
s.samplers[operation] = sampler
}
}
s.lowerBound = strategies.DefaultLowerBoundTracesPerSecond
if s.defaultSampler.SamplingRate() != strategies.DefaultSamplingProbability {
s.defaultSampler = newProbabilisticSampler(strategies.DefaultSamplingProbability)
}
}
// -----------------------
// RemotelyControlledSampler is a delegating sampler that polls a remote server
// for the appropriate sampling strategy, constructs a corresponding sampler and
// delegates to it for sampling decisions.
type RemotelyControlledSampler struct {
// These fields must be first in the struct because `sync/atomic` expects 64-bit alignment.
// Cf. https://github.com/uber/jaeger-client-go/issues/155, https://goo.gl/zW7dgq
closed int64 // 0 - not closed, 1 - closed
sync.RWMutex
samplerOptions
serviceName string
manager sampling.SamplingManager
doneChan chan *sync.WaitGroup
}
type httpSamplingManager struct {
serverURL string
}
func (s *httpSamplingManager) GetSamplingStrategy(serviceName string) (*sampling.SamplingStrategyResponse, error) {
var out sampling.SamplingStrategyResponse
v := url.Values{}
v.Set("service", serviceName)
if err := utils.GetJSON(s.serverURL+"?"+v.Encode(), &out); err != nil {
return nil, err
}
return &out, nil
}
// NewRemotelyControlledSampler creates a sampler that periodically pulls
// the sampling strategy from an HTTP sampling server (e.g. jaeger-agent).
func NewRemotelyControlledSampler(
serviceName string,
opts ...SamplerOption,
) *RemotelyControlledSampler {
options := applySamplerOptions(opts...)
sampler := &RemotelyControlledSampler{
samplerOptions: options,
serviceName: serviceName,
manager: &httpSamplingManager{serverURL: options.samplingServerURL},
doneChan: make(chan *sync.WaitGroup),
}
go sampler.pollController()
return sampler
}
func applySamplerOptions(opts ...SamplerOption) samplerOptions {
options := samplerOptions{}
for _, option := range opts {
option(&options)
}
if options.sampler == nil {
options.sampler = newProbabilisticSampler(0.001)
}
if options.logger == nil {
options.logger = log.NullLogger
}
if options.maxOperations <= 0 {
options.maxOperations = defaultMaxOperations
}
if options.samplingServerURL == "" {
options.samplingServerURL = defaultSamplingServerURL
}
if options.metrics == nil {
options.metrics = NewNullMetrics()
}
if options.samplingRefreshInterval <= 0 {
options.samplingRefreshInterval = defaultSamplingRefreshInterval
}
return options
}
// IsSampled implements IsSampled() of Sampler.
func (s *RemotelyControlledSampler) IsSampled(id TraceID, operation string) (bool, []Tag) {
s.RLock()
defer s.RUnlock()
return s.sampler.IsSampled(id, operation)
}
// Close implements Close() of Sampler.
func (s *RemotelyControlledSampler) Close() {
if swapped := atomic.CompareAndSwapInt64(&s.closed, 0, 1); !swapped {
s.logger.Error("Repeated attempt to close the sampler is ignored")
return
}
var wg sync.WaitGroup
wg.Add(1)
s.doneChan <- &wg
wg.Wait()
}
// Equal implements Equal() of Sampler.
func (s *RemotelyControlledSampler) Equal(other Sampler) bool {
// NB The Equal() function is expensive and will be removed. See adaptiveSampler.Equal() for
// more information.
if o, ok := other.(*RemotelyControlledSampler); ok {
s.RLock()
o.RLock()
defer s.RUnlock()
defer o.RUnlock()
return s.sampler.Equal(o.sampler)
}
return false
}
func (s *RemotelyControlledSampler) pollController() {
ticker := time.NewTicker(s.samplingRefreshInterval)
defer ticker.Stop()
s.pollControllerWithTicker(ticker)
}
func (s *RemotelyControlledSampler) pollControllerWithTicker(ticker *time.Ticker) {
for {
select {
case <-ticker.C:
s.updateSampler()
case wg := <-s.doneChan:
wg.Done()
return
}
}
}
func (s *RemotelyControlledSampler) getSampler() Sampler {
s.Lock()
defer s.Unlock()
return s.sampler
}
func (s *RemotelyControlledSampler) setSampler(sampler Sampler) {
s.Lock()
defer s.Unlock()
s.sampler = sampler
}
func (s *RemotelyControlledSampler) updateSampler() {
res, err := s.manager.GetSamplingStrategy(s.serviceName)
if err != nil {
s.metrics.SamplerQueryFailure.Inc(1)
return
}
s.Lock()
defer s.Unlock()
s.metrics.SamplerRetrieved.Inc(1)
if strategies := res.GetOperationSampling(); strategies != nil {
s.updateAdaptiveSampler(strategies)
} else {
err = s.updateRateLimitingOrProbabilisticSampler(res)
}
if err != nil {
s.metrics.SamplerUpdateFailure.Inc(1)
s.logger.Infof("Unable to handle sampling strategy response %+v. Got error: %v", res, err)
return
}
s.metrics.SamplerUpdated.Inc(1)
}
// NB: this function should only be called while holding a Write lock
func (s *RemotelyControlledSampler) updateAdaptiveSampler(strategies *sampling.PerOperationSamplingStrategies) {
if adaptiveSampler, ok := s.sampler.(*adaptiveSampler); ok {
adaptiveSampler.update(strategies)
} else {
s.sampler = newAdaptiveSampler(strategies, s.maxOperations)
}
}
// NB: this function should only be called while holding a Write lock
func (s *RemotelyControlledSampler) updateRateLimitingOrProbabilisticSampler(res *sampling.SamplingStrategyResponse) error {
var newSampler Sampler
if probabilistic := res.GetProbabilisticSampling(); probabilistic != nil {
newSampler = newProbabilisticSampler(probabilistic.SamplingRate)
} else if rateLimiting := res.GetRateLimitingSampling(); rateLimiting != nil {
newSampler = NewRateLimitingSampler(float64(rateLimiting.MaxTracesPerSecond))
} else {
return fmt.Errorf("Unsupported sampling strategy type %v", res.GetStrategyType())
}
if !s.sampler.Equal(newSampler) {
s.sampler = newSampler
}
return nil
}