consul/agent/hcp/telemetry_provider.go
Melissa Kam c112a6632d
[CC-7042] Update and enable the HCP metrics sink in the HCP manager (#20072)
* Option to set HCP client at runtime

Allows us to initially set a nil HCP client for the
telemetry provider and update it later.

* Set telemetry provider HCP client in HCP manager

Set the telemetry provider as a dependency and pass it to
the manager. Update the telemetry provider's HCP client
when the HCP manager starts.

* Add a provider interface for the metrics client

This provider will allow us to configure and reconfigure the
retryable HTTP client and the headers for the metrics client.

* Move HTTP retryable client to separate file

Copied directly from the metrics client.

* Abstract HCP specific values in HTTP client

Remove HCP specific references and instead initiate with
a generic TLS configuration and authentication source.

* Set up HTTP client and headers in the provider

Move setup from the metrics client to the HCP telemetry
provider.

* Update the telemetry provider in the HCP manager

Initialize the provider without the HCP configs and then update
it in the HCP manager to enable it.

* Improve test assertion, fix method comment

* Move client provider to metrics client

* Stop the manager on setup error

* Add separate lock for http configuration

* Start telemetry provider in HCP manager

* Update HCP client and config as part of Run

* Remove option to set config at initialization

* Simplify and clean up setting HCP configs

* Add test for telemetry provider Run method

* Fix race condition

* Use clone of HTTP headers

* Only allow initial update and run once
2024-01-16 10:46:12 -06:00

323 lines
9.4 KiB
Go

// Copyright (c) HashiCorp, Inc.
// SPDX-License-Identifier: BUSL-1.1
package hcp
import (
"context"
"errors"
"fmt"
"net/http"
"net/url"
"reflect"
"regexp"
"sync"
"time"
"github.com/armon/go-metrics"
"github.com/go-openapi/runtime"
"github.com/hashicorp/go-hclog"
"github.com/hashicorp/go-retryablehttp"
"github.com/hashicorp/consul/agent/hcp/client"
"github.com/hashicorp/consul/agent/hcp/config"
"github.com/hashicorp/consul/agent/hcp/telemetry"
"github.com/hashicorp/consul/version"
)
var (
// internalMetricRefreshFailure is a metric to monitor refresh failures.
internalMetricRefreshFailure []string = []string{"hcp", "telemetry_config_provider", "refresh", "failure"}
// internalMetricRefreshSuccess is a metric to monitor refresh successes.
internalMetricRefreshSuccess []string = []string{"hcp", "telemetry_config_provider", "refresh", "success"}
// defaultTelemetryConfigRefreshInterval is a default fallback in case the first HCP fetch fails.
defaultTelemetryConfigRefreshInterval = 1 * time.Minute
)
// Ensure hcpProviderImpl implements telemetry provider interfaces.
var _ telemetry.ConfigProvider = &hcpProviderImpl{}
var _ telemetry.EndpointProvider = &hcpProviderImpl{}
var _ client.MetricsClientProvider = &hcpProviderImpl{}
// hcpProviderImpl holds telemetry configuration and settings for continuous fetch of new config from HCP.
// it updates configuration, if changes are detected.
type hcpProviderImpl struct {
// cfg holds configuration that can be dynamically updated.
cfg *dynamicConfig
// httpCfg holds configuration for the HTTP client
httpCfg *httpCfg
// Reader-writer mutexes are used as the provider is read heavy.
// OTEL components access telemetryConfig during metrics collection and export (read).
// Meanwhile, configs are only updated when there are changes (write).
rw sync.RWMutex
httpCfgRW sync.RWMutex
// running indicates if the HCP telemetry config provider has been started
running bool
// hcpClient is an authenticated client used to make HTTP requests to HCP.
hcpClient client.Client
// logger is the HCP logger for the provider
logger hclog.Logger
// testUpdateConfigCh is used by unit tests to signal when an update config has occurred
testUpdateConfigCh chan struct{}
}
// dynamicConfig is a set of configurable settings for metrics collection, processing and export.
// fields MUST be exported to compute hash for equals method.
type dynamicConfig struct {
disabled bool
endpoint *url.URL
labels map[string]string
filters *regexp.Regexp
// refreshInterval controls the interval at which configuration is fetched from HCP to refresh config.
refreshInterval time.Duration
}
// defaultDisabledCfg disables metric collection and contains default config values.
func defaultDisabledCfg() *dynamicConfig {
return &dynamicConfig{
labels: map[string]string{},
filters: client.DefaultMetricFilters,
refreshInterval: defaultTelemetryConfigRefreshInterval,
endpoint: nil,
disabled: true,
}
}
// httpCfg is a set of configurable settings for the HTTP client used to export metrics
type httpCfg struct {
header *http.Header
client *retryablehttp.Client
}
type HCPProviderCfg struct {
HCPClient client.Client
HCPConfig config.CloudConfigurer
}
// NewHCPProvider initializes and starts a HCP Telemetry provider.
func NewHCPProvider(ctx context.Context) *hcpProviderImpl {
h := &hcpProviderImpl{
// Initialize with default config values.
cfg: defaultDisabledCfg(),
httpCfg: &httpCfg{},
logger: hclog.FromContext(ctx),
}
return h
}
// Run starts a process that continuously checks for updates to the telemetry configuration
// by making a request to HCP. It only starts running if it's not already running.
func (h *hcpProviderImpl) Run(ctx context.Context, c *HCPProviderCfg) error {
if h.isRunning() {
return nil
}
h.rw.Lock()
h.running = true
h.rw.Unlock()
// Update the provider with the HCP configurations
h.hcpClient = c.HCPClient
err := h.updateHTTPConfig(c.HCPConfig)
if err != nil {
return fmt.Errorf("failed to initialize HCP telemetry provider: %v", err)
}
go h.run(ctx)
return nil
}
// run continuously checks for updates to the telemetry configuration by making a request to HCP.
func (h *hcpProviderImpl) run(ctx context.Context) error {
h.logger.Debug("starting telemetry config provider")
// Try to initialize config once before starting periodic fetch.
h.updateConfig(ctx)
ticker := time.NewTicker(h.cfg.refreshInterval)
defer ticker.Stop()
for {
select {
case <-ticker.C:
if newRefreshInterval := h.updateConfig(ctx); newRefreshInterval > 0 {
ticker.Reset(newRefreshInterval)
}
case <-ctx.Done():
return nil
}
}
}
// updateConfig makes a HTTP request to HCP to update metrics configuration held in the provider.
func (h *hcpProviderImpl) updateConfig(ctx context.Context) time.Duration {
logger := h.logger.Named("telemetry_config_provider")
if h.testUpdateConfigCh != nil {
defer func() {
select {
case h.testUpdateConfigCh <- struct{}{}:
default:
}
}()
}
if h.hcpClient == nil || reflect.ValueOf(h.hcpClient).IsNil() {
// Disable metrics if HCP client is not configured
disabledMetricsCfg := defaultDisabledCfg()
h.modifyDynamicCfg(disabledMetricsCfg)
return disabledMetricsCfg.refreshInterval
}
ctx, cancel := context.WithTimeout(ctx, 5*time.Second)
defer cancel()
logger.Trace("fetching telemetry config")
telemetryCfg, err := h.hcpClient.FetchTelemetryConfig(ctx)
if err != nil {
// Only disable metrics on 404 or 401 to handle the case of an unlinked cluster.
// For other errors such as 5XX ones, we continue metrics collection, as these are potentially transient server-side errors.
apiErr, ok := err.(*runtime.APIError)
if ok && apiErr.IsClientError() {
disabledMetricsCfg := defaultDisabledCfg()
h.modifyDynamicCfg(disabledMetricsCfg)
return disabledMetricsCfg.refreshInterval
}
logger.Error("failed to fetch telemetry config from HCP", "error", err)
metrics.IncrCounter(internalMetricRefreshFailure, 1)
return 0
}
logger.Trace("successfully fetched telemetry config")
// newRefreshInterval of 0 or less can cause ticker Reset() panic.
newRefreshInterval := telemetryCfg.RefreshConfig.RefreshInterval
if newRefreshInterval <= 0 {
logger.Error("invalid refresh interval duration", "refreshInterval", newRefreshInterval)
metrics.IncrCounter(internalMetricRefreshFailure, 1)
return 0
}
newCfg := &dynamicConfig{
filters: telemetryCfg.MetricsConfig.Filters,
endpoint: telemetryCfg.MetricsConfig.Endpoint,
labels: telemetryCfg.MetricsConfig.Labels,
refreshInterval: telemetryCfg.RefreshConfig.RefreshInterval,
disabled: telemetryCfg.MetricsConfig.Disabled,
}
h.modifyDynamicCfg(newCfg)
return newCfg.refreshInterval
}
// modifyDynamicCfg acquires a write lock to update new configuration and emits a success metric.
func (h *hcpProviderImpl) modifyDynamicCfg(newCfg *dynamicConfig) {
h.rw.Lock()
h.cfg = newCfg
h.rw.Unlock()
metrics.IncrCounter(internalMetricRefreshSuccess, 1)
}
// GetEndpoint acquires a read lock to return endpoint configuration for consumers.
func (h *hcpProviderImpl) GetEndpoint() *url.URL {
h.rw.RLock()
defer h.rw.RUnlock()
return h.cfg.endpoint
}
// GetFilters acquires a read lock to return filters configuration for consumers.
func (h *hcpProviderImpl) GetFilters() *regexp.Regexp {
h.rw.RLock()
defer h.rw.RUnlock()
return h.cfg.filters
}
// GetLabels acquires a read lock to return labels configuration for consumers.
func (h *hcpProviderImpl) GetLabels() map[string]string {
h.rw.RLock()
defer h.rw.RUnlock()
return h.cfg.labels
}
// IsDisabled acquires a read lock and return true if metrics are enabled.
func (h *hcpProviderImpl) IsDisabled() bool {
h.rw.RLock()
defer h.rw.RUnlock()
return h.cfg.disabled
}
// updateHTTPConfig updates the HTTP configuration values that rely on the HCP configuration.
func (h *hcpProviderImpl) updateHTTPConfig(cfg config.CloudConfigurer) error {
h.httpCfgRW.Lock()
defer h.httpCfgRW.Unlock()
if cfg == nil {
return errors.New("must provide valid HCP configuration")
}
// Update headers
r, err := cfg.Resource()
if err != nil {
return fmt.Errorf("failed set telemetry client headers: %v", err)
}
header := make(http.Header)
header.Set("content-type", "application/x-protobuf")
header.Set("x-hcp-resource-id", r.String())
header.Set("x-channel", fmt.Sprintf("consul/%s", version.GetHumanVersion()))
h.httpCfg.header = &header
// Update HTTP client
hcpCfg, err := cfg.HCPConfig()
if err != nil {
return fmt.Errorf("failed to configure telemetry HTTP client: %v", err)
}
h.httpCfg.client = client.NewHTTPClient(
hcpCfg.APITLSConfig(),
hcpCfg,
h.logger.Named("hcp_telemetry_client"))
return nil
}
// GetHeader acquires a read lock to return the HTTP request headers needed
// to export metrics.
func (h *hcpProviderImpl) GetHeader() http.Header {
h.httpCfgRW.RLock()
defer h.httpCfgRW.RUnlock()
if h.httpCfg.header == nil {
return nil
}
return h.httpCfg.header.Clone()
}
// GetHTTPClient acquires a read lock to return the retryable HTTP client needed
// to export metrics.
func (h *hcpProviderImpl) GetHTTPClient() *retryablehttp.Client {
h.httpCfgRW.RLock()
defer h.httpCfgRW.RUnlock()
return h.httpCfg.client
}
// isRunning acquires a read lock to return whether the provider is running.
func (h *hcpProviderImpl) isRunning() bool {
h.rw.RLock()
defer h.rw.RUnlock()
return h.running
}