mirror of https://github.com/status-im/consul.git
243 lines
6.8 KiB
Go
243 lines
6.8 KiB
Go
// Copyright (c) HashiCorp, Inc.
|
|
// SPDX-License-Identifier: BUSL-1.1
|
|
|
|
package sprawl
|
|
|
|
import (
|
|
"context"
|
|
"errors"
|
|
"fmt"
|
|
"log"
|
|
"net/http"
|
|
"strings"
|
|
"time"
|
|
|
|
"github.com/hashicorp/consul/api"
|
|
"github.com/hashicorp/go-hclog"
|
|
|
|
"github.com/hashicorp/consul/testing/deployer/topology"
|
|
)
|
|
|
|
// TODO: this is definitely a grpc resolver/balancer issue to look into
|
|
const grpcWeirdError = `transport: Error while dialing failed to find Consul server for global address`
|
|
|
|
func isWeirdGRPCError(err error) bool {
|
|
if err == nil {
|
|
return false
|
|
}
|
|
return strings.Contains(err.Error(), grpcWeirdError)
|
|
}
|
|
|
|
func (s *Sprawl) initPeerings() error {
|
|
// TODO: wait until services are healthy? wait until mesh gateways work?
|
|
// if err := s.generator.Generate(tfgen.StepPeering); err != nil {
|
|
// return fmt.Errorf("generator[peering]: %w", err)
|
|
// }
|
|
|
|
var (
|
|
logger = s.logger.Named("peering")
|
|
_ = logger
|
|
)
|
|
|
|
for _, peering := range s.topology.Peerings {
|
|
dialingCluster, ok := s.topology.Clusters[peering.Dialing.Name]
|
|
if !ok {
|
|
return fmt.Errorf("peering references dialing cluster that does not exist: %s", peering.String())
|
|
}
|
|
acceptingCluster, ok := s.topology.Clusters[peering.Accepting.Name]
|
|
if !ok {
|
|
return fmt.Errorf("peering references accepting cluster that does not exist: %s", peering.String())
|
|
}
|
|
|
|
var (
|
|
dialingClient = s.clients[dialingCluster.Name]
|
|
acceptingClient = s.clients[acceptingCluster.Name]
|
|
)
|
|
|
|
// TODO: allow for use of ServerExternalAddresses
|
|
|
|
req1 := api.PeeringGenerateTokenRequest{
|
|
PeerName: peering.Accepting.PeerName,
|
|
}
|
|
if acceptingCluster.Enterprise {
|
|
req1.Partition = peering.Accepting.Partition
|
|
}
|
|
|
|
s.awaitMeshGateways()
|
|
|
|
GENTOKEN:
|
|
resp, _, err := acceptingClient.Peerings().GenerateToken(context.Background(), req1, nil)
|
|
if err != nil {
|
|
if isWeirdGRPCError(err) {
|
|
time.Sleep(50 * time.Millisecond)
|
|
goto GENTOKEN
|
|
}
|
|
return fmt.Errorf("error generating peering token for %q: %w", peering.String(), err)
|
|
}
|
|
|
|
peeringToken := resp.PeeringToken
|
|
logger.Debug("generated peering token", "peering", peering.String())
|
|
|
|
req2 := api.PeeringEstablishRequest{
|
|
PeerName: peering.Dialing.PeerName,
|
|
PeeringToken: peeringToken,
|
|
}
|
|
if dialingCluster.Enterprise {
|
|
req2.Partition = peering.Dialing.Partition
|
|
}
|
|
|
|
logger.Info("registering peering with token", "peering", peering.String())
|
|
ESTABLISH:
|
|
_, _, err = dialingClient.Peerings().Establish(context.Background(), req2, nil)
|
|
if err != nil {
|
|
if isWeirdGRPCError(err) {
|
|
time.Sleep(50 * time.Millisecond)
|
|
goto ESTABLISH
|
|
}
|
|
// Establish and friends return an api.StatusError value, not pointer
|
|
// not sure if this is weird
|
|
var asStatusError api.StatusError
|
|
if errors.As(err, &asStatusError) && asStatusError.Code == http.StatusGatewayTimeout {
|
|
time.Sleep(50 * time.Millisecond)
|
|
goto ESTABLISH
|
|
}
|
|
return fmt.Errorf("error establishing peering with token for %q: %#v", peering.String(), err)
|
|
}
|
|
|
|
logger.Info("peering registered", "peering", peering.String())
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
func (s *Sprawl) waitForPeeringEstablishment() error {
|
|
s.awaitMeshGateways()
|
|
var (
|
|
logger = s.logger.Named("peering")
|
|
)
|
|
logger.Info("awaiting peering establishment")
|
|
startTimeTotal := time.Now()
|
|
|
|
for _, peering := range s.topology.Peerings {
|
|
dialingCluster, ok := s.topology.Clusters[peering.Dialing.Name]
|
|
if !ok {
|
|
return fmt.Errorf("peering references dialing cluster that does not exist: %s", peering.String())
|
|
}
|
|
acceptingCluster, ok := s.topology.Clusters[peering.Accepting.Name]
|
|
if !ok {
|
|
return fmt.Errorf("peering references accepting cluster that does not exist: %s", peering.String())
|
|
}
|
|
|
|
var (
|
|
dialingClient = s.clients[dialingCluster.Name]
|
|
acceptingClient = s.clients[acceptingCluster.Name]
|
|
|
|
dialingLogger = logger.With(
|
|
"cluster", dialingCluster.Name,
|
|
"peering", peering.String(),
|
|
)
|
|
acceptingLogger = logger.With(
|
|
"cluster", acceptingCluster.Name,
|
|
"peering", peering.String(),
|
|
)
|
|
)
|
|
|
|
s.checkPeeringDirection(dialingLogger, dialingClient, peering.Dialing, dialingCluster.Enterprise)
|
|
s.checkPeeringDirection(acceptingLogger, acceptingClient, peering.Accepting, acceptingCluster.Enterprise)
|
|
}
|
|
logger.Info("peering established", "dur", time.Since(startTimeTotal).Round(time.Second))
|
|
return nil
|
|
}
|
|
|
|
func (s *Sprawl) checkPeeringDirection(logger hclog.Logger, client *api.Client, pc topology.PeerCluster, enterprise bool) {
|
|
ctx, cancel := context.WithCancel(context.Background())
|
|
defer cancel()
|
|
|
|
startTime := time.Now()
|
|
|
|
for {
|
|
opts := &api.QueryOptions{}
|
|
logger2 := logger.With("dur", time.Since(startTime).Round(time.Second))
|
|
if enterprise {
|
|
opts.Partition = pc.Partition
|
|
}
|
|
res, _, err := client.Peerings().Read(ctx, pc.PeerName, opts)
|
|
if isWeirdGRPCError(err) {
|
|
time.Sleep(50 * time.Millisecond)
|
|
continue
|
|
}
|
|
if err != nil {
|
|
logger2.Debug("error looking up peering", "error", err)
|
|
time.Sleep(100 * time.Millisecond)
|
|
continue
|
|
}
|
|
if res == nil {
|
|
logger2.Debug("peering not found")
|
|
time.Sleep(100 * time.Millisecond)
|
|
continue
|
|
}
|
|
|
|
if res.State == api.PeeringStateActive {
|
|
break
|
|
}
|
|
logger2.Debug("peering not active yet", "state", res.State)
|
|
time.Sleep(500 * time.Millisecond)
|
|
}
|
|
logger.Debug("peering is active", "dur", time.Since(startTime).Round(time.Second))
|
|
}
|
|
|
|
func (s *Sprawl) awaitMeshGateways() {
|
|
startTime := time.Now()
|
|
s.logger.Info("awaiting mesh gateways")
|
|
// TODO: maybe a better way to do this
|
|
mgws := []*topology.Workload{}
|
|
for _, clu := range s.topology.Clusters {
|
|
for _, node := range clu.Nodes {
|
|
for _, wrk := range node.Workloads {
|
|
if wrk.IsMeshGateway {
|
|
mgws = append(mgws, wrk)
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// TODO: parallel
|
|
for _, mgw := range mgws {
|
|
cl := s.clients[mgw.Node.Cluster]
|
|
logger := s.logger.With("cluster", mgw.Node.Cluster, "sid", mgw.ID, "nid", mgw.Node.ID())
|
|
logger.Info("awaiting MGW readiness")
|
|
RETRY:
|
|
// TODO: not sure if there's a better way to check if the MGW is ready
|
|
svcs, _, err := cl.Catalog().Service(mgw.ID.Name, "", mgw.ID.QueryOptions())
|
|
if err != nil {
|
|
logger.Debug("fetching MGW service", "err", err)
|
|
time.Sleep(time.Second)
|
|
goto RETRY
|
|
}
|
|
if len(svcs) < 1 {
|
|
logger.Debug("no MGW service in catalog yet")
|
|
time.Sleep(time.Second)
|
|
goto RETRY
|
|
}
|
|
if len(svcs) > 1 {
|
|
// not sure when this would happen
|
|
log.Fatalf("expected 1 MGW service, actually: %#v", svcs)
|
|
}
|
|
|
|
entries, _, err := cl.Health().Service(mgw.ID.Name, "", true, mgw.ID.QueryOptions())
|
|
if err != nil {
|
|
logger.Debug("fetching MGW checks", "err", err)
|
|
time.Sleep(time.Second)
|
|
goto RETRY
|
|
}
|
|
if len(entries) != 1 {
|
|
logger.Debug("expected 1 MGW entry", "entries", entries)
|
|
time.Sleep(time.Second)
|
|
goto RETRY
|
|
}
|
|
|
|
logger.Debug("MGW ready", "entry", *(entries[0]), "dur", time.Since(startTime).Round(time.Second))
|
|
}
|
|
s.logger.Info("mesh gateways ready", "dur", time.Since(startTime).Round(time.Second))
|
|
}
|