diff --git a/agent/consul/leader_connect.go b/agent/consul/leader_connect.go index 02caa4516b..c53b88730d 100644 --- a/agent/consul/leader_connect.go +++ b/agent/consul/leader_connect.go @@ -314,13 +314,40 @@ func (s *Server) initializeSecondaryCA(provider ca.Provider, roots structs.Index return fmt.Errorf("primary datacenter does not have an active root CA for Connect") } + newIntermediate := false + // Get a signed intermediate from the primary DC if the provider + // hasn't been initialized yet or if the primary's root has changed. + if activeIntermediate == "" || storedRootID != roots.ActiveRootID { + csr, err := provider.GenerateIntermediateCSR() + if err != nil { + return err + } + + var intermediatePEM string + if err := s.forwardDC("ConnectCA.SignIntermediate", s.config.PrimaryDatacenter, s.generateCASignRequest(csr), &intermediatePEM); err != nil { + // this is a failure in the primary and shouldn't be capable of erroring out our establishing leadership + s.logger.Printf("[WARN] connect: Primary datacenter refused to sign our intermediate CA certificate: %v", err) + return nil + } + + if err := provider.SetIntermediate(intermediatePEM, newActiveRoot.RootCert); err != nil { + return fmt.Errorf("Failed to set the intermediate certificate with the CA provider: %v", err) + } + + // Append the new intermediate to our local active root entry. + newActiveRoot.IntermediateCerts = append(newActiveRoot.IntermediateCerts, intermediatePEM) + newIntermediate = true + + s.logger.Printf("[INFO] connect: received new intermediate certificate from primary datacenter") + } + // Update the roots list in the state store if there's a new active root. state := s.fsm.State() _, activeRoot, err := state.CARootActive(nil) if err != nil { return err } - if activeRoot == nil || activeRoot.ID != newActiveRoot.ID { + if activeRoot == nil || activeRoot.ID != newActiveRoot.ID || newIntermediate { idx, oldRoots, err := state.CARoots(nil) if err != nil { return err @@ -372,31 +399,6 @@ func (s *Server) initializeSecondaryCA(provider ca.Provider, roots structs.Index s.logger.Printf("[INFO] connect: updated root certificates from primary datacenter") } - // Get a signed intermediate from the primary DC if the provider - // hasn't been initialized yet or if the primary's root has changed. - if activeIntermediate == "" || storedRootID != roots.ActiveRootID { - csr, err := provider.GenerateIntermediateCSR() - if err != nil { - return err - } - - var intermediatePEM string - if err := s.forwardDC("ConnectCA.SignIntermediate", s.config.PrimaryDatacenter, s.generateCASignRequest(csr), &intermediatePEM); err != nil { - // this is a failure in the primary and shouldn't be capable of erroring out our establishing leadership - s.logger.Printf("[WARN] connect: Primary datacenter refused to sign our intermediate CA certificate: %v", err) - return nil - } - - if err := provider.SetIntermediate(intermediatePEM, newActiveRoot.RootCert); err != nil { - return fmt.Errorf("Failed to set the intermediate certificate with the CA provider: %v", err) - } - - // Append the new intermediate to our local active root entry. - newActiveRoot.IntermediateCerts = append(newActiveRoot.IntermediateCerts, intermediatePEM) - - s.logger.Printf("[INFO] connect: received new intermediate certificate from primary datacenter") - } - s.setCAProvider(provider, newActiveRoot) return nil } diff --git a/agent/consul/leader_connect_test.go b/agent/consul/leader_connect_test.go index 42a54fe454..3a30c7c1a0 100644 --- a/agent/consul/leader_connect_test.go +++ b/agent/consul/leader_connect_test.go @@ -78,6 +78,8 @@ func TestLeader_SecondaryCA_Initialize(t *testing.T) { require.Equal(roots1[0].RootCert, roots2[0].RootCert) require.Equal(1, len(roots1)) require.Equal(len(roots1), len(roots2)) + require.Empty(roots1[0].IntermediateCerts) + require.NotEmpty(roots2[0].IntermediateCerts) // Have secondary sign a leaf cert and make sure the chain is correct. spiffeService := &connect.SpiffeIDService{ diff --git a/agent/local/state_test.go b/agent/local/state_test.go index c7090ccdb2..31640fd343 100644 --- a/agent/local/state_test.go +++ b/agent/local/state_test.go @@ -1417,21 +1417,34 @@ func TestAgentAntiEntropy_Check_DeferSync(t *testing.T) { // Update the check output! Should be deferred a.State.UpdateCheck("web", api.HealthPassing, "output") - // Should not update for 500 milliseconds - time.Sleep(250 * time.Millisecond) - if err := a.RPC("Health.NodeChecks", &req, &checks); err != nil { - t.Fatalf("err: %v", err) + // We are going to wait up to 850ms for the deferred check update to run. The update + // can happen any time within: check_update_interval / 2 + random(min: 0, max: check_update_interval) + // For this test that means it will get deferred for 250ms - 750ms. We add up to 100ms on top of that to + // account for potentially slow tests on a overloaded system. + timer := &retry.Timer{Timeout: 850 * time.Millisecond, Wait: 50 * time.Millisecond} + start := time.Now() + retry.RunWith(timer, t, func(r *retry.R) { + cs := a.State.CheckState("web") + if cs == nil { + r.Fatalf("check is not registered") + } + + if cs.DeferCheck != nil { + r.Fatalf("Deferred Check timeout not removed yet") + } + }) + elapsed := time.Since(start) + + // ensure the check deferral didn't update too fast + if elapsed < 240*time.Millisecond { + t.Fatalf("early update: elapsed %v\n\n%+v", elapsed, checks) } - // Verify not updated - for _, chk := range checks.HealthChecks { - switch chk.CheckID { - case "web": - if chk.Output != "" { - t.Fatalf("early update: %v", chk) - } - } + // ensure the check deferral didn't update too late + if elapsed > 850*time.Millisecond { + t.Fatalf("late update: elapsed: %v\n\n%+v", elapsed, checks) } + // Wait for a deferred update. TODO (slackpad) This isn't a great test // because we might be stuck in the random stagger from the full sync // after the leader election (~3 seconds) so it's easy to exceed the @@ -1441,10 +1454,14 @@ func TestAgentAntiEntropy_Check_DeferSync(t *testing.T) { // good news is that the later update below should be well past the full // sync so we are getting some coverage. We should rethink this a bit and // rework the deferred update stuff to be more testable. - timer := &retry.Timer{Timeout: 6 * time.Second, Wait: 100 * time.Millisecond} + // + // TODO - figure out why after the deferred check calls TriggerSyncChanges that this + // takes so long to happen. I have seen it take upwards of 1.5s before the check gets + // synced. + timer = &retry.Timer{Timeout: 6 * time.Second, Wait: 100 * time.Millisecond} retry.RunWith(timer, t, func(r *retry.R) { if err := a.RPC("Health.NodeChecks", &req, &checks); err != nil { - r.Fatal(err) + r.Fatalf("err: %v", err) } // Verify updated