acl: use the presence of a management policy in the state store as a sign that we already migrated to v2 acls (#9505)

This way we only have to wait for the serf barrier to pass once before
we can upgrade to v2 acls. Without this patch every restart needs to
re-compute the change, and potentially if a stray older node joins after
a migration it might regress back to v1 mode which would be problematic.
This commit is contained in:
R.B. Boyer 2021-01-05 17:04:27 -06:00 committed by GitHub
parent 2d11fc6746
commit 19baf4bc25
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 107 additions and 0 deletions

3
.changelog/9505.txt Normal file
View File

@ -0,0 +1,3 @@
```release-note:improvement
acl: use the presence of a management policy in the state store as a sign that we already migrated to v2 acls
```

View File

@ -108,6 +108,16 @@ func (s *Server) canUpgradeToNewACLs(isLeader bool) bool {
return false
}
// Check to see if we already upgraded the last time we ran by seeing if we
// have a copy of any global management policy stored locally. This should
// always be true because policies always replicate.
_, mgmtPolicy, err := s.fsm.State().ACLPolicyGetByID(nil, structs.ACLPolicyGlobalManagementID, structs.DefaultEnterpriseMeta())
if err != nil {
s.logger.Warn("Failed to get the builtin global-management policy to check for a completed ACL upgrade; skipping this optimization", "error", err)
} else if mgmtPolicy != nil {
return true
}
if !s.InACLDatacenter() {
foundServers, mode, _ := ServersGetACLMode(s, "", s.config.ACLDatacenter)
if mode != structs.ACLModeEnabled || !foundServers {

View File

@ -10,6 +10,7 @@ import (
"time"
"github.com/hashicorp/consul/agent/structs"
tokenStore "github.com/hashicorp/consul/agent/token"
"github.com/hashicorp/consul/api"
"github.com/hashicorp/consul/sdk/testutil"
"github.com/hashicorp/consul/sdk/testutil/retry"
@ -1272,6 +1273,99 @@ func TestLeader_ACLUpgrade(t *testing.T) {
})
}
func TestLeader_ACLUpgrade_IsStickyEvenIfSerfTagsRegress(t *testing.T) {
if testing.Short() {
t.Skip("too slow for testing.Short")
}
t.Parallel()
// We test this by having two datacenters with one server each. They
// initially come up and complete the migration, then we power them both
// off. We leave the primary off permanently, and then we stand up the
// secondary. Hopefully it should transition to ENABLED instead of being
// stuck in LEGACY.
dir1, s1 := testServerWithConfig(t, func(c *Config) {
c.Datacenter = "dc1"
c.ACLDatacenter = "dc1"
c.ACLsEnabled = true
c.ACLMasterToken = "root"
})
defer os.RemoveAll(dir1)
defer s1.Shutdown()
codec := rpcClient(t, s1)
defer codec.Close()
waitForLeaderEstablishment(t, s1)
dir2, s2 := testServerWithConfig(t, func(c *Config) {
c.Datacenter = "dc2"
c.ACLDatacenter = "dc1"
c.ACLsEnabled = true
c.ACLTokenReplication = false
c.ACLReplicationRate = 100
c.ACLReplicationBurst = 100
c.ACLReplicationApplyLimit = 1000000
})
defer os.RemoveAll(dir2)
defer s2.Shutdown()
codec2 := rpcClient(t, s2)
defer codec2.Close()
s2.tokens.UpdateReplicationToken("root", tokenStore.TokenSourceConfig)
testrpc.WaitForLeader(t, s2.RPC, "dc2")
waitForLeaderEstablishment(t, s2)
// Create the WAN link
joinWAN(t, s2, s1)
waitForLeaderEstablishment(t, s1)
waitForLeaderEstablishment(t, s2)
waitForNewACLs(t, s1)
waitForNewACLs(t, s2)
waitForNewACLReplication(t, s2, structs.ACLReplicatePolicies, 1, 0, 0)
// Everybody has the management policy.
retry.Run(t, func(r *retry.R) {
_, policy1, err := s1.fsm.State().ACLPolicyGetByID(nil, structs.ACLPolicyGlobalManagementID, structs.DefaultEnterpriseMeta())
require.NoError(r, err)
require.NotNil(r, policy1)
_, policy2, err := s2.fsm.State().ACLPolicyGetByID(nil, structs.ACLPolicyGlobalManagementID, structs.DefaultEnterpriseMeta())
require.NoError(r, err)
require.NotNil(r, policy2)
})
// Shutdown s1 and s2.
s1.Shutdown()
s2.Shutdown()
// Restart just s2
dir2new, s2new := testServerWithConfig(t, func(c *Config) {
c.Datacenter = "dc2"
c.ACLDatacenter = "dc1"
c.ACLsEnabled = true
c.ACLTokenReplication = false
c.ACLReplicationRate = 100
c.ACLReplicationBurst = 100
c.ACLReplicationApplyLimit = 1000000
c.DataDir = s2.config.DataDir
c.NodeName = s2.config.NodeName
c.NodeID = s2.config.NodeID
})
defer os.RemoveAll(dir2new)
defer s2new.Shutdown()
waitForLeaderEstablishment(t, s2new)
// It should be able to transition without connectivity to the primary.
waitForNewACLs(t, s2new)
}
func TestLeader_ConfigEntryBootstrap(t *testing.T) {
if testing.Short() {
t.Skip("too slow for testing.Short")