Use agent token for service/check deregistration during anti-entropy (#16097)

Use only the agent token for deregistration during anti-entropy

The previous behavior had the agent attempt to use the "service" token
(i.e. from the `token` field in a service definition file), and if that
was not set then it would use the agent token.

The previous behavior was problematic because, if the service token had
been deleted, the deregistration request would fail. The agent would
retry the deregistration during each anti-entropy sync, and the
situation would never resolve.

The new behavior is to only/always use the agent token for service and
check deregistration during anti-entropy. This approach is:

* Simpler: No fallback logic to try different tokens
* Faster (slightly): No time spent attempting the service token
* Correct: The agent token is able to deregister services on that
  agent's node, because:
  * node:write permissions allow deregistration of services/checks on
    that node.
  * The agent token must have node:write permission, or else the agent
    is not be able to (de)register itself into the catalog

Co-authored-by: Vesa Hagström <weeezes@gmail.com>
This commit is contained in:
Paul Glass 2023-02-03 08:45:11 -06:00 committed by GitHub
parent e40b731a52
commit a884d0d7c7
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 46 additions and 9 deletions

3
.changelog/16097.txt Normal file
View File

@ -0,0 +1,3 @@
```release-note:bug-fix
agent: Only use the `agent` token for internal deregistration of checks and services during anti-entropy syncing. The service token specified in the `token` field of the check or service definition is no longer used for deregistration. This fixes an issue where the agent failed to ever deregister a service or check if the service token was deleted.
```

View File

@ -1293,7 +1293,12 @@ func (l *State) deleteService(key structs.ServiceID) error {
return fmt.Errorf("ServiceID missing")
}
st := l.aclTokenForServiceSync(key, l.tokens.AgentToken)
// Always use the agent token to delete without trying the service token.
// This works because the agent token really must have node:write
// permission and node:write allows deregistration of services/checks on
// that node. Because the service token may have been deleted, using the
// agent token without fallback logic is a bit faster, simpler, and safer.
st := l.tokens.AgentToken()
req := structs.DeregisterRequest{
Datacenter: l.config.Datacenter,
Node: l.config.NodeName,
@ -1344,7 +1349,9 @@ func (l *State) deleteCheck(key structs.CheckID) error {
return fmt.Errorf("CheckID missing")
}
ct := l.aclTokenForCheckSync(key, l.tokens.AgentToken)
// Always use the agent token for deletion. Refer to deleteService() for
// an explanation.
ct := l.tokens.AgentToken()
req := structs.DeregisterRequest{
Datacenter: l.config.Datacenter,
Node: l.config.NodeName,

View File

@ -829,10 +829,6 @@ func TestAgentAntiEntropy_Services_WithChecks(t *testing.T) {
}
var testRegisterRules = `
node "" {
policy = "write"
}
service "api" {
policy = "write"
}
@ -863,6 +859,9 @@ func TestAgentAntiEntropy_Services_ACLDeny(t *testing.T) {
defer a.Shutdown()
testrpc.WaitForLeader(t, a.RPC, "dc1")
// The agent token is the only token used for deleteService.
setAgentToken(t, a)
token := createToken(t, a, testRegisterRules)
// Create service (disallowed)
@ -1153,15 +1152,19 @@ type RPC interface {
func createToken(t *testing.T, rpc RPC, policyRules string) string {
t.Helper()
uniqueId, err := uuid.GenerateUUID()
require.NoError(t, err)
policyName := "the-policy-" + uniqueId
reqPolicy := structs.ACLPolicySetRequest{
Datacenter: "dc1",
Policy: structs.ACLPolicy{
Name: "the-policy",
Name: policyName,
Rules: policyRules,
},
WriteRequest: structs.WriteRequest{Token: "root"},
}
err := rpc.RPC(context.Background(), "ACL.PolicySet", &reqPolicy, &structs.ACLPolicy{})
err = rpc.RPC(context.Background(), "ACL.PolicySet", &reqPolicy, &structs.ACLPolicy{})
require.NoError(t, err)
token, err := uuid.GenerateUUID()
@ -1171,7 +1174,7 @@ func createToken(t *testing.T, rpc RPC, policyRules string) string {
Datacenter: "dc1",
ACLToken: structs.ACLToken{
SecretID: token,
Policies: []structs.ACLTokenPolicyLink{{Name: "the-policy"}},
Policies: []structs.ACLTokenPolicyLink{{Name: policyName}},
},
WriteRequest: structs.WriteRequest{Token: "root"},
}
@ -1180,6 +1183,27 @@ func createToken(t *testing.T, rpc RPC, policyRules string) string {
return token
}
// setAgentToken sets the 'agent' token for this agent. It creates a new token
// with node:write for the agent's node name, and service:write for any
// service.
func setAgentToken(t *testing.T, a *agent.TestAgent) {
var policy = fmt.Sprintf(`
node "%s" {
policy = "write"
}
service_prefix "" {
policy = "read"
}
`, a.Config.NodeName)
token := createToken(t, a, policy)
_, err := a.Client().Agent().UpdateAgentACLToken(token, &api.WriteOptions{Token: "root"})
if err != nil {
t.Fatalf("setting agent token: %v", err)
}
}
func TestAgentAntiEntropy_Checks(t *testing.T) {
if testing.Short() {
t.Skip("too slow for testing.Short")
@ -1481,6 +1505,9 @@ func TestAgentAntiEntropy_Checks_ACLDeny(t *testing.T) {
testrpc.WaitForLeader(t, a.RPC, dc)
// The agent token is the only token used for deleteCheck.
setAgentToken(t, a)
token := createToken(t, a, testRegisterRules)
// Create services using the root token