mirror of
https://github.com/logos-storage/logos-storage-nim-cs-dist-tests.git
synced 2026-06-01 05:59:26 +00:00
refactor: replace scheduling affinity with explicit node pool label selection
Replace the indirect `SetSchedulingAffinity(notIn: "false")` / `allow-tests-pods` mechanism with `ScheduleInPoolsWithLabel(key, value)` and `AddToleration(key, value, effect)` in ContainerRecipeFactory. This is much more readable from an API perspective. `SetSchedulingAffinity(notIn: "false")` was a double-negative (hard to reason about) and it was not clear that this was meant to schedule on pools with labels `allow-tests-pods=true`. Previously, pods were steered to the spot node pool via a node affinity exclusion on a boolean label (`allow-tests-pods NotIn ["false"]`), and spot taint toleration was added implicitly by using the `system-node-critical` priority class. The priority class was removed earlier because it caused a ResourceQuota admission error in GCP, which silently broke spot node scheduling. The new API is explicit: recipes call `ScheduleInPoolsWithLabel` to set a nodeSelector label that targets the intended pool, and `AddToleration` to declare any taints the pool carries. Tolerations are set at the recipe level to allow for the recipe to move back to Digital Ocean if needed (removing the unneeded toleration). All four recipes (storage, prometheus, discord bot, rewarder bot) now call both. Cleanup applied alongside: - `PodToleration` converted to a record for structural equality and simpler deduplication - `ExposedPorts`, `InternalPorts`, `EnvVars`, `Volumes` on `ContainerRecipe` changed to `IReadOnlyList<T>` for consistent immutable typing - `SetCriticalPriority` property renamed to `IsCriticalPriority` - `GetPriorityClassName` returns `string?` instead of `null!` - `Reset()` extracted in `ContainerRecipeFactory` to consolidate post-create state reset - Fixed bug: `nodePoolLabels` and `tolerations` were passed by reference and then cleared, leaving the recipe with empty collections; now snapshotted before clearing - `SchedulingAffinity.cs` deleted (no remaining callers)
This commit is contained in:
parent
8d83c7e66c
commit
3c58ee3777
@ -371,8 +371,8 @@ namespace KubernetesWorkflow
|
||||
Spec = new V1PodSpec
|
||||
{
|
||||
PriorityClassName = GetPriorityClassName(containerRecipes),
|
||||
Affinity = CreatePodAffinity(containerRecipes),
|
||||
NodeSelector = CreateNodeSelector(location),
|
||||
NodeSelector = CreateNodeSelector(location, containerRecipes),
|
||||
Tolerations = CreateTolerations(containerRecipes),
|
||||
Containers = CreateDeploymentContainers(containerRecipes),
|
||||
Volumes = CreateVolumes(containerRecipes)
|
||||
}
|
||||
@ -392,51 +392,32 @@ namespace KubernetesWorkflow
|
||||
WaitUntilDeploymentOffline(deployment.Name);
|
||||
}
|
||||
|
||||
private IDictionary<string, string> CreateNodeSelector(ILocation location)
|
||||
private IDictionary<string, string> CreateNodeSelector(ILocation location, ContainerRecipe[] recipes)
|
||||
{
|
||||
var nodeLabel = GetNodeLabelForLocation(location);
|
||||
if (nodeLabel == null) return new Dictionary<string, string>();
|
||||
var result = new Dictionary<string, string>();
|
||||
|
||||
return new Dictionary<string, string>
|
||||
{
|
||||
{ nodeLabel.Key, nodeLabel.Value }
|
||||
};
|
||||
var nodeLabel = GetNodeLabelForLocation(location);
|
||||
if (nodeLabel != null) result[nodeLabel.Key] = nodeLabel.Value;
|
||||
|
||||
foreach (var recipe in recipes)
|
||||
foreach (var kvp in recipe.NodePoolLabels)
|
||||
result[kvp.Key] = kvp.Value;
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
private V1Affinity? CreatePodAffinity(ContainerRecipe[] recipes)
|
||||
private IList<V1Toleration>? CreateTolerations(ContainerRecipe[] recipes)
|
||||
{
|
||||
var notIns = recipes
|
||||
.Select(r => r.SchedulingAffinity.NotIn)
|
||||
.Where(n => !string.IsNullOrEmpty(n))
|
||||
.Distinct()
|
||||
.ToList();
|
||||
var distinct = recipes.SelectMany(r => r.Tolerations).Distinct().ToList();
|
||||
if (!distinct.Any()) return null;
|
||||
|
||||
if (!notIns.Any()) return null;
|
||||
|
||||
return new V1Affinity
|
||||
return distinct.Select(t => new V1Toleration
|
||||
{
|
||||
NodeAffinity = new V1NodeAffinity
|
||||
{
|
||||
RequiredDuringSchedulingIgnoredDuringExecution = new V1NodeSelector
|
||||
{
|
||||
NodeSelectorTerms = new List<V1NodeSelectorTerm>
|
||||
{
|
||||
new V1NodeSelectorTerm
|
||||
{
|
||||
MatchExpressions = new List<V1NodeSelectorRequirement>
|
||||
{
|
||||
new V1NodeSelectorRequirement
|
||||
{
|
||||
Key = "allow-tests-pods",
|
||||
OperatorProperty = "NotIn",
|
||||
Values = notIns
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
Key = t.Key,
|
||||
OperatorProperty = "Equal",
|
||||
Value = t.Value,
|
||||
Effect = t.Effect
|
||||
}).ToList();
|
||||
}
|
||||
|
||||
private K8sNodeLabel? GetNodeLabelForLocation(ILocation location)
|
||||
@ -445,13 +426,10 @@ namespace KubernetesWorkflow
|
||||
return l.NodeLabel;
|
||||
}
|
||||
|
||||
private string GetPriorityClassName(ContainerRecipe[] containerRecipes)
|
||||
private string? GetPriorityClassName(ContainerRecipe[] containerRecipes)
|
||||
{
|
||||
if (containerRecipes.Any(c => c.SetCriticalPriority))
|
||||
{
|
||||
return "system-node-critical";
|
||||
}
|
||||
return null!;
|
||||
if (containerRecipes.Any(c => c.IsCriticalPriority)) return "system-node-critical";
|
||||
return null;
|
||||
}
|
||||
|
||||
private IDictionary<string, string> GetSelector(ContainerRecipe[] containerRecipes)
|
||||
@ -759,7 +737,7 @@ namespace KubernetesWorkflow
|
||||
|
||||
private RunningService? CreateInternalService(ContainerRecipe[] recipes)
|
||||
{
|
||||
return CreateService(recipes, r => r.InternalPorts.Concat(r.ExposedPorts).ToArray(), "ClusterIP", "int", false);
|
||||
return CreateService(recipes, r => r.InternalPorts.Concat(r.ExposedPorts), "ClusterIP", "int", false);
|
||||
}
|
||||
|
||||
private RunningService? CreateExternalService(ContainerRecipe[] recipes)
|
||||
@ -767,7 +745,7 @@ namespace KubernetesWorkflow
|
||||
return CreateService(recipes, r => r.ExposedPorts, "NodePort", "ext", true);
|
||||
}
|
||||
|
||||
private RunningService? CreateService(ContainerRecipe[] recipes, Func<ContainerRecipe, Port[]> portSelector, string serviceType, string namePostfix, bool isNodePort)
|
||||
private RunningService? CreateService(ContainerRecipe[] recipes, Func<ContainerRecipe, IEnumerable<Port>> portSelector, string serviceType, string namePostfix, bool isNodePort)
|
||||
{
|
||||
var ports = CreateServicePorts(recipes, portSelector, isNodePort);
|
||||
if (!ports.Any()) return null;
|
||||
@ -843,7 +821,7 @@ namespace KubernetesWorkflow
|
||||
};
|
||||
}
|
||||
|
||||
private List<V1ServicePort> CreateServicePorts(ContainerRecipe[] recipes, Func<ContainerRecipe, Port[]> portSelector, bool isNodePort)
|
||||
private List<V1ServicePort> CreateServicePorts(ContainerRecipe[] recipes, Func<ContainerRecipe, IEnumerable<Port>> portSelector, bool isNodePort)
|
||||
{
|
||||
var result = new List<V1ServicePort>();
|
||||
foreach (var recipe in recipes)
|
||||
|
||||
@ -2,16 +2,17 @@
|
||||
{
|
||||
public class ContainerRecipe
|
||||
{
|
||||
public ContainerRecipe(DateTime recipeCreatedUtc, int number, string? nameOverride, string image, ContainerResources resources, SchedulingAffinity schedulingAffinity, CommandOverride commandOverride, bool setCriticalPriority, Port[] exposedPorts, Port[] internalPorts, EnvVar[] envVars, PodLabels podLabels, PodAnnotations podAnnotations, VolumeMount[] volumes, ContainerAdditionals additionals)
|
||||
public ContainerRecipe(DateTime recipeCreatedUtc, int number, string? nameOverride, string image, ContainerResources resources, IReadOnlyDictionary<string, string> nodePoolLabels, IReadOnlyList<PodToleration> tolerations, CommandOverride commandOverride, bool isCriticalPriority, IReadOnlyList<Port> exposedPorts, IReadOnlyList<Port> internalPorts, IReadOnlyList<EnvVar> envVars, PodLabels podLabels, PodAnnotations podAnnotations, IReadOnlyList<VolumeMount> volumes, ContainerAdditionals additionals)
|
||||
{
|
||||
RecipeCreatedUtc = recipeCreatedUtc;
|
||||
Number = number;
|
||||
NameOverride = nameOverride;
|
||||
Image = image;
|
||||
Resources = resources;
|
||||
SchedulingAffinity = schedulingAffinity;
|
||||
NodePoolLabels = nodePoolLabels;
|
||||
Tolerations = tolerations;
|
||||
CommandOverride = commandOverride;
|
||||
SetCriticalPriority = setCriticalPriority;
|
||||
IsCriticalPriority = isCriticalPriority;
|
||||
ExposedPorts = exposedPorts;
|
||||
InternalPorts = internalPorts;
|
||||
EnvVars = envVars;
|
||||
@ -37,16 +38,17 @@
|
||||
public int Number { get; }
|
||||
public string? NameOverride { get; }
|
||||
public ContainerResources Resources { get; }
|
||||
public SchedulingAffinity SchedulingAffinity { get; }
|
||||
public IReadOnlyDictionary<string, string> NodePoolLabels { get; }
|
||||
public IReadOnlyList<PodToleration> Tolerations { get; }
|
||||
public CommandOverride CommandOverride { get; }
|
||||
public bool SetCriticalPriority { get; }
|
||||
public bool IsCriticalPriority { get; }
|
||||
public string Image { get; }
|
||||
public Port[] ExposedPorts { get; }
|
||||
public Port[] InternalPorts { get; }
|
||||
public EnvVar[] EnvVars { get; }
|
||||
public IReadOnlyList<Port> ExposedPorts { get; }
|
||||
public IReadOnlyList<Port> InternalPorts { get; }
|
||||
public IReadOnlyList<EnvVar> EnvVars { get; }
|
||||
public PodLabels PodLabels { get; }
|
||||
public PodAnnotations PodAnnotations { get; }
|
||||
public VolumeMount[] Volumes { get; }
|
||||
public IReadOnlyList<VolumeMount> Volumes { get; }
|
||||
public ContainerAdditionals Additionals { get; }
|
||||
|
||||
public Port? GetPortByTag(string tag)
|
||||
@ -61,7 +63,8 @@
|
||||
$"internalPorts: {string.Join(",", InternalPorts.Select(p => p.Number))}, " +
|
||||
$"envVars: {string.Join(",", EnvVars.Select(v => v.ToString()))}, " +
|
||||
$"limits: {Resources}, " +
|
||||
$"affinity: {SchedulingAffinity}, " +
|
||||
$"nodePoolLabels: [{string.Join(",", NodePoolLabels.Select(kvp => $"{kvp.Key}={kvp.Value}"))}], " +
|
||||
$"tolerations: [{string.Join(",", Tolerations.Select(t => $"{t.Key}={t.Value}:{t.Effect}"))}], " +
|
||||
$"volumes: {string.Join(",", Volumes.Select(v => $"'{v.MountPath}'"))}";
|
||||
}
|
||||
}
|
||||
@ -107,6 +110,8 @@
|
||||
UDP
|
||||
}
|
||||
|
||||
public record PodToleration(string Key, string Value, string Effect);
|
||||
|
||||
public class EnvVar
|
||||
{
|
||||
public EnvVar(string name, string value)
|
||||
|
||||
@ -14,7 +14,8 @@ namespace KubernetesWorkflow.Recipe
|
||||
private readonly List<object> additionals = new List<object>();
|
||||
private RecipeComponentFactory factory = null!;
|
||||
private ContainerResources resources = new ContainerResources();
|
||||
private SchedulingAffinity schedulingAffinity = new SchedulingAffinity();
|
||||
private readonly Dictionary<string, string> nodePoolLabels = new Dictionary<string, string>();
|
||||
private readonly List<PodToleration> tolerations = new List<PodToleration>();
|
||||
private CommandOverride commandOverride = new CommandOverride();
|
||||
private bool setCriticalPriority;
|
||||
|
||||
@ -26,7 +27,10 @@ namespace KubernetesWorkflow.Recipe
|
||||
|
||||
Initialize(config);
|
||||
|
||||
var recipe = new ContainerRecipe(DateTime.UtcNow, containerNumber, config.NameOverride, Image, resources, schedulingAffinity, commandOverride, setCriticalPriority,
|
||||
var recipe = new ContainerRecipe(DateTime.UtcNow, containerNumber, config.NameOverride, Image, resources,
|
||||
new Dictionary<string, string>(nodePoolLabels),
|
||||
tolerations.ToArray(),
|
||||
commandOverride, setCriticalPriority,
|
||||
exposedPorts.ToArray(),
|
||||
internalPorts.ToArray(),
|
||||
envVars.ToArray(),
|
||||
@ -35,18 +39,7 @@ namespace KubernetesWorkflow.Recipe
|
||||
volumeMounts.ToArray(),
|
||||
ContainerAdditionals.CreateFromUserData(additionals));
|
||||
|
||||
exposedPorts.Clear();
|
||||
internalPorts.Clear();
|
||||
envVars.Clear();
|
||||
podLabels.Clear();
|
||||
podAnnotations.Clear();
|
||||
volumeMounts.Clear();
|
||||
additionals.Clear();
|
||||
this.factory = null!;
|
||||
resources = new ContainerResources();
|
||||
schedulingAffinity = new SchedulingAffinity();
|
||||
commandOverride = new CommandOverride();
|
||||
setCriticalPriority = false;
|
||||
Reset();
|
||||
|
||||
return recipe;
|
||||
}
|
||||
@ -133,9 +126,14 @@ namespace KubernetesWorkflow.Recipe
|
||||
SetResourcesRequest(new ContainerResourceSet(milliCPUs, memory));
|
||||
}
|
||||
|
||||
protected void SetSchedulingAffinity(string notIn)
|
||||
protected void ScheduleInPoolsWithLabel(string key, string value)
|
||||
{
|
||||
schedulingAffinity = new SchedulingAffinity(notIn);
|
||||
nodePoolLabels[key] = value;
|
||||
}
|
||||
|
||||
protected void AddToleration(string key, string value, string effect)
|
||||
{
|
||||
tolerations.Add(new PodToleration(key, value, effect));
|
||||
}
|
||||
|
||||
protected void OverrideCommand(params string[] command)
|
||||
@ -165,6 +163,23 @@ namespace KubernetesWorkflow.Recipe
|
||||
resources.Limits = limits;
|
||||
}
|
||||
|
||||
private void Reset()
|
||||
{
|
||||
exposedPorts.Clear();
|
||||
internalPorts.Clear();
|
||||
envVars.Clear();
|
||||
podLabels.Clear();
|
||||
podAnnotations.Clear();
|
||||
volumeMounts.Clear();
|
||||
additionals.Clear();
|
||||
nodePoolLabels.Clear();
|
||||
tolerations.Clear();
|
||||
factory = null!;
|
||||
resources = new ContainerResources();
|
||||
commandOverride = new CommandOverride();
|
||||
setCriticalPriority = false;
|
||||
}
|
||||
|
||||
private Port AddExposedPort(Port port)
|
||||
{
|
||||
exposedPorts.Add(port);
|
||||
|
||||
@ -1,18 +0,0 @@
|
||||
namespace KubernetesWorkflow.Recipe
|
||||
{
|
||||
public class SchedulingAffinity
|
||||
{
|
||||
public SchedulingAffinity(string? notIn = null)
|
||||
{
|
||||
NotIn = notIn;
|
||||
}
|
||||
|
||||
public string? NotIn { get; }
|
||||
|
||||
public override string ToString()
|
||||
{
|
||||
if (string.IsNullOrEmpty(NotIn)) return "none";
|
||||
return "notIn:" + NotIn;
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -15,7 +15,8 @@ namespace LogosStorageDiscordBotPlugin
|
||||
{
|
||||
var config = startupConfig.Get<DiscordBotStartupConfig>();
|
||||
|
||||
SetSchedulingAffinity(notIn: "false");
|
||||
ScheduleInPoolsWithLabel("workload-type", "tests-pods");
|
||||
AddToleration("cloud.google.com/gke-provisioning", "spot", "NoSchedule");
|
||||
|
||||
AddEnvVar("TOKEN", config.Token);
|
||||
AddEnvVar("SERVERNAME", config.ServerName);
|
||||
|
||||
@ -13,7 +13,8 @@ namespace LogosStorageDiscordBotPlugin
|
||||
{
|
||||
var config = startupConfig.Get<RewarderBotStartupConfig>();
|
||||
|
||||
SetSchedulingAffinity(notIn: "false");
|
||||
ScheduleInPoolsWithLabel("workload-type", "tests-pods");
|
||||
AddToleration("cloud.google.com/gke-provisioning", "spot", "NoSchedule");
|
||||
|
||||
AddEnvVar("DISCORDBOTHOST", config.DiscordBotHost);
|
||||
AddEnvVar("DISCORDBOTPORT", config.DiscordBotPort.ToString());
|
||||
|
||||
@ -14,7 +14,8 @@ namespace MetricsPlugin
|
||||
{
|
||||
var config = startupConfig.Get<PrometheusStartupConfig>();
|
||||
|
||||
SetSchedulingAffinity(notIn: "false");
|
||||
ScheduleInPoolsWithLabel("workload-type", "tests-pods");
|
||||
AddToleration("cloud.google.com/gke-provisioning", "spot", "NoSchedule");
|
||||
|
||||
AddExposedPortAndVar("PROM_PORT", PortTag);
|
||||
AddEnvVar("PROM_CONFIG", config.PrometheusConfigBase64);
|
||||
|
||||
@ -29,7 +29,9 @@ namespace StoragePlugin
|
||||
SetResourcesRequest(milliCPUs: 100, memory: 100.MB());
|
||||
//SetResourceLimits(milliCPUs: 4000, memory: 12.GB());
|
||||
|
||||
SetSchedulingAffinity(notIn: "false");
|
||||
// Schedule storage nodes on the spot node pool, away from the test runner.
|
||||
ScheduleInPoolsWithLabel("workload-type", "tests-pods");
|
||||
AddToleration("cloud.google.com/gke-provisioning", "spot", "NoSchedule");
|
||||
|
||||
var config = startupConfig.Get<LogosStorageStartupConfig>();
|
||||
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user