190 lines
6.6 KiB
C#
Raw Normal View History

using System.Globalization;
using Utils;
2023-09-08 09:39:56 +02:00
namespace KubernetesWorkflow.Recipe
2023-04-12 13:53:55 +02:00
{
public abstract class ContainerRecipeFactory
{
private readonly List<Port> exposedPorts = new List<Port>();
private readonly List<Port> internalPorts = new List<Port>();
private readonly List<EnvVar> envVars = new List<EnvVar>();
private readonly PodLabels podLabels = new PodLabels();
private readonly PodAnnotations podAnnotations = new PodAnnotations();
private readonly List<VolumeMount> volumeMounts = new List<VolumeMount>();
2023-04-14 12:37:05 +02:00
private readonly List<object> additionals = new List<object>();
2023-04-12 13:53:55 +02:00
private RecipeComponentFactory factory = null!;
private ContainerResources resources = new ContainerResources();
refactor: replace scheduling affinity with explicit node pool label selection Replace the indirect `SetSchedulingAffinity(notIn: "false")` / `allow-tests-pods` mechanism with `ScheduleInPoolsWithLabel(key, value)` and `AddToleration(key, value, effect)` in ContainerRecipeFactory. This is much more readable from an API perspective. `SetSchedulingAffinity(notIn: "false")` was a double-negative (hard to reason about) and it was not clear that this was meant to schedule on pools with labels `allow-tests-pods=true`. Previously, pods were steered to the spot node pool via a node affinity exclusion on a boolean label (`allow-tests-pods NotIn ["false"]`), and spot taint toleration was added implicitly by using the `system-node-critical` priority class. The priority class was removed earlier because it caused a ResourceQuota admission error in GCP, which silently broke spot node scheduling. The new API is explicit: recipes call `ScheduleInPoolsWithLabel` to set a nodeSelector label that targets the intended pool, and `AddToleration` to declare any taints the pool carries. Tolerations are set at the recipe level to allow for the recipe to move back to Digital Ocean if needed (removing the unneeded toleration). All four recipes (storage, prometheus, discord bot, rewarder bot) now call both. Cleanup applied alongside: - `PodToleration` converted to a record for structural equality and simpler deduplication - `ExposedPorts`, `InternalPorts`, `EnvVars`, `Volumes` on `ContainerRecipe` changed to `IReadOnlyList<T>` for consistent immutable typing - `SetCriticalPriority` property renamed to `IsCriticalPriority` - `GetPriorityClassName` returns `string?` instead of `null!` - `Reset()` extracted in `ContainerRecipeFactory` to consolidate post-create state reset - Fixed bug: `nodePoolLabels` and `tolerations` were passed by reference and then cleared, leaving the recipe with empty collections; now snapshotted before clearing - `SchedulingAffinity.cs` deleted (no remaining callers)
2026-04-29 16:45:55 +10:00
private readonly Dictionary<string, string> nodePoolLabels = new Dictionary<string, string>();
private readonly List<PodToleration> tolerations = new List<PodToleration>();
private CommandOverride commandOverride = new CommandOverride();
private bool setCriticalPriority;
2023-04-12 13:53:55 +02:00
2023-04-14 10:51:35 +02:00
public ContainerRecipe CreateRecipe(int index, int containerNumber, RecipeComponentFactory factory, StartupConfig config)
2023-04-12 13:53:55 +02:00
{
this.factory = factory;
ContainerNumber = containerNumber;
2023-04-14 10:51:35 +02:00
Index = index;
2023-04-12 13:53:55 +02:00
Initialize(config);
refactor: replace scheduling affinity with explicit node pool label selection Replace the indirect `SetSchedulingAffinity(notIn: "false")` / `allow-tests-pods` mechanism with `ScheduleInPoolsWithLabel(key, value)` and `AddToleration(key, value, effect)` in ContainerRecipeFactory. This is much more readable from an API perspective. `SetSchedulingAffinity(notIn: "false")` was a double-negative (hard to reason about) and it was not clear that this was meant to schedule on pools with labels `allow-tests-pods=true`. Previously, pods were steered to the spot node pool via a node affinity exclusion on a boolean label (`allow-tests-pods NotIn ["false"]`), and spot taint toleration was added implicitly by using the `system-node-critical` priority class. The priority class was removed earlier because it caused a ResourceQuota admission error in GCP, which silently broke spot node scheduling. The new API is explicit: recipes call `ScheduleInPoolsWithLabel` to set a nodeSelector label that targets the intended pool, and `AddToleration` to declare any taints the pool carries. Tolerations are set at the recipe level to allow for the recipe to move back to Digital Ocean if needed (removing the unneeded toleration). All four recipes (storage, prometheus, discord bot, rewarder bot) now call both. Cleanup applied alongside: - `PodToleration` converted to a record for structural equality and simpler deduplication - `ExposedPorts`, `InternalPorts`, `EnvVars`, `Volumes` on `ContainerRecipe` changed to `IReadOnlyList<T>` for consistent immutable typing - `SetCriticalPriority` property renamed to `IsCriticalPriority` - `GetPriorityClassName` returns `string?` instead of `null!` - `Reset()` extracted in `ContainerRecipeFactory` to consolidate post-create state reset - Fixed bug: `nodePoolLabels` and `tolerations` were passed by reference and then cleared, leaving the recipe with empty collections; now snapshotted before clearing - `SchedulingAffinity.cs` deleted (no remaining callers)
2026-04-29 16:45:55 +10:00
var recipe = new ContainerRecipe(DateTime.UtcNow, containerNumber, config.NameOverride, Image, resources,
new Dictionary<string, string>(nodePoolLabels),
tolerations.ToArray(),
commandOverride, setCriticalPriority,
exposedPorts.ToArray(),
internalPorts.ToArray(),
envVars.ToArray(),
podLabels.Clone(),
podAnnotations.Clone(),
volumeMounts.ToArray(),
2023-09-20 10:13:29 +02:00
ContainerAdditionals.CreateFromUserData(additionals));
2023-04-13 10:11:33 +02:00
refactor: replace scheduling affinity with explicit node pool label selection Replace the indirect `SetSchedulingAffinity(notIn: "false")` / `allow-tests-pods` mechanism with `ScheduleInPoolsWithLabel(key, value)` and `AddToleration(key, value, effect)` in ContainerRecipeFactory. This is much more readable from an API perspective. `SetSchedulingAffinity(notIn: "false")` was a double-negative (hard to reason about) and it was not clear that this was meant to schedule on pools with labels `allow-tests-pods=true`. Previously, pods were steered to the spot node pool via a node affinity exclusion on a boolean label (`allow-tests-pods NotIn ["false"]`), and spot taint toleration was added implicitly by using the `system-node-critical` priority class. The priority class was removed earlier because it caused a ResourceQuota admission error in GCP, which silently broke spot node scheduling. The new API is explicit: recipes call `ScheduleInPoolsWithLabel` to set a nodeSelector label that targets the intended pool, and `AddToleration` to declare any taints the pool carries. Tolerations are set at the recipe level to allow for the recipe to move back to Digital Ocean if needed (removing the unneeded toleration). All four recipes (storage, prometheus, discord bot, rewarder bot) now call both. Cleanup applied alongside: - `PodToleration` converted to a record for structural equality and simpler deduplication - `ExposedPorts`, `InternalPorts`, `EnvVars`, `Volumes` on `ContainerRecipe` changed to `IReadOnlyList<T>` for consistent immutable typing - `SetCriticalPriority` property renamed to `IsCriticalPriority` - `GetPriorityClassName` returns `string?` instead of `null!` - `Reset()` extracted in `ContainerRecipeFactory` to consolidate post-create state reset - Fixed bug: `nodePoolLabels` and `tolerations` were passed by reference and then cleared, leaving the recipe with empty collections; now snapshotted before clearing - `SchedulingAffinity.cs` deleted (no remaining callers)
2026-04-29 16:45:55 +10:00
Reset();
2023-04-13 10:11:33 +02:00
return recipe;
2023-04-12 13:53:55 +02:00
}
2023-08-07 15:51:44 +02:00
public abstract string AppName { get; }
public abstract string Image { get; }
2023-04-12 13:53:55 +02:00
protected int ContainerNumber { get; private set; } = 0;
2023-04-14 10:51:35 +02:00
protected int Index { get; private set; } = 0;
2023-04-12 13:53:55 +02:00
protected abstract void Initialize(StartupConfig config);
protected Port AddExposedPort(string tag, PortProtocol protocol = PortProtocol.TCP)
2023-04-12 13:53:55 +02:00
{
return AddExposedPort(factory.CreateExternalPort(tag, protocol));
2023-04-12 13:53:55 +02:00
}
protected Port AddExposedPort(int number, string tag, PortProtocol protocol = PortProtocol.TCP)
2023-08-11 09:37:30 +02:00
{
return AddExposedPort(factory.CreateExternalPort(number, tag, protocol));
2023-08-11 09:37:30 +02:00
}
protected Port AddInternalPort(string tag = "", PortProtocol protocol = PortProtocol.TCP)
2023-04-12 13:53:55 +02:00
{
var p = factory.CreateInternalPort(tag, protocol);
2023-04-12 13:53:55 +02:00
internalPorts.Add(p);
return p;
}
protected void AddExposedPortAndVar(string name, string tag, PortProtocol protocol = PortProtocol.TCP)
2023-04-12 13:53:55 +02:00
{
AddEnvVar(name, AddExposedPort(tag, protocol));
2023-04-12 13:53:55 +02:00
}
protected void AddInternalPortAndVar(string name, string tag = "", PortProtocol protocol = PortProtocol.TCP)
2023-04-12 13:53:55 +02:00
{
AddEnvVar(name, AddInternalPort(tag, protocol));
2023-04-12 13:53:55 +02:00
}
protected void AddEnvVar(string name, string value)
{
envVars.Add(factory.CreateEnvVar(name, value));
}
protected void AddEnvVar(string name, int value)
{
envVars.Add(factory.CreateEnvVar(name, value.ToString(CultureInfo.InvariantCulture)));
}
2023-04-12 13:53:55 +02:00
protected void AddEnvVar(string name, Port value)
{
envVars.Add(factory.CreateEnvVar(name, value.Number));
}
2023-04-14 12:37:05 +02:00
protected void AddPodLabel(string name, string value)
{
podLabels.Add(name, value);
}
protected void AddPodAnnotation(string name, string value)
{
podAnnotations.Add(name, value);
}
protected void AddVolume(string name, string mountPath, string? subPath = null, string? secret = null, string? hostPath = null)
{
2024-06-08 10:36:23 +02:00
var size = 10.MB().SizeInBytes.ToString();
volumeMounts.Add(new VolumeMount(name, mountPath, subPath, size, secret, hostPath));
}
2023-09-08 09:39:56 +02:00
protected void AddVolume(string mountPath, ByteSize volumeSize)
{
volumeMounts.Add(new VolumeMount(
$"autovolume-{Guid.NewGuid().ToString().ToLowerInvariant()}",
mountPath,
2024-06-08 10:36:23 +02:00
resourceQuantity: volumeSize.SizeInBytes.ToString()));
}
2023-04-14 12:37:05 +02:00
protected void Additional(object userData)
{
additionals.Add(userData);
}
2023-09-15 15:52:02 +02:00
protected void SetResourcesRequest(int milliCPUs, ByteSize memory)
{
SetResourcesRequest(new ContainerResourceSet(milliCPUs, memory));
}
refactor: replace scheduling affinity with explicit node pool label selection Replace the indirect `SetSchedulingAffinity(notIn: "false")` / `allow-tests-pods` mechanism with `ScheduleInPoolsWithLabel(key, value)` and `AddToleration(key, value, effect)` in ContainerRecipeFactory. This is much more readable from an API perspective. `SetSchedulingAffinity(notIn: "false")` was a double-negative (hard to reason about) and it was not clear that this was meant to schedule on pools with labels `allow-tests-pods=true`. Previously, pods were steered to the spot node pool via a node affinity exclusion on a boolean label (`allow-tests-pods NotIn ["false"]`), and spot taint toleration was added implicitly by using the `system-node-critical` priority class. The priority class was removed earlier because it caused a ResourceQuota admission error in GCP, which silently broke spot node scheduling. The new API is explicit: recipes call `ScheduleInPoolsWithLabel` to set a nodeSelector label that targets the intended pool, and `AddToleration` to declare any taints the pool carries. Tolerations are set at the recipe level to allow for the recipe to move back to Digital Ocean if needed (removing the unneeded toleration). All four recipes (storage, prometheus, discord bot, rewarder bot) now call both. Cleanup applied alongside: - `PodToleration` converted to a record for structural equality and simpler deduplication - `ExposedPorts`, `InternalPorts`, `EnvVars`, `Volumes` on `ContainerRecipe` changed to `IReadOnlyList<T>` for consistent immutable typing - `SetCriticalPriority` property renamed to `IsCriticalPriority` - `GetPriorityClassName` returns `string?` instead of `null!` - `Reset()` extracted in `ContainerRecipeFactory` to consolidate post-create state reset - Fixed bug: `nodePoolLabels` and `tolerations` were passed by reference and then cleared, leaving the recipe with empty collections; now snapshotted before clearing - `SchedulingAffinity.cs` deleted (no remaining callers)
2026-04-29 16:45:55 +10:00
protected void ScheduleInPoolsWithLabel(string key, string value)
{
nodePoolLabels[key] = value;
}
protected void AddToleration(string key, string value, string effect)
{
refactor: replace scheduling affinity with explicit node pool label selection Replace the indirect `SetSchedulingAffinity(notIn: "false")` / `allow-tests-pods` mechanism with `ScheduleInPoolsWithLabel(key, value)` and `AddToleration(key, value, effect)` in ContainerRecipeFactory. This is much more readable from an API perspective. `SetSchedulingAffinity(notIn: "false")` was a double-negative (hard to reason about) and it was not clear that this was meant to schedule on pools with labels `allow-tests-pods=true`. Previously, pods were steered to the spot node pool via a node affinity exclusion on a boolean label (`allow-tests-pods NotIn ["false"]`), and spot taint toleration was added implicitly by using the `system-node-critical` priority class. The priority class was removed earlier because it caused a ResourceQuota admission error in GCP, which silently broke spot node scheduling. The new API is explicit: recipes call `ScheduleInPoolsWithLabel` to set a nodeSelector label that targets the intended pool, and `AddToleration` to declare any taints the pool carries. Tolerations are set at the recipe level to allow for the recipe to move back to Digital Ocean if needed (removing the unneeded toleration). All four recipes (storage, prometheus, discord bot, rewarder bot) now call both. Cleanup applied alongside: - `PodToleration` converted to a record for structural equality and simpler deduplication - `ExposedPorts`, `InternalPorts`, `EnvVars`, `Volumes` on `ContainerRecipe` changed to `IReadOnlyList<T>` for consistent immutable typing - `SetCriticalPriority` property renamed to `IsCriticalPriority` - `GetPriorityClassName` returns `string?` instead of `null!` - `Reset()` extracted in `ContainerRecipeFactory` to consolidate post-create state reset - Fixed bug: `nodePoolLabels` and `tolerations` were passed by reference and then cleared, leaving the recipe with empty collections; now snapshotted before clearing - `SchedulingAffinity.cs` deleted (no remaining callers)
2026-04-29 16:45:55 +10:00
tolerations.Add(new PodToleration(key, value, effect));
}
protected void OverrideCommand(params string[] command)
{
commandOverride = new CommandOverride(command);
}
protected void SetSystemCriticalPriority()
{
setCriticalPriority = true;
}
2023-11-07 09:25:51 +01:00
// Disabled following a possible bug in the k8s cluster that will throttle containers much more than is
// called for if they have resource limits defined.
//protected void SetResourceLimits(int milliCPUs, ByteSize memory)
//{
// SetResourceLimits(new ContainerResourceSet(milliCPUs, memory));
//}
protected void SetResourcesRequest(ContainerResourceSet requests)
{
resources.Requests = requests;
}
protected void SetResourceLimits(ContainerResourceSet limits)
{
resources.Limits = limits;
}
refactor: replace scheduling affinity with explicit node pool label selection Replace the indirect `SetSchedulingAffinity(notIn: "false")` / `allow-tests-pods` mechanism with `ScheduleInPoolsWithLabel(key, value)` and `AddToleration(key, value, effect)` in ContainerRecipeFactory. This is much more readable from an API perspective. `SetSchedulingAffinity(notIn: "false")` was a double-negative (hard to reason about) and it was not clear that this was meant to schedule on pools with labels `allow-tests-pods=true`. Previously, pods were steered to the spot node pool via a node affinity exclusion on a boolean label (`allow-tests-pods NotIn ["false"]`), and spot taint toleration was added implicitly by using the `system-node-critical` priority class. The priority class was removed earlier because it caused a ResourceQuota admission error in GCP, which silently broke spot node scheduling. The new API is explicit: recipes call `ScheduleInPoolsWithLabel` to set a nodeSelector label that targets the intended pool, and `AddToleration` to declare any taints the pool carries. Tolerations are set at the recipe level to allow for the recipe to move back to Digital Ocean if needed (removing the unneeded toleration). All four recipes (storage, prometheus, discord bot, rewarder bot) now call both. Cleanup applied alongside: - `PodToleration` converted to a record for structural equality and simpler deduplication - `ExposedPorts`, `InternalPorts`, `EnvVars`, `Volumes` on `ContainerRecipe` changed to `IReadOnlyList<T>` for consistent immutable typing - `SetCriticalPriority` property renamed to `IsCriticalPriority` - `GetPriorityClassName` returns `string?` instead of `null!` - `Reset()` extracted in `ContainerRecipeFactory` to consolidate post-create state reset - Fixed bug: `nodePoolLabels` and `tolerations` were passed by reference and then cleared, leaving the recipe with empty collections; now snapshotted before clearing - `SchedulingAffinity.cs` deleted (no remaining callers)
2026-04-29 16:45:55 +10:00
private void Reset()
{
exposedPorts.Clear();
internalPorts.Clear();
envVars.Clear();
podLabels.Clear();
podAnnotations.Clear();
volumeMounts.Clear();
additionals.Clear();
nodePoolLabels.Clear();
tolerations.Clear();
factory = null!;
resources = new ContainerResources();
commandOverride = new CommandOverride();
setCriticalPriority = false;
}
2023-09-15 15:52:02 +02:00
private Port AddExposedPort(Port port)
{
exposedPorts.Add(port);
return port;
}
2023-04-12 13:53:55 +02:00
}
}