diff --git a/Framework/KubernetesWorkflow/K8sController.cs b/Framework/KubernetesWorkflow/K8sController.cs index 5b657d2..27f477a 100644 --- a/Framework/KubernetesWorkflow/K8sController.cs +++ b/Framework/KubernetesWorkflow/K8sController.cs @@ -360,6 +360,7 @@ namespace KubernetesWorkflow }, Spec = new V1PodSpec { + Affinity = CreatePodAffinity(containerRecipes), NodeSelector = CreateNodeSelector(location), Containers = CreateDeploymentContainers(containerRecipes), Volumes = CreateVolumes(containerRecipes) @@ -392,6 +393,42 @@ namespace KubernetesWorkflow }; } + private V1Affinity? CreatePodAffinity(ContainerRecipe[] recipes) + { + var notIns = recipes + .Select(r => r.SchedulingAffinity.NotIn) + .Where(n => !string.IsNullOrEmpty(n)) + .Distinct() + .ToList(); + + if (!notIns.Any()) return null; + + return new V1Affinity + { + NodeAffinity = new V1NodeAffinity + { + RequiredDuringSchedulingIgnoredDuringExecution = new V1NodeSelector + { + NodeSelectorTerms = new List + { + new V1NodeSelectorTerm + { + MatchExpressions = new List + { + new V1NodeSelectorRequirement + { + Key = "workload-type", + OperatorProperty = "NotIn", + Values = notIns + } + } + } + } + } + } + }; + } + private K8sNodeLabel? GetNodeLabelForLocation(ILocation location) { var l = (Location)location; diff --git a/Framework/KubernetesWorkflow/Recipe/ContainerRecipe.cs b/Framework/KubernetesWorkflow/Recipe/ContainerRecipe.cs index b66bd42..1865dc7 100644 --- a/Framework/KubernetesWorkflow/Recipe/ContainerRecipe.cs +++ b/Framework/KubernetesWorkflow/Recipe/ContainerRecipe.cs @@ -2,12 +2,13 @@ { public class ContainerRecipe { - public ContainerRecipe(int number, string? nameOverride, string image, ContainerResources resources, Port[] exposedPorts, Port[] internalPorts, EnvVar[] envVars, PodLabels podLabels, PodAnnotations podAnnotations, VolumeMount[] volumes, ContainerAdditionals additionals) + public ContainerRecipe(int number, string? nameOverride, string image, ContainerResources resources, SchedulingAffinity schedulingAffinity, Port[] exposedPorts, Port[] internalPorts, EnvVar[] envVars, PodLabels podLabels, PodAnnotations podAnnotations, VolumeMount[] volumes, ContainerAdditionals additionals) { Number = number; NameOverride = nameOverride; Image = image; Resources = resources; + SchedulingAffinity = schedulingAffinity; ExposedPorts = exposedPorts; InternalPorts = internalPorts; EnvVars = envVars; @@ -32,6 +33,7 @@ public int Number { get; } public string? NameOverride { get; } public ContainerResources Resources { get; } + public SchedulingAffinity SchedulingAffinity { get; } public string Image { get; } public Port[] ExposedPorts { get; } public Port[] InternalPorts { get; } @@ -53,6 +55,7 @@ $"internalPorts: {string.Join(",", InternalPorts.Select(p => p.Number))}, " + $"envVars: {string.Join(",", EnvVars.Select(v => v.ToString()))}, " + $"limits: {Resources}, " + + $"affinity: {SchedulingAffinity}, " + $"volumes: {string.Join(",", Volumes.Select(v => $"'{v.MountPath}'"))}"; } } diff --git a/Framework/KubernetesWorkflow/Recipe/ContainerRecipeFactory.cs b/Framework/KubernetesWorkflow/Recipe/ContainerRecipeFactory.cs index 0a988de..7c734e8 100644 --- a/Framework/KubernetesWorkflow/Recipe/ContainerRecipeFactory.cs +++ b/Framework/KubernetesWorkflow/Recipe/ContainerRecipeFactory.cs @@ -13,6 +13,7 @@ namespace KubernetesWorkflow.Recipe private readonly List additionals = new List(); private RecipeComponentFactory factory = null!; private ContainerResources resources = new ContainerResources(); + private SchedulingAffinity schedulingAffinity = new SchedulingAffinity(); public ContainerRecipe CreateRecipe(int index, int containerNumber, RecipeComponentFactory factory, StartupConfig config) { @@ -22,7 +23,7 @@ namespace KubernetesWorkflow.Recipe Initialize(config); - var recipe = new ContainerRecipe(containerNumber, config.NameOverride, Image, resources, + var recipe = new ContainerRecipe(containerNumber, config.NameOverride, Image, resources, schedulingAffinity, exposedPorts.ToArray(), internalPorts.ToArray(), envVars.ToArray(), @@ -40,6 +41,7 @@ namespace KubernetesWorkflow.Recipe additionals.Clear(); this.factory = null!; resources = new ContainerResources(); + schedulingAffinity = new SchedulingAffinity(); return recipe; } @@ -121,6 +123,11 @@ namespace KubernetesWorkflow.Recipe SetResourcesRequest(new ContainerResourceSet(milliCPUs, memory)); } + protected void SetSchedulingAffinity(string notIn) + { + schedulingAffinity = new SchedulingAffinity(notIn); + } + // Disabled following a possible bug in the k8s cluster that will throttle containers much more than is // called for if they have resource limits defined. //protected void SetResourceLimits(int milliCPUs, ByteSize memory) diff --git a/Framework/KubernetesWorkflow/Recipe/SchedulingAffinity.cs b/Framework/KubernetesWorkflow/Recipe/SchedulingAffinity.cs new file mode 100644 index 0000000..8bc4c84 --- /dev/null +++ b/Framework/KubernetesWorkflow/Recipe/SchedulingAffinity.cs @@ -0,0 +1,18 @@ +namespace KubernetesWorkflow.Recipe +{ + public class SchedulingAffinity + { + public SchedulingAffinity(string? notIn = null) + { + NotIn = notIn; + } + + public string? NotIn { get; } + + public override string ToString() + { + if (string.IsNullOrEmpty(NotIn)) return "none"; + return "notIn:" + NotIn; + } + } +} diff --git a/ProjectPlugins/CodexContractsPlugin/CodexContractsContainerRecipe.cs b/ProjectPlugins/CodexContractsPlugin/CodexContractsContainerRecipe.cs index d6dd92f..3e5faa1 100644 --- a/ProjectPlugins/CodexContractsPlugin/CodexContractsContainerRecipe.cs +++ b/ProjectPlugins/CodexContractsPlugin/CodexContractsContainerRecipe.cs @@ -21,6 +21,8 @@ namespace CodexContractsPlugin var address = config.GethNode.StartResult.Container.GetAddress(new NullLog(), GethContainerRecipe.HttpPortTag); + SetSchedulingAffinity(notIn: "tests-runners"); + AddEnvVar("DISTTEST_NETWORK_URL", address.ToString()); AddEnvVar("HARDHAT_NETWORK", "codexdisttestnetwork"); AddEnvVar("KEEP_ALIVE", "1"); diff --git a/ProjectPlugins/CodexDiscordBotPlugin/DiscordBotContainerRecipe.cs b/ProjectPlugins/CodexDiscordBotPlugin/DiscordBotContainerRecipe.cs index 3e14a91..c193e02 100644 --- a/ProjectPlugins/CodexDiscordBotPlugin/DiscordBotContainerRecipe.cs +++ b/ProjectPlugins/CodexDiscordBotPlugin/DiscordBotContainerRecipe.cs @@ -13,6 +13,8 @@ namespace CodexDiscordBotPlugin { var config = startupConfig.Get(); + SetSchedulingAffinity(notIn: "tests-runners"); + AddEnvVar("TOKEN", config.Token); AddEnvVar("SERVERNAME", config.ServerName); AddEnvVar("ADMINROLE", config.AdminRoleName); diff --git a/ProjectPlugins/CodexPlugin/CodexContainerRecipe.cs b/ProjectPlugins/CodexPlugin/CodexContainerRecipe.cs index bfdf52b..744ae37 100644 --- a/ProjectPlugins/CodexPlugin/CodexContainerRecipe.cs +++ b/ProjectPlugins/CodexPlugin/CodexContainerRecipe.cs @@ -29,6 +29,8 @@ namespace CodexPlugin SetResourcesRequest(milliCPUs: 100, memory: 100.MB()); //SetResourceLimits(milliCPUs: 4000, memory: 12.GB()); + SetSchedulingAffinity(notIn: "tests-runners"); + var config = startupConfig.Get(); var apiPort = CreateApiPort(config, ApiPortTag); diff --git a/ProjectPlugins/GethPlugin/GethContainerRecipe.cs b/ProjectPlugins/GethPlugin/GethContainerRecipe.cs index f9eaf79..35b994c 100644 --- a/ProjectPlugins/GethPlugin/GethContainerRecipe.cs +++ b/ProjectPlugins/GethPlugin/GethContainerRecipe.cs @@ -24,6 +24,8 @@ namespace GethPlugin var args = CreateArgs(config); + SetSchedulingAffinity(notIn: "tests-runners"); + AddEnvVar("GETH_ARGS", args); } diff --git a/ProjectPlugins/MetricsPlugin/PrometheusContainerRecipe.cs b/ProjectPlugins/MetricsPlugin/PrometheusContainerRecipe.cs index a4dfdb4..e132a8f 100644 --- a/ProjectPlugins/MetricsPlugin/PrometheusContainerRecipe.cs +++ b/ProjectPlugins/MetricsPlugin/PrometheusContainerRecipe.cs @@ -14,6 +14,8 @@ namespace MetricsPlugin { var config = startupConfig.Get(); + SetSchedulingAffinity(notIn: "tests-runners"); + AddExposedPortAndVar("PROM_PORT", PortTag); AddEnvVar("PROM_CONFIG", config.PrometheusConfigBase64); } diff --git a/Tests/CodexContinuousTests/reports/CodexTestNetReport-October2023.md b/Tests/CodexContinuousTests/reports/CodexTestNetReport-October2023.md new file mode 100644 index 0000000..ad796c4 --- /dev/null +++ b/Tests/CodexContinuousTests/reports/CodexTestNetReport-October2023.md @@ -0,0 +1,52 @@ +# Codex Continuous Test-net Report +Date: 13-11-2023 + +Report for: 10-2023 + + +## Test-net Status +- Start of month: Offline - stopped +- End of month: Offline - stopped + +(Stopped: The number of tests that can successfully run on the test-net is not high enough to justify the cost of leaving it running.) + +## Deployment Configuration +Continous Test-net is deployed to the kubernetes cluster with the following configuration: + +5x Codex Nodes: +- Log-level: Trace +- Storage quota: 2048 MB +- Storage sell: 1024 MB +- Min price: 1024 +- Max collateral: 1024 +- Max duration: 3600000 seconds +- Block-TTL*: 180 seconds +- Block-MI*: 120 seconds +- Block-MN*: 10000 blocks + +3 of these 5 nodes have: +- Validator: true + +Kubernetes namespace: 'codex-continuous-tests' +* Some tests have been performed with alternative (disabled) maintenance parameters: +- Block-TTL: 99999999 seconds +- Block-MI: 99999999 seconds +- Block-MN: 100 blocks + +## Test Overview +| Changes | Test | Description | Status | Results | +|---------------------|------------------|--------------------------------|------------|---------------------------------------------------------------| +| No change | Two-client test | See report for July 2023. | Faulted | Test reliably fails. Both upload and download failures occur. | +| No change | Two-client test* | See report for September 2023. | Faulted | Test reliably fails. Both upload and download failures occur. | +| Possible regression | HoldMyBeer test | See report for August 2023. | Unreliable | Successful runs of 48h have not been observed in October. | +| Possible regression | Peers test | See report for August 2023. | Unreliable | Successful runs of 48h have not been observed in October. | + +## Resulting changes +As a result of the testing efforts in 10-2023, these changes were made: +1. Consolidation of test logs and metrics using grafana and elastic-search. +1. Investment made in profiling instrumentation in Codex codebase. +1. Some testing effort has been diverted to preparing the necessary infrastructure for the creation of a public testnet by 1-December-2023. + +## Action Points +- Debugging efforts continuou +- Some effort remains allocated to deploying and supporting the public testnet diff --git a/docker/continuous-tests-job.yaml b/docker/continuous-tests-job.yaml index 79c914b..c669ec0 100644 --- a/docker/continuous-tests-job.yaml +++ b/docker/continuous-tests-job.yaml @@ -18,14 +18,14 @@ spec: spec: priorityClassName: system-node-critical nodeSelector: - doks.digitalocean.com/node-pool: "fixed-s-4vcpu-16gb-amd" + workload-type: "tests-runners" containers: - name: ${NAMEPREFIX}-runner image: codexstorage/cs-codex-dist-tests:latest imagePullPolicy: Always resources: requests: - memory: "2Gi" + memory: "1Gi" env: - name: KUBECONFIG value: "/opt/kubeconfig.yaml" diff --git a/docker/dist-tests-job.yaml b/docker/dist-tests-job.yaml index 7c92375..2eaed22 100644 --- a/docker/dist-tests-job.yaml +++ b/docker/dist-tests-job.yaml @@ -16,10 +16,16 @@ spec: name: ${NAMEPREFIX}-${RUNID} run-id: ${RUNID} spec: + priorityClassName: system-node-critical + nodeSelector: + workload-type: "tests-runners" containers: - name: ${NAMEPREFIX}-runner image: codexstorage/cs-codex-dist-tests:latest imagePullPolicy: Always + resources: + requests: + memory: "1Gi" env: - name: KUBECONFIG value: "/opt/kubeconfig.yaml"