rigged up a pod crash detector maybe

This commit is contained in:
benbierens 2023-08-14 15:10:36 +02:00
parent 9f0600d6c1
commit 6e5e30afd4
No known key found for this signature in database
GPG Key ID: FE44815D96D0A1AA
4 changed files with 77 additions and 20 deletions

View File

@ -377,7 +377,8 @@
"value": 80 "value": 80
} }
] ]
} },
"unit": "decbytes"
}, },
"overrides": [] "overrides": []
}, },
@ -471,7 +472,8 @@
"value": "<CODEX_STORAGEQUOTA>" "value": "<CODEX_STORAGEQUOTA>"
} }
] ]
} },
"unit": "decbytes"
}, },
"overrides": [] "overrides": []
}, },

View File

@ -604,6 +604,30 @@ namespace KubernetesWorkflow
#endregion #endregion
public Task WatchForCrashLogs(RunningContainer container, CancellationToken token, ILogHandler logHandler)
{
return Task.Run(() =>
{
var myOwnClient = new Kubernetes(cluster.GetK8sClientConfig());
while (!token.IsCancellationRequested)
{
token.WaitHandle.WaitOne(TimeSpan.FromSeconds(3));
var pod = container.Pod;
var recipe = container.Recipe;
var podName = pod.PodInfo.Name;
var podInfo = myOwnClient.ReadNamespacedPod(podName, K8sTestNamespace);
if (podInfo.Status.ContainerStatuses.Any(c => c.RestartCount > 0))
{
log.Log("Pod crash detected for " + container.Name);
using var stream = client.Run(c => c.ReadNamespacedPodLog(pod.PodInfo.Name, K8sTestNamespace, recipe.Name, previous: true));
logHandler.Log(stream);
}
}
});
}
private PodInfo FetchNewPod() private PodInfo FetchNewPod()
{ {
var pods = client.Run(c => c.ListNamespacedPod(K8sTestNamespace)).Items; var pods = client.Run(c => c.ListNamespacedPod(K8sTestNamespace)).Items;

View File

@ -37,6 +37,11 @@ namespace KubernetesWorkflow
}, pl); }, pl);
} }
public Task WatchForCrashLogs(RunningContainer container, CancellationToken token, ILogHandler logHandler)
{
return K8s(controller => controller.WatchForCrashLogs(container, token, logHandler));
}
public void Stop(RunningContainers runningContainers) public void Stop(RunningContainers runningContainers)
{ {
K8s(controller => K8s(controller =>

View File

@ -1,11 +1,12 @@
using DistTestCore; using DistTestCore;
using KubernetesWorkflow;
using NUnit.Framework; using NUnit.Framework;
using Utils; using Utils;
namespace Tests.BasicTests namespace Tests.BasicTests
{ {
[TestFixture] [TestFixture]
public class ContinuousSubstitute : AutoBootstrapDistTest public class ContinuousSubstitute : AutoBootstrapDistTest, ILogHandler
{ {
[Test] [Test]
[UseLongTimeouts] [UseLongTimeouts]
@ -70,6 +71,12 @@ namespace Tests.BasicTests
var nodes = group.Cast<OnlineCodexNode>().ToArray(); var nodes = group.Cast<OnlineCodexNode>().ToArray();
var flow = Get().WorkflowCreator.CreateWorkflow();
var cst = new CancellationTokenSource();
var tasks = nodes.Select(n => flow.WatchForCrashLogs(n.CodexAccess.Container, cst.Token, this)).ToArray();
try
{
foreach (var node in nodes) foreach (var node in nodes)
{ {
node.Marketplace.MakeStorageAvailable( node.Marketplace.MakeStorageAvailable(
@ -94,5 +101,24 @@ namespace Tests.BasicTests
Thread.Sleep(TimeSpan.FromMinutes(2)); Thread.Sleep(TimeSpan.FromMinutes(2));
} }
} }
finally
{
cst.Cancel();
foreach (var t in tasks) t.Wait();
}
}
public void Log(Stream log)
{
Log("Well damn, container crashed. Here's the log:");
using var reader = new StreamReader(log);
var line = reader.ReadLine();
while(line != null)
{
Log(line);
line = reader.ReadLine();
}
}
} }
} }