rigged up a pod crash detector maybe

This commit is contained in:
benbierens 2023-08-14 15:10:36 +02:00
parent 9f0600d6c1
commit 6e5e30afd4
No known key found for this signature in database
GPG Key ID: FE44815D96D0A1AA
4 changed files with 77 additions and 20 deletions

View File

@ -377,7 +377,8 @@
"value": 80
}
]
}
},
"unit": "decbytes"
},
"overrides": []
},
@ -471,7 +472,8 @@
"value": "<CODEX_STORAGEQUOTA>"
}
]
}
},
"unit": "decbytes"
},
"overrides": []
},

View File

@ -604,6 +604,30 @@ namespace KubernetesWorkflow
#endregion
public Task WatchForCrashLogs(RunningContainer container, CancellationToken token, ILogHandler logHandler)
{
return Task.Run(() =>
{
var myOwnClient = new Kubernetes(cluster.GetK8sClientConfig());
while (!token.IsCancellationRequested)
{
token.WaitHandle.WaitOne(TimeSpan.FromSeconds(3));
var pod = container.Pod;
var recipe = container.Recipe;
var podName = pod.PodInfo.Name;
var podInfo = myOwnClient.ReadNamespacedPod(podName, K8sTestNamespace);
if (podInfo.Status.ContainerStatuses.Any(c => c.RestartCount > 0))
{
log.Log("Pod crash detected for " + container.Name);
using var stream = client.Run(c => c.ReadNamespacedPodLog(pod.PodInfo.Name, K8sTestNamespace, recipe.Name, previous: true));
logHandler.Log(stream);
}
}
});
}
private PodInfo FetchNewPod()
{
var pods = client.Run(c => c.ListNamespacedPod(K8sTestNamespace)).Items;

View File

@ -37,6 +37,11 @@ namespace KubernetesWorkflow
}, pl);
}
public Task WatchForCrashLogs(RunningContainer container, CancellationToken token, ILogHandler logHandler)
{
return K8s(controller => controller.WatchForCrashLogs(container, token, logHandler));
}
public void Stop(RunningContainers runningContainers)
{
K8s(controller =>

View File

@ -1,11 +1,12 @@
using DistTestCore;
using KubernetesWorkflow;
using NUnit.Framework;
using Utils;
namespace Tests.BasicTests
{
[TestFixture]
public class ContinuousSubstitute : AutoBootstrapDistTest
public class ContinuousSubstitute : AutoBootstrapDistTest, ILogHandler
{
[Test]
[UseLongTimeouts]
@ -57,7 +58,7 @@ namespace Tests.BasicTests
testFile.AssertIsEqual(downloadedFile);
});
}
[Test]
[UseLongTimeouts]
public void HoldMyBeerTest()
@ -70,28 +71,53 @@ namespace Tests.BasicTests
var nodes = group.Cast<OnlineCodexNode>().ToArray();
foreach (var node in nodes)
{
node.Marketplace.MakeStorageAvailable(
size: 1.GB(),
minPricePerBytePerSecond: 1.TestTokens(),
maxCollateral: 1024.TestTokens(),
maxDuration: TimeSpan.FromMinutes(5));
}
var flow = Get().WorkflowCreator.CreateWorkflow();
var cst = new CancellationTokenSource();
var tasks = nodes.Select(n => flow.WatchForCrashLogs(n.CodexAccess.Container, cst.Token, this)).ToArray();
var endTime = DateTime.UtcNow + TimeSpan.FromHours(2);
while (DateTime.UtcNow < endTime)
try
{
foreach (var node in nodes)
{
var file = GenerateTestFile(80.MB());
var cid = node.UploadFile(file);
var dl = node.DownloadContent(cid);
file.AssertIsEqual(dl);
node.Marketplace.MakeStorageAvailable(
size: 1.GB(),
minPricePerBytePerSecond: 1.TestTokens(),
maxCollateral: 1024.TestTokens(),
maxDuration: TimeSpan.FromMinutes(5));
}
Thread.Sleep(TimeSpan.FromMinutes(2));
var endTime = DateTime.UtcNow + TimeSpan.FromHours(2);
while (DateTime.UtcNow < endTime)
{
foreach (var node in nodes)
{
var file = GenerateTestFile(80.MB());
var cid = node.UploadFile(file);
var dl = node.DownloadContent(cid);
file.AssertIsEqual(dl);
}
Thread.Sleep(TimeSpan.FromMinutes(2));
}
}
finally
{
cst.Cancel();
foreach (var t in tasks) t.Wait();
}
}
public void Log(Stream log)
{
Log("Well damn, container crashed. Here's the log:");
using var reader = new StreamReader(log);
var line = reader.ReadLine();
while(line != null)
{
Log(line);
line = reader.ReadLine();
}
}
}