wires crashWatcher into CodexStarter
This commit is contained in:
parent
72fa368357
commit
08ee09af1e
@ -4,10 +4,11 @@ using Utils;
|
||||
|
||||
namespace DistTestCore.Codex
|
||||
{
|
||||
public class CodexAccess
|
||||
public class CodexAccess : ILogHandler
|
||||
{
|
||||
private readonly BaseLog log;
|
||||
private readonly ITimeSet timeSet;
|
||||
private bool hasContainerCrashed;
|
||||
|
||||
public CodexAccess(BaseLog log, RunningContainer container, ITimeSet timeSet, Address address)
|
||||
{
|
||||
@ -15,6 +16,9 @@ namespace DistTestCore.Codex
|
||||
Container = container;
|
||||
this.timeSet = timeSet;
|
||||
Address = address;
|
||||
hasContainerCrashed = false;
|
||||
|
||||
if (container.CrashWatcher != null) container.CrashWatcher.Start(this);
|
||||
}
|
||||
|
||||
public RunningContainer Container { get; }
|
||||
@ -86,7 +90,30 @@ namespace DistTestCore.Codex
|
||||
|
||||
private Http Http()
|
||||
{
|
||||
CheckContainerCrashed();
|
||||
return new Http(log, timeSet, Address, baseUrl: "/api/codex/v1", Container.Name);
|
||||
}
|
||||
|
||||
private void CheckContainerCrashed()
|
||||
{
|
||||
if (hasContainerCrashed) throw new Exception("Container has crashed.");
|
||||
}
|
||||
|
||||
public void Log(Stream crashLog)
|
||||
{
|
||||
var file = log.CreateSubfile();
|
||||
log.Log($"Container {Container.Name} has crashed. Downloading crash log to '{file.FullFilename}'...");
|
||||
|
||||
using var reader = new StreamReader(crashLog);
|
||||
var line = reader.ReadLine();
|
||||
while (line != null)
|
||||
{
|
||||
file.Write(line);
|
||||
line = reader.ReadLine();
|
||||
}
|
||||
|
||||
log.Log("Crash log successfully downloaded.");
|
||||
hasContainerCrashed = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -47,7 +47,11 @@ namespace DistTestCore
|
||||
{
|
||||
LogStart($"Stopping {group.Describe()}...");
|
||||
var workflow = CreateWorkflow();
|
||||
foreach (var c in group.Containers) workflow.Stop(c);
|
||||
foreach (var c in group.Containers)
|
||||
{
|
||||
StopCrashWatcher(c);
|
||||
workflow.Stop(c);
|
||||
}
|
||||
RunningGroups.Remove(group);
|
||||
LogEnd("Stopped.");
|
||||
}
|
||||
@ -96,7 +100,9 @@ namespace DistTestCore
|
||||
for (var i = 0; i < numberOfNodes; i++)
|
||||
{
|
||||
var workflow = CreateWorkflow();
|
||||
result.Add(workflow.Start(1, location, recipe, startupConfig));
|
||||
var rc = workflow.Start(1, location, recipe, startupConfig);
|
||||
CreateCrashWatcher(workflow, rc);
|
||||
result.Add(rc);
|
||||
}
|
||||
return result.ToArray();
|
||||
}
|
||||
@ -134,5 +140,19 @@ namespace DistTestCore
|
||||
{
|
||||
Log("----------------------------------------------------------------------------");
|
||||
}
|
||||
|
||||
private void CreateCrashWatcher(StartupWorkflow workflow, RunningContainers rc)
|
||||
{
|
||||
var c = rc.Containers.Single();
|
||||
c.CrashWatcher = workflow.CreateCrashWatcher(c);
|
||||
}
|
||||
|
||||
private void StopCrashWatcher(RunningContainers containers)
|
||||
{
|
||||
foreach (var c in containers.Containers)
|
||||
{
|
||||
c.CrashWatcher?.Stop();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
85
KubernetesWorkflow/CrashWatcher.cs
Normal file
85
KubernetesWorkflow/CrashWatcher.cs
Normal file
@ -0,0 +1,85 @@
|
||||
using k8s;
|
||||
using Logging;
|
||||
|
||||
namespace KubernetesWorkflow
|
||||
{
|
||||
public class CrashWatcher
|
||||
{
|
||||
private readonly BaseLog log;
|
||||
private readonly KubernetesClientConfiguration config;
|
||||
private readonly string k8sNamespace;
|
||||
private readonly RunningContainer container;
|
||||
private ILogHandler? logHandler;
|
||||
private CancellationTokenSource cts;
|
||||
private Task? worker;
|
||||
private Exception? workerException;
|
||||
|
||||
public CrashWatcher(BaseLog log, KubernetesClientConfiguration config, string k8sNamespace, RunningContainer container)
|
||||
{
|
||||
this.log = log;
|
||||
this.config = config;
|
||||
this.k8sNamespace = k8sNamespace;
|
||||
this.container = container;
|
||||
cts = new CancellationTokenSource();
|
||||
}
|
||||
|
||||
public void Start(ILogHandler logHandler)
|
||||
{
|
||||
if (worker != null) throw new InvalidOperationException();
|
||||
|
||||
this.logHandler = logHandler;
|
||||
cts = new CancellationTokenSource();
|
||||
worker = Task.Run(Worker);
|
||||
}
|
||||
|
||||
public void Stop()
|
||||
{
|
||||
if (worker == null) throw new InvalidOperationException();
|
||||
|
||||
cts.Cancel();
|
||||
worker.Wait();
|
||||
worker = null;
|
||||
|
||||
if (workerException != null) throw new Exception("Exception occurred in CrashWatcher worker thread.", workerException);
|
||||
}
|
||||
|
||||
private void Worker()
|
||||
{
|
||||
try
|
||||
{
|
||||
MonitorContainer(cts.Token);
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
workerException = ex;
|
||||
}
|
||||
}
|
||||
|
||||
private void MonitorContainer(CancellationToken token)
|
||||
{
|
||||
var client = new Kubernetes(config);
|
||||
while (!token.IsCancellationRequested)
|
||||
{
|
||||
token.WaitHandle.WaitOne(TimeSpan.FromSeconds(10));
|
||||
|
||||
var pod = container.Pod;
|
||||
var recipe = container.Recipe;
|
||||
var podName = pod.PodInfo.Name;
|
||||
var podInfo = client.ReadNamespacedPod(podName, k8sNamespace);
|
||||
|
||||
if (podInfo.Status.ContainerStatuses.Any(c => c.RestartCount > 0))
|
||||
{
|
||||
DownloadCrashedContainerLogs(client, podName, recipe);
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private void DownloadCrashedContainerLogs(Kubernetes client, string podName, ContainerRecipe recipe)
|
||||
{
|
||||
log.Log("Pod crash detected for " + container.Name);
|
||||
using var stream = client.ReadNamespacedPodLog(podName, k8sNamespace, recipe.Name, previous: true);
|
||||
logHandler!.Log(stream);
|
||||
}
|
||||
}
|
||||
}
|
@ -604,29 +604,9 @@ namespace KubernetesWorkflow
|
||||
|
||||
#endregion
|
||||
|
||||
public Task WatchForCrashLogs(RunningContainer container, CancellationToken token, ILogHandler logHandler)
|
||||
public CrashWatcher CreateCrashWatcher(RunningContainer container)
|
||||
{
|
||||
return Task.Run(() =>
|
||||
{
|
||||
var myOwnClient = new Kubernetes(cluster.GetK8sClientConfig());
|
||||
while (!token.IsCancellationRequested)
|
||||
{
|
||||
token.WaitHandle.WaitOne(TimeSpan.FromSeconds(3));
|
||||
|
||||
var pod = container.Pod;
|
||||
var recipe = container.Recipe;
|
||||
var podName = pod.PodInfo.Name;
|
||||
var podInfo = myOwnClient.ReadNamespacedPod(podName, K8sTestNamespace);
|
||||
|
||||
if (podInfo.Status.ContainerStatuses.Any(c => c.RestartCount > 0))
|
||||
{
|
||||
log.Log("Pod crash detected for " + container.Name);
|
||||
using var stream = myOwnClient.ReadNamespacedPodLog(podName, K8sTestNamespace, recipe.Name, previous: true);
|
||||
logHandler.Log(stream);
|
||||
return;
|
||||
}
|
||||
}
|
||||
});
|
||||
return new CrashWatcher(log, cluster.GetK8sClientConfig(), K8sTestNamespace, container);
|
||||
}
|
||||
|
||||
private PodInfo FetchNewPod()
|
||||
|
@ -1,4 +1,5 @@
|
||||
using Utils;
|
||||
using Newtonsoft.Json;
|
||||
using Utils;
|
||||
|
||||
namespace KubernetesWorkflow
|
||||
{
|
||||
@ -39,6 +40,9 @@ namespace KubernetesWorkflow
|
||||
public Port[] ServicePorts { get; }
|
||||
public Address ClusterExternalAddress { get; }
|
||||
public Address ClusterInternalAddress { get; }
|
||||
|
||||
[JsonIgnore]
|
||||
public CrashWatcher? CrashWatcher { get; set; }
|
||||
}
|
||||
|
||||
public static class RunningContainersExtensions
|
||||
|
@ -37,9 +37,9 @@ namespace KubernetesWorkflow
|
||||
}, pl);
|
||||
}
|
||||
|
||||
public Task WatchForCrashLogs(RunningContainer container, CancellationToken token, ILogHandler logHandler)
|
||||
public CrashWatcher CreateCrashWatcher(RunningContainer container)
|
||||
{
|
||||
return K8s(controller => controller.WatchForCrashLogs(container, token, logHandler));
|
||||
return K8s(controller => controller.CreateCrashWatcher(container));
|
||||
}
|
||||
|
||||
public void Stop(RunningContainers runningContainers)
|
||||
|
@ -1,7 +1,6 @@
|
||||
using DistTestCore;
|
||||
using KubernetesWorkflow;
|
||||
using NUnit.Framework;
|
||||
using System.ComponentModel;
|
||||
using Utils;
|
||||
|
||||
namespace Tests.BasicTests
|
||||
@ -73,53 +72,41 @@ namespace Tests.BasicTests
|
||||
|
||||
var nodes = group.Cast<OnlineCodexNode>().ToArray();
|
||||
|
||||
var flow = Get().WorkflowCreator.CreateWorkflow();
|
||||
var cst = new CancellationTokenSource();
|
||||
var tasks = nodes.Select(n => flow.WatchForCrashLogs(n.CodexAccess.Container, cst.Token, this)).ToArray();
|
||||
//foreach (var node in nodes)
|
||||
//{
|
||||
// node.Marketplace.MakeStorageAvailable(
|
||||
// size: 1.GB(),
|
||||
// minPricePerBytePerSecond: 1.TestTokens(),
|
||||
// maxCollateral: 1024.TestTokens(),
|
||||
// maxDuration: TimeSpan.FromMinutes(5));
|
||||
//}
|
||||
|
||||
try
|
||||
{
|
||||
//foreach (var node in nodes)
|
||||
//{
|
||||
// node.Marketplace.MakeStorageAvailable(
|
||||
// size: 1.GB(),
|
||||
// minPricePerBytePerSecond: 1.TestTokens(),
|
||||
// maxCollateral: 1024.TestTokens(),
|
||||
// maxDuration: TimeSpan.FromMinutes(5));
|
||||
//}
|
||||
Thread.Sleep(2000);
|
||||
|
||||
Thread.Sleep(2000);
|
||||
Log("calling crash...");
|
||||
var http = new Http(Get().Log, Get().TimeSet, nodes.First().CodexAccess.Address, baseUrl: "/api/codex/v1", nodes.First().CodexAccess.Container.Name);
|
||||
var str = http.HttpGetString("debug/crash");
|
||||
|
||||
Log("calling crash...");
|
||||
var http = new Http(Get().Log, Get().TimeSet, nodes.First().CodexAccess.Address, baseUrl: "/api/codex/v1", nodes.First().CodexAccess.Container.Name);
|
||||
var str = http.HttpGetString("debug/crash");
|
||||
Log("crash called.");
|
||||
|
||||
Log("crash called.");
|
||||
Thread.Sleep(TimeSpan.FromSeconds(60));
|
||||
|
||||
Thread.Sleep(TimeSpan.FromSeconds(60));
|
||||
Log("test done.");
|
||||
|
||||
Log("test done.");
|
||||
//var endTime = DateTime.UtcNow + TimeSpan.FromHours(2);
|
||||
//while (DateTime.UtcNow < endTime)
|
||||
//{
|
||||
// foreach (var node in nodes)
|
||||
// {
|
||||
// var file = GenerateTestFile(80.MB());
|
||||
// var cid = node.UploadFile(file);
|
||||
|
||||
//var endTime = DateTime.UtcNow + TimeSpan.FromHours(2);
|
||||
//while (DateTime.UtcNow < endTime)
|
||||
//{
|
||||
// foreach (var node in nodes)
|
||||
// {
|
||||
// var file = GenerateTestFile(80.MB());
|
||||
// var cid = node.UploadFile(file);
|
||||
// var dl = node.DownloadContent(cid);
|
||||
// file.AssertIsEqual(dl);
|
||||
// }
|
||||
|
||||
// var dl = node.DownloadContent(cid);
|
||||
// file.AssertIsEqual(dl);
|
||||
// }
|
||||
|
||||
// Thread.Sleep(TimeSpan.FromSeconds(30));
|
||||
//}
|
||||
}
|
||||
finally
|
||||
{
|
||||
cst.Cancel();
|
||||
foreach (var t in tasks) t.Wait();
|
||||
}
|
||||
// Thread.Sleep(TimeSpan.FromSeconds(30));
|
||||
//}
|
||||
}
|
||||
|
||||
public void Log(Stream log)
|
||||
|
Loading…
x
Reference in New Issue
Block a user