From 0c8e28fa468c194cc95b0ee63fab7604e617c61e Mon Sep 17 00:00:00 2001 From: Chrysostomos Nanakos Date: Tue, 21 Oct 2025 12:41:13 +0300 Subject: [PATCH] feat(k8s): add Vector logging infrastructure for benchmarks Add Vector agent/aggregator deployment for collecting logs from Codex benchmark experiments in K8s. Includes PVC for log storage, S3 secret template and RBAC. Vector collects logs from benchmark pods and writes JSONL files for post-processing by the log-parsing workflow. Signed-off-by: Chrysostomos Nanakos --- k8s/clusters/benchmarks/README.md | 96 +++++++++++++++ k8s/clusters/benchmarks/s3-secret.yaml | 21 ++++ .../vector/vector-agent-configmap.yaml | 41 +++++++ .../vector/vector-aggregator-configmap.yaml | 32 +++++ .../vector/vector-aggregator-deployment.yaml | 93 +++++++++++++++ .../vector/vector-aggregator-rbac.yaml | 27 +++++ .../benchmarks/vector/vector-configmap.yaml | 36 ++++++ .../benchmarks/vector/vector-deployment.yaml | 112 ++++++++++++++++++ .../benchmarks/vector/vector-pvc.yaml | 15 +++ 9 files changed, 473 insertions(+) create mode 100644 k8s/clusters/benchmarks/README.md create mode 100644 k8s/clusters/benchmarks/s3-secret.yaml create mode 100644 k8s/clusters/benchmarks/vector/vector-agent-configmap.yaml create mode 100644 k8s/clusters/benchmarks/vector/vector-aggregator-configmap.yaml create mode 100644 k8s/clusters/benchmarks/vector/vector-aggregator-deployment.yaml create mode 100644 k8s/clusters/benchmarks/vector/vector-aggregator-rbac.yaml create mode 100644 k8s/clusters/benchmarks/vector/vector-configmap.yaml create mode 100644 k8s/clusters/benchmarks/vector/vector-deployment.yaml create mode 100644 k8s/clusters/benchmarks/vector/vector-pvc.yaml diff --git a/k8s/clusters/benchmarks/README.md b/k8s/clusters/benchmarks/README.md new file mode 100644 index 0000000..0f8e4e2 --- /dev/null +++ b/k8s/clusters/benchmarks/README.md @@ -0,0 +1,96 @@ +# Benchmark Cluster Infrastructure + +This directory contains Kubernetes manifests for the benchmark cluster's logging infrastructure. + +## Overview + +### Vector Log Collection + +Vector is deployed as a distributed logging system to collect logs from Codex benchmark experiments: + +**Vector Agent (DaemonSet):** +- Collects logs from benchmark pods (filtered by label: `app.kubernetes.io/name=codex-benchmarks`) +- Forwards compressed logs to Vector Aggregator via port 6000 + +**Vector Aggregator:** +- Receives logs from all Vector agents +- Writes consolidated logs to PVC as JSONL files: `/vector-logs/benchmarks-YYYY-MM-DD.jsonl` +- Used by log-parsing Argo workflow for post-experiment processing + +**Persistent Volume:** +- Stores collected JSONL logs +- Mounted by Vector Aggregator and log-parsing workflow +- Allows logs to persist between workflow runs + +**S3 Secret:** +- Credentials for uploading processed logs to S3-compatible storage +- Used by log-parsing workflow's tar-and-upload step + +**RBAC:** +- Service account and cluster role for Vector to access Kubernetes API +- Required for reading pod logs cluster-wide + +## Installation Order + +### 1. Create namespace (if not exists) + +```bash +kubectl create namespace argo +``` + +### 2. Apply Vector components + +```bash +kubectl apply -f vector/vector-pvc.yaml -n argo +kubectl apply -f vector/vector-agent-configmap.yaml -n argo +kubectl apply -f vector/vector-configmap.yaml -n argo +kubectl apply -f vector/vector-aggregator-configmap.yaml -n argo +kubectl apply -f vector/vector-deployment.yaml -n argo +kubectl apply -f vector/vector-aggregator-deployment.yaml -n argo +``` + +### 3. Configure S3 access + +```bash +# Edit s3-secret.yaml with your credentials first +kubectl apply -f s3-secret.yaml -n argo +``` + +### 4. Configure Vector RBAC + +```bash +kubectl apply -f vector/vector-aggregator-rbac.yaml -n argo +``` + +## Verification + +### Check Vector Agent status +```bash +kubectl get daemonset -n argo | grep vector +kubectl get pods -n argo -l app.kubernetes.io/name=vector +``` + +### Check Vector Aggregator status +```bash +kubectl get deployment -n argo | grep vector-aggregator +``` + +### Check PVC status +```bash +kubectl get pvc -n argo vector-logs-pvc +``` + +## Troubleshooting + +### Vector Agent not collecting logs +- Verify pod labels: `kubectl get pods -n codex-benchmarks --show-labels` +- Check agent logs: `kubectl logs -n argo -l app.kubernetes.io/name=vector` +- Ensure RBAC is applied: `kubectl get clusterrole vector-agent` + +### Logs not appearing in PVC +- Check aggregator connection: `kubectl logs -n argo deployment/vector-aggregator | grep error` +- Verify PVC is mounted: `kubectl describe pod -n argo ` + +### S3 upload failures +- Verify secret exists: `kubectl get secret -n argo s3-codex-benchmarks` +- Check credentials are correct (not placeholders) diff --git a/k8s/clusters/benchmarks/s3-secret.yaml b/k8s/clusters/benchmarks/s3-secret.yaml new file mode 100644 index 0000000..2ba4b38 --- /dev/null +++ b/k8s/clusters/benchmarks/s3-secret.yaml @@ -0,0 +1,21 @@ +# S3 credentials secret for uploading benchmark logs +# Replace the placeholder values with your actual S3 credentials before applying +# +# For Digital Ocean Spaces: +# AWS_ENDPOINT_URL: https://.digitaloceanspaces.com (e.g., https://fra1.digitaloceanspaces.com) +# AWS_ACCESS_KEY_ID: Your Spaces access key +# AWS_SECRET_ACCESS_KEY: Your Spaces secret key +# +# Apply with: +# kubectl apply -f s3-secret.yaml -n argo + +apiVersion: v1 +kind: Secret +metadata: + name: s3-codex-benchmarks + namespace: argo +type: Opaque +stringData: + AWS_ENDPOINT_URL: "https://.digitaloceanspaces.com" + AWS_ACCESS_KEY_ID: "ACCESS_KEY_ID" + AWS_SECRET_ACCESS_KEY: "SECRET_ACCESS_KEY" diff --git a/k8s/clusters/benchmarks/vector/vector-agent-configmap.yaml b/k8s/clusters/benchmarks/vector/vector-agent-configmap.yaml new file mode 100644 index 0000000..53ac805 --- /dev/null +++ b/k8s/clusters/benchmarks/vector/vector-agent-configmap.yaml @@ -0,0 +1,41 @@ +# Vector Agent configuration for DaemonSet +# Collects logs from pods on each node and forwards to Vector Aggregator + +apiVersion: v1 +kind: ConfigMap +metadata: + name: vector-agent-config + namespace: argo +data: + vector.yaml: | + data_dir: /vector-data + + sources: + benchmark-experiments-source: + type: kubernetes_logs + extra_label_selector: "app.kubernetes.io/name=codex-benchmarks" + node_annotation_fields: + node_labels: "" + pod_annotation_fields: + container_id: "" + container_image: "" + container_image_id: "" + pod_node_name: "" + pod_owner: "" + pod_uid: "" + pod_ip: "" + pod_ips: "" + glob_minimum_cooldown_ms: 5000 + + sinks: + vector-aggregator: + type: vector + inputs: [benchmark-experiments-source] + address: vector-aggregator.argo.svc.cluster.local:6000 + compression: true + acknowledgements: + enabled: true + buffer: + type: disk + max_size: 301989888 # 312MB per agent + when_full: block # Block new logs if buffer full (prevents drops) diff --git a/k8s/clusters/benchmarks/vector/vector-aggregator-configmap.yaml b/k8s/clusters/benchmarks/vector/vector-aggregator-configmap.yaml new file mode 100644 index 0000000..d4235f7 --- /dev/null +++ b/k8s/clusters/benchmarks/vector/vector-aggregator-configmap.yaml @@ -0,0 +1,32 @@ +# Vector Aggregator configuration +# Receives logs from Vector Agents and writes to file + +apiVersion: v1 +kind: ConfigMap +metadata: + name: vector-aggregator-config + namespace: argo +data: + vector.yaml: | + data_dir: /vector-data + + sources: + vector-agents: + type: vector + address: 0.0.0.0:6000 + acknowledgements: + enabled: true + + sinks: + file-output: + type: file + inputs: [vector-agents] + path: "/vector-logs/benchmarks-%Y-%m-%d.jsonl" + encoding: + codec: json + acknowledgements: + enabled: true + buffer: + type: disk + max_size: 301989888 # 312MB for aggregator + when_full: block diff --git a/k8s/clusters/benchmarks/vector/vector-aggregator-deployment.yaml b/k8s/clusters/benchmarks/vector/vector-aggregator-deployment.yaml new file mode 100644 index 0000000..6714545 --- /dev/null +++ b/k8s/clusters/benchmarks/vector/vector-aggregator-deployment.yaml @@ -0,0 +1,93 @@ +# Vector Aggregator Deployment +# Receives logs from Vector Agents (DaemonSet) and writes to PVC + +apiVersion: apps/v1 +kind: Deployment +metadata: + name: vector-aggregator + namespace: argo + labels: + app.kubernetes.io/name: vector-aggregator + app.kubernetes.io/instance: vector + app.kubernetes.io/component: Aggregator +spec: + replicas: 1 + selector: + matchLabels: + app.kubernetes.io/name: vector-aggregator + app.kubernetes.io/instance: vector + app.kubernetes.io/component: Aggregator + template: + metadata: + labels: + app.kubernetes.io/name: vector-aggregator + app.kubernetes.io/instance: vector + app.kubernetes.io/component: Aggregator + spec: + serviceAccountName: vector + # Schedule aggregator on infra node to avoid competing with benchmark workloads + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: workload-type + operator: In + values: + - infra + containers: + - name: vector + image: timberio/vector:0.34.0-distroless-libc + args: + - --config + - /etc/vector/vector.yaml + env: + - name: VECTOR_LOG + value: "info" + ports: + - containerPort: 6000 + name: vector + protocol: TCP + volumeMounts: + - name: config + mountPath: /etc/vector + readOnly: true + - name: data + mountPath: /vector-data + - name: vector-logs + mountPath: /vector-logs + resources: + requests: + cpu: 200m + memory: 512Mi + limits: + cpu: 2000m + memory: 2Gi + volumes: + - name: config + configMap: + name: vector-aggregator-config + - name: data + emptyDir: {} + - name: vector-logs + persistentVolumeClaim: + claimName: vector-logs-pvc +--- +apiVersion: v1 +kind: Service +metadata: + name: vector-aggregator + namespace: argo + labels: + app.kubernetes.io/name: vector-aggregator +spec: + type: ClusterIP + ports: + - port: 6000 + targetPort: 6000 + protocol: TCP + name: vector + selector: + app.kubernetes.io/name: vector-aggregator + app.kubernetes.io/instance: vector + app.kubernetes.io/component: Aggregator diff --git a/k8s/clusters/benchmarks/vector/vector-aggregator-rbac.yaml b/k8s/clusters/benchmarks/vector/vector-aggregator-rbac.yaml new file mode 100644 index 0000000..9ab165a --- /dev/null +++ b/k8s/clusters/benchmarks/vector/vector-aggregator-rbac.yaml @@ -0,0 +1,27 @@ +# RBAC permissions for log parsing workflow to scale Vector Aggregator +# Allows the workflow to scale down/up the aggregator to access the RWO PVC + +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + name: vector-aggregator-scaler + namespace: argo +rules: + - apiGroups: ["apps"] + resources: ["deployments"] + verbs: ["get", "patch", "update"] + resourceNames: ["vector-aggregator"] +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: codex-workflows-vector-aggregator-scaler + namespace: argo +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: vector-aggregator-scaler +subjects: + - kind: ServiceAccount + name: codex-benchmarks-workflows + namespace: argo diff --git a/k8s/clusters/benchmarks/vector/vector-configmap.yaml b/k8s/clusters/benchmarks/vector/vector-configmap.yaml new file mode 100644 index 0000000..157350c --- /dev/null +++ b/k8s/clusters/benchmarks/vector/vector-configmap.yaml @@ -0,0 +1,36 @@ +# Vector configuration for collecting benchmark pod logs +# This ConfigMap contains the Vector pipeline configuration + +apiVersion: v1 +kind: ConfigMap +metadata: + name: vector-config + namespace: argo +data: + vector.yaml: | + data_dir: /vector-data + + sources: + benchmark-experiments-source: + type: kubernetes_logs + extra_label_selector: "app.kubernetes.io/name=codex-benchmarks" + node_annotation_fields: + node_labels: "" + pod_annotation_fields: + container_id: "" + container_image: "" + container_image_id: "" + pod_node_name: "" + pod_owner: "" + pod_uid: "" + pod_ip: "" + pod_ips: "" + glob_minimum_cooldown_ms: 5000 + + sinks: + output: + type: file + inputs: [benchmark-experiments-source] + path: "/vector-logs/benchmarks-%Y-%m-%d.jsonl" + encoding: + codec: json diff --git a/k8s/clusters/benchmarks/vector/vector-deployment.yaml b/k8s/clusters/benchmarks/vector/vector-deployment.yaml new file mode 100644 index 0000000..0cb0104 --- /dev/null +++ b/k8s/clusters/benchmarks/vector/vector-deployment.yaml @@ -0,0 +1,112 @@ +# Vector Agent DaemonSet for collecting benchmark logs +# Runs on each node to collect logs from local pods and forward to Vector Aggregator + +apiVersion: v1 +kind: ServiceAccount +metadata: + name: vector + namespace: argo +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: vector-agent +rules: + - apiGroups: + - "" + resources: + - namespaces + - nodes + - pods + verbs: + - list + - watch +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: vector-agent +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: vector-agent +subjects: + - kind: ServiceAccount + name: vector + namespace: argo +--- +apiVersion: apps/v1 +kind: DaemonSet +metadata: + name: vector + namespace: argo + labels: + app.kubernetes.io/name: vector + app.kubernetes.io/instance: vector + app.kubernetes.io/component: Agent +spec: + selector: + matchLabels: + app.kubernetes.io/name: vector + app.kubernetes.io/instance: vector + app.kubernetes.io/component: Agent + template: + metadata: + labels: + app.kubernetes.io/name: vector + app.kubernetes.io/instance: vector + app.kubernetes.io/component: Agent + spec: + serviceAccountName: vector + containers: + - name: vector + image: timberio/vector:0.34.0-distroless-libc + args: + - --config + - /etc/vector/vector.yaml + env: + - name: VECTOR_SELF_NODE_NAME + valueFrom: + fieldRef: + fieldPath: spec.nodeName + - name: VECTOR_SELF_POD_NAME + valueFrom: + fieldRef: + fieldPath: metadata.name + - name: VECTOR_SELF_POD_NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace + - name: VECTOR_LOG + value: "info" + volumeMounts: + - name: config + mountPath: /etc/vector + readOnly: true + - name: data + mountPath: /vector-data + - name: var-log + mountPath: /var/log + readOnly: true + - name: var-lib + mountPath: /var/lib + readOnly: true + resources: + requests: + cpu: 100m + memory: 256Mi + limits: + cpu: 1000m + memory: 1Gi + volumes: + - name: config + configMap: + name: vector-agent-config + - name: data + emptyDir: {} + - name: var-log + hostPath: + path: /var/log + - name: var-lib + hostPath: + path: /var/lib diff --git a/k8s/clusters/benchmarks/vector/vector-pvc.yaml b/k8s/clusters/benchmarks/vector/vector-pvc.yaml new file mode 100644 index 0000000..2aa4a73 --- /dev/null +++ b/k8s/clusters/benchmarks/vector/vector-pvc.yaml @@ -0,0 +1,15 @@ +# Persistent Volume Claim for Vector to collect benchmark logs. +# This PVC uses block storage which will be mounted by Vector Aggregator to write logs. + +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: vector-logs-pvc + namespace: argo +spec: + accessModes: + - ReadWriteOnce + storageClassName: do-block-storage + resources: + requests: + storage: 50Gi