diff --git a/k8s/clusters/benchmarks/README.md b/k8s/clusters/benchmarks/README.md new file mode 100644 index 0000000..0f8e4e2 --- /dev/null +++ b/k8s/clusters/benchmarks/README.md @@ -0,0 +1,96 @@ +# Benchmark Cluster Infrastructure + +This directory contains Kubernetes manifests for the benchmark cluster's logging infrastructure. + +## Overview + +### Vector Log Collection + +Vector is deployed as a distributed logging system to collect logs from Codex benchmark experiments: + +**Vector Agent (DaemonSet):** +- Collects logs from benchmark pods (filtered by label: `app.kubernetes.io/name=codex-benchmarks`) +- Forwards compressed logs to Vector Aggregator via port 6000 + +**Vector Aggregator:** +- Receives logs from all Vector agents +- Writes consolidated logs to PVC as JSONL files: `/vector-logs/benchmarks-YYYY-MM-DD.jsonl` +- Used by log-parsing Argo workflow for post-experiment processing + +**Persistent Volume:** +- Stores collected JSONL logs +- Mounted by Vector Aggregator and log-parsing workflow +- Allows logs to persist between workflow runs + +**S3 Secret:** +- Credentials for uploading processed logs to S3-compatible storage +- Used by log-parsing workflow's tar-and-upload step + +**RBAC:** +- Service account and cluster role for Vector to access Kubernetes API +- Required for reading pod logs cluster-wide + +## Installation Order + +### 1. Create namespace (if not exists) + +```bash +kubectl create namespace argo +``` + +### 2. Apply Vector components + +```bash +kubectl apply -f vector/vector-pvc.yaml -n argo +kubectl apply -f vector/vector-agent-configmap.yaml -n argo +kubectl apply -f vector/vector-configmap.yaml -n argo +kubectl apply -f vector/vector-aggregator-configmap.yaml -n argo +kubectl apply -f vector/vector-deployment.yaml -n argo +kubectl apply -f vector/vector-aggregator-deployment.yaml -n argo +``` + +### 3. Configure S3 access + +```bash +# Edit s3-secret.yaml with your credentials first +kubectl apply -f s3-secret.yaml -n argo +``` + +### 4. Configure Vector RBAC + +```bash +kubectl apply -f vector/vector-aggregator-rbac.yaml -n argo +``` + +## Verification + +### Check Vector Agent status +```bash +kubectl get daemonset -n argo | grep vector +kubectl get pods -n argo -l app.kubernetes.io/name=vector +``` + +### Check Vector Aggregator status +```bash +kubectl get deployment -n argo | grep vector-aggregator +``` + +### Check PVC status +```bash +kubectl get pvc -n argo vector-logs-pvc +``` + +## Troubleshooting + +### Vector Agent not collecting logs +- Verify pod labels: `kubectl get pods -n codex-benchmarks --show-labels` +- Check agent logs: `kubectl logs -n argo -l app.kubernetes.io/name=vector` +- Ensure RBAC is applied: `kubectl get clusterrole vector-agent` + +### Logs not appearing in PVC +- Check aggregator connection: `kubectl logs -n argo deployment/vector-aggregator | grep error` +- Verify PVC is mounted: `kubectl describe pod -n argo ` + +### S3 upload failures +- Verify secret exists: `kubectl get secret -n argo s3-codex-benchmarks` +- Check credentials are correct (not placeholders) diff --git a/k8s/clusters/benchmarks/s3-secret.yaml b/k8s/clusters/benchmarks/s3-secret.yaml new file mode 100644 index 0000000..2ba4b38 --- /dev/null +++ b/k8s/clusters/benchmarks/s3-secret.yaml @@ -0,0 +1,21 @@ +# S3 credentials secret for uploading benchmark logs +# Replace the placeholder values with your actual S3 credentials before applying +# +# For Digital Ocean Spaces: +# AWS_ENDPOINT_URL: https://.digitaloceanspaces.com (e.g., https://fra1.digitaloceanspaces.com) +# AWS_ACCESS_KEY_ID: Your Spaces access key +# AWS_SECRET_ACCESS_KEY: Your Spaces secret key +# +# Apply with: +# kubectl apply -f s3-secret.yaml -n argo + +apiVersion: v1 +kind: Secret +metadata: + name: s3-codex-benchmarks + namespace: argo +type: Opaque +stringData: + AWS_ENDPOINT_URL: "https://.digitaloceanspaces.com" + AWS_ACCESS_KEY_ID: "ACCESS_KEY_ID" + AWS_SECRET_ACCESS_KEY: "SECRET_ACCESS_KEY" diff --git a/k8s/clusters/benchmarks/vector/vector-agent-configmap.yaml b/k8s/clusters/benchmarks/vector/vector-agent-configmap.yaml new file mode 100644 index 0000000..53ac805 --- /dev/null +++ b/k8s/clusters/benchmarks/vector/vector-agent-configmap.yaml @@ -0,0 +1,41 @@ +# Vector Agent configuration for DaemonSet +# Collects logs from pods on each node and forwards to Vector Aggregator + +apiVersion: v1 +kind: ConfigMap +metadata: + name: vector-agent-config + namespace: argo +data: + vector.yaml: | + data_dir: /vector-data + + sources: + benchmark-experiments-source: + type: kubernetes_logs + extra_label_selector: "app.kubernetes.io/name=codex-benchmarks" + node_annotation_fields: + node_labels: "" + pod_annotation_fields: + container_id: "" + container_image: "" + container_image_id: "" + pod_node_name: "" + pod_owner: "" + pod_uid: "" + pod_ip: "" + pod_ips: "" + glob_minimum_cooldown_ms: 5000 + + sinks: + vector-aggregator: + type: vector + inputs: [benchmark-experiments-source] + address: vector-aggregator.argo.svc.cluster.local:6000 + compression: true + acknowledgements: + enabled: true + buffer: + type: disk + max_size: 301989888 # 312MB per agent + when_full: block # Block new logs if buffer full (prevents drops) diff --git a/k8s/clusters/benchmarks/vector/vector-aggregator-configmap.yaml b/k8s/clusters/benchmarks/vector/vector-aggregator-configmap.yaml new file mode 100644 index 0000000..d4235f7 --- /dev/null +++ b/k8s/clusters/benchmarks/vector/vector-aggregator-configmap.yaml @@ -0,0 +1,32 @@ +# Vector Aggregator configuration +# Receives logs from Vector Agents and writes to file + +apiVersion: v1 +kind: ConfigMap +metadata: + name: vector-aggregator-config + namespace: argo +data: + vector.yaml: | + data_dir: /vector-data + + sources: + vector-agents: + type: vector + address: 0.0.0.0:6000 + acknowledgements: + enabled: true + + sinks: + file-output: + type: file + inputs: [vector-agents] + path: "/vector-logs/benchmarks-%Y-%m-%d.jsonl" + encoding: + codec: json + acknowledgements: + enabled: true + buffer: + type: disk + max_size: 301989888 # 312MB for aggregator + when_full: block diff --git a/k8s/clusters/benchmarks/vector/vector-aggregator-deployment.yaml b/k8s/clusters/benchmarks/vector/vector-aggregator-deployment.yaml new file mode 100644 index 0000000..6714545 --- /dev/null +++ b/k8s/clusters/benchmarks/vector/vector-aggregator-deployment.yaml @@ -0,0 +1,93 @@ +# Vector Aggregator Deployment +# Receives logs from Vector Agents (DaemonSet) and writes to PVC + +apiVersion: apps/v1 +kind: Deployment +metadata: + name: vector-aggregator + namespace: argo + labels: + app.kubernetes.io/name: vector-aggregator + app.kubernetes.io/instance: vector + app.kubernetes.io/component: Aggregator +spec: + replicas: 1 + selector: + matchLabels: + app.kubernetes.io/name: vector-aggregator + app.kubernetes.io/instance: vector + app.kubernetes.io/component: Aggregator + template: + metadata: + labels: + app.kubernetes.io/name: vector-aggregator + app.kubernetes.io/instance: vector + app.kubernetes.io/component: Aggregator + spec: + serviceAccountName: vector + # Schedule aggregator on infra node to avoid competing with benchmark workloads + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: workload-type + operator: In + values: + - infra + containers: + - name: vector + image: timberio/vector:0.34.0-distroless-libc + args: + - --config + - /etc/vector/vector.yaml + env: + - name: VECTOR_LOG + value: "info" + ports: + - containerPort: 6000 + name: vector + protocol: TCP + volumeMounts: + - name: config + mountPath: /etc/vector + readOnly: true + - name: data + mountPath: /vector-data + - name: vector-logs + mountPath: /vector-logs + resources: + requests: + cpu: 200m + memory: 512Mi + limits: + cpu: 2000m + memory: 2Gi + volumes: + - name: config + configMap: + name: vector-aggregator-config + - name: data + emptyDir: {} + - name: vector-logs + persistentVolumeClaim: + claimName: vector-logs-pvc +--- +apiVersion: v1 +kind: Service +metadata: + name: vector-aggregator + namespace: argo + labels: + app.kubernetes.io/name: vector-aggregator +spec: + type: ClusterIP + ports: + - port: 6000 + targetPort: 6000 + protocol: TCP + name: vector + selector: + app.kubernetes.io/name: vector-aggregator + app.kubernetes.io/instance: vector + app.kubernetes.io/component: Aggregator diff --git a/k8s/clusters/benchmarks/vector/vector-aggregator-rbac.yaml b/k8s/clusters/benchmarks/vector/vector-aggregator-rbac.yaml new file mode 100644 index 0000000..9ab165a --- /dev/null +++ b/k8s/clusters/benchmarks/vector/vector-aggregator-rbac.yaml @@ -0,0 +1,27 @@ +# RBAC permissions for log parsing workflow to scale Vector Aggregator +# Allows the workflow to scale down/up the aggregator to access the RWO PVC + +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + name: vector-aggregator-scaler + namespace: argo +rules: + - apiGroups: ["apps"] + resources: ["deployments"] + verbs: ["get", "patch", "update"] + resourceNames: ["vector-aggregator"] +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: codex-workflows-vector-aggregator-scaler + namespace: argo +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: vector-aggregator-scaler +subjects: + - kind: ServiceAccount + name: codex-benchmarks-workflows + namespace: argo diff --git a/k8s/clusters/benchmarks/vector/vector-configmap.yaml b/k8s/clusters/benchmarks/vector/vector-configmap.yaml new file mode 100644 index 0000000..157350c --- /dev/null +++ b/k8s/clusters/benchmarks/vector/vector-configmap.yaml @@ -0,0 +1,36 @@ +# Vector configuration for collecting benchmark pod logs +# This ConfigMap contains the Vector pipeline configuration + +apiVersion: v1 +kind: ConfigMap +metadata: + name: vector-config + namespace: argo +data: + vector.yaml: | + data_dir: /vector-data + + sources: + benchmark-experiments-source: + type: kubernetes_logs + extra_label_selector: "app.kubernetes.io/name=codex-benchmarks" + node_annotation_fields: + node_labels: "" + pod_annotation_fields: + container_id: "" + container_image: "" + container_image_id: "" + pod_node_name: "" + pod_owner: "" + pod_uid: "" + pod_ip: "" + pod_ips: "" + glob_minimum_cooldown_ms: 5000 + + sinks: + output: + type: file + inputs: [benchmark-experiments-source] + path: "/vector-logs/benchmarks-%Y-%m-%d.jsonl" + encoding: + codec: json diff --git a/k8s/clusters/benchmarks/vector/vector-deployment.yaml b/k8s/clusters/benchmarks/vector/vector-deployment.yaml new file mode 100644 index 0000000..0cb0104 --- /dev/null +++ b/k8s/clusters/benchmarks/vector/vector-deployment.yaml @@ -0,0 +1,112 @@ +# Vector Agent DaemonSet for collecting benchmark logs +# Runs on each node to collect logs from local pods and forward to Vector Aggregator + +apiVersion: v1 +kind: ServiceAccount +metadata: + name: vector + namespace: argo +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: vector-agent +rules: + - apiGroups: + - "" + resources: + - namespaces + - nodes + - pods + verbs: + - list + - watch +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: vector-agent +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: vector-agent +subjects: + - kind: ServiceAccount + name: vector + namespace: argo +--- +apiVersion: apps/v1 +kind: DaemonSet +metadata: + name: vector + namespace: argo + labels: + app.kubernetes.io/name: vector + app.kubernetes.io/instance: vector + app.kubernetes.io/component: Agent +spec: + selector: + matchLabels: + app.kubernetes.io/name: vector + app.kubernetes.io/instance: vector + app.kubernetes.io/component: Agent + template: + metadata: + labels: + app.kubernetes.io/name: vector + app.kubernetes.io/instance: vector + app.kubernetes.io/component: Agent + spec: + serviceAccountName: vector + containers: + - name: vector + image: timberio/vector:0.34.0-distroless-libc + args: + - --config + - /etc/vector/vector.yaml + env: + - name: VECTOR_SELF_NODE_NAME + valueFrom: + fieldRef: + fieldPath: spec.nodeName + - name: VECTOR_SELF_POD_NAME + valueFrom: + fieldRef: + fieldPath: metadata.name + - name: VECTOR_SELF_POD_NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace + - name: VECTOR_LOG + value: "info" + volumeMounts: + - name: config + mountPath: /etc/vector + readOnly: true + - name: data + mountPath: /vector-data + - name: var-log + mountPath: /var/log + readOnly: true + - name: var-lib + mountPath: /var/lib + readOnly: true + resources: + requests: + cpu: 100m + memory: 256Mi + limits: + cpu: 1000m + memory: 1Gi + volumes: + - name: config + configMap: + name: vector-agent-config + - name: data + emptyDir: {} + - name: var-log + hostPath: + path: /var/log + - name: var-lib + hostPath: + path: /var/lib diff --git a/k8s/clusters/benchmarks/vector/vector-pvc.yaml b/k8s/clusters/benchmarks/vector/vector-pvc.yaml new file mode 100644 index 0000000..2aa4a73 --- /dev/null +++ b/k8s/clusters/benchmarks/vector/vector-pvc.yaml @@ -0,0 +1,15 @@ +# Persistent Volume Claim for Vector to collect benchmark logs. +# This PVC uses block storage which will be mounted by Vector Aggregator to write logs. + +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: vector-logs-pvc + namespace: argo +spec: + accessModes: + - ReadWriteOnce + storageClassName: do-block-storage + resources: + requests: + storage: 50Gi