adding betelgeusebytes.io deops part

2026-01-25 21:15:43 +01:00 · 2026-01-25 21:15:43 +01:00 · dfdd36db3f
commit dfdd36db3f
95 changed files with 8869 additions and 0 deletions
--- a/.DS_Store
+++ b/.DS_Store
--- a/CLAUDE.md
+++ b/CLAUDE.md
@ -0,0 +1,177 @@
+# CLAUDE.md - BetelgeuseBytes Full Stack
+
+## Project Overview
+
+Kubernetes cluster deployment for BetelgeuseBytes using Ansible for infrastructure automation and kubectl for application deployment. This is a complete data science/ML platform with integrated observability, databases, and ML tools.
+
+**Infrastructure:**
+- 2-node Kubernetes cluster on Hetzner Cloud
+- Control plane + worker: hetzner-1 (95.217.89.53)
+- Worker node: hetzner-2 (138.201.254.97)
+- Kubernetes v1.30.3 with Cilium CNI
+
+## Directory Structure
+
+```
+.
+├── ansible/                    # Infrastructure-as-Code for cluster setup
+│   ├── inventories/prod/       # Hetzner nodes inventory & group vars
+│   │   ├── hosts.ini           # Node definitions
+│   │   └── group_vars/all.yml  # Global K8s config (versions, CIDRs)
+│   ├── playbooks/
+│   │   ├── site.yml            # Main cluster bootstrap playbook
+│   │   └── add-control-planes.yml  # HA control plane expansion
+│   └── roles/                  # 16 reusable Ansible roles
+│       ├── common/             # Swap disable, kernel modules, sysctl
+│       ├── containerd/         # Container runtime
+│       ├── kubernetes/         # kubeadm, kubelet, kubectl
+│       ├── kubeadm_init/       # Primary control plane init
+│       ├── kubeadm_join/       # Worker node join
+│       ├── cilium/             # CNI plugin
+│       ├── ingress/            # NGINX Ingress Controller
+│       ├── cert_manager/       # Let's Encrypt integration
+│       ├── labels/             # Node labeling
+│       └── storage_local_path/ # Local storage provisioning
+└── k8s/                        # Kubernetes manifests
+    ├── 00-namespaces.yaml      # 8 namespaces
+    ├── 01-secrets/             # Basic auth secrets
+    ├── storage/                # StorageClass, PersistentVolumes
+    ├── postgres/               # PostgreSQL 16 with extensions
+    ├── redis/                  # Redis 7 cache
+    ├── elastic/                # Elasticsearch 8.14 + Kibana
+    ├── gitea/                  # Git repository service
+    ├── jupyter/                # JupyterLab notebook
+    ├── kafka/                  # Apache Kafka broker
+    ├── neo4j/                  # Neo4j graph database
+    ├── prometheus/             # Prometheus monitoring
+    ├── grafana/                # Grafana dashboards
+    ├── minio/                  # S3-compatible object storage
+    ├── mlflow/                 # ML lifecycle tracking
+    ├── vllm/                   # LLM inference (Ollama)
+    ├── label_studio/           # Data annotation platform
+    ├── argoflow/               # Argo Workflows
+    ├── otlp/                   # OpenTelemetry collector
+    └── observability/          # Fluent-Bit log aggregation
+```
+
+## Build & Deployment Commands
+
+### Phase 1: Cluster Infrastructure
+
+```bash
+# Validate connectivity
+ansible -i ansible/inventories/prod/hosts.ini all -m ping
+
+# Bootstrap Kubernetes cluster
+ansible-playbook -i ansible/inventories/prod/hosts.ini ansible/playbooks/site.yml
+```
+
+### Phase 2: Kubernetes Applications (order matters)
+
+```bash
+# 1. Namespaces & storage
+kubectl apply -f k8s/00-namespaces.yaml
+kubectl apply -f k8s/storage/storageclass.yaml
+
+# 2. Secrets & auth
+kubectl apply -f k8s/01-secrets/
+
+# 3. Infrastructure (databases, cache, search)
+kubectl apply -f k8s/postgres/
+kubectl apply -f k8s/redis/
+kubectl apply -f k8s/elastic/elasticsearch.yaml
+kubectl apply -f k8s/elastic/kibana.yaml
+
+# 4. Application layer
+kubectl apply -f k8s/gitea/
+kubectl apply -f k8s/jupyter/
+kubectl apply -f k8s/kafka/kafka.yaml
+kubectl apply -f k8s/kafka/kafka-ui.yaml
+kubectl apply -f k8s/neo4j/
+
+# 5. Observability & telemetry
+kubectl apply -f k8s/otlp/
+kubectl apply -f k8s/observability/fluent-bit.yaml
+kubectl apply -f k8s/prometheus/
+kubectl apply -f k8s/grafana/
+```
+
+## Namespace Organization
+
+| Namespace | Purpose | Services |
+|-----------|---------|----------|
+| `db` | Databases & cache | PostgreSQL, Redis |
+| `scm` | Source control | Gitea |
+| `ml` | Machine Learning | JupyterLab, MLflow, Argo, Label Studio, Ollama |
+| `elastic` | Search & logging | Elasticsearch, Kibana |
+| `broker` | Message brokers | Kafka |
+| `graph` | Graph databases | Neo4j |
+| `monitoring` | Observability | Prometheus, Grafana |
+| `observability` | Telemetry | OpenTelemetry, Fluent-Bit |
+| `storage` | Object storage | MinIO |
+
+## Key Configuration
+
+**Kubernetes:**
+- Pod CIDR: 10.244.0.0/16
+- Service CIDR: 10.96.0.0/12
+- CNI: Cilium v1.15.7
+
+**Storage:**
+- StorageClass: `local-ssd-hetzner` (local volumes)
+- All stateful workloads pinned to hetzner-2
+- Local path: `/mnt/local-ssd/{service-name}`
+
+**Networking:**
+- Internal DNS: `service.namespace.svc.cluster.local`
+- External: `{service}.betelgeusebytes.io` via NGINX Ingress
+- TLS: Let's Encrypt via cert-manager
+
+## DNS Records
+
+A records point to both nodes:
+- `apps.betelgeusebytes.io` → 95.217.89.53, 138.201.254.97
+
+CNAMEs to `apps.betelgeusebytes.io`:
+- gitea, kibana, grafana, prometheus, notebook, broker, neo4j, otlp, label, llm, mlflow, minio
+
+## Secrets Location
+
+- `k8s/01-secrets/basic-auth.yaml` - HTTP basic auth for protected services
+- Service-specific secrets inline in respective manifests (e.g., postgres-auth, redis-auth)
+
+## Manifest Conventions
+
+1. Compact YAML style: `metadata: { name: xyz, namespace: ns }`
+2. StatefulSets for persistent services (databases, brokers)
+3. Deployments for stateless services (web UIs, workers)
+4. DaemonSets for node-level agents (Fluent-Bit)
+5. Service port=80 for ingress routing, backend maps to container port
+6. Ingress with TLS + basic auth annotations where needed
+
+## Common Operations
+
+```bash
+# Check cluster status
+kubectl get nodes
+kubectl get pods -A
+
+# View logs for a service
+kubectl logs -n <namespace> -l app=<service-name>
+
+# Scale a deployment
+kubectl scale -n <namespace> deployment/<name> --replicas=N
+
+# Apply changes to a specific service
+kubectl apply -f k8s/<service>/
+
+# Delete and recreate a service
+kubectl delete -f k8s/<service>/ && kubectl apply -f k8s/<service>/
+```
+
+## Notes
+
+- This is a development/test setup; passwords are hardcoded in manifests
+- Elasticsearch security is disabled for development
+- GPU support for vLLM is commented out (requires nvidia.com/gpu resources)
+- Neo4j Bolt protocol (7687) requires manual ingress-nginx TCP patch
--- a/DNS_RECORDS.txt
+++ b/DNS_RECORDS.txt
@ -0,0 +1,10 @@
+apps.betelgeusebytes.io.   300 IN A 95.217.89.53
+apps.betelgeusebytes.io.   300 IN A 138.201.254.97
+gitea.betelgeusebytes.io.      300 IN CNAME apps.betelgeusebytes.io.
+kibana.betelgeusebytes.io.     300 IN CNAME apps.betelgeusebytes.io.
+grafana.betelgeusebytes.io.    300 IN CNAME apps.betelgeusebytes.io.
+prometheus.betelgeusebytes.io. 300 IN CNAME apps.betelgeusebytes.io.
+notebook.betelgeusebytes.io.   300 IN CNAME apps.betelgeusebytes.io.
+broker.betelgeusebytes.io.     300 IN CNAME apps.betelgeusebytes.io.
+neo4j.betelgeusebytes.io.      300 IN CNAME apps.betelgeusebytes.io.
+otlp.betelgeusebytes.io.       300 IN CNAME apps.betelgeusebytes.io.
--- a/README.md
+++ b/README.md
@ -0,0 +1,43 @@
+# BetelgeuseBytes K8s — Full Stack (kubectl-only)
+
+**Nodes**
+- Control-plane + worker: hetzner-1 (95.217.89.53)
+- Worker: hetzner-2 (138.201.254.97)
+
+## Bring up the cluster
+```bash
+ansible -i ansible/inventories/prod/hosts.ini all -m ping
+ansible-playbook -i ansible/inventories/prod/hosts.ini ansible/playbooks/site.yml
+```
+
+## Apply apps (edit secrets first)
+```bash
+kubectl apply -f k8s/00-namespaces.yaml
+kubectl apply -f k8s/01-secrets/
+kubectl apply -f k8s/storage/storageclass.yaml
+
+kubectl apply -f k8s/postgres/
+kubectl apply -f k8s/redis/
+kubectl apply -f k8s/elastic/elasticsearch.yaml
+kubectl apply -f k8s/elastic/kibana.yaml
+
+kubectl apply -f k8s/gitea/
+kubectl apply -f k8s/jupyter/
+kubectl apply -f k8s/kafka/kafka.yaml
+kubectl apply -f k8s/kafka/kafka-ui.yaml
+kubectl apply -f k8s/neo4j/
+
+kubectl apply -f k8s/otlp/
+kubectl apply -f k8s/observability/fluent-bit.yaml
+kubectl apply -f k8s/prometheus/
+kubectl apply -f k8s/grafana/
+```
+
+## DNS
+A records:
+- apps.betelgeusebytes.io → 95.217.89.53, 138.201.254.97
+
+CNAMEs → apps.betelgeusebytes.io:
+- gitea., kibana., grafana., prometheus., notebook., broker., neo4j., otlp.
+
+(HA later) cp.k8s.betelgeusebytes.io → <VPS_IP>, 95.217.89.53, 138.201.254.97; then set control_plane_endpoint accordingly.
--- a/ansible/inventories/prod/group_vars/all.yml
+++ b/ansible/inventories/prod/group_vars/all.yml
@ -0,0 +1,13 @@
+cluster_name: prod
+k8s_version: "v1.30.3"
+control_plane_endpoint: "95.217.89.53:6443"  # switch later to cp.k8s.betelgeusebytes.io:6443
+
+pod_cidr: "10.244.0.0/16"
+service_cidr: "10.96.0.0/12"
+cilium_version: "1.15.7"
+
+local_path_dir: "/srv/k8s"
+local_sc_name: "local-ssd-hetzner"
+
+stateful_node_label_key: "node"
+stateful_node_label_val: "hetzner-2"
--- a/ansible/inventories/prod/hosts.ini
+++ b/ansible/inventories/prod/hosts.ini
@ -0,0 +1,19 @@
+[k8s_control_plane]
+hetzner-1 ansible_host=95.217.89.53 public_ip=95.217.89.53 wg_address=10.66.0.11
+
+[k8s_workers]
+hetzner-1 ansible_host=95.217.89.53 public_ip=95.217.89.53 wg_address=10.66.0.11
+hetzner-2 ansible_host=138.201.254.97 public_ip=138.201.254.97 wg_address=10.66.0.12
+
+[k8s_nodes:children]
+k8s_control_plane
+k8s_workers
+
+# add tiny VPS control-planes here when ready
+[new_control_planes]
+# cp-a ansible_host=<VPS1_IP> public_ip=<VPS1_IP> wg_address=10.66.0.10
+
+[all:vars]
+ansible_user=root
+ansible_password=3Lcd0504
+ansible_become=true
--- a/ansible/playbooks/add-control-planes.yml
+++ b/ansible/playbooks/add-control-planes.yml
@ -0,0 +1,19 @@
+- hosts: k8s_control_plane[0]
+  become: yes
+  roles:
+    - kubeadm_cp_discovery
+
+- hosts: new_control_planes
+  become: yes
+  roles:
+    - common
+    - wireguard
+    - containerd
+    - kubernetes
+
+- hosts: new_control_planes
+  become: yes
+  roles:
+    - kubeadm_join_cp
+  vars:
+    kubeadm_cp_join_cmd: "{{ hostvars[groups['k8s_control_plane'][0]].kubeadm_cp_join_cmd | default(kubeadm_cp_join_cmd) }}"
--- a/ansible/playbooks/site.yml
+++ b/ansible/playbooks/site.yml
@ -0,0 +1,31 @@
+- hosts: k8s_nodes
+  become: yes
+  # serial: 1 
+  roles:
+    # - ../roles/common
+    #- ../roles/wireguard
+    #- ../roles/containerd
+    #- ../roles/kubernetes
+
+- hosts: k8s_control_plane
+  become: yes
+  roles:
+    - ../roles/kubeadm_init
+
+# - hosts: k8s_workers
+#   become: yes
+#   roles:
+#     - ../roles/kubeadm_join
+
+- hosts: k8s_control_plane
+  become: yes
+  roles:
+    # - ../roles/cilium
+    # - ../roles/ingress
+    #- ../roles/cert_manager
+
+- hosts: k8s_nodes
+  become: yes
+  roles:
+    #- ../roles/storage_local_path
+    - ../roles/labels
--- a/ansible/roles/cert_manager/tasks/main.yml
+++ b/ansible/roles/cert_manager/tasks/main.yml
@ -0,0 +1,66 @@
+- name: Install cert-manager
+  shell: kubectl apply -f https://github.com/cert-manager/cert-manager/releases/download/v1.14.4/cert-manager.yaml
+
+- name: Wait for cert-manager pods to be ready
+  shell: kubectl wait --for=condition=ready --timeout=300s pod -l app.kubernetes.io/instance=cert-manager -n cert-manager
+
+- name: Wait for webhook endpoint to be ready
+  shell: |
+    for i in {1..30}; do
+      if kubectl get endpoints cert-manager-webhook -n cert-manager -o jsonpath='{.subsets[*].addresses[*].ip}' | grep -q .; then
+        echo "Webhook endpoint is ready"
+        exit 0
+      fi
+      echo "Waiting for webhook endpoint... attempt $i/30"
+      sleep 2
+    done
+    exit 1
+
+- name: Test webhook connectivity
+  shell: kubectl run test-webhook --image=curlimages/curl:latest --rm -i --restart=Never -- curl -k https://cert-manager-webhook.cert-manager.svc:443/healthz
+  register: webhook_test
+  ignore_errors: yes
+
+- name: Display webhook test result
+  debug:
+    var: webhook_test
+
+- name: ClusterIssuer
+  copy:
+    dest: /root/cluster-issuer-prod.yaml
+    content: |
+      apiVersion: cert-manager.io/v1
+      kind: ClusterIssuer
+      metadata:
+        name: letsencrypt-prod
+      spec:
+        acme:
+- name: ClusterIssuer
+  copy:
+    dest: /root/cluster-issuer-prod.yaml
+    content: |
+      apiVersion: cert-manager.io/v1
+      kind: ClusterIssuer
+      metadata:
+        name: letsencrypt-prod
+      spec:
+        acme:
+          email: admin@betelgeusebytes.io
+          server: https://acme-v02.api.letsencrypt.org/directory
+          privateKeySecretRef:
+            name: letsencrypt-prod-key
+          solvers:
+          - http01:
+              ingress:
+                class: nginx
+
+- name: Temporarily disable cert-manager webhook
+  shell: |
+    kubectl delete validatingwebhookconfiguration cert-manager-webhook || true
+  ignore_errors: yes
+
+- name: Apply ClusterIssuer
+  command: kubectl apply -f /root/cluster-issuer-prod.yaml
+
+- name: Reinstall cert-manager to restore webhook
+  shell: kubectl apply -f https://github.com/cert-manager/cert-manager/releases/download/v1.14.4/cert-manager.yaml
--- a/ansible/roles/cilium/tasks/main.yml
+++ b/ansible/roles/cilium/tasks/main.yml
@ -0,0 +1,9 @@
+- name: Install cilium CLI
+  shell: |
+    curl -L --fail --remote-name-all https://github.com/cilium/cilium-cli/releases/latest/download/cilium-linux-amd64.tar.gz
+    tar xzf cilium-linux-amd64.tar.gz -C /usr/local/bin
+  args: { creates: /usr/local/bin/cilium }
+
+- name: Deploy cilium
+  shell: |
+    cilium install --version {{ cilium_version }} --set kubeProxyReplacement=strict --set bpf.masquerade=true
--- a/ansible/roles/common/tasks/main.yml
+++ b/ansible/roles/common/tasks/main.yml
@ -0,0 +1,31 @@
+- name: Disable swap
+  command: swapoff -a
+  when: ansible_swaptotal_mb|int > 0
+
+- name: Ensure swap disabled on boot
+  replace:
+    path: /etc/fstab
+    regexp: '^([^#].*\sswap\s)'
+    replace: '# \1'
+
+- name: Kernel modules
+  copy:
+    dest: /etc/modules-load.d/containerd.conf
+    content: |
+      overlay
+      br_netfilter
+
+- name: Load modules
+  command: modprobe {{ item }}
+  loop: [overlay, br_netfilter]
+
+- name: Sysctl for k8s
+  copy:
+    dest: /etc/sysctl.d/99-kubernetes.conf
+    content: |
+      net.bridge.bridge-nf-call-iptables = 1
+      net.bridge.bridge-nf-call-ip6tables = 1
+      net.ipv4.ip_forward = 1
+      vm.max_map_count = 262144
+- name: Apply sysctl
+  command: sysctl --system
--- a/ansible/roles/containerd/tasks/main.yml
+++ b/ansible/roles/containerd/tasks/main.yml
@ -0,0 +1,27 @@
+- name: Install containerd
+  apt:
+    name: containerd
+    state: present
+    update_cache: yes
+
+- name: Ensure containerd config directory
+  file:
+    path: /etc/containerd
+    state: directory
+    mode: '0755'
+
+- name: Generate default config
+  shell: containerd config default > /etc/containerd/config.toml
+  args: { creates: /etc/containerd/config.toml }
+
+- name: Ensure SystemdCgroup=true
+  replace:
+    path: /etc/containerd/config.toml
+    regexp: 'SystemdCgroup = false'
+    replace: 'SystemdCgroup = true'
+
+- name: Restart containerd
+  service:
+    name: containerd
+    state: restarted
+    enabled: yes
--- a/ansible/roles/ingress/tasks/main.yml
+++ b/ansible/roles/ingress/tasks/main.yml
@ -0,0 +1,2 @@
+- name: Deploy ingress-nginx (baremetal)
+  shell: kubectl apply -f https://raw.githubusercontent.com/kubernetes/ingress-nginx/main/deploy/static/provider/baremetal/deploy.yaml
--- a/ansible/roles/kubeadm_cp_discovery/tasks/main.yml
+++ b/ansible/roles/kubeadm_cp_discovery/tasks/main.yml
@ -0,0 +1,24 @@
+- name: Upload certs and get certificate key
+  shell: kubeadm init phase upload-certs --upload-certs | tail -n 1
+  register: cert_key
+
+- name: Compute CA cert hash
+  shell: |
+    openssl x509 -pubkey -in /etc/kubernetes/pki/ca.crt     | openssl rsa -pubin -outform der 2>/dev/null     | openssl dgst -sha256 -hex | awk '{print $2}'
+  register: ca_hash
+
+- name: Create short-lived token
+  shell: kubeadm token create --ttl 30m
+  register: join_token
+
+- name: Determine control-plane endpoint
+  set_fact:
+    cp_endpoint: "{{ hostvars[inventory_hostname].control_plane_endpoint | default(ansible_host ~ ':6443') }}"
+
+- set_fact:
+    kubeadm_cp_join_cmd: >-
+      kubeadm join {{ cp_endpoint }}
+      --token {{ join_token.stdout }}
+      --discovery-token-ca-cert-hash sha256:{{ ca_hash.stdout }}
+      --control-plane
+      --certificate-key {{ cert_key.stdout }}
--- a/ansible/roles/kubeadm_init/tasks/main.yml
+++ b/ansible/roles/kubeadm_init/tasks/main.yml
@ -0,0 +1,24 @@
+# - name: Write kubeadm config
+#   template:
+#     src: kubeadm-config.yaml.j2
+#     dest: /etc/kubernetes/kubeadm-config.yaml
+
+# - name: Pre-pull images
+#   command: kubeadm config images pull
+
+# - name: Init control-plane
+#   command: kubeadm init --config=/etc/kubernetes/kubeadm-config.yaml
+#   args: { creates: /etc/kubernetes/admin.conf }
+
+# - name: Setup kubeconfig
+#   shell: |
+#     mkdir -p $HOME/.kube
+#     cp -i /etc/kubernetes/admin.conf $HOME/.kube/config
+#     chown $(id -u):$(id -g) $HOME/.kube/config
+
+- name: Save join command
+  shell: kubeadm token create --print-join-command
+  register: join_cmd
+
+- set_fact:
+    kubeadm_join_command_all: "{{ join_cmd.stdout }}"
--- a/ansible/roles/kubeadm_init/templates/kubeadm-config.yaml.j2
+++ b/ansible/roles/kubeadm_init/templates/kubeadm-config.yaml.j2
@ -0,0 +1,14 @@
+apiVersion: kubeadm.k8s.io/v1beta3
+kind: ClusterConfiguration
+kubernetesVersion: {{ k8s_version }}
+clusterName: {{ cluster_name }}
+controlPlaneEndpoint: "{{ control_plane_endpoint }}"
+networking:
+  podSubnet: "{{ pod_cidr }}"
+  serviceSubnet: "{{ service_cidr }}"
+---
+apiVersion: kubeadm.k8s.io/v1beta3
+kind: InitConfiguration
+nodeRegistration:
+  kubeletExtraArgs:
+    node-ip: "{{ hostvars[inventory_hostname].wg_address | default(hostvars[inventory_hostname].public_ip) }}"
--- a/ansible/roles/kubeadm_join/tasks/main.yml
+++ b/ansible/roles/kubeadm_join/tasks/main.yml
@ -0,0 +1,2 @@
+- name: Join node to cluster
+  command: "{{ hostvars[groups['k8s_control_plane'][0]].kubeadm_join_command_all }} --ignore-preflight-errors=FileAvailable--etc-kubernetes-kubelet.conf,FileAvailable--etc-kubernetes-pki-ca.crt,Port-10250"
--- a/ansible/roles/kubeadm_join_cp/tasks/main.yml
+++ b/ansible/roles/kubeadm_join_cp/tasks/main.yml
@ -0,0 +1,9 @@
+- name: Ensure join command provided
+  fail:
+    msg: "Set kubeadm_cp_join_cmd variable (string)"
+  when: kubeadm_cp_join_cmd is not defined
+
+- name: Join node as control-plane
+  command: "{{ kubeadm_cp_join_cmd }}"
+  args:
+    creates: /etc/kubernetes/kubelet.conf
--- a/ansible/roles/kubernetes/tasks/main.yml
+++ b/ansible/roles/kubernetes/tasks/main.yml
@ -0,0 +1,17 @@
+- name: Install Kubernetes apt key
+  shell: curl -fsSL https://pkgs.k8s.io/core:/stable:/v1.30/deb/Release.key | gpg --dearmor -o /etc/apt/keyrings/kubernetes-apt-keyring.gpg
+  args: { creates: /etc/apt/keyrings/kubernetes-apt-keyring.gpg }
+
+- name: Add Kubernetes repo
+  apt_repository:
+    repo: "deb [signed-by=/etc/apt/keyrings/kubernetes-apt-keyring.gpg] https://pkgs.k8s.io/core:/stable:/v1.30/deb/ /"
+    state: present
+
+- name: Install kubeadm, kubelet, kubectl
+  apt:
+    name: [kubeadm, kubelet, kubectl]
+    state: present
+    update_cache: yes
+
+- name: Hold kube packages
+  command: apt-mark hold kubeadm kubelet kubectl
--- a/ansible/roles/labels/tasks/main.yml
+++ b/ansible/roles/labels/tasks/main.yml
@ -0,0 +1,4 @@
+- name: Label hetzner-2 for stateful
+  command: kubectl label node hetzner-2 {{ stateful_node_label_key }}={{ stateful_node_label_val }} --overwrite
+  delegate_to: "{{ groups['k8s_control_plane'][0] }}"
+  run_once: true
--- a/ansible/roles/storage_local_path/tasks/main.yml
+++ b/ansible/roles/storage_local_path/tasks/main.yml
@ -0,0 +1,55 @@
+- name: Ensure local path dir
+  file:
+    path: "{{ local_path_dir }}"
+    state: directory
+    mode: '0777'
+
+- name: StorageClass local-ssd-hetzner
+  copy:
+    dest: /root/local-sc.yaml
+    content: |
+      apiVersion: storage.k8s.io/v1
+      kind: StorageClass
+      metadata:
+        name: {{ local_sc_name }}
+      provisioner: kubernetes.io/no-provisioner
+      volumeBindingMode: WaitForFirstConsumer
+  when: inventory_hostname in groups['k8s_control_plane']
+
+- name: Apply SC
+  command: kubectl apply -f /root/local-sc.yaml
+  environment:
+    KUBECONFIG: /etc/kubernetes/admin.conf
+  when: inventory_hostname in groups['k8s_control_plane']
+
+- name: Create local-path directory
+  file:
+    path: /mnt/local-ssd
+    state: directory
+    mode: '0755'
+
+- name: Create subdirectories for each PV
+  file:
+    path: "/mnt/local-ssd/{{ item }}"
+    state: directory
+    mode: '0755'
+  loop:
+    - postgres
+    - prometheus
+    - elasticsearch
+    - grafana
+
+- name: Copy PV manifest
+  template:
+    src: local-ssd-pv.yaml
+    dest: /tmp/local-ssd-pv.yaml
+
+- name: Apply PV
+  command: kubectl apply -f /tmp/local-ssd-pv.yaml
+  run_once: true
+  delegate_to: "{{ groups['k8s_control_plane'][0] }}"
+
+- name: Apply SC
+  command: kubectl apply -f /tmp/local-ssd-sc.yaml
+  run_once: true
+  delegate_to: "{{ groups['k8s_control_plane'][0] }}"
--- a/ansible/roles/storage_local_path/templates/local-ssd-pv.yaml
+++ b/ansible/roles/storage_local_path/templates/local-ssd-pv.yaml
@ -0,0 +1,65 @@
+apiVersion: v1
+kind: PersistentVolume
+metadata:
+  name: local-ssd-postgres
+spec:
+  capacity:
+    storage: 100Gi
+  accessModes:
+    - ReadWriteOnce
+  persistentVolumeReclaimPolicy: Retain
+  storageClassName: local-ssd-hetzner
+  local:
+    path: /mnt/local-ssd/postgres
+  nodeAffinity:
+    required:
+      nodeSelectorTerms:
+      - matchExpressions:
+        - key: kubernetes.io/hostname
+          operator: In
+          values:
+          - hetzner-2
+---
+apiVersion: v1
+kind: PersistentVolume
+metadata:
+  name: local-ssd-prometheus
+spec:
+  capacity:
+    storage: 100Gi
+  accessModes:
+    - ReadWriteOnce
+  persistentVolumeReclaimPolicy: Retain
+  storageClassName: local-ssd-hetzner
+  local:
+    path: /mnt/local-ssd/prometheus
+  nodeAffinity:
+    required:
+      nodeSelectorTerms:
+      - matchExpressions:
+        - key: kubernetes.io/hostname
+          operator: In
+          values:
+          - hetzner-2
+---
+apiVersion: v1
+kind: PersistentVolume
+metadata:
+  name: local-ssd-elasticsearch
+spec:
+  capacity:
+    storage: 300Gi
+  accessModes:
+    - ReadWriteOnce
+  persistentVolumeReclaimPolicy: Retain
+  storageClassName: local-ssd-hetzner
+  local:
+    path: /mnt/local-ssd/elasticsearch
+  nodeAffinity:
+    required:
+      nodeSelectorTerms:
+      - matchExpressions:
+        - key: kubernetes.io/hostname
+          operator: In
+          values:
+          - hetzner-2
--- a/ansible/roles/wireguard/tasks/main.yml
+++ b/ansible/roles/wireguard/tasks/main.yml
@ -0,0 +1,62 @@
+- name: Install wireguard
+  apt:
+    name: [wireguard, qrencode]
+    state: present
+    update_cache: yes
+
+- name: Ensure key dir
+  file: { path: /etc/wireguard/keys, state: directory, mode: '0700' }
+
+- name: Generate private key if missing
+  shell: "[ -f /etc/wireguard/keys/privatekey ] || (umask 077 && wg genkey > /etc/wireguard/keys/privatekey)"
+  args: { creates: /etc/wireguard/keys/privatekey }
+
+- name: Generate public key
+  shell: "wg pubkey < /etc/wireguard/keys/privatekey > /etc/wireguard/keys/publickey"
+  args: { creates: /etc/wireguard/keys/publickey }
+
+- name: Read pubkey
+  slurp: { src: /etc/wireguard/keys/publickey }
+  register: pubkey_raw
+
+- name: Read private key
+  slurp: { src: /etc/wireguard/keys/privatekey }
+  register: privkey_raw
+
+- set_fact:
+    wg_public_key: "{{ pubkey_raw.content | b64decode | trim }}"
+    wg_private_key: "{{ privkey_raw.content | b64decode | trim }}"
+
+- name: Gather facts from all hosts
+  setup:
+  delegate_to: "{{ item }}"
+  delegate_facts: true
+  loop: "{{ groups['k8s_nodes'] }}"
+  run_once: true
+
+- name: Pretty print hostvars
+  debug:
+    msg: "{{ hostvars['hetzner-1']['wg_public_key']  }}"
+
+- name: Render config
+  template:
+    src: wg0.conf.j2
+    dest: /etc/wireguard/wg0.conf
+    mode: '0600'
+
+- name: Enable IP forward
+  sysctl:
+    name: net.ipv4.ip_forward
+    value: "1"
+    sysctl_set: yes
+    state: present
+    reload: yes
+
+- name: Enable wg-quick
+  service:
+    name: wg-quick@wg0
+    enabled: yes
+    state: started
+
+- debug:
+    var: wg_show.stdout
--- a/ansible/roles/wireguard/templates/wg0.conf.j2
+++ b/ansible/roles/wireguard/templates/wg0.conf.j2
@ -0,0 +1,12 @@
+[Interface]
+Address = {{ wg_nodes[inventory_hostname].address }}/24
+ListenPort = {{ wg_port }}
+PrivateKey = {{ wg_private_key }}
+
+{% for h in groups['k8s_nodes'] if h != inventory_hostname %}
+[Peer]
+PublicKey = {{ hostvars[h].wg_public_key }}
+AllowedIPs = {{ wg_nodes[h].address }}/32
+Endpoint = {{ wg_nodes[h].public_ip }}:{{ wg_port }}
+PersistentKeepalive = 25
+{% endfor %}
--- a/ansible/roles/wireguard/vars/main.yml
+++ b/ansible/roles/wireguard/vars/main.yml
@ -0,0 +1,6 @@
+wg_interface: wg0
+wg_port: 51820
+wg_cidr: 10.66.0.0/24
+wg_nodes:
+  hetzner-1: { address: 10.66.0.11, public_ip: "95.217.89.53" }
+  hetzner-2: { address: 10.66.0.12, public_ip: "138.201.254.97" }
--- a/k8s/.DS_Store
+++ b/k8s/.DS_Store
--- a/k8s/00-namespaces.yaml
+++ b/k8s/00-namespaces.yaml
@ -0,0 +1,31 @@
+apiVersion: v1
+kind: Namespace
+metadata: { name: db }
+---
+apiVersion: v1
+kind: Namespace
+metadata: { name: scm }
+---
+apiVersion: v1
+kind: Namespace
+metadata: { name: ml }
+---
+apiVersion: v1
+kind: Namespace
+metadata: { name: monitoring }
+---
+apiVersion: v1
+kind: Namespace
+metadata: { name: elastic }
+---
+apiVersion: v1
+kind: Namespace
+metadata: { name: broker }
+---
+apiVersion: v1
+kind: Namespace
+metadata: { name: graph }
+---
+apiVersion: v1
+kind: Namespace
+metadata: { name: observability }
--- a/k8s/01-secrets/basic-auth.yaml
+++ b/k8s/01-secrets/basic-auth.yaml
@ -0,0 +1,38 @@
+# Replace each 'auth' line with a real htpasswd pair:
+#   htpasswd -nbBC 10 admin 'Str0ngP@ss'  (copy 'admin:...' to value below)
+
+apiVersion: v1
+kind: Secret
+metadata: { name: basic-auth-kibana, namespace: elastic }
+type: Opaque
+stringData: { auth: "admin:$2y$10$MBLgALyI7xwFrQh2PHqZruX.EzaTUGagmJODwpBEvF27snFAxCBvq" }
+---
+apiVersion: v1
+kind: Secret
+metadata: { name: basic-auth-grafana, namespace: monitoring }
+type: Opaque
+stringData: { auth: "admin:$2y$10$MBLgALyI7xwFrQh2PHqZruX.EzaTUGagmJODwpBEvF27snFAxCBvq" }
+---
+apiVersion: v1
+kind: Secret
+metadata: { name: basic-auth-prometheus, namespace: monitoring }
+type: Opaque
+stringData: { auth: "aadmin:$2y$10$MBLgALyI7xwFrQh2PHqZruX.EzaTUGagmJODwpBEvF27snFAxCBvq" }
+---
+apiVersion: v1
+kind: Secret
+metadata: { name: basic-auth-notebook, namespace: ml }
+type: Opaque
+stringData: { auth: "admin:$2y$10$MBLgALyI7xwFrQh2PHqZruX.EzaTUGagmJODwpBEvF27snFAxCBvq" }
+---
+apiVersion: v1
+kind: Secret
+metadata: { name: basic-auth-broker, namespace: broker }
+type: Opaque
+stringData: { auth: "admin:$2y$10$MBLgALyI7xwFrQh2PHqZruX.EzaTUGagmJODwpBEvF27snFAxCBvq" }
+---
+apiVersion: v1
+kind: Secret
+metadata: { name: basic-auth-neo4j, namespace: graph }
+type: Opaque
+stringData: { auth: "admin:$2y$10$MBLgALyI7xwFrQh2PHqZruX.EzaTUGagmJODwpBEvF27snFAxCBvq" }
--- a/k8s/argoflow/argo.yaml
+++ b/k8s/argoflow/argo.yaml
@ -0,0 +1,146 @@
+apiVersion: v1
+kind: Secret
+metadata:
+  name: argo-artifacts
+  namespace: ml
+type: Opaque
+stringData:
+  accesskey: "minioadmin"          # <-- change
+  secretkey: "minioadmin"   # <-- change
+---
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: workflow-controller-configmap
+  namespace: ml
+data:
+  config: |
+    artifactRepository:
+      s3:
+        bucket: argo-artifacts
+        endpoint: minio.betelgeusebytes.io     # no scheme here
+        insecure: false                        # https via Ingress
+        accessKeySecret:
+          name: argo-artifacts
+          key: accesskey
+        secretKeySecret:
+          name: argo-artifacts
+          key: secretkey
+        keyFormat: "{{workflow.namespace}}/{{workflow.name}}/{{pod.name}}"
+
+---
+# k8s/argo/workflows/ns-rbac.yaml
+apiVersion: v1
+kind: ServiceAccount
+metadata:
+  name: argo-server
+  namespace: ml
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: Role
+metadata:
+  name: argo-namespaced
+  namespace: ml
+rules:
+- apiGroups: [""]
+  resources: ["pods","pods/log","secrets","configmaps","events","persistentvolumeclaims","serviceaccounts"]
+  verbs: ["get","list","watch","create","delete","patch","update"]
+- apiGroups: ["coordination.k8s.io"]
+  resources: ["leases"]
+  verbs: ["get","list","watch","create","delete","patch","update"]
+- apiGroups: ["argoproj.io"]
+  resources: ["workflows","workflowtemplates","cronworkflows","workfloweventbindings","sensors","eventsources","workflowtasksets","workflowartifactgctasks","workflowtaskresults"]
+  verbs: ["get","list","watch","create","delete","patch","update"]
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: RoleBinding
+metadata:
+  name: argo-namespaced-binding
+  namespace: ml
+subjects:
+- kind: ServiceAccount
+  name: argo-server
+  namespace: ml
+roleRef:
+  apiGroup: rbac.authorization.k8s.io
+  kind: Role
+  name: argo-namespaced
+
+---
+# k8s/argo/workflows/controller.yaml
+apiVersion: apps/v1
+kind: Deployment
+metadata: { name: workflow-controller,  namespace: ml }
+spec:
+  replicas: 1
+  selector: { matchLabels: { app: workflow-controller } }
+  template:
+    metadata: { labels: { app: workflow-controller } }
+    spec:
+      serviceAccountName: argo-server
+      containers:
+      - name: controller
+        image: quay.io/argoproj/workflow-controller:latest
+        args: ["--namespaced"]
+        env:
+        - name: LEADER_ELECTION_IDENTITY
+          valueFrom:
+            fieldRef:
+              fieldPath: metadata.name
+        ports: [{ containerPort: 9090 }]
+        readinessProbe:
+          httpGet: { path: /metrics, port: 9090, scheme: HTTPS }
+          initialDelaySeconds: 5
+          periodSeconds: 10
+        livenessProbe:
+          httpGet: { path: /metrics, port: 9090, scheme: HTTPS }
+          initialDelaySeconds: 20
+          periodSeconds: 20
+
+---
+# k8s/argo/workflows/server.yaml
+apiVersion: apps/v1
+kind: Deployment
+metadata: { name: argo-server,  namespace: ml }
+spec:
+  replicas: 1
+  selector: { matchLabels: { app: argo-server } }
+  template:
+    metadata: { labels: { app: argo-server } }
+    spec:
+      serviceAccountName: argo-server
+      containers:
+      - name: server
+        image: quay.io/argoproj/argocli:latest
+        args: ["server","--auth-mode","server","--namespaced","--secure=false"]
+        ports: [{ containerPort: 2746 }]
+        readinessProbe:
+          httpGet: { path: /, port: 2746, scheme: HTTP }
+          initialDelaySeconds: 5
+          periodSeconds: 10
+        livenessProbe:
+          httpGet: { path: /, port: 2746, scheme: HTTP }
+          initialDelaySeconds: 20
+          periodSeconds: 20
+---
+apiVersion: v1
+kind: Service
+metadata: { name: argo-server,  namespace: ml }
+spec: { selector: { app: argo-server }, ports: [ { port: 80, targetPort: 2746 } ] }
+---
+apiVersion: networking.k8s.io/v1
+kind: Ingress
+metadata:
+  name: argo
+  namespace: ml
+  annotations: { cert-manager.io/cluster-issuer: letsencrypt-prod }
+spec:
+  ingressClassName: nginx
+  tls: [{ hosts: ["argo.betelgeusebytes.io"], secretName: argo-tls }]
+  rules:
+  - host: argo.betelgeusebytes.io
+    http:
+      paths:
+      - path: /
+        pathType: Prefix
+        backend: { service: { name: argo-server, port: { number: 80 } } }
--- a/k8s/automation/n8n.yaml
+++ b/k8s/automation/n8n.yaml
@ -0,0 +1,217 @@
+apiVersion: v1
+kind: Namespace
+metadata:
+  name: automation
+  labels:
+    name: automation
+---
+apiVersion: v1
+kind: PersistentVolume
+metadata:
+  name: n8n-pv
+  labels:
+    app: n8n
+spec:
+  capacity:
+    storage: 20Gi
+  volumeMode: Filesystem
+  accessModes:
+    - ReadWriteOnce
+  persistentVolumeReclaimPolicy: Retain
+  storageClassName: local-ssd
+  local:
+    path: /mnt/local-ssd/n8n
+  nodeAffinity:
+    required:
+      nodeSelectorTerms:
+      - matchExpressions:
+        - key: kubernetes.io/hostname
+          operator: In
+          values:
+          - hetzner-2
+---
+apiVersion: v1
+kind: PersistentVolumeClaim
+metadata:
+  name: n8n-data
+  namespace: automation
+  labels:
+    app: n8n
+spec:
+  accessModes:
+    - ReadWriteOnce
+  storageClassName: local-ssd
+  resources:
+    requests:
+      storage: 20Gi
+  selector:
+    matchLabels:
+      app: n8n
+---
+apiVersion: v1
+kind: Secret
+metadata:
+  name: n8n-secrets
+  namespace: automation
+type: Opaque
+stringData:
+  # Generate a strong encryption key with: openssl rand -base64 32
+  N8N_ENCRYPTION_KEY: "G/US0ePajEpWwRUjlchyOs6+6I/AT+0bisXmE2fugSU="
+  # Optional: Database connection if using PostgreSQL
+  DB_TYPE: "postgresdb"
+  DB_POSTGRESDB_HOST: "pg.betelgeusebytes.io"
+  DB_POSTGRESDB_PORT: "5432"
+  DB_POSTGRESDB_DATABASE: "n8n"
+  DB_POSTGRESDB_USER: "app"
+  DB_POSTGRESDB_PASSWORD: "pa$$word"
+---
+apiVersion: apps/v1
+kind: StatefulSet
+metadata:
+  name: n8n
+  namespace: automation
+spec:
+  serviceName: n8n
+  replicas: 1
+  selector:
+    matchLabels:
+      app: n8n
+  template:
+    metadata:
+      labels:
+        app: n8n
+    spec:
+      nodeSelector:
+        kubernetes.io/hostname: hetzner-2
+      containers:
+      - name: n8n
+        image: n8nio/n8n:latest
+        ports:
+        - containerPort: 5678
+          name: http
+        env:
+        - name: N8N_HOST
+          value: "n8n.betelgeusebytes.io"
+        - name: N8N_PORT
+          value: "5678"
+        - name: N8N_PROTOCOL
+          value: "https"
+        - name: WEBHOOK_URL
+          value: "https://n8n.betelgeusebytes.io/"
+        - name: GENERIC_TIMEZONE
+          value: "UTC"
+        - name: N8N_ENCRYPTION_KEY
+          valueFrom:
+            secretKeyRef:
+              name: n8n-secrets
+              key: N8N_ENCRYPTION_KEY
+        # Uncomment if using PostgreSQL
+        - name: DB_TYPE
+          valueFrom:
+            secretKeyRef:
+              name: n8n-secrets
+              key: DB_TYPE
+        - name: DB_POSTGRESDB_HOST
+          valueFrom:
+            secretKeyRef:
+              name: n8n-secrets
+              key: DB_POSTGRESDB_HOST
+        - name: DB_POSTGRESDB_PORT
+          valueFrom:
+            secretKeyRef:
+              name: n8n-secrets
+              key: DB_POSTGRESDB_PORT
+        - name: DB_POSTGRESDB_DATABASE
+          valueFrom:
+            secretKeyRef:
+              name: n8n-secrets
+              key: DB_POSTGRESDB_DATABASE
+        - name: DB_POSTGRESDB_USER
+          valueFrom:
+            secretKeyRef:
+              name: n8n-secrets
+              key: DB_POSTGRESDB_USER
+        - name: DB_POSTGRESDB_PASSWORD
+          valueFrom:
+            secretKeyRef:
+              name: n8n-secrets
+              key: DB_POSTGRESDB_PASSWORD
+        volumeMounts:
+        - name: n8n-data
+          mountPath: /home/node/.n8n
+        resources:
+          requests:
+            memory: "512Mi"
+            cpu: "250m"
+          limits:
+            memory: "2Gi"
+            cpu: "1000m"
+        livenessProbe:
+          httpGet:
+            path: /healthz
+            port: 5678
+          initialDelaySeconds: 60
+          periodSeconds: 30
+          timeoutSeconds: 10
+          failureThreshold: 5
+        readinessProbe:
+          httpGet:
+            path: /healthz
+            port: 5678
+          initialDelaySeconds: 30
+          periodSeconds: 10
+          timeoutSeconds: 5
+          failureThreshold: 3
+      volumes:
+      - name: n8n-data
+        persistentVolumeClaim:
+          claimName: n8n-data
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: n8n
+  namespace: automation
+  labels:
+    app: n8n
+spec:
+  type: ClusterIP
+  ports:
+  - port: 5678
+    targetPort: 5678
+    protocol: TCP
+    name: http
+  selector:
+    app: n8n
+---
+apiVersion: networking.k8s.io/v1
+kind: Ingress
+metadata:
+  name: n8n
+  namespace: automation
+  annotations:
+    cert-manager.io/cluster-issuer: "letsencrypt-prod"
+    # nginx.ingress.kubernetes.io/proxy-body-size: "50m"
+    # nginx.ingress.kubernetes.io/proxy-read-timeout: "300"
+    # nginx.ingress.kubernetes.io/proxy-send-timeout: "300"
+    # Uncomment below if you want basic auth protection in addition to n8n's auth
+    # nginx.ingress.kubernetes.io/auth-type: basic
+    # nginx.ingress.kubernetes.io/auth-secret: n8n-basic-auth
+    # nginx.ingress.kubernetes.io/auth-realm: 'Authentication Required'
+spec:
+  ingressClassName: nginx
+  tls:
+  - hosts:
+    - n8n.betelgeusebytes.io
+    secretName: wildcard-betelgeusebytes-tls
+  rules:
+  - host: n8n.betelgeusebytes.io
+    http:
+      paths:
+      - path: /
+        pathType: Prefix
+        backend:
+          service:
+            name: n8n
+            port:
+              number: 5678
--- a/k8s/cert-manager/cluster-issuer.yaml
+++ b/k8s/cert-manager/cluster-issuer.yaml
@ -0,0 +1,10 @@
+apiVersion: cert-manager.io/v1
+kind: ClusterIssuer
+metadata: { name: letsencrypt-prod }
+spec:
+  acme:
+    email: angal.salah@gmail.com
+    server: https://acme-v02.api.letsencrypt.org/directory
+    privateKeySecretRef: { name: letsencrypt-prod-key }
+    solvers:
+    - http01: { ingress: { class: nginx } }
--- a/k8s/elastic/elastic-pv.yaml
+++ b/k8s/elastic/elastic-pv.yaml
@ -0,0 +1,21 @@
+apiVersion: v1
+kind: PersistentVolume
+metadata:
+  name: pv-elasticsearch
+spec:
+  capacity:
+    storage: 80Gi
+  accessModes:
+    - ReadWriteOnce
+  persistentVolumeReclaimPolicy: Retain
+  storageClassName: local-ssd-hetzner
+  local:
+    path: /mnt/local-ssd/elasticsearch
+  nodeAffinity:
+    required:
+      nodeSelectorTerms:
+      - matchExpressions:
+        - key: kubernetes.io/hostname
+          operator: In
+          values:
+          - hetzner-2
--- a/k8s/elastic/elasticsearch.yaml
+++ b/k8s/elastic/elasticsearch.yaml
@ -0,0 +1,38 @@
+apiVersion: v1
+kind: Service
+metadata: { name: elasticsearch, namespace: elastic }
+spec:
+  ports:
+    - { name: http, port: 9200, targetPort: 9200 }
+    - { name: transport, port: 9300, targetPort: 9300 }
+  selector: { app: elasticsearch }
+---
+apiVersion: apps/v1
+kind: StatefulSet
+metadata: { name: elasticsearch, namespace: elastic }
+spec:
+  serviceName: elasticsearch
+  replicas: 1
+  selector: { matchLabels: { app: elasticsearch } }
+  template:
+    metadata: { labels: { app: elasticsearch } }
+    spec:
+      nodeSelector: { node: hetzner-2 }
+      containers:
+      - name: es
+        image: docker.elastic.co/elasticsearch/elasticsearch:8.14.0
+        env:
+          - { name: discovery.type, value: single-node }
+          - { name: xpack.security.enabled, value: "false" }
+          - { name: ES_JAVA_OPTS, value: "-Xms2g -Xmx2g" }
+        ports:
+          - { containerPort: 9200 }
+          - { containerPort: 9300 }
+        volumeMounts:
+          - { name: data, mountPath: /usr/share/elasticsearch/data }
+  volumeClaimTemplates:
+  - metadata: { name: data }
+    spec:
+      accessModes: ["ReadWriteOnce"]
+      storageClassName: local-ssd-hetzner
+      resources: { requests: { storage: 80Gi } }
--- a/k8s/elastic/kibana.yaml
+++ b/k8s/elastic/kibana.yaml
@ -0,0 +1,44 @@
+apiVersion: v1
+kind: Service
+metadata: { name: kibana, namespace: elastic }
+spec:
+  ports: [{ port: 5601, targetPort: 5601 }]
+  selector: { app: kibana }
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata: { name: kibana, namespace: elastic }
+spec:
+  replicas: 1
+  selector: { matchLabels: { app: kibana } }
+  template:
+    metadata: { labels: { app: kibana } }
+    spec:
+      nodeSelector: { node: hetzner-2 }
+      containers:
+      - name: kibana
+        image: docker.elastic.co/kibana/kibana:8.14.0
+        env:
+          - { name: ELASTICSEARCH_HOSTS, value: "http://elasticsearch.elastic.svc.cluster.local:9200" }
+        ports: [{ containerPort: 5601 }]
+---
+apiVersion: networking.k8s.io/v1
+kind: Ingress
+metadata:
+  name: kibana
+  namespace: elastic
+  annotations:
+    cert-manager.io/cluster-issuer: letsencrypt-prod
+    # nginx.ingress.kubernetes.io/auth-type: basic
+    # nginx.ingress.kubernetes.io/auth-secret: basic-auth-kibana
+    # nginx.ingress.kubernetes.io/auth-realm: "Authentication Required"
+spec:
+  ingressClassName: nginx
+  tls: [{ hosts: ["kibana.betelgeusebytes.io"], secretName: kibana-tls }]
+  rules:
+  - host: kibana.betelgeusebytes.io
+    http:
+      paths:
+      - path: /
+        pathType: Prefix
+        backend: { service: { name: kibana, port: { number: 5601 } } }
--- a/k8s/gitea/gitea-pv.yaml
+++ b/k8s/gitea/gitea-pv.yaml
@ -0,0 +1,21 @@
+apiVersion: v1
+kind: PersistentVolume
+metadata:
+  name: pv-gitea
+spec:
+  capacity:
+    storage: 50Gi
+  accessModes:
+    - ReadWriteOnce
+  persistentVolumeReclaimPolicy: Retain
+  storageClassName: local-ssd-hetzner
+  local:
+    path: /mnt/local-ssd/gitea
+  nodeAffinity:
+    required:
+      nodeSelectorTerms:
+      - matchExpressions:
+        - key: kubernetes.io/hostname
+          operator: In
+          values:
+          - hetzner-2
--- a/k8s/gitea/gitea.yaml
+++ b/k8s/gitea/gitea.yaml
@ -0,0 +1,54 @@
+apiVersion: v1
+kind: Service
+metadata: { name: gitea, namespace: scm }
+spec:
+  ports: [{ port: 80, targetPort: 3000 }]
+  selector: { app: gitea }
+---
+apiVersion: apps/v1
+kind: StatefulSet
+metadata: { name: gitea, namespace: scm }
+spec:
+  serviceName: gitea
+  replicas: 1
+  selector: { matchLabels: { app: gitea } }
+  template:
+    metadata: { labels: { app: gitea } }
+    spec:
+      nodeSelector: { node: hetzner-2 }
+      containers:
+      - name: gitea
+        image: gitea/gitea:1.21.11
+        env:
+          - { name: GITEA__server__ROOT_URL, value: "https://gitea.betelgeusebytes.io" }
+          - { name: GITEA__database__DB_TYPE, value: "postgres" }
+          - { name: GITEA__database__HOST, value: "postgres.db.svc.cluster.local:5432" }
+          - { name: GITEA__database__NAME, value: "gitea" }
+          - { name: GITEA__database__USER, value: "app" }
+          - { name: GITEA__database__PASSWD, value: "pa$$word" }
+        ports: [{ containerPort: 3000 }]
+        volumeMounts:
+          - { name: data, mountPath: /data }
+  volumeClaimTemplates:
+  - metadata: { name: data }
+    spec:
+      accessModes: ["ReadWriteOnce"]
+      storageClassName: local-ssd-hetzner
+      resources: { requests: { storage: 50Gi } }
+---
+apiVersion: networking.k8s.io/v1
+kind: Ingress
+metadata:
+  name: gitea
+  namespace: scm
+  annotations: { cert-manager.io/cluster-issuer: letsencrypt-prod }
+spec:
+  ingressClassName: nginx
+  tls: [{ hosts: ["gitea.betelgeusebytes.io"], secretName: gitea-tls }]
+  rules:
+  - host: gitea.betelgeusebytes.io
+    http:
+      paths:
+      - path: /
+        pathType: Prefix
+        backend: { service: { name: gitea, port: { number: 80 } } }
--- a/k8s/grafana/grafana.yaml
+++ b/k8s/grafana/grafana.yaml
@ -0,0 +1,45 @@
+apiVersion: v1
+kind: Service
+metadata: { name: grafana, namespace: monitoring }
+spec:
+  ports: [{ port: 80, targetPort: 3000 }]
+  selector: { app: grafana }
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata: { name: grafana, namespace: monitoring }
+spec:
+  replicas: 1
+  selector: { matchLabels: { app: grafana } }
+  template:
+    metadata: { labels: { app: grafana } }
+    spec:
+      nodeSelector: { node: hetzner-2 }
+      containers:
+      - name: grafana
+        image: grafana/grafana:10.4.3
+        env:
+          - { name: GF_SECURITY_ADMIN_USER, value: admin }
+          - { name: GF_SECURITY_ADMIN_PASSWORD, value: "ADMINclaude-GRAFANA" }
+        ports: [{ containerPort: 3000 }]
+---
+apiVersion: networking.k8s.io/v1
+kind: Ingress
+metadata:
+  name: grafana
+  namespace: monitoring
+  annotations:
+    cert-manager.io/cluster-issuer: letsencrypt-prod
+    nginx.ingress.kubernetes.io/auth-type: basic
+    nginx.ingress.kubernetes.io/auth-secret: basic-auth-grafana
+    nginx.ingress.kubernetes.io/auth-realm: "Authentication Required"
+spec:
+  ingressClassName: nginx
+  tls: [{ hosts: ["grafana.betelgeusebytes.io"], secretName: grafana-tls }]
+  rules:
+  - host: grafana.betelgeusebytes.io
+    http:
+      paths:
+      - path: /
+        pathType: Prefix
+        backend: { service: { name: grafana, port: { number: 80 } } }
--- a/k8s/ingress-patch/kustomization.yaml
+++ b/k8s/ingress-patch/kustomization.yaml
@ -0,0 +1,49 @@
+apiVersion: kustomize.config.k8s.io/v1beta1
+kind: Kustomization
+namespace: ingress-nginx
+
+# Create the tcp-services ConfigMap from *quoted* literals
+configMapGenerator:
+  - name: tcp-services
+    literals:
+      - "5432=db/postgres:5432"
+      - "7687=graph/neo4j:7687"
+
+generatorOptions:
+  disableNameSuffixHash: true
+
+# Inline JSON6902 patches
+patches:
+  # 1) Add controller arg for tcp-services
+  - target:
+      group: apps
+      version: v1
+      kind: Deployment
+      name: ingress-nginx-controller
+      namespace: ingress-nginx
+    patch: |-
+      - op: add
+        path: /spec/template/spec/containers/0/args/-
+        value: --tcp-services-configmap=$(POD_NAMESPACE)/tcp-services
+
+  # 2) Expose Service ports 5432 and 7687 (keeps 80/443)
+  - target:
+      version: v1
+      kind: Service
+      name: ingress-nginx-controller
+      namespace: ingress-nginx
+    patch: |-
+      - op: add
+        path: /spec/ports/-
+        value:
+          name: tcp-5432
+          port: 5432
+          protocol: TCP
+          targetPort: 5432
+      - op: add
+        path: /spec/ports/-
+        value:
+          name: tcp-7687
+          port: 7687
+          protocol: TCP
+          targetPort: 7687
--- a/k8s/jupyter/jupyter.yaml
+++ b/k8s/jupyter/jupyter.yaml
@ -0,0 +1,68 @@
+apiVersion: v1
+kind: Service
+metadata: { name: notebook, namespace: ml }
+spec:
+  selector: { app: jupyterlab }
+  ports: [{ port: 80, targetPort: 8888 }]
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata: { name: jupyterlab, namespace: ml }
+spec:
+  replicas: 1
+  selector: { matchLabels: { app: jupyterlab } }
+  template:
+    metadata: { labels: { app: jupyterlab } }
+    spec:
+      securityContext:
+        runAsUser: 1000
+        fsGroup: 100
+      nodeSelector: { node: hetzner-2 }
+      containers:
+      - name: jupyter
+        image: jupyter/base-notebook:latest
+        args: ["start-notebook.sh", "--NotebookApp.token=$(PASSWORD)"]
+        env:
+          - name: PASSWORD
+            valueFrom: { secretKeyRef: { name: jupyter-auth, key: PASSWORD } }
+        ports: [{ containerPort: 8888 }]
+        volumeMounts:
+          - { name: work, mountPath: /home/jovyan/work }
+      volumes:
+        - name: work
+          persistentVolumeClaim: { claimName: jupyter-pvc }
+---
+apiVersion: v1
+kind: PersistentVolumeClaim
+metadata: { name: jupyter-pvc, namespace: ml }
+spec:
+  accessModes: ["ReadWriteOnce"]
+  storageClassName: local-ssd-hetzner
+  resources: { requests: { storage: 20Gi } }
+---
+apiVersion: v1
+kind: Secret
+metadata: { name: jupyter-auth, namespace: ml }
+type: Opaque
+stringData: { PASSWORD: "notebook" }
+---
+apiVersion: networking.k8s.io/v1
+kind: Ingress
+metadata:
+  name: notebook
+  namespace: ml
+  annotations:
+    cert-manager.io/cluster-issuer: letsencrypt-prod
+    # nginx.ingress.kubernetes.io/auth-type: basic
+    # nginx.ingress.kubernetes.io/auth-secret: basic-auth-notebook
+    # nginx.ingress.kubernetes.io/auth-realm: "Authentication Required"
+spec:
+  ingressClassName: nginx
+  tls: [{ hosts: ["notebook.betelgeusebytes.io"], secretName: notebook-tls }]
+  rules:
+  - host: notebook.betelgeusebytes.io
+    http:
+      paths:
+      - path: /
+        pathType: Prefix
+        backend: { service: { name: notebook, port: { number: 80 } } }
--- a/k8s/kafka/kafka-pv.yaml
+++ b/k8s/kafka/kafka-pv.yaml
@ -0,0 +1,65 @@
+apiVersion: v1
+kind: PersistentVolume
+metadata:
+  name: pv-kafka
+spec:
+  capacity:
+    storage: 50Gi
+  accessModes:
+    - ReadWriteOnce
+  persistentVolumeReclaimPolicy: Retain
+  storageClassName: local-ssd-hetzner
+  local:
+    path: /mnt/local-ssd/kafka
+  nodeAffinity:
+    required:
+      nodeSelectorTerms:
+      - matchExpressions:
+        - key: kubernetes.io/hostname
+          operator: In
+          values:
+          - hetzner-2
+---
+apiVersion: v1
+kind: PersistentVolume
+metadata:
+  name: pv-zookeeper-data
+spec:
+  capacity:
+    storage: 10Gi
+  accessModes:
+    - ReadWriteOnce
+  persistentVolumeReclaimPolicy: Retain
+  storageClassName: local-ssd-hetzner
+  local:
+    path: /mnt/local-ssd/zookeeper-data
+  nodeAffinity:
+    required:
+      nodeSelectorTerms:
+      - matchExpressions:
+        - key: kubernetes.io/hostname
+          operator: In
+          values:
+          - hetzner-2
+---
+apiVersion: v1
+kind: PersistentVolume
+metadata:
+  name: pv-zookeeper-log
+spec:
+  capacity:
+    storage: 10Gi
+  accessModes:
+    - ReadWriteOnce
+  persistentVolumeReclaimPolicy: Retain
+  storageClassName: local-ssd-hetzner
+  local:
+    path: /mnt/local-ssd/zookeeper-log
+  nodeAffinity:
+    required:
+      nodeSelectorTerms:
+      - matchExpressions:
+        - key: kubernetes.io/hostname
+          operator: In
+          values:
+          - hetzner-2
--- a/k8s/kafka/kafka-ui.yaml
+++ b/k8s/kafka/kafka-ui.yaml
@ -0,0 +1,44 @@
+apiVersion: v1
+kind: Service
+metadata: { name: kafka-ui, namespace: broker }
+spec:
+  ports: [{ port: 80, targetPort: 8080 }]
+  selector: { app: kafka-ui }
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata: { name: kafka-ui, namespace: broker }
+spec:
+  replicas: 1
+  selector: { matchLabels: { app: kafka-ui } }
+  template:
+    metadata: { labels: { app: kafka-ui } }
+    spec:
+      containers:
+      - name: ui
+        image: provectuslabs/kafka-ui:latest
+        env:
+          - { name: KAFKA_CLUSTERS_0_NAME, value: "local" }
+          - { name: KAFKA_CLUSTERS_0_BOOTSTRAPSERVERS, value: "kafka.broker.svc.cluster.local:9092" }
+        ports: [{ containerPort: 8080 }]
+---
+apiVersion: networking.k8s.io/v1
+kind: Ingress
+metadata:
+  name: kafka-ui
+  namespace: broker
+  annotations:
+    cert-manager.io/cluster-issuer: letsencrypt-prod
+    # nginx.ingress.kubernetes.io/auth-type: basic
+    # nginx.ingress.kubernetes.io/auth-secret: basic-auth-broker
+    # nginx.ingress.kubernetes.io/auth-realm: "Authentication Required"
+spec:
+  ingressClassName: nginx
+  tls: [{ hosts: ["broker.betelgeusebytes.io"], secretName: broker-tls }]
+  rules:
+  - host: broker.betelgeusebytes.io
+    http:
+      paths:
+      - path: /
+        pathType: Prefix
+        backend: { service: { name: kafka-ui, port: { number: 80 } } }
--- a/k8s/kafka/kafka.yaml
+++ b/k8s/kafka/kafka.yaml
@ -0,0 +1,45 @@
+apiVersion: v1
+kind: Service
+metadata: { name: kafka, namespace: broker }
+spec:
+  ports: [{ name: kafka, port: 9092, targetPort: 9092 }]
+  selector: { app: kafka }
+---
+apiVersion: apps/v1
+kind: StatefulSet
+metadata: { name: kafka, namespace: broker }
+spec:
+  serviceName: kafka
+  replicas: 1
+  selector: { matchLabels: { app: kafka } }
+  template:
+    metadata: { labels: { app: kafka } }
+    spec:
+      nodeSelector: { node: hetzner-2 }
+      containers:
+      - name: kafka
+        image: apache/kafka:latest
+        env:
+          - { name: KAFKA_NODE_ID, value: "1" }
+          - { name: KAFKA_PROCESS_ROLES, value: "broker,controller" }
+          - { name: KAFKA_LISTENERS, value: "PLAINTEXT://:9092,CONTROLLER://:9093" }
+          - { name: KAFKA_ADVERTISED_LISTENERS, value: "PLAINTEXT://kafka.broker.svc.cluster.local:9092" }
+          - { name: KAFKA_CONTROLLER_LISTENER_NAMES, value: "CONTROLLER" }
+          - { name: KAFKA_LISTENER_SECURITY_PROTOCOL_MAP, value: "CONTROLLER:PLAINTEXT,PLAINTEXT:PLAINTEXT" }
+          - { name: KAFKA_CONTROLLER_QUORUM_VOTERS, value: "1@localhost:9093" }
+          - { name: KAFKA_OFFSETS_TOPIC_REPLICATION_FACTOR, value: "1" }
+          - { name: KAFKA_TRANSACTION_STATE_LOG_REPLICATION_FACTOR, value: "1" }
+          - { name: KAFKA_TRANSACTION_STATE_LOG_MIN_ISR, value: "1" }
+          - { name: KAFKA_LOG_DIRS, value: "/var/lib/kafka/data" }
+          - { name: CLUSTER_ID, value: "MkU3OEVBNTcwNTJENDM2Qk" }
+        ports:
+          - { containerPort: 9092 }
+          - { containerPort: 9093 }
+        volumeMounts:
+          - { name: data, mountPath: /var/lib/kafka/data }
+  volumeClaimTemplates:
+  - metadata: { name: data }
+    spec:
+      accessModes: ["ReadWriteOnce"]
+      storageClassName: local-ssd-hetzner
+      resources: { requests: { storage: 50Gi } }
--- a/k8s/label_studio/label.yaml
+++ b/k8s/label_studio/label.yaml
@ -0,0 +1,74 @@
+# k8s/ai/label-studio/secret-pg.yaml
+apiVersion: v1
+kind: Secret
+metadata: { name: labelstudio-pg,  namespace: ml }
+type: Opaque
+stringData: { POSTGRES_PASSWORD: "admin" }
+
+---
+# k8s/ai/label-studio/secret-minio.yaml
+apiVersion: v1
+kind: Secret
+metadata: { name: minio-label,  namespace: ml }
+type: Opaque
+stringData:
+  accesskey: "minioadmin"
+  secretkey: "minioadmin"
+
+---
+# k8s/ai/label-studio/deploy.yaml
+apiVersion: apps/v1
+kind: Deployment
+metadata: { name: label-studio,  namespace: ml }
+spec:
+  replicas: 1
+  selector: { matchLabels: { app: label-studio } }
+  template:
+    metadata: { labels: { app: label-studio } }
+    spec:
+      containers:
+      - name: app
+        image: heartexlabs/label-studio:latest
+        env:
+          - { name: POSTGRE_NAME, value: "labelstudio" }
+          - { name: POSTGRE_USER, value: "admin" }
+          - name: POSTGRE_PASSWORD
+            valueFrom: { secretKeyRef: { name: labelstudio-pg, key: POSTGRES_PASSWORD } }
+          - { name: POSTGRE_HOST, value: "postgres.db.svc.cluster.local" }
+          - { name: POSTGRE_PORT, value: "5432" }
+          - { name: S3_ENDPOINT, value: "https://minio.betelgeusebytes.io" }
+          - name: AWS_ACCESS_KEY_ID
+            valueFrom: { secretKeyRef: { name: minio-label, key: accesskey } }
+          - name: AWS_SECRET_ACCESS_KEY
+            valueFrom: { secretKeyRef: { name: minio-label, key: secretkey } }
+          - name: ALLOWED_HOSTS
+            value: "label.betelgeusebytes.io"
+          - name: CSRF_TRUSTED_ORIGINS
+            value: "https://label.betelgeusebytes.io"
+          - name: CSRF_COOKIE_SECURE
+            value: "1"
+          - name: SESSION_COOKIE_SECURE
+            value: "1"
+        ports: [{ containerPort: 8080 }]
+---
+apiVersion: v1
+kind: Service
+metadata: { name: label-studio,  namespace: ml }
+spec: { selector: { app: label-studio }, ports: [ { port: 80, targetPort: 8080 } ] }
+---
+apiVersion: networking.k8s.io/v1
+kind: Ingress
+metadata:
+  name: label-studio
+  namespace: ml
+  annotations: { cert-manager.io/cluster-issuer: letsencrypt-prod }
+spec:
+  ingressClassName: nginx
+  tls: [{ hosts: ["label.betelgeusebytes.io"], secretName: label-tls }]
+  rules:
+  - host: label.betelgeusebytes.io
+    http:
+      paths:
+      - path: /
+        pathType: Prefix
+        backend: { service: { name: label-studio, port: { number: 80 } } }
--- a/k8s/minio/minio.yaml
+++ b/k8s/minio/minio.yaml
@ -0,0 +1,96 @@
+apiVersion: v1
+kind: Namespace
+metadata: { name: storage }
+---
+# k8s/storage/minio/secret.yaml
+apiVersion: v1
+kind: Secret
+metadata: { name: minio-root, namespace: storage }
+type: Opaque
+stringData:
+  MINIO_ROOT_USER: "minioadmin"
+  MINIO_ROOT_PASSWORD: "minioadmin"
+
+---
+# k8s/storage/minio/pvc.yaml
+apiVersion: v1
+kind: PersistentVolumeClaim
+metadata: { name: minio-data, namespace: storage }
+spec:
+  accessModes: ["ReadWriteOnce"]
+  storageClassName: local-ssd-hetzner
+  resources: { requests: { storage: 20Gi } }
+
+---
+# k8s/storage/minio/deploy.yaml
+apiVersion: apps/v1
+kind: Deployment
+metadata: { name: minio, namespace: storage }
+spec:
+  replicas: 1
+  selector: { matchLabels: { app: minio } }
+  template:
+    metadata: { labels: { app: minio } }
+    spec:
+      containers:
+      - name: minio
+        image: minio/minio:latest
+        args: ["server","/data","--console-address",":9001"]
+        envFrom: [{ secretRef: { name: minio-root } }]
+        ports:
+          - { containerPort: 9000 } # S3
+          - { containerPort: 9001 } # Console
+        volumeMounts:
+          - { name: data, mountPath: /data }
+      volumes:
+        - name: data
+          persistentVolumeClaim: { claimName: minio-data }
+---
+apiVersion: v1
+kind: Service
+metadata: { name: minio, namespace: storage }
+spec:
+  selector: { app: minio }
+  ports:
+    - { name: s3,      port: 9000, targetPort: 9000 }
+    - { name: console, port: 9001, targetPort: 9001 }
+---
+apiVersion: networking.k8s.io/v1
+kind: Ingress
+metadata:
+  name: minio
+  namespace: storage
+  annotations: { cert-manager.io/cluster-issuer: letsencrypt-prod }
+spec:
+  ingressClassName: nginx
+  tls: [{ hosts: ["minio.betelgeusebytes.io"], secretName: minio-tls }]
+  rules:
+  - host: minio.betelgeusebytes.io
+    http:
+      paths:
+      - path: /
+        pathType: Prefix
+        backend: { service: { name: minio, port: { number: 9001 } } }
+---
+# PV
+apiVersion: v1
+kind: PersistentVolume
+metadata:
+  name: pv-minio
+spec:
+  capacity:
+    storage: 20Gi
+  accessModes:
+    - ReadWriteOnce
+  persistentVolumeReclaimPolicy: Retain
+  storageClassName: local-ssd-hetzner
+  local:
+    path: /mnt/local-ssd/minio
+  nodeAffinity:
+    required:
+      nodeSelectorTerms:
+      - matchExpressions:
+        - key: kubernetes.io/hostname
+          operator: In
+          values:
+          - hetzner-2
--- a/k8s/mlflow/mlflow.yaml
+++ b/k8s/mlflow/mlflow.yaml
@ -0,0 +1,64 @@
+# k8s/mlops/mlflow/secret-pg.yaml
+apiVersion: v1
+kind: Secret
+metadata: { name: mlflow-pg,  namespace: ml }
+type: Opaque
+stringData: { POSTGRES_PASSWORD: "pa$$word" }
+
+---
+# k8s/mlops/mlflow/secret-minio.yaml
+apiVersion: v1
+kind: Secret
+metadata: { name: mlflow-minio,  namespace: ml }
+type: Opaque
+stringData:
+  accesskey: "minioadmin"
+  secretkey: "minioadmin"
+
+---
+# k8s/mlops/mlflow/deploy.yaml
+apiVersion: apps/v1
+kind: Deployment
+metadata: { name: mlflow,  namespace: ml }
+spec:
+  replicas: 1
+  selector: { matchLabels: { app: mlflow } }
+  template:
+    metadata: { labels: { app: mlflow } }
+    spec:
+      containers:
+      - name: mlflow
+        # image: ghcr.io/mlflow/mlflow:v3.6.0
+        image: axxs/mlflow-pg
+        env:
+          - { name: MLFLOW_BACKEND_STORE_URI,
+              value: "postgresql://admin:admin@postgres.db.svc.cluster.local:5432/mlflow" }
+          - { name: POSTGRES_PASSWORD, valueFrom: { secretKeyRef: { name: mlflow-pg, key: POSTGRES_PASSWORD } } }
+          - { name: MLFLOW_S3_ENDPOINT_URL, value: "https://minio.betelgeusebytes.io" }
+          - { name: AWS_ACCESS_KEY_ID, valueFrom: { secretKeyRef: { name: mlflow-minio, key: accesskey } } }
+          - { name: AWS_SECRET_ACCESS_KEY, valueFrom: { secretKeyRef: { name: mlflow-minio, key: secretkey } } }
+        args: ["mlflow","server","--host","0.0.0.0","--port","5000","--artifacts-destination","s3://mlflow", "--allowed-hosts", "*.betelgeusebytes.io"]
+        ports: [{ containerPort: 5000 }]
+---
+apiVersion: v1
+kind: Service
+metadata: { name: mlflow,  namespace: ml }
+spec: { selector: { app: mlflow }, ports: [ { port: 80, targetPort: 5000 } ] }
+---
+apiVersion: networking.k8s.io/v1
+kind: Ingress
+metadata:
+  name: mlflow
+  namespace: ml
+  annotations: { cert-manager.io/cluster-issuer: letsencrypt-prod }
+spec:
+  ingressClassName: nginx
+  tls: [{ hosts: ["mlflow.betelgeusebytes.io"], secretName: mlflow-tls }]
+  rules:
+  - host: mlflow.betelgeusebytes.io
+    http:
+      paths:
+      - path: /
+        pathType: Prefix
+        backend: { service: { name: mlflow, port: { number: 80 } } }
+
--- a/k8s/neo4j/neo4j-pv.yaml
+++ b/k8s/neo4j/neo4j-pv.yaml
@ -0,0 +1,21 @@
+apiVersion: v1
+kind: PersistentVolume
+metadata:
+  name: pv-neo4j
+spec:
+  capacity:
+    storage: 20Gi
+  accessModes:
+    - ReadWriteOnce
+  persistentVolumeReclaimPolicy: Retain
+  storageClassName: local-ssd-hetzner
+  local:
+    path: /mnt/local-ssd/neo4j
+  nodeAffinity:
+    required:
+      nodeSelectorTerms:
+      - matchExpressions:
+        - key: kubernetes.io/hostname
+          operator: In
+          values:
+          - hetzner-2
--- a/k8s/neo4j/neo4j.yaml
+++ b/k8s/neo4j/neo4j.yaml
@ -0,0 +1,107 @@
+apiVersion: v1
+kind: Service
+metadata: { name: neo4j, namespace: graph }
+spec:
+  selector: { app: neo4j }
+  ports:
+    - { name: http, port: 7474, targetPort: 7474 }
+    - { name: bolt, port: 7687, targetPort: 7687 }
+---
+apiVersion: apps/v1
+kind: StatefulSet
+metadata: { name: neo4j, namespace: graph }
+spec:
+  serviceName: neo4j
+  replicas: 1
+  selector: { matchLabels: { app: neo4j } }
+  template:
+    metadata: { labels: { app: neo4j } }
+    spec:
+      enableServiceLinks: false
+      nodeSelector: { node: hetzner-2 }
+      containers:
+      - name: neo4j
+        image: neo4j:5.20
+        env:
+          - name: NEO4J_AUTH
+            valueFrom: { secretKeyRef: { name: neo4j-auth, key: NEO4J_AUTH } }
+          - name: NEO4J_dbms_ssl_policy_bolt_enabled
+            value: "true"
+          - name: NEO4J_dbms_ssl_policy_bolt_base__directory
+            value: "/certs/bolt"
+          - name: NEO4J_dbms_ssl_policy_bolt_private__key
+            value: "tls.key"
+          - name: NEO4J_dbms_ssl_policy_bolt_public__certificate
+            value: "tls.crt"
+          - name: NEO4J_dbms_connector_bolt_tls__level
+            value: "REQUIRED"
+          # Advertise public hostname so the Browser uses the external FQDN for Bolt
+          - name: NEO4J_dbms_connector_bolt_advertised__address
+            value: "neo4j.betelgeusebytes.io:7687"
+          # also set a default advertised address (recommended)
+          - name: NEO4J_dbms_default__advertised__address
+            value: "neo4j.betelgeusebytes.io"
+        ports:
+          - { containerPort: 7474 }
+          - { containerPort: 7687 }
+        volumeMounts:
+          - { name: data, mountPath: /data }
+          - { name: bolt-certs, mountPath: /certs/bolt }
+      volumes:
+        - name: bolt-certs
+          secret:
+            secretName: neo4j-tls
+            items:
+              - key: tls.crt
+                path: tls.crt
+              - key: tls.key
+                path: tls.key
+  volumeClaimTemplates:
+  - metadata: { name: data }
+    spec:
+      accessModes: ["ReadWriteOnce"]
+      storageClassName: local-ssd-hetzner
+      resources: { requests: { storage: 20Gi } }
+---
+apiVersion: v1
+kind: Secret
+metadata: { name: neo4j-auth, namespace: graph }
+type: Opaque
+stringData: { NEO4J_AUTH: "neo4j/NEO4J-PASS" }
+---
+apiVersion: networking.k8s.io/v1
+kind: Ingress
+metadata:
+  name: neo4j-http
+  namespace: graph
+  annotations:
+    cert-manager.io/cluster-issuer: letsencrypt-prod
+    # nginx.ingress.kubernetes.io/auth-type: basic
+    # nginx.ingress.kubernetes.io/auth-secret: basic-auth-neo4j
+    # nginx.ingress.kubernetes.io/auth-realm: "Authentication Required"
+spec:
+  ingressClassName: nginx
+  tls: [{ hosts: ["neo4j.betelgeusebytes.io"], secretName: neo4j-tls }]
+  rules:
+  - host: neo4j.betelgeusebytes.io
+    http:
+      paths:
+      - path: /
+        pathType: Prefix
+        backend: { service: { name: neo4j, port: { number: 7474 } } }
+
+# create or update the tcp-services configmap
+# kubectl -n ingress-nginx create configmap tcp-services \
+#   --from-literal="7687=graph/neo4j:7687" \
+#   -o yaml --dry-run=client | kubectl apply -f -
+
+# kubectl -n ingress-nginx patch deploy ingress-nginx-controller \
+#   --type='json' -p='[{"op":"add","path":"/spec/template/spec/containers/0/args/-","value":"--tcp-services-configmap=$(POD_NAMESPACE)/tcp-services"}]'
+
+# kubectl -n ingress-nginx patch deploy ingress-nginx-controller \
+#   --type='json' -p='[{"op":"add","path":"/spec/template/spec/containers/0/args/-","value":"--tcp-services-configmap=$(POD_NAMESPACE)/tcp-services"}]'
+
+# kubectl -n ingress-nginx patch deployment ingress-nginx-controller \
+#   --type='json' -p='[
+#     {"op":"add","path":"/spec/template/spec/containers/0/ports/-","value":{"name":"tcp-7687","containerPort":7687,"hostPort":7687,"protocol":"TCP"}}
+#   ]'
--- a/k8s/observability-stack/00-namespace.yaml
+++ b/k8s/observability-stack/00-namespace.yaml
@ -0,0 +1,7 @@
+apiVersion: v1
+kind: Namespace
+metadata:
+  name: observability
+  labels:
+    name: observability
+    monitoring: "true"
--- a/k8s/observability-stack/01-persistent-volumes.yaml
+++ b/k8s/observability-stack/01-persistent-volumes.yaml
@ -0,0 +1,95 @@
+---
+# Prometheus PV
+apiVersion: v1
+kind: PersistentVolume
+metadata:
+  name: prometheus-data-pv
+spec:
+  capacity:
+    storage: 50Gi
+  accessModes:
+    - ReadWriteOnce
+  persistentVolumeReclaimPolicy: Retain
+  storageClassName: local-storage
+  local:
+    path: /mnt/local-ssd/prometheus
+  nodeAffinity:
+    required:
+      nodeSelectorTerms:
+      - matchExpressions:
+        - key: kubernetes.io/hostname
+          operator: In
+          values:
+          - hetzner-2
+
+---
+# Loki PV
+apiVersion: v1
+kind: PersistentVolume
+metadata:
+  name: loki-data-pv
+spec:
+  capacity:
+    storage: 100Gi
+  accessModes:
+    - ReadWriteOnce
+  persistentVolumeReclaimPolicy: Retain
+  storageClassName: local-storage
+  local:
+    path: /mnt/local-ssd/loki
+  nodeAffinity:
+    required:
+      nodeSelectorTerms:
+      - matchExpressions:
+        - key: kubernetes.io/hostname
+          operator: In
+          values:
+          - hetzner-2
+
+---
+# Tempo PV
+apiVersion: v1
+kind: PersistentVolume
+metadata:
+  name: tempo-data-pv
+spec:
+  capacity:
+    storage: 50Gi
+  accessModes:
+    - ReadWriteOnce
+  persistentVolumeReclaimPolicy: Retain
+  storageClassName: local-storage
+  local:
+    path: /mnt/local-ssd/tempo
+  nodeAffinity:
+    required:
+      nodeSelectorTerms:
+      - matchExpressions:
+        - key: kubernetes.io/hostname
+          operator: In
+          values:
+          - hetzner-2
+
+---
+# Grafana PV
+apiVersion: v1
+kind: PersistentVolume
+metadata:
+  name: grafana-data-pv
+spec:
+  capacity:
+    storage: 10Gi
+  accessModes:
+    - ReadWriteOnce
+  persistentVolumeReclaimPolicy: Retain
+  storageClassName: local-storage
+  local:
+    path: /mnt/local-ssd/grafana
+  nodeAffinity:
+    required:
+      nodeSelectorTerms:
+      - matchExpressions:
+        - key: kubernetes.io/hostname
+          operator: In
+          values:
+          - hetzner-2
--- a/k8s/observability-stack/02-persistent-volume-claims.yaml
+++ b/k8s/observability-stack/02-persistent-volume-claims.yaml
@ -0,0 +1,55 @@
+---
+apiVersion: v1
+kind: PersistentVolumeClaim
+metadata:
+  name: prometheus-data
+  namespace: observability
+spec:
+  accessModes:
+    - ReadWriteOnce
+  storageClassName: local-storage
+  resources:
+    requests:
+      storage: 50Gi
+
+---
+apiVersion: v1
+kind: PersistentVolumeClaim
+metadata:
+  name: loki-data
+  namespace: observability
+spec:
+  accessModes:
+    - ReadWriteOnce
+  storageClassName: local-storage
+  resources:
+    requests:
+      storage: 100Gi
+
+---
+apiVersion: v1
+kind: PersistentVolumeClaim
+metadata:
+  name: tempo-data
+  namespace: observability
+spec:
+  accessModes:
+    - ReadWriteOnce
+  storageClassName: local-storage
+  resources:
+    requests:
+      storage: 50Gi
+
+---
+apiVersion: v1
+kind: PersistentVolumeClaim
+metadata:
+  name: grafana-data
+  namespace: observability
+spec:
+  accessModes:
+    - ReadWriteOnce
+  storageClassName: local-storage
+  resources:
+    requests:
+      storage: 10Gi
--- a/k8s/observability-stack/03-prometheus-config.yaml
+++ b/k8s/observability-stack/03-prometheus-config.yaml
@ -0,0 +1,169 @@
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: prometheus-config
+  namespace: observability
+data:
+  prometheus.yml: |
+    global:
+      scrape_interval: 15s
+      evaluation_interval: 15s
+      external_labels:
+        cluster: 'betelgeuse-k8s'
+        environment: 'production'
+
+    # Alerting configuration (optional - can add alertmanager later)
+    alerting:
+      alertmanagers:
+        - static_configs:
+            - targets: []
+
+    # Rule files
+    rule_files:
+      - /etc/prometheus/rules/*.yml
+
+    scrape_configs:
+      # Scrape Prometheus itself
+      - job_name: 'prometheus'
+        static_configs:
+          - targets: ['localhost:9090']
+
+      # Kubernetes API server
+      - job_name: 'kubernetes-apiservers'
+        kubernetes_sd_configs:
+          - role: endpoints
+        scheme: https
+        tls_config:
+          ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
+        bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
+        relabel_configs:
+          - source_labels: [__meta_kubernetes_namespace, __meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name]
+            action: keep
+            regex: default;kubernetes;https
+
+      # Kubernetes nodes
+      - job_name: 'kubernetes-nodes'
+        kubernetes_sd_configs:
+          - role: node
+        scheme: https
+        tls_config:
+          ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
+        bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
+        relabel_configs:
+          - action: labelmap
+            regex: __meta_kubernetes_node_label_(.+)
+          - target_label: __address__
+            replacement: kubernetes.default.svc:443
+          - source_labels: [__meta_kubernetes_node_name]
+            regex: (.+)
+            target_label: __metrics_path__
+            replacement: /api/v1/nodes/${1}/proxy/metrics
+
+      # Kubernetes nodes cadvisor
+      - job_name: 'kubernetes-cadvisor'
+        kubernetes_sd_configs:
+          - role: node
+        scheme: https
+        tls_config:
+          ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
+        bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
+        relabel_configs:
+          - action: labelmap
+            regex: __meta_kubernetes_node_label_(.+)
+          - target_label: __address__
+            replacement: kubernetes.default.svc:443
+          - source_labels: [__meta_kubernetes_node_name]
+            regex: (.+)
+            target_label: __metrics_path__
+            replacement: /api/v1/nodes/${1}/proxy/metrics/cadvisor
+
+      # Kubernetes service endpoints
+      - job_name: 'kubernetes-service-endpoints'
+        kubernetes_sd_configs:
+          - role: endpoints
+        relabel_configs:
+          - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scrape]
+            action: keep
+            regex: true
+          - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scheme]
+            action: replace
+            target_label: __scheme__
+            regex: (https?)
+          - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_path]
+            action: replace
+            target_label: __metrics_path__
+            regex: (.+)
+          - source_labels: [__address__, __meta_kubernetes_service_annotation_prometheus_io_port]
+            action: replace
+            target_label: __address__
+            regex: ([^:]+)(?::\d+)?;(\d+)
+            replacement: $1:$2
+          - action: labelmap
+            regex: __meta_kubernetes_service_label_(.+)
+          - source_labels: [__meta_kubernetes_namespace]
+            action: replace
+            target_label: kubernetes_namespace
+          - source_labels: [__meta_kubernetes_service_name]
+            action: replace
+            target_label: kubernetes_name
+          - source_labels: [__meta_kubernetes_pod_name]
+            action: replace
+            target_label: kubernetes_pod_name
+
+      # Kubernetes pods
+      - job_name: 'kubernetes-pods'
+        kubernetes_sd_configs:
+          - role: pod
+        relabel_configs:
+          - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape]
+            action: keep
+            regex: true
+          - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_path]
+            action: replace
+            target_label: __metrics_path__
+            regex: (.+)
+          - source_labels: [__address__, __meta_kubernetes_pod_annotation_prometheus_io_port]
+            action: replace
+            regex: ([^:]+)(?::\d+)?;(\d+)
+            replacement: $1:$2
+            target_label: __address__
+          - action: labelmap
+            regex: __meta_kubernetes_pod_label_(.+)
+          - source_labels: [__meta_kubernetes_namespace]
+            action: replace
+            target_label: kubernetes_namespace
+          - source_labels: [__meta_kubernetes_pod_name]
+            action: replace
+            target_label: kubernetes_pod_name
+
+      # kube-state-metrics
+      - job_name: 'kube-state-metrics'
+        static_configs:
+          - targets: ['kube-state-metrics.observability.svc.cluster.local:8080']
+
+      # node-exporter
+      - job_name: 'node-exporter'
+        kubernetes_sd_configs:
+          - role: pod
+        relabel_configs:
+          - source_labels: [__meta_kubernetes_pod_label_app]
+            action: keep
+            regex: node-exporter
+          - source_labels: [__meta_kubernetes_pod_node_name]
+            action: replace
+            target_label: instance
+
+      # Grafana Loki
+      - job_name: 'loki'
+        static_configs:
+          - targets: ['loki.observability.svc.cluster.local:3100']
+
+      # Grafana Tempo
+      - job_name: 'tempo'
+        static_configs:
+          - targets: ['tempo.observability.svc.cluster.local:3200']
+
+      # Grafana
+      - job_name: 'grafana'
+        static_configs:
+          - targets: ['grafana.observability.svc.cluster.local:3000']
--- a/k8s/observability-stack/04-loki-config.yaml
+++ b/k8s/observability-stack/04-loki-config.yaml
@ -0,0 +1,94 @@
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: loki-config
+  namespace: observability
+data:
+  loki.yaml: |
+    auth_enabled: false
+
+    server:
+      http_listen_port: 3100
+      grpc_listen_port: 9096
+      log_level: info
+
+    common:
+      path_prefix: /loki
+      storage:
+        filesystem:
+          chunks_directory: /loki/chunks
+          rules_directory: /loki/rules
+      replication_factor: 1
+      ring:
+        kvstore:
+          store: inmemory
+
+    schema_config:
+      configs:
+        - from: 2024-01-01
+          store: tsdb
+          object_store: filesystem
+          schema: v13
+          index:
+            prefix: index_
+            period: 24h
+
+    storage_config:
+      tsdb_shipper:
+        active_index_directory: /loki/tsdb-index
+        cache_location: /loki/tsdb-cache
+      filesystem:
+        directory: /loki/chunks
+
+    compactor:
+      working_directory: /loki/compactor
+      compaction_interval: 10m
+      retention_enabled: true
+      retention_delete_delay: 2h
+      retention_delete_worker_count: 150
+
+    limits_config:
+      enforce_metric_name: false
+      reject_old_samples: true
+      reject_old_samples_max_age: 168h  # 7 days
+      retention_period: 168h  # 7 days
+      max_query_length: 721h  # 30 days for queries
+      max_query_parallelism: 32
+      max_streams_per_user: 0
+      max_global_streams_per_user: 0
+      ingestion_rate_mb: 50
+      ingestion_burst_size_mb: 100
+      per_stream_rate_limit: 10MB
+      per_stream_rate_limit_burst: 20MB
+      split_queries_by_interval: 15m
+
+    query_range:
+      align_queries_with_step: true
+      cache_results: true
+      results_cache:
+        cache:
+          embedded_cache:
+            enabled: true
+            max_size_mb: 500
+
+    frontend:
+      log_queries_longer_than: 5s
+      compress_responses: true
+
+    query_scheduler:
+      max_outstanding_requests_per_tenant: 2048
+
+    ingester:
+      chunk_idle_period: 30m
+      chunk_block_size: 262144
+      chunk_encoding: snappy
+      chunk_retain_period: 1m
+      max_chunk_age: 2h
+      wal:
+        enabled: true
+        dir: /loki/wal
+        flush_on_shutdown: true
+        replay_memory_ceiling: 1GB
+
+    analytics:
+      reporting_enabled: false
--- a/k8s/observability-stack/05-tempo-config.yaml
+++ b/k8s/observability-stack/05-tempo-config.yaml
@ -0,0 +1,72 @@
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: tempo-config
+  namespace: observability
+data:
+  tempo.yaml: |
+    server:
+      http_listen_port: 3200
+      log_level: info
+
+    distributor:
+      receivers:
+        jaeger:
+          protocols:
+            thrift_http:
+              endpoint: 0.0.0.0:14268
+            grpc:
+              endpoint: 0.0.0.0:14250
+        zipkin:
+          endpoint: 0.0.0.0:9411
+        otlp:
+          protocols:
+            http:
+              endpoint: 0.0.0.0:4318
+            grpc:
+              endpoint: 0.0.0.0:4317
+
+    ingester:
+      max_block_duration: 5m
+
+    compactor:
+      compaction:
+        block_retention: 168h  # 7 days
+
+    metrics_generator:
+      registry:
+        external_labels:
+          source: tempo
+          cluster: betelgeuse-k8s
+      storage:
+        path: /tmp/tempo/generator/wal
+        remote_write:
+          - url: http://prometheus.observability.svc.cluster.local:9090/api/v1/write
+            send_exemplars: true
+
+    storage:
+      trace:
+        backend: local
+        wal:
+          path: /tmp/tempo/wal
+        local:
+          path: /tmp/tempo/blocks
+        pool:
+          max_workers: 100
+          queue_depth: 10000
+
+    querier:
+      frontend_worker:
+        frontend_address: tempo.observability.svc.cluster.local:9095
+
+    query_frontend:
+      search:
+        duration_slo: 5s
+        throughput_bytes_slo: 1.073741824e+09
+      trace_by_id:
+        duration_slo: 5s
+
+    overrides:
+      defaults:
+        metrics_generator:
+          processors: [service-graphs, span-metrics]
--- a/k8s/observability-stack/06-alloy-config.yaml
+++ b/k8s/observability-stack/06-alloy-config.yaml
@ -0,0 +1,159 @@
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: alloy-config
+  namespace: observability
+data:
+  config.alloy: |
+    // Logging configuration
+    logging {
+      level  = "info"
+      format = "logfmt"
+    }
+
+    // Discover Kubernetes pods for log collection
+    discovery.kubernetes "pods" {
+      role = "pod"
+    }
+
+    // Discover Kubernetes nodes
+    discovery.kubernetes "nodes" {
+      role = "node"
+    }
+
+    // Relabel pods for log collection
+    discovery.relabel "pod_logs" {
+      targets = discovery.kubernetes.pods.targets
+
+      // Only scrape pods with logs
+      rule {
+        source_labels = ["__meta_kubernetes_pod_container_name"]
+        action        = "keep"
+        regex         = ".+"
+      }
+
+      // Set the log path
+      rule {
+        source_labels = ["__meta_kubernetes_pod_uid", "__meta_kubernetes_pod_container_name"]
+        target_label  = "__path__"
+        separator     = "/"
+        replacement   = "/var/log/pods/*$1/*.log"
+      }
+
+      // Set namespace label
+      rule {
+        source_labels = ["__meta_kubernetes_namespace"]
+        target_label  = "namespace"
+      }
+
+      // Set pod name label
+      rule {
+        source_labels = ["__meta_kubernetes_pod_name"]
+        target_label  = "pod"
+      }
+
+      // Set container name label
+      rule {
+        source_labels = ["__meta_kubernetes_pod_container_name"]
+        target_label  = "container"
+      }
+
+      // Set node name label
+      rule {
+        source_labels = ["__meta_kubernetes_pod_node_name"]
+        target_label  = "node"
+      }
+
+      // Copy all pod labels
+      rule {
+        action = "labelmap"
+        regex  = "__meta_kubernetes_pod_label_(.+)"
+      }
+    }
+
+    // Read logs from discovered pods
+    loki.source.kubernetes "pod_logs" {
+      targets    = discovery.relabel.pod_logs.output
+      forward_to = [loki.process.pod_logs.receiver]
+    }
+
+    // Process and enrich logs
+    loki.process "pod_logs" {
+      forward_to = [loki.write.local.receiver]
+
+      // Parse JSON logs
+      stage.json {
+        expressions = {
+          level   = "level",
+          message = "message",
+          timestamp = "timestamp",
+        }
+      }
+
+      // Extract log level
+      stage.labels {
+        values = {
+          level = "",
+        }
+      }
+
+      // Add cluster label
+      stage.static_labels {
+        values = {
+          cluster = "betelgeuse-k8s",
+        }
+      }
+    }
+
+    // Write logs to Loki
+    loki.write "local" {
+      endpoint {
+        url = "http://loki.observability.svc.cluster.local:3100/loki/api/v1/push"
+      }
+    }
+
+    // OpenTelemetry receiver for traces
+    otelcol.receiver.otlp "default" {
+      grpc {
+        endpoint = "0.0.0.0:4317"
+      }
+
+      http {
+        endpoint = "0.0.0.0:4318"
+      }
+
+      output {
+        traces  = [otelcol.exporter.otlp.tempo.input]
+        metrics = [otelcol.exporter.prometheus.metrics.input]
+      }
+    }
+
+    // Export traces to Tempo
+    otelcol.exporter.otlp "tempo" {
+      client {
+        endpoint = "tempo.observability.svc.cluster.local:4317"
+        tls {
+          insecure = true
+        }
+      }
+    }
+
+    // Export OTLP metrics to Prometheus
+    otelcol.exporter.prometheus "metrics" {
+      forward_to = [prometheus.remote_write.local.receiver]
+    }
+
+    // Remote write to Prometheus
+    prometheus.remote_write "local" {
+      endpoint {
+        url = "http://prometheus.observability.svc.cluster.local:9090/api/v1/write"
+      }
+    }
+
+    // Scrape local metrics (Alloy's own metrics)
+    prometheus.scrape "alloy" {
+      targets = [{
+        __address__ = "localhost:12345",
+      }]
+      forward_to = [prometheus.remote_write.local.receiver]
+    }
--- a/k8s/observability-stack/07-grafana-datasources.yaml
+++ b/k8s/observability-stack/07-grafana-datasources.yaml
@ -0,0 +1,62 @@
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: grafana-datasources
+  namespace: observability
+data:
+  datasources.yaml: |
+    apiVersion: 1
+    datasources:
+      # Prometheus
+      - name: Prometheus
+        type: prometheus
+        access: proxy
+        url: http://prometheus.observability.svc.cluster.local:9090
+        isDefault: true
+        editable: true
+        jsonData:
+          timeInterval: 15s
+          queryTimeout: 60s
+          httpMethod: POST
+
+      # Loki
+      - name: Loki
+        type: loki
+        access: proxy
+        url: http://loki.observability.svc.cluster.local:3100
+        editable: true
+        jsonData:
+          maxLines: 1000
+          derivedFields:
+            - datasourceUid: tempo
+              matcherRegex: "traceID=(\\w+)"
+              name: TraceID
+              url: "$${__value.raw}"
+
+      # Tempo
+      - name: Tempo
+        type: tempo
+        access: proxy
+        url: http://tempo.observability.svc.cluster.local:3200
+        editable: true
+        uid: tempo
+        jsonData:
+          tracesToLogsV2:
+            datasourceUid: loki
+            spanStartTimeShift: -1h
+            spanEndTimeShift: 1h
+            filterByTraceID: true
+            filterBySpanID: false
+            customQuery: false
+          tracesToMetrics:
+            datasourceUid: prometheus
+            spanStartTimeShift: -1h
+            spanEndTimeShift: 1h
+          serviceMap:
+            datasourceUid: prometheus
+          nodeGraph:
+            enabled: true
+          search:
+            hide: false
+          lokiSearch:
+            datasourceUid: loki
--- a/k8s/observability-stack/08-rbac.yaml
+++ b/k8s/observability-stack/08-rbac.yaml
@ -0,0 +1,178 @@
+---
+# Prometheus ServiceAccount
+apiVersion: v1
+kind: ServiceAccount
+metadata:
+  name: prometheus
+  namespace: observability
+
+---
+# Prometheus ClusterRole
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRole
+metadata:
+  name: prometheus
+rules:
+  - apiGroups: [""]
+    resources:
+      - nodes
+      - nodes/proxy
+      - services
+      - endpoints
+      - pods
+    verbs: ["get", "list", "watch"]
+  - apiGroups:
+      - extensions
+    resources:
+      - ingresses
+    verbs: ["get", "list", "watch"]
+  - nonResourceURLs: ["/metrics"]
+    verbs: ["get"]
+
+---
+# Prometheus ClusterRoleBinding
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRoleBinding
+metadata:
+  name: prometheus
+roleRef:
+  apiGroup: rbac.authorization.k8s.io
+  kind: ClusterRole
+  name: prometheus
+subjects:
+  - kind: ServiceAccount
+    name: prometheus
+    namespace: observability
+
+---
+# Alloy ServiceAccount
+apiVersion: v1
+kind: ServiceAccount
+metadata:
+  name: alloy
+  namespace: observability
+
+---
+# Alloy ClusterRole
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRole
+metadata:
+  name: alloy
+rules:
+  - apiGroups: [""]
+    resources:
+      - nodes
+      - nodes/proxy
+      - services
+      - endpoints
+      - pods
+    verbs: ["get", "list", "watch"]
+  - apiGroups:
+      - extensions
+    resources:
+      - ingresses
+    verbs: ["get", "list", "watch"]
+
+---
+# Alloy ClusterRoleBinding
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRoleBinding
+metadata:
+  name: alloy
+roleRef:
+  apiGroup: rbac.authorization.k8s.io
+  kind: ClusterRole
+  name: alloy
+subjects:
+  - kind: ServiceAccount
+    name: alloy
+    namespace: observability
+
+---
+# kube-state-metrics ServiceAccount
+apiVersion: v1
+kind: ServiceAccount
+metadata:
+  name: kube-state-metrics
+  namespace: observability
+
+---
+# kube-state-metrics ClusterRole
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRole
+metadata:
+  name: kube-state-metrics
+rules:
+  - apiGroups: [""]
+    resources:
+      - configmaps
+      - secrets
+      - nodes
+      - pods
+      - services
+      - resourcequotas
+      - replicationcontrollers
+      - limitranges
+      - persistentvolumeclaims
+      - persistentvolumes
+      - namespaces
+      - endpoints
+    verbs: ["list", "watch"]
+  - apiGroups: ["apps"]
+    resources:
+      - statefulsets
+      - daemonsets
+      - deployments
+      - replicasets
+    verbs: ["list", "watch"]
+  - apiGroups: ["batch"]
+    resources:
+      - cronjobs
+      - jobs
+    verbs: ["list", "watch"]
+  - apiGroups: ["autoscaling"]
+    resources:
+      - horizontalpodautoscalers
+    verbs: ["list", "watch"]
+  - apiGroups: ["policy"]
+    resources:
+      - poddisruptionbudgets
+    verbs: ["list", "watch"]
+  - apiGroups: ["certificates.k8s.io"]
+    resources:
+      - certificatesigningrequests
+    verbs: ["list", "watch"]
+  - apiGroups: ["storage.k8s.io"]
+    resources:
+      - storageclasses
+      - volumeattachments
+    verbs: ["list", "watch"]
+  - apiGroups: ["admissionregistration.k8s.io"]
+    resources:
+      - mutatingwebhookconfigurations
+      - validatingwebhookconfigurations
+    verbs: ["list", "watch"]
+  - apiGroups: ["networking.k8s.io"]
+    resources:
+      - networkpolicies
+      - ingresses
+    verbs: ["list", "watch"]
+  - apiGroups: ["coordination.k8s.io"]
+    resources:
+      - leases
+    verbs: ["list", "watch"]
+
+---
+# kube-state-metrics ClusterRoleBinding
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRoleBinding
+metadata:
+  name: kube-state-metrics
+roleRef:
+  apiGroup: rbac.authorization.k8s.io
+  kind: ClusterRole
+  name: kube-state-metrics
+subjects:
+  - kind: ServiceAccount
+    name: kube-state-metrics
+    namespace: observability
--- a/k8s/observability-stack/10-prometheus.yaml
+++ b/k8s/observability-stack/10-prometheus.yaml
@ -0,0 +1,90 @@
+apiVersion: apps/v1
+kind: StatefulSet
+metadata:
+  name: prometheus
+  namespace: observability
+  labels:
+    app: prometheus
+spec:
+  serviceName: prometheus
+  replicas: 1
+  selector:
+    matchLabels:
+      app: prometheus
+  template:
+    metadata:
+      labels:
+        app: prometheus
+      annotations:
+        prometheus.io/scrape: "true"
+        prometheus.io/port: "9090"
+    spec:
+      serviceAccountName: prometheus
+      nodeSelector:
+        kubernetes.io/hostname: hetzner-2
+      containers:
+        - name: prometheus
+          image: prom/prometheus:v2.54.1
+          args:
+            - '--config.file=/etc/prometheus/prometheus.yml'
+            - '--storage.tsdb.path=/prometheus'
+            - '--storage.tsdb.retention.time=7d'
+            - '--web.console.libraries=/usr/share/prometheus/console_libraries'
+            - '--web.console.templates=/usr/share/prometheus/consoles'
+            - '--web.enable-lifecycle'
+            - '--web.enable-admin-api'
+          ports:
+            - name: http
+              containerPort: 9090
+              protocol: TCP
+          livenessProbe:
+            httpGet:
+              path: /-/healthy
+              port: http
+            initialDelaySeconds: 30
+            periodSeconds: 10
+            timeoutSeconds: 5
+          readinessProbe:
+            httpGet:
+              path: /-/ready
+              port: http
+            initialDelaySeconds: 30
+            periodSeconds: 10
+            timeoutSeconds: 5
+          resources:
+            requests:
+              cpu: 500m
+              memory: 2Gi
+            limits:
+              cpu: 2000m
+              memory: 4Gi
+          volumeMounts:
+            - name: prometheus-config
+              mountPath: /etc/prometheus
+            - name: prometheus-data
+              mountPath: /prometheus
+      volumes:
+        - name: prometheus-config
+          configMap:
+            name: prometheus-config
+        - name: prometheus-data
+          persistentVolumeClaim:
+            claimName: prometheus-data
+
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: prometheus
+  namespace: observability
+  labels:
+    app: prometheus
+spec:
+  type: ClusterIP
+  ports:
+    - port: 9090
+      targetPort: http
+      protocol: TCP
+      name: http
+  selector:
+    app: prometheus
--- a/k8s/observability-stack/11-loki.yaml
+++ b/k8s/observability-stack/11-loki.yaml
@ -0,0 +1,96 @@
+apiVersion: apps/v1
+kind: StatefulSet
+metadata:
+  name: loki
+  namespace: observability
+  labels:
+    app: loki
+spec:
+  serviceName: loki
+  replicas: 1
+  selector:
+    matchLabels:
+      app: loki
+  template:
+    metadata:
+      labels:
+        app: loki
+      annotations:
+        prometheus.io/scrape: "true"
+        prometheus.io/port: "3100"
+    spec:
+      nodeSelector:
+        kubernetes.io/hostname: hetzner-2
+      securityContext:
+        fsGroup: 10001
+        runAsGroup: 10001
+        runAsNonRoot: true
+        runAsUser: 10001
+      containers:
+        - name: loki
+          image: grafana/loki:3.2.1
+          args:
+            - '-config.file=/etc/loki/loki.yaml'
+            - '-target=all'
+          ports:
+            - name: http
+              containerPort: 3100
+              protocol: TCP
+            - name: grpc
+              containerPort: 9096
+              protocol: TCP
+          livenessProbe:
+            httpGet:
+              path: /ready
+              port: http
+            initialDelaySeconds: 45
+            periodSeconds: 10
+            timeoutSeconds: 5
+          readinessProbe:
+            httpGet:
+              path: /ready
+              port: http
+            initialDelaySeconds: 45
+            periodSeconds: 10
+            timeoutSeconds: 5
+          resources:
+            requests:
+              cpu: 500m
+              memory: 1Gi
+            limits:
+              cpu: 2000m
+              memory: 2Gi
+          volumeMounts:
+            - name: loki-config
+              mountPath: /etc/loki
+            - name: loki-data
+              mountPath: /loki
+      volumes:
+        - name: loki-config
+          configMap:
+            name: loki-config
+        - name: loki-data
+          persistentVolumeClaim:
+            claimName: loki-data
+
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: loki
+  namespace: observability
+  labels:
+    app: loki
+spec:
+  type: ClusterIP
+  ports:
+    - port: 3100
+      targetPort: http
+      protocol: TCP
+      name: http
+    - port: 9096
+      targetPort: grpc
+      protocol: TCP
+      name: grpc
+  selector:
+    app: loki
--- a/k8s/observability-stack/12-tempo.yaml
+++ b/k8s/observability-stack/12-tempo.yaml
@ -0,0 +1,118 @@
+apiVersion: apps/v1
+kind: StatefulSet
+metadata:
+  name: tempo
+  namespace: observability
+  labels:
+    app: tempo
+spec:
+  serviceName: tempo
+  replicas: 1
+  selector:
+    matchLabels:
+      app: tempo
+  template:
+    metadata:
+      labels:
+        app: tempo
+      annotations:
+        prometheus.io/scrape: "true"
+        prometheus.io/port: "3200"
+    spec:
+      nodeSelector:
+        kubernetes.io/hostname: hetzner-2
+      containers:
+        - name: tempo
+          image: grafana/tempo:2.6.1
+          args:
+            - '-config.file=/etc/tempo/tempo.yaml'
+          ports:
+            - name: http
+              containerPort: 3200
+              protocol: TCP
+            - name: otlp-grpc
+              containerPort: 4317
+              protocol: TCP
+            - name: otlp-http
+              containerPort: 4318
+              protocol: TCP
+            - name: jaeger-grpc
+              containerPort: 14250
+              protocol: TCP
+            - name: jaeger-http
+              containerPort: 14268
+              protocol: TCP
+            - name: zipkin
+              containerPort: 9411
+              protocol: TCP
+          livenessProbe:
+            httpGet:
+              path: /ready
+              port: http
+            initialDelaySeconds: 30
+            periodSeconds: 10
+            timeoutSeconds: 5
+          readinessProbe:
+            httpGet:
+              path: /ready
+              port: http
+            initialDelaySeconds: 30
+            periodSeconds: 10
+            timeoutSeconds: 5
+          resources:
+            requests:
+              cpu: 500m
+              memory: 1Gi
+            limits:
+              cpu: 2000m
+              memory: 2Gi
+          volumeMounts:
+            - name: tempo-config
+              mountPath: /etc/tempo
+            - name: tempo-data
+              mountPath: /tmp/tempo
+      volumes:
+        - name: tempo-config
+          configMap:
+            name: tempo-config
+        - name: tempo-data
+          persistentVolumeClaim:
+            claimName: tempo-data
+
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: tempo
+  namespace: observability
+  labels:
+    app: tempo
+spec:
+  type: ClusterIP
+  ports:
+    - port: 3200
+      targetPort: http
+      protocol: TCP
+      name: http
+    - port: 4317
+      targetPort: otlp-grpc
+      protocol: TCP
+      name: otlp-grpc
+    - port: 4318
+      targetPort: otlp-http
+      protocol: TCP
+      name: otlp-http
+    - port: 14250
+      targetPort: jaeger-grpc
+      protocol: TCP
+      name: jaeger-grpc
+    - port: 14268
+      targetPort: jaeger-http
+      protocol: TCP
+      name: jaeger-http
+    - port: 9411
+      targetPort: zipkin
+      protocol: TCP
+      name: zipkin
+  selector:
+    app: tempo
--- a/k8s/observability-stack/13-grafana.yaml
+++ b/k8s/observability-stack/13-grafana.yaml
@ -0,0 +1,97 @@
+apiVersion: apps/v1
+kind: StatefulSet
+metadata:
+  name: grafana
+  namespace: observability
+  labels:
+    app: grafana
+spec:
+  serviceName: grafana
+  replicas: 1
+  selector:
+    matchLabels:
+      app: grafana
+  template:
+    metadata:
+      labels:
+        app: grafana
+    spec:
+      nodeSelector:
+        kubernetes.io/hostname: hetzner-2
+      securityContext:
+        fsGroup: 472
+        runAsGroup: 472
+        runAsUser: 472
+      containers:
+        - name: grafana
+          image: grafana/grafana:11.4.0
+          ports:
+            - name: http
+              containerPort: 3000
+              protocol: TCP
+          env:
+            - name: GF_SECURITY_ADMIN_USER
+              value: admin
+            - name: GF_SECURITY_ADMIN_PASSWORD
+              value: admin  # Change this in production!
+            - name: GF_INSTALL_PLUGINS
+              value: ""
+            - name: GF_FEATURE_TOGGLES_ENABLE
+              value: "traceqlEditor,correlations"
+            - name: GF_AUTH_ANONYMOUS_ENABLED
+              value: "false"
+            - name: GF_ANALYTICS_REPORTING_ENABLED
+              value: "false"
+            - name: GF_ANALYTICS_CHECK_FOR_UPDATES
+              value: "false"
+          livenessProbe:
+            httpGet:
+              path: /api/health
+              port: http
+            initialDelaySeconds: 60
+            periodSeconds: 10
+            timeoutSeconds: 5
+          readinessProbe:
+            httpGet:
+              path: /api/health
+              port: http
+            initialDelaySeconds: 60
+            periodSeconds: 10
+            timeoutSeconds: 5
+          resources:
+            requests:
+              cpu: 250m
+              memory: 512Mi
+            limits:
+              cpu: 1000m
+              memory: 1Gi
+          volumeMounts:
+            - name: grafana-data
+              mountPath: /var/lib/grafana
+            - name: grafana-datasources
+              mountPath: /etc/grafana/provisioning/datasources
+      volumes:
+        - name: grafana-data
+          persistentVolumeClaim:
+            claimName: grafana-data
+        - name: grafana-datasources
+          configMap:
+            name: grafana-datasources
+
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: grafana
+  namespace: observability
+  labels:
+    app: grafana
+spec:
+  type: ClusterIP
+  ports:
+    - port: 3000
+      targetPort: http
+      protocol: TCP
+      name: http
+  selector:
+    app: grafana
--- a/k8s/observability-stack/14-alloy.yaml
+++ b/k8s/observability-stack/14-alloy.yaml
@ -0,0 +1,107 @@
+apiVersion: apps/v1
+kind: DaemonSet
+metadata:
+  name: alloy
+  namespace: observability
+  labels:
+    app: alloy
+spec:
+  selector:
+    matchLabels:
+      app: alloy
+  template:
+    metadata:
+      labels:
+        app: alloy
+    spec:
+      serviceAccountName: alloy
+      hostNetwork: true
+      hostPID: true
+      dnsPolicy: ClusterFirstWithHostNet
+      containers:
+        - name: alloy
+          image: grafana/alloy:v1.5.1
+          args:
+            - run
+            - /etc/alloy/config.alloy
+            - --storage.path=/var/lib/alloy
+            - --server.http.listen-addr=0.0.0.0:12345
+          ports:
+            - name: http-metrics
+              containerPort: 12345
+              protocol: TCP
+            - name: otlp-grpc
+              containerPort: 4317
+              protocol: TCP
+            - name: otlp-http
+              containerPort: 4318
+              protocol: TCP
+          env:
+            - name: HOSTNAME
+              valueFrom:
+                fieldRef:
+                  fieldPath: spec.nodeName
+          securityContext:
+            privileged: true
+            runAsUser: 0
+          resources:
+            requests:
+              cpu: 100m
+              memory: 256Mi
+            limits:
+              cpu: 500m
+              memory: 512Mi
+          volumeMounts:
+            - name: config
+              mountPath: /etc/alloy
+            - name: varlog
+              mountPath: /var/log
+              readOnly: true
+            - name: varlibdockercontainers
+              mountPath: /var/lib/docker/containers
+              readOnly: true
+            - name: etcmachineid
+              mountPath: /etc/machine-id
+              readOnly: true
+      tolerations:
+        - effect: NoSchedule
+          operator: Exists
+      volumes:
+        - name: config
+          configMap:
+            name: alloy-config
+        - name: varlog
+          hostPath:
+            path: /var/log
+        - name: varlibdockercontainers
+          hostPath:
+            path: /var/lib/docker/containers
+        - name: etcmachineid
+          hostPath:
+            path: /etc/machine-id
+
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: alloy
+  namespace: observability
+  labels:
+    app: alloy
+spec:
+  type: ClusterIP
+  ports:
+    - port: 12345
+      targetPort: http-metrics
+      protocol: TCP
+      name: http-metrics
+    - port: 4317
+      targetPort: otlp-grpc
+      protocol: TCP
+      name: otlp-grpc
+    - port: 4318
+      targetPort: otlp-http
+      protocol: TCP
+      name: otlp-http
+  selector:
+    app: alloy
--- a/k8s/observability-stack/15-kube-state-metrics.yaml
+++ b/k8s/observability-stack/15-kube-state-metrics.yaml
@ -0,0 +1,71 @@
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: kube-state-metrics
+  namespace: observability
+  labels:
+    app: kube-state-metrics
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: kube-state-metrics
+  template:
+    metadata:
+      labels:
+        app: kube-state-metrics
+      annotations:
+        prometheus.io/scrape: "true"
+        prometheus.io/port: "8080"
+    spec:
+      serviceAccountName: kube-state-metrics
+      containers:
+        - name: kube-state-metrics
+          image: registry.k8s.io/kube-state-metrics/kube-state-metrics:v2.13.0
+          ports:
+            - name: http-metrics
+              containerPort: 8080
+            - name: telemetry
+              containerPort: 8081
+          livenessProbe:
+            httpGet:
+              path: /healthz
+              port: 8080
+            initialDelaySeconds: 5
+            timeoutSeconds: 5
+          readinessProbe:
+            httpGet:
+              path: /
+              port: 8080
+            initialDelaySeconds: 5
+            timeoutSeconds: 5
+          resources:
+            requests:
+              cpu: 100m
+              memory: 128Mi
+            limits:
+              cpu: 200m
+              memory: 256Mi
+
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: kube-state-metrics
+  namespace: observability
+  labels:
+    app: kube-state-metrics
+  annotations:
+    prometheus.io/scrape: "true"
+    prometheus.io/port: "8080"
+spec:
+  type: ClusterIP
+  ports:
+    - name: http-metrics
+      port: 8080
+      targetPort: http-metrics
+    - name: telemetry
+      port: 8081
+      targetPort: telemetry
+  selector:
+    app: kube-state-metrics
--- a/k8s/observability-stack/16-node-exporter.yaml
+++ b/k8s/observability-stack/16-node-exporter.yaml
@ -0,0 +1,85 @@
+apiVersion: apps/v1
+kind: DaemonSet
+metadata:
+  name: node-exporter
+  namespace: observability
+  labels:
+    app: node-exporter
+spec:
+  selector:
+    matchLabels:
+      app: node-exporter
+  template:
+    metadata:
+      labels:
+        app: node-exporter
+      annotations:
+        prometheus.io/scrape: "true"
+        prometheus.io/port: "9100"
+    spec:
+      hostNetwork: true
+      hostPID: true
+      containers:
+        - name: node-exporter
+          image: prom/node-exporter:v1.8.2
+          args:
+            - --path.procfs=/host/proc
+            - --path.sysfs=/host/sys
+            - --path.rootfs=/host/root
+            - --collector.filesystem.mount-points-exclude=^/(sys|proc|dev|host|etc)($$|/)
+          ports:
+            - name: metrics
+              containerPort: 9100
+              protocol: TCP
+          resources:
+            requests:
+              cpu: 100m
+              memory: 128Mi
+            limits:
+              cpu: 200m
+              memory: 256Mi
+          volumeMounts:
+            - name: proc
+              mountPath: /host/proc
+              readOnly: true
+            - name: sys
+              mountPath: /host/sys
+              readOnly: true
+            - name: root
+              mountPath: /host/root
+              mountPropagation: HostToContainer
+              readOnly: true
+      tolerations:
+        - effect: NoSchedule
+          operator: Exists
+      volumes:
+        - name: proc
+          hostPath:
+            path: /proc
+        - name: sys
+          hostPath:
+            path: /sys
+        - name: root
+          hostPath:
+            path: /
+
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: node-exporter
+  namespace: observability
+  labels:
+    app: node-exporter
+  annotations:
+    prometheus.io/scrape: "true"
+    prometheus.io/port: "9100"
+spec:
+  type: ClusterIP
+  clusterIP: None
+  ports:
+    - name: metrics
+      port: 9100
+      targetPort: metrics
+  selector:
+    app: node-exporter
--- a/k8s/observability-stack/20-grafana-ingress.yaml
+++ b/k8s/observability-stack/20-grafana-ingress.yaml
@ -0,0 +1,26 @@
+apiVersion: networking.k8s.io/v1
+kind: Ingress
+metadata:
+  name: grafana-ingress
+  namespace: observability
+  annotations:
+    cert-manager.io/cluster-issuer: "letsencrypt-prod"
+    nginx.ingress.kubernetes.io/ssl-redirect: "true"
+    nginx.ingress.kubernetes.io/force-ssl-redirect: "true"
+spec:
+  ingressClassName: nginx
+  tls:
+    - hosts:
+        - grafana.betelgeusebytes.io
+      secretName: grafana-tls
+  rules:
+    - host: grafana.betelgeusebytes.io
+      http:
+        paths:
+          - path: /
+            pathType: Prefix
+            backend:
+              service:
+                name: grafana
+                port:
+                  number: 3000
--- a/k8s/observability-stack/21-optional-ingresses.yaml
+++ b/k8s/observability-stack/21-optional-ingresses.yaml
@ -0,0 +1,90 @@
+---
+# Optional: Prometheus Ingress (for direct access to Prometheus UI)
+apiVersion: networking.k8s.io/v1
+kind: Ingress
+metadata:
+  name: prometheus-ingress
+  namespace: observability
+  annotations:
+    cert-manager.io/cluster-issuer: "letsencrypt-prod"
+    nginx.ingress.kubernetes.io/ssl-redirect: "true"
+    nginx.ingress.kubernetes.io/force-ssl-redirect: "true"
+    # Optional: Add basic auth for security
+    # nginx.ingress.kubernetes.io/auth-type: basic
+    # nginx.ingress.kubernetes.io/auth-secret: prometheus-basic-auth
+    # nginx.ingress.kubernetes.io/auth-realm: 'Authentication Required'
+spec:
+  ingressClassName: nginx
+  tls:
+    - hosts:
+        - prometheus.betelgeusebytes.io
+      secretName: prometheus-tls
+  rules:
+    - host: prometheus.betelgeusebytes.io
+      http:
+        paths:
+          - path: /
+            pathType: Prefix
+            backend:
+              service:
+                name: prometheus
+                port:
+                  number: 9090
+
+---
+# Optional: Loki Ingress (for direct API access)
+apiVersion: networking.k8s.io/v1
+kind: Ingress
+metadata:
+  name: loki-ingress
+  namespace: observability
+  annotations:
+    cert-manager.io/cluster-issuer: "letsencrypt-prod"
+    nginx.ingress.kubernetes.io/ssl-redirect: "true"
+    nginx.ingress.kubernetes.io/force-ssl-redirect: "true"
+spec:
+  ingressClassName: nginx
+  tls:
+    - hosts:
+        - loki.betelgeusebytes.io
+      secretName: loki-tls
+  rules:
+    - host: loki.betelgeusebytes.io
+      http:
+        paths:
+          - path: /
+            pathType: Prefix
+            backend:
+              service:
+                name: loki
+                port:
+                  number: 3100
+
+---
+# Optional: Tempo Ingress (for direct API access)
+apiVersion: networking.k8s.io/v1
+kind: Ingress
+metadata:
+  name: tempo-ingress
+  namespace: observability
+  annotations:
+    cert-manager.io/cluster-issuer: "letsencrypt-prod"
+    nginx.ingress.kubernetes.io/ssl-redirect: "true"
+    nginx.ingress.kubernetes.io/force-ssl-redirect: "true"
+spec:
+  ingressClassName: nginx
+  tls:
+    - hosts:
+        - tempo.betelgeusebytes.io
+      secretName: tempo-tls
+  rules:
+    - host: tempo.betelgeusebytes.io
+      http:
+        paths:
+          - path: /
+            pathType: Prefix
+            backend:
+              service:
+                name: tempo
+                port:
+                  number: 3200
--- a/k8s/observability-stack/DEPLOYMENT-CHECKLIST.md
+++ b/k8s/observability-stack/DEPLOYMENT-CHECKLIST.md
@ -0,0 +1,359 @@
+# Observability Stack Deployment Checklist
+
+Use this checklist to ensure a smooth deployment of the observability stack.
+
+## Pre-Deployment
+
+### Check for Existing Monitoring Stack
+- [ ] Check if you have existing monitoring components:
+```bash
+# Check for monitoring namespaces
+kubectl get namespaces | grep -E "(monitoring|prometheus|grafana|loki|tempo)"
+
+# Check for monitoring pods in common namespaces
+kubectl get pods -n monitoring 2>/dev/null || true
+kubectl get pods -n prometheus 2>/dev/null || true
+kubectl get pods -n grafana 2>/dev/null || true
+kubectl get pods -A | grep -E "(prometheus|grafana|loki|tempo|fluent-bit|vector)"
+
+# Check for Helm releases
+helm list -A | grep -E "(prometheus|grafana|loki|tempo)"
+```
+
+- [ ] If existing monitoring is found, remove it first:
+```bash
+./remove-old-monitoring.sh
+```
+
+**OR** run the deployment script which will prompt you:
+```bash
+./deploy.sh  # Will ask if you want to clean up first
+```
+
+### Prerequisites
+- [ ] Kubernetes cluster is running
+- [ ] NGINX Ingress Controller is installed
+- [ ] cert-manager is installed with Let's Encrypt ClusterIssuer
+- [ ] DNS record `grafana.betelgeusebytes.io` points to cluster IP
+- [ ] Node is labeled `kubernetes.io/hostname=hetzner-2`
+- [ ] kubectl is configured and working
+
+### Verify Prerequisites
+```bash
+# Check cluster
+kubectl cluster-info
+
+# Check NGINX Ingress
+kubectl get pods -n ingress-nginx
+
+# Check cert-manager
+kubectl get pods -n cert-manager
+
+# Check node label
+kubectl get nodes --show-labels | grep hetzner-2
+
+# Check DNS (from external machine)
+dig grafana.betelgeusebytes.io
+```
+
+## Deployment Steps
+
+### Step 1: Prepare Storage
+- [ ] SSH into hetzner-2 node
+- [ ] Create directories:
+```bash
+sudo mkdir -p /mnt/local-ssd/{prometheus,loki,tempo,grafana}
+```
+- [ ] Set correct permissions:
+```bash
+sudo chown -R 65534:65534 /mnt/local-ssd/prometheus
+sudo chown -R 10001:10001 /mnt/local-ssd/loki
+sudo chown -R root:root /mnt/local-ssd/tempo
+sudo chown -R 472:472 /mnt/local-ssd/grafana
+```
+- [ ] Verify permissions:
+```bash
+ls -la /mnt/local-ssd/
+```
+
+### Step 2: Review Configuration
+- [ ] Review `03-prometheus-config.yaml` - verify scrape targets
+- [ ] Review `04-loki-config.yaml` - verify retention (7 days)
+- [ ] Review `05-tempo-config.yaml` - verify retention (7 days)
+- [ ] Review `06-alloy-config.yaml` - verify endpoints
+- [ ] Review `20-grafana-ingress.yaml` - verify domain name
+
+### Step 3: Deploy the Stack
+- [ ] Navigate to observability-stack directory
+```bash
+cd /path/to/observability-stack
+```
+- [ ] Make scripts executable (already done):
+```bash
+chmod +x *.sh
+```
+- [ ] Run deployment script:
+```bash
+./deploy.sh
+```
+OR deploy manually:
+```bash
+kubectl apply -f 00-namespace.yaml
+kubectl apply -f 01-persistent-volumes.yaml
+kubectl apply -f 02-persistent-volume-claims.yaml
+kubectl apply -f 03-prometheus-config.yaml
+kubectl apply -f 04-loki-config.yaml
+kubectl apply -f 05-tempo-config.yaml
+kubectl apply -f 06-alloy-config.yaml
+kubectl apply -f 07-grafana-datasources.yaml
+kubectl apply -f 08-rbac.yaml
+kubectl apply -f 10-prometheus.yaml
+kubectl apply -f 11-loki.yaml
+kubectl apply -f 12-tempo.yaml
+kubectl apply -f 13-grafana.yaml
+kubectl apply -f 14-alloy.yaml
+kubectl apply -f 15-kube-state-metrics.yaml
+kubectl apply -f 16-node-exporter.yaml
+kubectl apply -f 20-grafana-ingress.yaml
+```
+
+### Step 4: Verify Deployment
+- [ ] Run status check:
+```bash
+./status.sh
+```
+- [ ] Check all PersistentVolumes are Bound:
+```bash
+kubectl get pv
+```
+- [ ] Check all PersistentVolumeClaims are Bound:
+```bash
+kubectl get pvc -n observability
+```
+- [ ] Check all pods are Running:
+```bash
+kubectl get pods -n observability
+```
+Expected pods:
+  - [x] prometheus-0
+  - [x] loki-0
+  - [x] tempo-0
+  - [x] grafana-0
+  - [x] alloy-xxxxx (one per node)
+  - [x] kube-state-metrics-xxxxx
+  - [x] node-exporter-xxxxx (one per node)
+
+- [ ] Check services are created:
+```bash
+kubectl get svc -n observability
+```
+- [ ] Check ingress is created:
+```bash
+kubectl get ingress -n observability
+```
+- [ ] Verify TLS certificate is issued:
+```bash
+kubectl get certificate -n observability
+kubectl describe certificate grafana-tls -n observability
+```
+
+### Step 5: Test Connectivity
+- [ ] Test Prometheus endpoint:
+```bash
+kubectl run -it --rm test --image=curlimages/curl --restart=Never -- \
+  curl http://prometheus.observability.svc.cluster.local:9090/-/healthy
+```
+- [ ] Test Loki endpoint:
+```bash
+kubectl run -it --rm test --image=curlimages/curl --restart=Never -- \
+  curl http://loki.observability.svc.cluster.local:3100/ready
+```
+- [ ] Test Tempo endpoint:
+```bash
+kubectl run -it --rm test --image=curlimages/curl --restart=Never -- \
+  curl http://tempo.observability.svc.cluster.local:3200/ready
+```
+- [ ] Test Grafana endpoint:
+```bash
+kubectl run -it --rm test --image=curlimages/curl --restart=Never -- \
+  curl http://grafana.observability.svc.cluster.local:3000/api/health
+```
+
+## Post-Deployment Configuration
+
+### Step 6: Access Grafana
+- [ ] Open browser to: https://grafana.betelgeusebytes.io
+- [ ] Login with default credentials:
+  - Username: `admin`
+  - Password: `admin`
+- [ ] **CRITICAL**: Change admin password immediately
+- [ ] Verify datasources are configured:
+  - Go to Configuration → Data Sources
+  - Should see: Prometheus (default), Loki, Tempo
+  - Click "Test" on each datasource
+
+### Step 7: Verify Data Collection
+- [ ] Check Prometheus has targets:
+  - In Grafana, Explore → Prometheus
+  - Query: `up`
+  - Should see multiple targets with value=1
+- [ ] Check Loki is receiving logs:
+  - In Grafana, Explore → Loki
+  - Query: `{namespace="observability"}`
+  - Should see logs from observability stack
+- [ ] Check kube-state-metrics:
+  - In Grafana, Explore → Prometheus
+  - Query: `kube_pod_status_phase`
+  - Should see pod status metrics
+
+### Step 8: Import Dashboards (Optional)
+- [ ] Import Kubernetes cluster dashboard:
+  - Dashboards → Import → ID: 315
+- [ ] Import Node Exporter dashboard:
+  - Dashboards → Import → ID: 1860
+- [ ] Import Loki dashboard:
+  - Dashboards → Import → ID: 13639
+
+### Step 9: Test with Demo App (Optional)
+- [ ] Deploy demo application:
+```bash
+kubectl apply -f demo-app.yaml
+```
+- [ ] Wait for pod to be ready:
+```bash
+kubectl wait --for=condition=ready pod -l app=demo-app -n observability --timeout=300s
+```
+- [ ] Test the endpoints:
+```bash
+kubectl port-forward -n observability svc/demo-app 8080:8080
+# In another terminal:
+curl http://localhost:8080/
+curl http://localhost:8080/items
+curl http://localhost:8080/slow
+curl http://localhost:8080/error
+```
+- [ ] Verify in Grafana:
+  - Logs: `{app="demo-app"}`
+  - Metrics: `flask_http_request_total`
+  - Traces: Search for "demo-app" service in Tempo
+
+## Monitoring and Maintenance
+
+### Daily Checks
+- [ ] Check pod status: `kubectl get pods -n observability`
+- [ ] Check resource usage: `kubectl top pods -n observability`
+- [ ] Check disk usage on hetzner-2: `df -h /mnt/local-ssd/`
+
+### Weekly Checks
+- [ ] Review Grafana for any alerts or anomalies
+- [ ] Verify TLS certificate is valid
+- [ ] Check logs for any errors:
+```bash
+kubectl logs -n observability -l app=prometheus --tail=100
+kubectl logs -n observability -l app=loki --tail=100
+kubectl logs -n observability -l app=tempo --tail=100
+kubectl logs -n observability -l app=grafana --tail=100
+```
+
+### Monthly Checks
+- [ ] Review retention policies (7 days is appropriate)
+- [ ] Check storage growth trends
+- [ ] Review and update dashboards
+- [ ] Backup Grafana dashboards and configs
+
+## Troubleshooting Guide
+
+### Pod Won't Start
+1. Check events: `kubectl describe pod <pod-name> -n observability`
+2. Check logs: `kubectl logs <pod-name> -n observability`
+3. Check storage: `kubectl get pv` and `kubectl get pvc -n observability`
+4. Verify node has space: SSH to hetzner-2 and run `df -h`
+
+### No Logs Appearing
+1. Check Alloy pods: `kubectl get pods -n observability -l app=alloy`
+2. Check Alloy logs: `kubectl logs -n observability -l app=alloy`
+3. Check Loki is running: `kubectl get pods -n observability -l app=loki`
+4. Test Loki endpoint from Alloy pod
+
+### No Metrics Appearing
+1. Check Prometheus targets: Port-forward and visit http://localhost:9090/targets
+2. Check service discovery: Look for "kubernetes-*" targets
+3. Verify RBAC: `kubectl get clusterrolebinding prometheus`
+4. Check kube-state-metrics: `kubectl get pods -n observability -l app=kube-state-metrics`
+
+### Grafana Can't Connect to Datasources
+1. Test from Grafana pod:
+```bash
+kubectl exec -it grafana-0 -n observability -- wget -O- http://prometheus.observability.svc.cluster.local:9090/-/healthy
+```
+2. Check datasource configuration in Grafana UI
+3. Verify services exist: `kubectl get svc -n observability`
+
+### High Resource Usage
+1. Check actual usage: `kubectl top pods -n observability`
+2. Check node capacity: `kubectl top nodes`
+3. Consider reducing retention periods
+4. Review and adjust resource limits
+
+## Rollback Procedure
+
+If something goes wrong:
+
+1. Remove the deployment:
+```bash
+./cleanup.sh
+```
+
+2. Fix the issue in configuration files
+
+3. Redeploy:
+```bash
+./deploy.sh
+```
+
+## Success Criteria
+
+All checked items below indicate successful deployment:
+
+- [x] All pods are in Running state
+- [x] All PVCs are Bound
+- [x] Grafana is accessible at https://grafana.betelgeusebytes.io
+- [x] All three datasources (Prometheus, Loki, Tempo) test successfully
+- [x] Prometheus shows targets as "up"
+- [x] Loki shows logs from observability namespace
+- [x] TLS certificate is valid and auto-renewing
+- [x] Admin password has been changed
+- [x] Resource usage is within acceptable limits
+
+## Documentation References
+
+- **README.md**: Comprehensive documentation
+- **QUICKREF.md**: Quick reference for common operations
+- **demo-app.yaml**: Example instrumented application
+- **deploy.sh**: Automated deployment script
+- **cleanup.sh**: Removal script
+- **status.sh**: Status checking script
+
+## Next Steps After Deployment
+
+1. Import useful dashboards from Grafana.com
+2. Configure alerts (requires Alertmanager - not included)
+3. Instrument your applications to send logs/metrics/traces
+4. Create custom dashboards for your specific needs
+5. Set up backup procedures for Grafana dashboards
+6. Document your team's observability practices
+
+## Notes
+
+- Default retention: 7 days for all components
+- Default resources are optimized for single-node cluster
+- Scale up resources if monitoring high-traffic applications
+- Always backup before making configuration changes
+- Test changes in a non-production environment first
+
+---
+
+**Deployment Date**: _______________
+**Deployed By**: _______________
+**Grafana Version**: 11.4.0
+**Stack Version**: January 2025
--- a/k8s/observability-stack/DNS-SETUP.md
+++ b/k8s/observability-stack/DNS-SETUP.md
@ -0,0 +1,146 @@
+# DNS Configuration Guide
+
+## Required DNS Records
+
+### Minimum Setup (Recommended)
+
+Only **one** DNS record is required for basic operation:
+
+```
+grafana.betelgeusebytes.io    A/CNAME    <your-cluster-ip>
+```
+
+This gives you access to the complete observability stack through Grafana's unified interface.
+
+## Optional DNS Records
+
+If you want direct access to individual components, add these DNS records:
+
+```
+prometheus.betelgeusebytes.io    A/CNAME    <your-cluster-ip>
+loki.betelgeusebytes.io          A/CNAME    <your-cluster-ip>
+tempo.betelgeusebytes.io         A/CNAME    <your-cluster-ip>
+```
+
+Then deploy the optional ingresses:
+```bash
+kubectl apply -f 21-optional-ingresses.yaml
+```
+
+## DNS Record Types
+
+**Option 1: A Record (Direct IP)**
+```
+Type: A
+Name: grafana.betelgeusebytes.io
+Value: 1.2.3.4  (your cluster's public IP)
+TTL: 300
+```
+
+**Option 2: CNAME (Alias to another domain)**
+```
+Type: CNAME
+Name: grafana.betelgeusebytes.io
+Value: your-server.example.com
+TTL: 300
+```
+
+## Access URLs Summary
+
+### After DNS Setup
+
+| Service | URL | Purpose | DNS Required? |
+|---------|-----|---------|---------------|
+| **Grafana** | https://grafana.betelgeusebytes.io | Main dashboard (logs/metrics/traces) | ✅ Yes |
+| **Prometheus** | https://prometheus.betelgeusebytes.io | Metrics UI (optional) | ⚠️ Optional |
+| **Loki** | https://loki.betelgeusebytes.io | Logs API (optional) | ⚠️ Optional |
+| **Tempo** | https://tempo.betelgeusebytes.io | Traces API (optional) | ⚠️ Optional |
+
+### Internal (No DNS Needed)
+
+These services are accessible from within your cluster only:
+
+```
+# Metrics
+http://prometheus.observability.svc.cluster.local:9090
+
+# Logs
+http://loki.observability.svc.cluster.local:3100
+
+# Traces (OTLP endpoints for your apps)
+http://tempo.observability.svc.cluster.local:4317  # gRPC
+http://tempo.observability.svc.cluster.local:4318  # HTTP
+
+# Grafana (internal)
+http://grafana.observability.svc.cluster.local:3000
+```
+
+## Verification
+
+After setting up DNS, verify it's working:
+
+```bash
+# Check DNS resolution
+dig grafana.betelgeusebytes.io
+nslookup grafana.betelgeusebytes.io
+
+# Should return your cluster IP
+
+# Test HTTPS access
+curl -I https://grafana.betelgeusebytes.io
+# Should return 200 OK or 302 redirect
+```
+
+## TLS Certificate
+
+Let's Encrypt will automatically issue certificates for:
+- grafana.betelgeusebytes.io (required)
+- prometheus.betelgeusebytes.io (if optional ingress deployed)
+- loki.betelgeusebytes.io (if optional ingress deployed)
+- tempo.betelgeusebytes.io (if optional ingress deployed)
+
+Check certificate status:
+```bash
+kubectl get certificate -n observability
+kubectl describe certificate grafana-tls -n observability
+```
+
+## Recommendation
+
+**For most users:** Just configure `grafana.betelgeusebytes.io`
+
+Why?
+- Single DNS record to manage
+- Grafana provides unified access to all components
+- Simpler certificate management
+- All functionality available through one interface
+
+**For advanced users:** Add optional DNS records if you need:
+- Direct Prometheus UI access for debugging
+- External log/trace ingestion
+- API integrations
+- Programmatic queries outside Grafana
+
+## Troubleshooting
+
+**DNS not resolving:**
+- Check DNS propagation: https://dnschecker.org/
+- Wait 5-15 minutes for DNS to propagate
+- Verify your DNS provider settings
+
+**Certificate not issued:**
+```bash
+# Check cert-manager
+kubectl get pods -n cert-manager
+
+# Check certificate request
+kubectl describe certificate grafana-tls -n observability
+
+# Check challenges
+kubectl get challenges -n observability
+```
+
+**403/404 errors:**
+- Verify ingress is created: `kubectl get ingress -n observability`
+- Check NGINX ingress controller: `kubectl get pods -n ingress-nginx`
+- Check ingress logs: `kubectl logs -n ingress-nginx <nginx-pod>`
--- a/k8s/observability-stack/MONITORING-GUIDE.md
+++ b/k8s/observability-stack/MONITORING-GUIDE.md
@ -0,0 +1,572 @@
+# Access URLs & Monitoring New Applications Guide
+
+## 🌐 Access URLs
+
+### Required (Already Configured)
+
+**Grafana - Main Dashboard**
+- **URL**: https://grafana.betelgeusebytes.io
+- **DNS Required**: Yes - `grafana.betelgeusebytes.io` → your cluster IP
+- **Login**: admin / admin (change on first login!)
+- **Purpose**: Unified interface for logs, metrics, and traces
+- **Ingress**: Already included in deployment (20-grafana-ingress.yaml)
+
+### Optional (Direct Component Access)
+
+You can optionally expose these components directly:
+
+**Prometheus - Metrics UI**
+- **URL**: https://prometheus.betelgeusebytes.io
+- **DNS Required**: Yes - `prometheus.betelgeusebytes.io` → your cluster IP
+- **Purpose**: Direct access to Prometheus UI, query metrics, check targets
+- **Deploy**: `kubectl apply -f 21-optional-ingresses.yaml`
+- **Use Case**: Debugging metric collection, advanced PromQL queries
+
+**Loki - Logs API**
+- **URL**: https://loki.betelgeusebytes.io
+- **DNS Required**: Yes - `loki.betelgeusebytes.io` → your cluster IP
+- **Purpose**: Direct API access for log queries
+- **Deploy**: `kubectl apply -f 21-optional-ingresses.yaml`
+- **Use Case**: External log forwarding, API integration
+
+**Tempo - Traces API**
+- **URL**: https://tempo.betelgeusebytes.io
+- **DNS Required**: Yes - `tempo.betelgeusebytes.io` → your cluster IP
+- **Purpose**: Direct API access for trace queries
+- **Deploy**: `kubectl apply -f 21-optional-ingresses.yaml`
+- **Use Case**: External trace ingestion, API integration
+
+### Internal Only (No DNS Required)
+
+These are ClusterIP services accessible only from within the cluster:
+
+```
+http://prometheus.observability.svc.cluster.local:9090
+http://loki.observability.svc.cluster.local:3100
+http://tempo.observability.svc.cluster.local:3200
+http://tempo.observability.svc.cluster.local:4317  # OTLP gRPC
+http://tempo.observability.svc.cluster.local:4318  # OTLP HTTP
+```
+
+## 🎯 Recommendation
+
+**For most users**: Just use Grafana (grafana.betelgeusebytes.io)
+- Grafana provides unified access to all components
+- No need to expose Prometheus, Loki, or Tempo directly
+- Simpler DNS configuration (only one subdomain)
+
+**For power users**: Add optional ingresses
+- Direct Prometheus access is useful for debugging
+- Helps verify targets and scrape configs
+- Deploy with: `kubectl apply -f 21-optional-ingresses.yaml`
+
+## 📊 Monitoring New Applications
+
+### Automatic: Kubernetes Logs
+
+**All pod logs are automatically collected!** No configuration needed.
+
+Alloy runs as a DaemonSet and automatically:
+1. Discovers all pods in the cluster
+2. Reads logs from `/var/log/pods/`
+3. Sends them to Loki with labels:
+   - `namespace`
+   - `pod`
+   - `container`
+   - `node`
+   - All pod labels
+
+**View in Grafana:**
+```logql
+# All logs from your app
+{namespace="your-namespace", pod=~"your-app.*"}
+
+# Error logs only
+{namespace="your-namespace"} |= "error"
+
+# JSON logs parsed
+{namespace="your-namespace"} | json | level="error"
+```
+
+**Best Practice for Logs:**
+Emit structured JSON logs from your application:
+
+```python
+import json
+import logging
+
+# Python example
+logging.basicConfig(
+    format='%(message)s',
+    level=logging.INFO
+)
+
+logger = logging.getLogger(__name__)
+
+# Log as JSON
+logger.info(json.dumps({
+    "level": "info",
+    "message": "User login successful",
+    "user_id": "123",
+    "ip": "1.2.3.4",
+    "duration_ms": 42
+}))
+```
+
+### Manual: Application Metrics
+
+#### Step 1: Expose Metrics Endpoint
+
+Your application needs to expose metrics at `/metrics` in Prometheus format.
+
+**Python (Flask) Example:**
+```python
+from prometheus_flask_exporter import PrometheusMetrics
+
+app = Flask(__name__)
+metrics = PrometheusMetrics(app)
+
+# Now /metrics endpoint is available
+# Automatic metrics: request count, duration, etc.
+```
+
+**Python (FastAPI) Example:**
+```python
+from prometheus_fastapi_instrumentator import Instrumentator
+
+app = FastAPI()
+Instrumentator().instrument(app).expose(app)
+
+# /metrics endpoint is now available
+```
+
+**Go Example:**
+```go
+import (
+    "github.com/prometheus/client_golang/prometheus/promhttp"
+    "net/http"
+)
+
+http.Handle("/metrics", promhttp.Handler())
+```
+
+**Node.js Example:**
+```javascript
+const promClient = require('prom-client');
+
+// Create default metrics
+const register = new promClient.Registry();
+promClient.collectDefaultMetrics({ register });
+
+// Expose /metrics endpoint
+app.get('/metrics', async (req, res) => {
+    res.set('Content-Type', register.contentType);
+    res.end(await register.metrics());
+});
+```
+
+#### Step 2: Add Prometheus Annotations to Your Deployment
+
+Add these annotations to your pod template:
+
+```yaml
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: my-app
+  namespace: my-namespace
+spec:
+  template:
+    metadata:
+      annotations:
+        prometheus.io/scrape: "true"   # Enable scraping
+        prometheus.io/port: "8080"     # Port where metrics are exposed
+        prometheus.io/path: "/metrics" # Path to metrics (optional, /metrics is default)
+    spec:
+      containers:
+        - name: my-app
+          image: my-app:latest
+          ports:
+            - name: http
+              containerPort: 8080
+```
+
+#### Step 3: Verify Metrics Collection
+
+**Check in Prometheus:**
+1. Access Prometheus UI (if exposed): https://prometheus.betelgeusebytes.io
+2. Go to Status → Targets
+3. Look for your pod under "kubernetes-pods"
+4. Should show as "UP"
+
+**Or via Grafana:**
+1. Go to Explore → Prometheus
+2. Query: `up{pod=~"my-app.*"}`
+3. Should return value=1
+
+**Query your metrics:**
+```promql
+# Request rate
+rate(http_requests_total{namespace="my-namespace"}[5m])
+
+# Request duration 95th percentile
+histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m]))
+
+# Error rate
+rate(http_requests_total{namespace="my-namespace", status=~"5.."}[5m])
+```
+
+### Manual: Application Traces
+
+#### Step 1: Add OpenTelemetry to Your Application
+
+**Python Example:**
+```python
+from opentelemetry import trace
+from opentelemetry.sdk.trace import TracerProvider
+from opentelemetry.sdk.trace.export import BatchSpanProcessor
+from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter
+from opentelemetry.instrumentation.flask import FlaskInstrumentor
+from opentelemetry.sdk.resources import Resource
+
+# Configure resource
+resource = Resource.create({"service.name": "my-app"})
+
+# Setup tracer
+trace_provider = TracerProvider(resource=resource)
+trace_provider.add_span_processor(
+    BatchSpanProcessor(
+        OTLPSpanExporter(
+            endpoint="http://tempo.observability.svc.cluster.local:4317",
+            insecure=True
+        )
+    )
+)
+trace.set_tracer_provider(trace_provider)
+
+# Auto-instrument Flask
+app = Flask(__name__)
+FlaskInstrumentor().instrument_app(app)
+
+# Manual spans
+tracer = trace.get_tracer(__name__)
+
+@app.route('/api/data')
+def get_data():
+    with tracer.start_as_current_span("fetch_data") as span:
+        # Your code here
+        span.set_attribute("rows", 100)
+        return {"data": "..."}
+```
+
+**Install dependencies:**
+```bash
+pip install opentelemetry-api opentelemetry-sdk \
+    opentelemetry-instrumentation-flask \
+    opentelemetry-exporter-otlp-proto-grpc
+```
+
+**Go Example:**
+```go
+import (
+    "go.opentelemetry.io/otel"
+    "go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc"
+    "go.opentelemetry.io/otel/sdk/trace"
+)
+
+exporter, _ := otlptracegrpc.New(
+    context.Background(),
+    otlptracegrpc.WithEndpoint("tempo.observability.svc.cluster.local:4317"),
+    otlptracegrpc.WithInsecure(),
+)
+
+tp := trace.NewTracerProvider(
+    trace.WithBatcher(exporter),
+)
+otel.SetTracerProvider(tp)
+```
+
+**Node.js Example:**
+```javascript
+const { NodeTracerProvider } = require('@opentelemetry/sdk-trace-node');
+const { OTLPTraceExporter } = require('@opentelemetry/exporter-trace-otlp-grpc');
+const { BatchSpanProcessor } = require('@opentelemetry/sdk-trace-base');
+
+const provider = new NodeTracerProvider();
+const exporter = new OTLPTraceExporter({
+    url: 'http://tempo.observability.svc.cluster.local:4317'
+});
+provider.addSpanProcessor(new BatchSpanProcessor(exporter));
+provider.register();
+```
+
+#### Step 2: Add Trace IDs to Logs (Optional but Recommended)
+
+This enables clicking from logs to traces in Grafana!
+
+**Python Example:**
+```python
+import json
+from opentelemetry import trace
+
+def log_with_trace(message):
+    span = trace.get_current_span()
+    trace_id = format(span.get_span_context().trace_id, '032x')
+    
+    log_entry = {
+        "message": message,
+        "trace_id": trace_id,
+        "level": "info"
+    }
+    print(json.dumps(log_entry))
+```
+
+#### Step 3: Verify Traces
+
+**In Grafana:**
+1. Go to Explore → Tempo
+2. Search for service: "my-app"
+3. Click on a trace to view details
+4. Click "Logs for this span" to see correlated logs
+
+## 📋 Complete Example: Monitoring a New App
+
+Here's a complete deployment with all monitoring configured:
+
+```yaml
+---
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: my-app-config
+  namespace: my-namespace
+data:
+  app.py: |
+    from flask import Flask
+    import logging
+    import json
+    from opentelemetry import trace
+    from opentelemetry.sdk.trace import TracerProvider
+    from opentelemetry.sdk.trace.export import BatchSpanProcessor
+    from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter
+    from opentelemetry.instrumentation.flask import FlaskInstrumentor
+    from opentelemetry.sdk.resources import Resource
+    from prometheus_flask_exporter import PrometheusMetrics
+    
+    # Setup logging
+    logging.basicConfig(level=logging.INFO, format='%(message)s')
+    logger = logging.getLogger(__name__)
+    
+    # Setup tracing
+    resource = Resource.create({"service.name": "my-app"})
+    trace_provider = TracerProvider(resource=resource)
+    trace_provider.add_span_processor(
+        BatchSpanProcessor(
+            OTLPSpanExporter(
+                endpoint="http://tempo.observability.svc.cluster.local:4317",
+                insecure=True
+            )
+        )
+    )
+    trace.set_tracer_provider(trace_provider)
+    
+    app = Flask(__name__)
+    
+    # Setup metrics
+    metrics = PrometheusMetrics(app)
+    
+    # Auto-instrument with traces
+    FlaskInstrumentor().instrument_app(app)
+    
+    @app.route('/')
+    def index():
+        span = trace.get_current_span()
+        trace_id = format(span.get_span_context().trace_id, '032x')
+        
+        logger.info(json.dumps({
+            "level": "info",
+            "message": "Request received",
+            "trace_id": trace_id,
+            "endpoint": "/"
+        }))
+        
+        return {"status": "ok", "trace_id": trace_id}
+    
+    if __name__ == '__main__':
+        app.run(host='0.0.0.0', port=8080)
+
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: my-app
+  namespace: my-namespace
+  labels:
+    app: my-app
+spec:
+  replicas: 2
+  selector:
+    matchLabels:
+      app: my-app
+  template:
+    metadata:
+      labels:
+        app: my-app
+      annotations:
+        # Enable Prometheus scraping
+        prometheus.io/scrape: "true"
+        prometheus.io/port: "8080"
+        prometheus.io/path: "/metrics"
+    spec:
+      containers:
+        - name: my-app
+          image: python:3.11-slim
+          command:
+            - /bin/bash
+            - -c
+            - |
+              pip install flask opentelemetry-api opentelemetry-sdk \
+                opentelemetry-instrumentation-flask \
+                opentelemetry-exporter-otlp-proto-grpc \
+                prometheus-flask-exporter && \
+              python /app/app.py
+          ports:
+            - name: http
+              containerPort: 8080
+          volumeMounts:
+            - name: app-code
+              mountPath: /app
+          resources:
+            requests:
+              cpu: 100m
+              memory: 256Mi
+            limits:
+              cpu: 500m
+              memory: 512Mi
+      volumes:
+        - name: app-code
+          configMap:
+            name: my-app-config
+
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: my-app
+  namespace: my-namespace
+  labels:
+    app: my-app
+  annotations:
+    prometheus.io/scrape: "true"
+    prometheus.io/port: "8080"
+spec:
+  type: ClusterIP
+  ports:
+    - port: 8080
+      targetPort: http
+      protocol: TCP
+      name: http
+  selector:
+    app: my-app
+```
+
+## 🔍 Verification Checklist
+
+After deploying a new app with monitoring:
+
+### Logs ✓ (Automatic)
+```bash
+# Check logs appear in Grafana
+# Explore → Loki → {namespace="my-namespace", pod=~"my-app.*"}
+```
+
+### Metrics ✓ (If configured)
+```bash
+# Check Prometheus is scraping
+# Explore → Prometheus → up{pod=~"my-app.*"}
+# Should return 1
+
+# Check your custom metrics
+# Explore → Prometheus → flask_http_request_total{namespace="my-namespace"}
+```
+
+### Traces ✓ (If configured)
+```bash
+# Check traces appear in Tempo
+# Explore → Tempo → Search for service "my-app"
+# Should see traces
+
+# Verify log-trace correlation
+# Click on a log line with trace_id → should jump to trace
+```
+
+## 🎓 Quick Start for Common Frameworks
+
+### Python Flask/FastAPI
+```bash
+pip install opentelemetry-distro opentelemetry-exporter-otlp prometheus-flask-exporter
+opentelemetry-bootstrap -a install
+```
+
+```python
+# Set environment variables in your deployment:
+OTEL_SERVICE_NAME=my-app
+OTEL_EXPORTER_OTLP_ENDPOINT=http://tempo.observability.svc.cluster.local:4317
+OTEL_EXPORTER_OTLP_PROTOCOL=grpc
+
+# Then run with auto-instrumentation:
+opentelemetry-instrument python app.py
+```
+
+### Go
+```bash
+go get go.opentelemetry.io/otel
+go get go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp
+go get go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc
+```
+
+### Node.js
+```bash
+npm install @opentelemetry/sdk-node @opentelemetry/auto-instrumentations-node \
+    @opentelemetry/exporter-trace-otlp-grpc prom-client
+```
+
+## 📚 Summary
+
+| Component | Automatic? | Configuration Needed |
+|-----------|-----------|---------------------|
+| **Logs** | ✅ Yes | None - just deploy your app |
+| **Metrics** | ❌ No | Add /metrics endpoint + annotations |
+| **Traces** | ❌ No | Add OpenTelemetry SDK + configure endpoint |
+
+**Recommended Approach:**
+1. **Start simple**: Deploy app, logs work automatically
+2. **Add metrics**: Expose /metrics, add annotations
+3. **Add traces**: Instrument with OpenTelemetry
+4. **Correlate**: Add trace IDs to logs for full observability
+
+## 🔗 Useful Links
+
+- OpenTelemetry Python: https://opentelemetry.io/docs/instrumentation/python/
+- OpenTelemetry Go: https://opentelemetry.io/docs/instrumentation/go/
+- OpenTelemetry Node.js: https://opentelemetry.io/docs/instrumentation/js/
+- Prometheus Client Libraries: https://prometheus.io/docs/instrumenting/clientlibs/
+- Grafana Docs: https://grafana.com/docs/
+
+## 🆘 Troubleshooting
+
+**Logs not appearing:**
+- Check Alloy is running: `kubectl get pods -n observability -l app=alloy`
+- Check pod logs are being written to stdout/stderr
+- View in real-time: `kubectl logs -f <pod-name> -n <namespace>`
+
+**Metrics not being scraped:**
+- Verify annotations are present: `kubectl get pod <pod> -o yaml | grep prometheus`
+- Check /metrics endpoint: `kubectl port-forward pod/<pod> 8080:8080` then `curl localhost:8080/metrics`
+- Check Prometheus targets: https://prometheus.betelgeusebytes.io/targets
+
+**Traces not appearing:**
+- Verify endpoint: `tempo.observability.svc.cluster.local:4317`
+- Check Tempo logs: `kubectl logs -n observability tempo-0`
+- Verify OTLP exporter is configured correctly in your app
+- Check network policies allow traffic to observability namespace
--- a/k8s/observability-stack/QUICKREF.md
+++ b/k8s/observability-stack/QUICKREF.md
@ -0,0 +1,398 @@
+# Observability Stack Quick Reference
+
+## Before You Start
+
+### Remove Old Monitoring Stack
+If you have existing monitoring components, remove them first:
+```bash
+./remove-old-monitoring.sh
+```
+
+This will safely remove:
+- Prometheus, Grafana, Loki, Tempo deployments
+- Fluent Bit, Vector, or other log collectors
+- Helm releases
+- ConfigMaps, PVCs, RBAC resources
+- Prometheus Operator CRDs
+
+## Quick Access
+
+- **Grafana UI**: https://grafana.betelgeusebytes.io
+- **Default Login**: admin / admin (change immediately!)
+
+## Essential Commands
+
+### Check Status
+```bash
+# Quick status check
+./status.sh
+
+# View all pods
+kubectl get pods -n observability -o wide
+
+# Check specific component
+kubectl get pods -n observability -l app=prometheus
+kubectl get pods -n observability -l app=loki
+kubectl get pods -n observability -l app=tempo
+kubectl get pods -n observability -l app=grafana
+
+# Check storage
+kubectl get pv
+kubectl get pvc -n observability
+```
+
+### View Logs
+```bash
+# Grafana
+kubectl logs -n observability -l app=grafana -f
+
+# Prometheus
+kubectl logs -n observability -l app=prometheus -f
+
+# Loki
+kubectl logs -n observability -l app=loki -f
+
+# Tempo
+kubectl logs -n observability -l app=tempo -f
+
+# Alloy (log collector)
+kubectl logs -n observability -l app=alloy -f
+```
+
+### Restart Components
+```bash
+# Restart Prometheus
+kubectl rollout restart statefulset/prometheus -n observability
+
+# Restart Loki
+kubectl rollout restart statefulset/loki -n observability
+
+# Restart Tempo
+kubectl rollout restart statefulset/tempo -n observability
+
+# Restart Grafana
+kubectl rollout restart statefulset/grafana -n observability
+
+# Restart Alloy
+kubectl rollout restart daemonset/alloy -n observability
+```
+
+### Update Configurations
+```bash
+# Edit Prometheus config
+kubectl edit configmap prometheus-config -n observability
+kubectl rollout restart statefulset/prometheus -n observability
+
+# Edit Loki config
+kubectl edit configmap loki-config -n observability
+kubectl rollout restart statefulset/loki -n observability
+
+# Edit Tempo config
+kubectl edit configmap tempo-config -n observability
+kubectl rollout restart statefulset/tempo -n observability
+
+# Edit Alloy config
+kubectl edit configmap alloy-config -n observability
+kubectl rollout restart daemonset/alloy -n observability
+
+# Edit Grafana datasources
+kubectl edit configmap grafana-datasources -n observability
+kubectl rollout restart statefulset/grafana -n observability
+```
+
+## Common LogQL Queries (Loki)
+
+### Basic Queries
+```logql
+# All logs from observability namespace
+{namespace="observability"}
+
+# Logs from specific app
+{namespace="observability", app="prometheus"}
+
+# Filter by log level
+{namespace="default"} |= "error"
+{namespace="default"} | json | level="error"
+
+# Exclude certain logs
+{namespace="default"} != "health check"
+
+# Multiple filters
+{namespace="default"} |= "error" != "ignore"
+```
+
+### Advanced Queries
+```logql
+# Rate of errors
+rate({namespace="default"} |= "error" [5m])
+
+# Count logs by level
+sum by (level) (count_over_time({namespace="default"} | json [5m]))
+
+# Top 10 error messages
+topk(10, count by (message) (
+  {namespace="default"} | json | level="error"
+))
+```
+
+## Common PromQL Queries (Prometheus)
+
+### Cluster Health
+```promql
+# All targets up/down
+up
+
+# Pods by phase
+kube_pod_status_phase{namespace="observability"}
+
+# Node memory available
+node_memory_MemAvailable_bytes
+
+# Node CPU usage
+rate(node_cpu_seconds_total{mode="user"}[5m])
+```
+
+### Container Metrics
+```promql
+# CPU usage by container
+rate(container_cpu_usage_seconds_total[5m])
+
+# Memory usage by container
+container_memory_usage_bytes
+
+# Network traffic
+rate(container_network_transmit_bytes_total[5m])
+rate(container_network_receive_bytes_total[5m])
+```
+
+### Application Metrics
+```promql
+# HTTP request rate
+rate(http_requests_total[5m])
+
+# Request duration
+histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m]))
+
+# Error rate
+rate(http_requests_total{status=~"5.."}[5m])
+```
+
+## Trace Search (Tempo)
+
+In Grafana Explore with Tempo datasource:
+
+- **Search by service**: Select from dropdown
+- **Search by duration**: "> 1s", "< 100ms"
+- **Search by tag**: `http.status_code=500`
+- **TraceQL**: `{span.http.method="POST" && span.http.status_code>=400}`
+
+## Correlations
+
+### From Logs to Traces
+1. View logs in Loki
+2. Click on a log line with a trace ID
+3. Click the "Tempo" link
+4. Trace opens in Tempo
+
+### From Traces to Logs
+1. View trace in Tempo
+2. Click on a span
+3. Click "Logs for this span"
+4. Related logs appear
+
+### From Traces to Metrics
+1. View trace in Tempo
+2. Service graph shows metrics
+3. Click service to see metrics
+
+## Demo Application
+
+Deploy the demo app to test the stack:
+
+```bash
+kubectl apply -f demo-app.yaml
+
+# Wait for it to start
+kubectl wait --for=condition=ready pod -l app=demo-app -n observability --timeout=300s
+
+# Test it
+kubectl port-forward -n observability svc/demo-app 8080:8080
+
+# In another terminal
+curl http://localhost:8080/
+curl http://localhost:8080/items
+curl http://localhost:8080/item/0
+curl http://localhost:8080/slow
+curl http://localhost:8080/error
+```
+
+Now view in Grafana:
+- **Logs**: Search `{app="demo-app"}` in Loki
+- **Traces**: Search "demo-app" service in Tempo
+- **Metrics**: Query `flask_http_request_total` in Prometheus
+
+## Storage Management
+
+### Check Disk Usage
+```bash
+# On hetzner-2 node
+df -h /mnt/local-ssd/
+
+# Detailed usage
+du -sh /mnt/local-ssd/*
+```
+
+### Cleanup Old Data
+Data is automatically deleted after 7 days. To manually adjust retention:
+
+**Prometheus** (in 03-prometheus-config.yaml):
+```yaml
+args:
+  - '--storage.tsdb.retention.time=7d'
+```
+
+**Loki** (in 04-loki-config.yaml):
+```yaml
+limits_config:
+  retention_period: 168h  # 7 days
+```
+
+**Tempo** (in 05-tempo-config.yaml):
+```yaml
+compactor:
+  compaction:
+    block_retention: 168h  # 7 days
+```
+
+## Troubleshooting
+
+### No Logs Appearing
+```bash
+# Check Alloy is running
+kubectl get pods -n observability -l app=alloy
+
+# Check Alloy logs
+kubectl logs -n observability -l app=alloy
+
+# Check Loki
+kubectl logs -n observability -l app=loki
+
+# Test Loki endpoint
+kubectl run -it --rm debug --image=curlimages/curl --restart=Never -- \
+  curl http://loki.observability.svc.cluster.local:3100/ready
+```
+
+### No Traces Appearing
+```bash
+# Check Tempo is running
+kubectl get pods -n observability -l app=tempo
+
+# Check Tempo logs
+kubectl logs -n observability -l app=tempo
+
+# Test Tempo endpoint
+kubectl run -it --rm debug --image=curlimages/curl --restart=Never -- \
+  curl http://tempo.observability.svc.cluster.local:3200/ready
+
+# Verify your app sends to correct endpoint
+# Should be: tempo.observability.svc.cluster.local:4317 (gRPC)
+#        or: tempo.observability.svc.cluster.local:4318 (HTTP)
+```
+
+### Grafana Can't Connect to Datasources
+```bash
+# Check all services are running
+kubectl get svc -n observability
+
+# Test from Grafana pod
+kubectl exec -it -n observability grafana-0 -- \
+  wget -O- http://prometheus.observability.svc.cluster.local:9090/-/healthy
+
+kubectl exec -it -n observability grafana-0 -- \
+  wget -O- http://loki.observability.svc.cluster.local:3100/ready
+
+kubectl exec -it -n observability grafana-0 -- \
+  wget -O- http://tempo.observability.svc.cluster.local:3200/ready
+```
+
+### High Resource Usage
+```bash
+# Check resource usage
+kubectl top pods -n observability
+kubectl top nodes
+
+# Scale down if needed (for testing)
+kubectl scale statefulset/prometheus -n observability --replicas=0
+kubectl scale statefulset/loki -n observability --replicas=0
+```
+
+## Backup and Restore
+
+### Backup Grafana Dashboards
+```bash
+# Export all dashboards via API
+kubectl port-forward -n observability svc/grafana 3000:3000
+
+# In another terminal
+curl -H "Authorization: Bearer <API_KEY>" \
+  http://localhost:3000/api/search?type=dash-db | jq
+```
+
+### Backup Configurations
+```bash
+# Backup all ConfigMaps
+kubectl get configmap -n observability -o yaml > configmaps-backup.yaml
+
+# Backup specific config
+kubectl get configmap prometheus-config -n observability -o yaml > prometheus-config-backup.yaml
+```
+
+## Useful Dashboards in Grafana
+
+After login, import these dashboard IDs:
+
+- **315**: Kubernetes cluster monitoring
+- **7249**: Kubernetes cluster
+- **13639**: Loki dashboard
+- **12611**: Tempo dashboard
+- **3662**: Prometheus 2.0 stats
+- **1860**: Node Exporter Full
+
+Go to: Dashboards → Import → Enter ID → Load
+
+## Performance Tuning
+
+### For Higher Load
+Increase resources in respective YAML files:
+
+```yaml
+resources:
+  requests:
+    cpu: 1000m      # from 500m
+    memory: 4Gi     # from 2Gi
+  limits:
+    cpu: 4000m      # from 2000m
+    memory: 8Gi     # from 4Gi
+```
+
+### For Lower Resource Usage
+- Reduce scrape intervals in Prometheus config
+- Reduce log retention periods
+- Reduce trace sampling rate
+
+## Security Checklist
+
+- [ ] Change Grafana admin password
+- [ ] Review RBAC permissions
+- [ ] Enable audit logging
+- [ ] Consider adding NetworkPolicies
+- [ ] Review ingress TLS configuration
+- [ ] Backup configurations regularly
+
+## Getting Help
+
+1. Check component logs first
+2. Review configurations
+3. Test network connectivity
+4. Check resource availability
+5. Review Grafana datasource settings
--- a/k8s/observability-stack/README.md
+++ b/k8s/observability-stack/README.md
@ -0,0 +1,385 @@
+# State-of-the-Art Observability Stack for Kubernetes
+
+This deployment provides a comprehensive, production-ready observability solution using the Grafana LGTM stack (Loki, Grafana, Tempo, Mimir/Prometheus) with unified collection through Grafana Alloy.
+
+## Architecture Overview
+
+### Core Components
+
+1. **Grafana** (v11.4.0) - Unified visualization platform
+   - Pre-configured datasources for Prometheus, Loki, and Tempo
+   - Automatic correlation between logs, metrics, and traces
+   - Modern UI with TraceQL editor support
+
+2. **Prometheus** (v2.54.1) - Metrics collection and storage
+   - 7-day retention
+   - Comprehensive Kubernetes service discovery
+   - Scrapes: API server, nodes, cadvisor, pods, services
+
+3. **Grafana Loki** (v3.2.1) - Log aggregation
+   - 7-day retention with compaction
+   - TSDB index for efficient queries
+   - Automatic correlation with traces
+
+4. **Grafana Tempo** (v2.6.1) - Distributed tracing
+   - 7-day retention
+   - Multiple protocol support: OTLP, Jaeger, Zipkin
+   - Metrics generation from traces
+   - Automatic correlation with logs and metrics
+
+5. **Grafana Alloy** (v1.5.1) - Unified observability agent
+   - Replaces Promtail, Vector, Fluent Bit
+   - Collects logs from all pods
+   - OTLP receiver for traces
+   - Runs as DaemonSet on all nodes
+
+6. **kube-state-metrics** (v2.13.0) - Kubernetes object metrics
+   - Deployment, Pod, Service, Node metrics
+   - Essential for cluster monitoring
+
+7. **node-exporter** (v1.8.2) - Node-level system metrics
+   - CPU, memory, disk, network metrics
+   - Runs on all nodes via DaemonSet
+
+## Key Features
+
+- **Unified Observability**: Logs, metrics, and traces in one platform
+- **Automatic Correlation**: Click from logs to traces to metrics seamlessly
+- **7-Day Retention**: Optimized for single-node cluster
+- **Local SSD Storage**: Fast, persistent storage on hetzner-2 node
+- **OTLP Support**: Modern OpenTelemetry protocol support
+- **TLS Enabled**: Secure access via NGINX Ingress with Let's Encrypt
+- **Low Resource Footprint**: Optimized for single-node deployment
+
+## Storage Layout
+
+All data stored on local SSD at `/mnt/local-ssd/`:
+
+```
+/mnt/local-ssd/
+├── prometheus/    (50Gi)  - Metrics data
+├── loki/          (100Gi) - Log data
+├── tempo/         (50Gi)  - Trace data
+└── grafana/       (10Gi)  - Dashboards and settings
+```
+
+## Deployment Instructions
+
+### Prerequisites
+
+1. Kubernetes cluster with NGINX Ingress Controller
+2. cert-manager installed with Let's Encrypt issuer
+3. DNS record: `grafana.betelgeusebytes.io` → your cluster IP
+4. Node labeled: `kubernetes.io/hostname=hetzner-2`
+
+### Step 0: Remove Existing Monitoring (If Applicable)
+
+If you have an existing monitoring stack (Prometheus, Grafana, Loki, Fluent Bit, etc.), remove it first to avoid conflicts:
+
+```bash
+./remove-old-monitoring.sh
+```
+
+This interactive script will help you safely remove:
+- Existing Prometheus/Grafana/Loki/Tempo deployments
+- Helm releases for monitoring components
+- Fluent Bit, Vector, or other log collectors
+- Related ConfigMaps, PVCs, and RBAC resources
+- Prometheus Operator CRDs (if applicable)
+
+**Note**: The main deployment script (`deploy.sh`) will also prompt you to run cleanup if needed.
+
+### Step 1: Prepare Storage Directories
+
+SSH into the hetzner-2 node and create directories:
+
+```bash
+sudo mkdir -p /mnt/local-ssd/{prometheus,loki,tempo,grafana}
+sudo chown -R 65534:65534 /mnt/local-ssd/prometheus
+sudo chown -R 10001:10001 /mnt/local-ssd/loki
+sudo chown -R root:root /mnt/local-ssd/tempo
+sudo chown -R 472:472 /mnt/local-ssd/grafana
+```
+
+### Step 2: Deploy the Stack
+
+```bash
+chmod +x deploy.sh
+./deploy.sh
+```
+
+Or deploy manually:
+
+```bash
+kubectl apply -f 00-namespace.yaml
+kubectl apply -f 01-persistent-volumes.yaml
+kubectl apply -f 02-persistent-volume-claims.yaml
+kubectl apply -f 03-prometheus-config.yaml
+kubectl apply -f 04-loki-config.yaml
+kubectl apply -f 05-tempo-config.yaml
+kubectl apply -f 06-alloy-config.yaml
+kubectl apply -f 07-grafana-datasources.yaml
+kubectl apply -f 08-rbac.yaml
+kubectl apply -f 10-prometheus.yaml
+kubectl apply -f 11-loki.yaml
+kubectl apply -f 12-tempo.yaml
+kubectl apply -f 13-grafana.yaml
+kubectl apply -f 14-alloy.yaml
+kubectl apply -f 15-kube-state-metrics.yaml
+kubectl apply -f 16-node-exporter.yaml
+kubectl apply -f 20-grafana-ingress.yaml
+```
+
+### Step 3: Verify Deployment
+
+```bash
+kubectl get pods -n observability
+kubectl get pv
+kubectl get pvc -n observability
+```
+
+All pods should be in `Running` state:
+- grafana-0
+- loki-0
+- prometheus-0
+- tempo-0
+- alloy-xxxxx (one per node)
+- kube-state-metrics-xxxxx
+- node-exporter-xxxxx (one per node)
+
+### Step 4: Access Grafana
+
+1. Open: https://grafana.betelgeusebytes.io
+2. Login with default credentials:
+   - Username: `admin`
+   - Password: `admin`
+3. **IMPORTANT**: Change the password on first login!
+
+## Using the Stack
+
+### Exploring Logs (Loki)
+
+1. In Grafana, go to **Explore**
+2. Select **Loki** datasource
+3. Example queries:
+   ```
+   {namespace="observability"}
+   {namespace="observability", app="prometheus"}
+   {namespace="default"} |= "error"
+   {pod="my-app-xxx"} | json | level="error"
+   ```
+
+### Exploring Metrics (Prometheus)
+
+1. In Grafana, go to **Explore**
+2. Select **Prometheus** datasource
+3. Example queries:
+   ```
+   up
+   node_memory_MemAvailable_bytes
+   rate(container_cpu_usage_seconds_total[5m])
+   kube_pod_status_phase{namespace="observability"}
+   ```
+
+### Exploring Traces (Tempo)
+
+1. In Grafana, go to **Explore**
+2. Select **Tempo** datasource
+3. Search by:
+   - Service name
+   - Duration
+   - Tags
+4. Click on a trace to see detailed span timeline
+
+### Correlations
+
+The stack automatically correlates:
+- **Logs → Traces**: Click traceID in logs to view trace
+- **Traces → Logs**: Click on trace to see related logs
+- **Traces → Metrics**: Tempo generates metrics from traces
+
+### Instrumenting Your Applications
+
+#### For Logs
+Logs are automatically collected from all pods by Alloy. Emit structured JSON logs:
+
+```json
+{"level":"info","message":"Request processed","duration_ms":42}
+```
+
+#### For Traces
+Send traces to Tempo using OTLP:
+
+```python
+# Python with OpenTelemetry
+from opentelemetry import trace
+from opentelemetry.sdk.trace import TracerProvider
+from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter
+
+provider = TracerProvider()
+provider.add_span_processor(
+    BatchSpanProcessor(
+        OTLPSpanExporter(endpoint="http://tempo.observability.svc.cluster.local:4317")
+    )
+)
+trace.set_tracer_provider(provider)
+```
+
+#### For Metrics
+Expose metrics in Prometheus format and add annotations to your pod:
+
+```yaml
+metadata:
+  annotations:
+    prometheus.io/scrape: "true"
+    prometheus.io/port: "8080"
+    prometheus.io/path: "/metrics"
+```
+
+## Monitoring Endpoints
+
+Internal service endpoints:
+
+- **Prometheus**: `http://prometheus.observability.svc.cluster.local:9090`
+- **Loki**: `http://loki.observability.svc.cluster.local:3100`
+- **Tempo**: 
+  - HTTP: `http://tempo.observability.svc.cluster.local:3200`
+  - OTLP gRPC: `tempo.observability.svc.cluster.local:4317`
+  - OTLP HTTP: `tempo.observability.svc.cluster.local:4318`
+- **Grafana**: `http://grafana.observability.svc.cluster.local:3000`
+
+## Troubleshooting
+
+### Check Pod Status
+```bash
+kubectl get pods -n observability
+kubectl describe pod <pod-name> -n observability
+```
+
+### View Logs
+```bash
+kubectl logs -n observability -l app=grafana
+kubectl logs -n observability -l app=prometheus
+kubectl logs -n observability -l app=loki
+kubectl logs -n observability -l app=tempo
+kubectl logs -n observability -l app=alloy
+```
+
+### Check Storage
+```bash
+kubectl get pv
+kubectl get pvc -n observability
+```
+
+### Test Connectivity
+```bash
+# From inside cluster
+kubectl run -it --rm debug --image=curlimages/curl --restart=Never -- \
+  curl http://prometheus.observability.svc.cluster.local:9090/-/healthy
+```
+
+### Common Issues
+
+**Pods stuck in Pending**
+- Check if storage directories exist on hetzner-2
+- Verify PV/PVC bindings: `kubectl describe pvc -n observability`
+
+**Loki won't start**
+- Check permissions on `/mnt/local-ssd/loki` (should be 10001:10001)
+- View logs: `kubectl logs -n observability loki-0`
+
+**No logs appearing**
+- Check Alloy pods are running: `kubectl get pods -n observability -l app=alloy`
+- View Alloy logs: `kubectl logs -n observability -l app=alloy`
+
+**Grafana can't reach datasources**
+- Verify services: `kubectl get svc -n observability`
+- Check datasource URLs in Grafana UI
+
+## Updating Configuration
+
+### Update Prometheus Scrape Config
+```bash
+kubectl edit configmap prometheus-config -n observability
+kubectl rollout restart statefulset/prometheus -n observability
+```
+
+### Update Loki Retention
+```bash
+kubectl edit configmap loki-config -n observability
+kubectl rollout restart statefulset/loki -n observability
+```
+
+### Update Alloy Collection Rules
+```bash
+kubectl edit configmap alloy-config -n observability
+kubectl rollout restart daemonset/alloy -n observability
+```
+
+## Resource Usage
+
+Expected resource consumption:
+
+| Component | CPU Request | CPU Limit | Memory Request | Memory Limit |
+|-----------|-------------|-----------|----------------|--------------|
+| Prometheus | 500m | 2000m | 2Gi | 4Gi |
+| Loki | 500m | 2000m | 1Gi | 2Gi |
+| Tempo | 500m | 2000m | 1Gi | 2Gi |
+| Grafana | 250m | 1000m | 512Mi | 1Gi |
+| Alloy (per node) | 100m | 500m | 256Mi | 512Mi |
+| kube-state-metrics | 100m | 200m | 128Mi | 256Mi |
+| node-exporter (per node) | 100m | 200m | 128Mi | 256Mi |
+
+**Total (single node)**: ~2.1 CPU cores, ~7.5Gi memory
+
+## Security Considerations
+
+1. **Change default Grafana password** immediately after deployment
+2. Consider adding authentication for internal services if exposed
+3. Review and restrict RBAC permissions as needed
+4. Enable audit logging in Loki for sensitive namespaces
+5. Consider adding NetworkPolicies to restrict traffic
+
+## Documentation
+
+This deployment includes comprehensive guides:
+
+- **README.md**: Complete deployment and configuration guide (this file)
+- **MONITORING-GUIDE.md**: URLs, access, and how to monitor new applications
+- **DEPLOYMENT-CHECKLIST.md**: Step-by-step deployment checklist
+- **QUICKREF.md**: Quick reference for daily operations
+- **demo-app.yaml**: Example fully instrumented application
+- **deploy.sh**: Automated deployment script
+- **status.sh**: Health check script
+- **cleanup.sh**: Complete stack removal
+- **remove-old-monitoring.sh**: Remove existing monitoring before deployment
+- **21-optional-ingresses.yaml**: Optional external access to Prometheus/Loki/Tempo
+
+## Future Enhancements
+
+- Add Alertmanager for alerting
+- Configure Grafana SMTP for email notifications
+- Add custom dashboards for your applications
+- Implement Grafana RBAC for team access
+- Consider Mimir for long-term metrics storage
+- Add backup/restore procedures
+
+## Support
+
+For issues or questions:
+1. Check pod logs first
+2. Review Grafana datasource configuration
+3. Verify network connectivity between components
+4. Check storage and resource availability
+
+## Version Information
+
+- Grafana: 11.4.0
+- Prometheus: 2.54.1
+- Loki: 3.2.1
+- Tempo: 2.6.1
+- Alloy: 1.5.1
+- kube-state-metrics: 2.13.0
+- node-exporter: 1.8.2
+
+Last updated: January 2025
--- a/k8s/observability-stack/cleanup.sh
+++ b/k8s/observability-stack/cleanup.sh
@ -0,0 +1,62 @@
+#!/bin/bash
+
+set -e
+
+echo "=================================================="
+echo "Removing Observability Stack from Kubernetes"
+echo "=================================================="
+echo ""
+
+RED='\033[0;31m'
+YELLOW='\033[1;33m'
+NC='\033[0m' # No Color
+
+echo -e "${RED}WARNING: This will delete all observability data!${NC}"
+echo ""
+read -p "Are you sure you want to continue? (yes/no): " confirm
+
+if [ "$confirm" != "yes" ]; then
+    echo "Cleanup cancelled."
+    exit 0
+fi
+
+echo -e "${YELLOW}Removing Ingress...${NC}"
+kubectl delete -f 20-grafana-ingress.yaml --ignore-not-found
+
+echo -e "${YELLOW}Removing Deployments and DaemonSets...${NC}"
+kubectl delete -f 16-node-exporter.yaml --ignore-not-found
+kubectl delete -f 15-kube-state-metrics.yaml --ignore-not-found
+kubectl delete -f 14-alloy.yaml --ignore-not-found
+kubectl delete -f 13-grafana.yaml --ignore-not-found
+kubectl delete -f 12-tempo.yaml --ignore-not-found
+kubectl delete -f 11-loki.yaml --ignore-not-found
+kubectl delete -f 10-prometheus.yaml --ignore-not-found
+
+echo -e "${YELLOW}Removing RBAC...${NC}"
+kubectl delete -f 08-rbac.yaml --ignore-not-found
+
+echo -e "${YELLOW}Removing ConfigMaps...${NC}"
+kubectl delete -f 07-grafana-datasources.yaml --ignore-not-found
+kubectl delete -f 06-alloy-config.yaml --ignore-not-found
+kubectl delete -f 05-tempo-config.yaml --ignore-not-found
+kubectl delete -f 04-loki-config.yaml --ignore-not-found
+kubectl delete -f 03-prometheus-config.yaml --ignore-not-found
+
+echo -e "${YELLOW}Removing PVCs...${NC}"
+kubectl delete -f 02-persistent-volume-claims.yaml --ignore-not-found
+
+echo -e "${YELLOW}Removing PVs...${NC}"
+kubectl delete -f 01-persistent-volumes.yaml --ignore-not-found
+
+echo -e "${YELLOW}Removing Namespace...${NC}"
+kubectl delete -f 00-namespace.yaml --ignore-not-found
+
+echo ""
+echo -e "${RED}=================================================="
+echo "Cleanup Complete!"
+echo "==================================================${NC}"
+echo ""
+echo "Data directories on hetzner-2 node are preserved."
+echo "To remove them, run on the node:"
+echo "  sudo rm -rf /mnt/local-ssd/{prometheus,loki,tempo,grafana}"
+echo ""
--- a/k8s/observability-stack/demo-app.yaml
+++ b/k8s/observability-stack/demo-app.yaml
@ -0,0 +1,253 @@
+---
+# Example instrumented application to test the observability stack
+# This is a simple Python Flask app with OpenTelemetry instrumentation
+
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: demo-app
+  namespace: observability
+data:
+  app.py: |
+    from flask import Flask, jsonify
+    import logging
+    import json
+    import time
+    import random
+    
+    # OpenTelemetry imports
+    from opentelemetry import trace, metrics
+    from opentelemetry.sdk.trace import TracerProvider
+    from opentelemetry.sdk.trace.export import BatchSpanProcessor
+    from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter
+    from opentelemetry.sdk.metrics import MeterProvider
+    from opentelemetry.sdk.metrics.export import PeriodicExportingMetricReader
+    from opentelemetry.exporter.otlp.proto.grpc.metric_exporter import OTLPMetricExporter
+    from opentelemetry.instrumentation.flask import FlaskInstrumentor
+    from opentelemetry.sdk.resources import Resource
+    from prometheus_flask_exporter import PrometheusMetrics
+    
+    # Configure structured logging
+    logging.basicConfig(
+        level=logging.INFO,
+        format='%(message)s'
+    )
+    
+    class JSONFormatter(logging.Formatter):
+        def format(self, record):
+            log_obj = {
+                'timestamp': self.formatTime(record, self.datefmt),
+                'level': record.levelname,
+                'message': record.getMessage(),
+                'logger': record.name,
+            }
+            if hasattr(record, 'trace_id'):
+                log_obj['trace_id'] = record.trace_id
+                log_obj['span_id'] = record.span_id
+            return json.dumps(log_obj)
+    
+    handler = logging.StreamHandler()
+    handler.setFormatter(JSONFormatter())
+    logger = logging.getLogger(__name__)
+    logger.addHandler(handler)
+    logger.setLevel(logging.INFO)
+    
+    # Configure OpenTelemetry
+    resource = Resource.create({"service.name": "demo-app"})
+    
+    # Tracing
+    trace_provider = TracerProvider(resource=resource)
+    trace_provider.add_span_processor(
+        BatchSpanProcessor(
+            OTLPSpanExporter(
+                endpoint="http://tempo.observability.svc.cluster.local:4317",
+                insecure=True
+            )
+        )
+    )
+    trace.set_tracer_provider(trace_provider)
+    tracer = trace.get_tracer(__name__)
+    
+    # Create Flask app
+    app = Flask(__name__)
+    
+    # Prometheus metrics
+    metrics = PrometheusMetrics(app)
+    
+    # Auto-instrument Flask
+    FlaskInstrumentor().instrument_app(app)
+    
+    # Sample data
+    ITEMS = ["apple", "banana", "orange", "grape", "mango"]
+    
+    @app.route('/')
+    def index():
+        span = trace.get_current_span()
+        trace_id = format(span.get_span_context().trace_id, '032x')
+        
+        logger.info("Index page accessed", extra={
+            'trace_id': trace_id,
+            'endpoint': '/'
+        })
+        
+        return jsonify({
+            'service': 'demo-app',
+            'status': 'healthy',
+            'trace_id': trace_id
+        })
+    
+    @app.route('/items')
+    def get_items():
+        with tracer.start_as_current_span("fetch_items") as span:
+            # Simulate database query
+            time.sleep(random.uniform(0.01, 0.1))
+            
+            span.set_attribute("items.count", len(ITEMS))
+            trace_id = format(span.get_span_context().trace_id, '032x')
+            
+            logger.info("Items fetched", extra={
+                'trace_id': trace_id,
+                'count': len(ITEMS)
+            })
+            
+            return jsonify({
+                'items': ITEMS,
+                'count': len(ITEMS),
+                'trace_id': trace_id
+            })
+    
+    @app.route('/item/<int:item_id>')
+    def get_item(item_id):
+        with tracer.start_as_current_span("fetch_item") as span:
+            span.set_attribute("item.id", item_id)
+            trace_id = format(span.get_span_context().trace_id, '032x')
+            
+            # Simulate processing
+            time.sleep(random.uniform(0.01, 0.05))
+            
+            if item_id < 0 or item_id >= len(ITEMS):
+                logger.warning("Item not found", extra={
+                    'trace_id': trace_id,
+                    'item_id': item_id
+                })
+                return jsonify({'error': 'Item not found', 'trace_id': trace_id}), 404
+            
+            item = ITEMS[item_id]
+            logger.info("Item fetched", extra={
+                'trace_id': trace_id,
+                'item_id': item_id,
+                'item': item
+            })
+            
+            return jsonify({
+                'id': item_id,
+                'name': item,
+                'trace_id': trace_id
+            })
+    
+    @app.route('/slow')
+    def slow_endpoint():
+        with tracer.start_as_current_span("slow_operation") as span:
+            trace_id = format(span.get_span_context().trace_id, '032x')
+            
+            logger.info("Slow operation started", extra={'trace_id': trace_id})
+            
+            # Simulate slow operation
+            time.sleep(random.uniform(1, 3))
+            
+            logger.info("Slow operation completed", extra={'trace_id': trace_id})
+            
+            return jsonify({
+                'message': 'Operation completed',
+                'trace_id': trace_id
+            })
+    
+    @app.route('/error')
+    def error_endpoint():
+        with tracer.start_as_current_span("error_operation") as span:
+            trace_id = format(span.get_span_context().trace_id, '032x')
+            
+            logger.error("Intentional error triggered", extra={'trace_id': trace_id})
+            span.set_attribute("error", True)
+            
+            return jsonify({
+                'error': 'This is an intentional error',
+                'trace_id': trace_id
+            }), 500
+    
+    if __name__ == '__main__':
+        app.run(host='0.0.0.0', port=8080)
+
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: demo-app
+  namespace: observability
+  labels:
+    app: demo-app
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: demo-app
+  template:
+    metadata:
+      labels:
+        app: demo-app
+      annotations:
+        prometheus.io/scrape: "true"
+        prometheus.io/port: "8080"
+        prometheus.io/path: "/metrics"
+    spec:
+      containers:
+        - name: demo-app
+          image: python:3.11-slim
+          command:
+            - /bin/bash
+            - -c
+            - |
+              pip install flask opentelemetry-api opentelemetry-sdk \
+                opentelemetry-instrumentation-flask \
+                opentelemetry-exporter-otlp-proto-grpc \
+                prometheus-flask-exporter && \
+              python /app/app.py
+          ports:
+            - name: http
+              containerPort: 8080
+          volumeMounts:
+            - name: app-code
+              mountPath: /app
+          resources:
+            requests:
+              cpu: 100m
+              memory: 256Mi
+            limits:
+              cpu: 500m
+              memory: 512Mi
+      volumes:
+        - name: app-code
+          configMap:
+            name: demo-app
+
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: demo-app
+  namespace: observability
+  labels:
+    app: demo-app
+  annotations:
+    prometheus.io/scrape: "true"
+    prometheus.io/port: "8080"
+    prometheus.io/path: "/metrics"
+spec:
+  type: ClusterIP
+  ports:
+    - port: 8080
+      targetPort: http
+      protocol: TCP
+      name: http
+  selector:
+    app: demo-app
--- a/k8s/observability-stack/deploy.sh
+++ b/k8s/observability-stack/deploy.sh
@ -0,0 +1,114 @@
+#!/bin/bash
+
+set -e
+
+echo "=================================================="
+echo "Deploying Observability Stack to Kubernetes"
+echo "=================================================="
+echo ""
+
+# Colors for output
+GREEN='\033[0;32m'
+YELLOW='\033[1;33m'
+RED='\033[0;31m'
+NC='\033[0m' # No Color
+
+echo -e "${YELLOW}Pre-deployment Check: Existing Monitoring Stack${NC}"
+echo ""
+echo "If you have an existing monitoring/prometheus/grafana deployment,"
+echo "you should remove it first to avoid conflicts."
+echo ""
+read -p "Do you want to run the cleanup script now? (yes/no): " run_cleanup
+if [ "$run_cleanup" = "yes" ]; then
+    if [ -f "./remove-old-monitoring.sh" ]; then
+        echo "Running cleanup script..."
+        ./remove-old-monitoring.sh
+        echo ""
+        echo "Cleanup complete. Continuing with deployment..."
+        echo ""
+    else
+        echo -e "${RED}Error: remove-old-monitoring.sh not found${NC}"
+        echo "Please run it manually before deploying."
+        exit 1
+    fi
+fi
+
+echo -e "${YELLOW}Step 1: Creating storage directories on node...${NC}"
+echo "Please run this on the hetzner-2 node:"
+echo "  sudo mkdir -p /mnt/local-ssd/{prometheus,loki,tempo,grafana}"
+echo "  sudo chown -R 65534:65534 /mnt/local-ssd/prometheus"
+echo "  sudo chown -R 10001:10001 /mnt/local-ssd/loki"
+echo "  sudo chown -R root:root /mnt/local-ssd/tempo"
+echo "  sudo chown -R 472:472 /mnt/local-ssd/grafana"
+echo ""
+read -p "Press Enter once directories are created..."
+
+echo -e "${GREEN}Step 2: Creating namespace...${NC}"
+kubectl apply -f 00-namespace.yaml
+
+echo -e "${GREEN}Step 3: Creating PersistentVolumes...${NC}"
+kubectl apply -f 01-persistent-volumes.yaml
+
+echo -e "${GREEN}Step 4: Creating PersistentVolumeClaims...${NC}"
+kubectl apply -f 02-persistent-volume-claims.yaml
+
+echo -e "${GREEN}Step 5: Creating ConfigMaps...${NC}"
+kubectl apply -f 03-prometheus-config.yaml
+kubectl apply -f 04-loki-config.yaml
+kubectl apply -f 05-tempo-config.yaml
+kubectl apply -f 06-alloy-config.yaml
+kubectl apply -f 07-grafana-datasources.yaml
+
+echo -e "${GREEN}Step 6: Creating RBAC resources...${NC}"
+kubectl apply -f 08-rbac.yaml
+
+echo -e "${GREEN}Step 7: Deploying Prometheus...${NC}"
+kubectl apply -f 10-prometheus.yaml
+
+echo -e "${GREEN}Step 8: Deploying Loki...${NC}"
+kubectl apply -f 11-loki.yaml
+
+echo -e "${GREEN}Step 9: Deploying Tempo...${NC}"
+kubectl apply -f 12-tempo.yaml
+
+echo -e "${GREEN}Step 10: Deploying Grafana...${NC}"
+kubectl apply -f 13-grafana.yaml
+
+echo -e "${GREEN}Step 11: Deploying Grafana Alloy...${NC}"
+kubectl apply -f 14-alloy.yaml
+
+echo -e "${GREEN}Step 12: Deploying kube-state-metrics...${NC}"
+kubectl apply -f 15-kube-state-metrics.yaml
+
+echo -e "${GREEN}Step 13: Deploying node-exporter...${NC}"
+kubectl apply -f 16-node-exporter.yaml
+
+echo -e "${GREEN}Step 14: Creating Grafana Ingress...${NC}"
+kubectl apply -f 20-grafana-ingress.yaml
+
+echo ""
+echo -e "${GREEN}=================================================="
+echo "Deployment Complete!"
+echo "==================================================${NC}"
+echo ""
+echo "Waiting for pods to be ready..."
+kubectl wait --for=condition=ready pod -l app=prometheus -n observability --timeout=300s
+kubectl wait --for=condition=ready pod -l app=loki -n observability --timeout=300s
+kubectl wait --for=condition=ready pod -l app=tempo -n observability --timeout=300s
+kubectl wait --for=condition=ready pod -l app=grafana -n observability --timeout=300s
+
+echo ""
+echo -e "${GREEN}All pods are ready!${NC}"
+echo ""
+echo "Access Grafana at: https://grafana.betelgeusebytes.io"
+echo "Default credentials: admin / admin"
+echo ""
+echo "To check status:"
+echo "  kubectl get pods -n observability"
+echo ""
+echo "To view logs:"
+echo "  kubectl logs -n observability -l app=grafana"
+echo "  kubectl logs -n observability -l app=prometheus"
+echo "  kubectl logs -n observability -l app=loki"
+echo "  kubectl logs -n observability -l app=tempo"
+echo ""
--- a/k8s/observability-stack/remove-old-monitoring.sh
+++ b/k8s/observability-stack/remove-old-monitoring.sh
@ -0,0 +1,319 @@
+#!/bin/bash
+
+set -e
+
+echo "=========================================================="
+echo "Removing Existing Monitoring Stack"
+echo "=========================================================="
+echo ""
+
+RED='\033[0;31m'
+YELLOW='\033[1;33m'
+GREEN='\033[0;32m'
+NC='\033[0m' # No Color
+
+echo -e "${YELLOW}This script will remove common monitoring deployments including:${NC}"
+echo "  - Prometheus (standalone or operator)"
+echo "  - Grafana"
+echo "  - Fluent Bit"
+echo "  - Vector"
+echo "  - Loki"
+echo "  - Tempo"
+echo "  - Node exporters"
+echo "  - kube-state-metrics"
+echo "  - Any monitoring/prometheus/grafana namespaces"
+echo ""
+echo -e "${RED}WARNING: This will delete all existing monitoring data!${NC}"
+echo ""
+read -p "Are you sure you want to continue? (yes/no): " confirm
+
+if [ "$confirm" != "yes" ]; then
+    echo "Cleanup cancelled."
+    exit 0
+fi
+
+echo ""
+echo -e "${YELLOW}Step 1: Checking for existing monitoring namespaces...${NC}"
+
+# Common namespace names for monitoring
+NAMESPACES=("monitoring" "prometheus" "grafana" "loki" "tempo" "logging")
+
+for ns in "${NAMESPACES[@]}"; do
+    if kubectl get namespace "$ns" &> /dev/null; then
+        echo -e "${GREEN}Found namespace: $ns${NC}"
+        
+        # Show what's in the namespace
+        echo "  Resources in $ns:"
+        kubectl get all -n "$ns" 2>/dev/null | head -20 || true
+        echo ""
+        
+        read -p "  Delete namespace '$ns'? (yes/no): " delete_ns
+        if [ "$delete_ns" = "yes" ]; then
+            echo "  Deleting namespace $ns..."
+            kubectl delete namespace "$ns" --timeout=120s || {
+                echo -e "${YELLOW}  Warning: Namespace deletion timed out, forcing...${NC}"
+                kubectl delete namespace "$ns" --grace-period=0 --force &
+            }
+        fi
+    fi
+done
+
+echo ""
+echo -e "${YELLOW}Step 2: Removing common monitoring Helm releases...${NC}"
+
+# Check if helm is available
+if command -v helm &> /dev/null; then
+    echo "Checking for Helm releases..."
+    
+    # Common Helm release names
+    RELEASES=("prometheus" "grafana" "loki" "tempo" "fluent-bit" "prometheus-operator" "kube-prometheus-stack")
+    
+    for release in "${RELEASES[@]}"; do
+        # Check all namespaces for the release
+        if helm list -A | grep -q "$release"; then
+            ns=$(helm list -A | grep "$release" | awk '{print $2}')
+            echo -e "${GREEN}Found Helm release: $release in namespace $ns${NC}"
+            read -p "  Uninstall Helm release '$release'? (yes/no): " uninstall
+            if [ "$uninstall" = "yes" ]; then
+                echo "  Uninstalling $release..."
+                helm uninstall "$release" -n "$ns" || echo -e "${YELLOW}  Warning: Failed to uninstall $release${NC}"
+            fi
+        fi
+    done
+else
+    echo "Helm not found, skipping Helm releases check"
+fi
+
+echo ""
+echo -e "${YELLOW}Step 3: Removing standalone monitoring components...${NC}"
+
+# Remove common DaemonSets in kube-system or default
+echo "Checking for monitoring DaemonSets..."
+for ns in kube-system default; do
+    if kubectl get daemonset -n "$ns" 2>/dev/null | grep -q "node-exporter\|fluent-bit\|fluentd\|vector"; then
+        echo -e "${GREEN}Found monitoring DaemonSets in $ns${NC}"
+        kubectl get daemonset -n "$ns" | grep -E "node-exporter|fluent-bit|fluentd|vector"
+        read -p "  Delete these DaemonSets? (yes/no): " delete_ds
+        if [ "$delete_ds" = "yes" ]; then
+            kubectl delete daemonset -n "$ns" -l app=node-exporter --ignore-not-found
+            kubectl delete daemonset -n "$ns" -l app=fluent-bit --ignore-not-found
+            kubectl delete daemonset -n "$ns" -l app=fluentd --ignore-not-found
+            kubectl delete daemonset -n "$ns" -l app=vector --ignore-not-found
+            kubectl delete daemonset -n "$ns" node-exporter --ignore-not-found
+            kubectl delete daemonset -n "$ns" fluent-bit --ignore-not-found
+            kubectl delete daemonset -n "$ns" fluentd --ignore-not-found
+            kubectl delete daemonset -n "$ns" vector --ignore-not-found
+        fi
+    fi
+done
+
+# Remove common Deployments
+echo ""
+echo "Checking for monitoring Deployments..."
+for ns in kube-system default; do
+    if kubectl get deployment -n "$ns" 2>/dev/null | grep -q "prometheus\|grafana\|kube-state-metrics\|loki\|tempo"; then
+        echo -e "${GREEN}Found monitoring Deployments in $ns${NC}"
+        kubectl get deployment -n "$ns" | grep -E "prometheus|grafana|kube-state-metrics|loki|tempo"
+        read -p "  Delete these Deployments? (yes/no): " delete_deploy
+        if [ "$delete_deploy" = "yes" ]; then
+            kubectl delete deployment -n "$ns" -l app=prometheus --ignore-not-found
+            kubectl delete deployment -n "$ns" -l app=grafana --ignore-not-found
+            kubectl delete deployment -n "$ns" -l app=kube-state-metrics --ignore-not-found
+            kubectl delete deployment -n "$ns" -l app=loki --ignore-not-found
+            kubectl delete deployment -n "$ns" -l app=tempo --ignore-not-found
+            kubectl delete deployment -n "$ns" prometheus --ignore-not-found
+            kubectl delete deployment -n "$ns" grafana --ignore-not-found
+            kubectl delete deployment -n "$ns" kube-state-metrics --ignore-not-found
+        fi
+    fi
+done
+
+# Remove common StatefulSets
+echo ""
+echo "Checking for monitoring StatefulSets..."
+for ns in kube-system default; do
+    if kubectl get statefulset -n "$ns" 2>/dev/null | grep -q "prometheus\|grafana\|loki\|tempo"; then
+        echo -e "${GREEN}Found monitoring StatefulSets in $ns${NC}"
+        kubectl get statefulset -n "$ns" | grep -E "prometheus|grafana|loki|tempo"
+        read -p "  Delete these StatefulSets? (yes/no): " delete_sts
+        if [ "$delete_sts" = "yes" ]; then
+            kubectl delete statefulset -n "$ns" -l app=prometheus --ignore-not-found
+            kubectl delete statefulset -n "$ns" -l app=grafana --ignore-not-found
+            kubectl delete statefulset -n "$ns" -l app=loki --ignore-not-found
+            kubectl delete statefulset -n "$ns" -l app=tempo --ignore-not-found
+            kubectl delete statefulset -n "$ns" prometheus --ignore-not-found
+            kubectl delete statefulset -n "$ns" grafana --ignore-not-found
+            kubectl delete statefulset -n "$ns" loki --ignore-not-found
+            kubectl delete statefulset -n "$ns" tempo --ignore-not-found
+        fi
+    fi
+done
+
+echo ""
+echo -e "${YELLOW}Step 4: Removing monitoring ConfigMaps...${NC}"
+
+# Ask before removing ConfigMaps (they might contain important configs)
+echo "Checking for monitoring ConfigMaps..."
+for ns in kube-system default monitoring prometheus grafana; do
+    if kubectl get namespace "$ns" &> /dev/null; then
+        if kubectl get configmap -n "$ns" 2>/dev/null | grep -q "prometheus\|grafana\|loki\|tempo\|fluent"; then
+            echo -e "${GREEN}Found monitoring ConfigMaps in $ns${NC}"
+            kubectl get configmap -n "$ns" | grep -E "prometheus|grafana|loki|tempo|fluent"
+            read -p "  Delete these ConfigMaps? (yes/no): " delete_cm
+            if [ "$delete_cm" = "yes" ]; then
+                kubectl delete configmap -n "$ns" -l app=prometheus --ignore-not-found
+                kubectl delete configmap -n "$ns" -l app=grafana --ignore-not-found
+                kubectl delete configmap -n "$ns" -l app=loki --ignore-not-found
+                kubectl delete configmap -n "$ns" -l app=fluent-bit --ignore-not-found
+            fi
+        fi
+    fi
+done
+
+echo ""
+echo -e "${YELLOW}Step 5: Removing ClusterRoles and ClusterRoleBindings...${NC}"
+
+# Remove monitoring-related RBAC
+echo "Checking for monitoring ClusterRoles..."
+if kubectl get clusterrole 2>/dev/null | grep -q "prometheus\|grafana\|kube-state-metrics\|fluent-bit\|node-exporter"; then
+    echo -e "${GREEN}Found monitoring ClusterRoles${NC}"
+    kubectl get clusterrole | grep -E "prometheus|grafana|kube-state-metrics|fluent-bit|node-exporter"
+    read -p "  Delete these ClusterRoles? (yes/no): " delete_cr
+    if [ "$delete_cr" = "yes" ]; then
+        kubectl delete clusterrole prometheus --ignore-not-found
+        kubectl delete clusterrole grafana --ignore-not-found
+        kubectl delete clusterrole kube-state-metrics --ignore-not-found
+        kubectl delete clusterrole fluent-bit --ignore-not-found
+        kubectl delete clusterrole node-exporter --ignore-not-found
+    fi
+fi
+
+echo "Checking for monitoring ClusterRoleBindings..."
+if kubectl get clusterrolebinding 2>/dev/null | grep -q "prometheus\|grafana\|kube-state-metrics\|fluent-bit\|node-exporter"; then
+    echo -e "${GREEN}Found monitoring ClusterRoleBindings${NC}"
+    kubectl get clusterrolebinding | grep -E "prometheus|grafana|kube-state-metrics|fluent-bit|node-exporter"
+    read -p "  Delete these ClusterRoleBindings? (yes/no): " delete_crb
+    if [ "$delete_crb" = "yes" ]; then
+        kubectl delete clusterrolebinding prometheus --ignore-not-found
+        kubectl delete clusterrolebinding grafana --ignore-not-found
+        kubectl delete clusterrolebinding kube-state-metrics --ignore-not-found
+        kubectl delete clusterrolebinding fluent-bit --ignore-not-found
+        kubectl delete clusterrolebinding node-exporter --ignore-not-found
+    fi
+fi
+
+echo ""
+echo -e "${YELLOW}Step 6: Removing PVCs and PVs...${NC}"
+
+# Check for monitoring PVCs
+echo "Checking for monitoring PersistentVolumeClaims..."
+for ns in kube-system default monitoring prometheus grafana; do
+    if kubectl get namespace "$ns" &> /dev/null; then
+        if kubectl get pvc -n "$ns" 2>/dev/null | grep -q "prometheus\|grafana\|loki\|tempo"; then
+            echo -e "${GREEN}Found monitoring PVCs in $ns${NC}"
+            kubectl get pvc -n "$ns" | grep -E "prometheus|grafana|loki|tempo"
+            echo -e "${RED}  WARNING: Deleting PVCs will delete all stored data!${NC}"
+            read -p "  Delete these PVCs? (yes/no): " delete_pvc
+            if [ "$delete_pvc" = "yes" ]; then
+                kubectl delete pvc -n "$ns" -l app=prometheus --ignore-not-found
+                kubectl delete pvc -n "$ns" -l app=grafana --ignore-not-found
+                kubectl delete pvc -n "$ns" -l app=loki --ignore-not-found
+                kubectl delete pvc -n "$ns" -l app=tempo --ignore-not-found
+                # Also try by name patterns
+                kubectl get pvc -n "$ns" -o name | grep -E "prometheus|grafana|loki|tempo" | xargs -r kubectl delete -n "$ns" || true
+            fi
+        fi
+    fi
+done
+
+# Check for monitoring PVs
+echo ""
+echo "Checking for monitoring PersistentVolumes..."
+if kubectl get pv 2>/dev/null | grep -q "prometheus\|grafana\|loki\|tempo\|monitoring"; then
+    echo -e "${GREEN}Found monitoring PVs${NC}"
+    kubectl get pv | grep -E "prometheus|grafana|loki|tempo|monitoring"
+    echo -e "${RED}  WARNING: Deleting PVs may delete data on disk!${NC}"
+    read -p "  Delete these PVs? (yes/no): " delete_pv
+    if [ "$delete_pv" = "yes" ]; then
+        kubectl get pv -o name | grep -E "prometheus|grafana|loki|tempo|monitoring" | xargs -r kubectl delete || true
+    fi
+fi
+
+echo ""
+echo -e "${YELLOW}Step 7: Checking for monitoring Ingresses...${NC}"
+
+for ns in kube-system default monitoring prometheus grafana; do
+    if kubectl get namespace "$ns" &> /dev/null; then
+        if kubectl get ingress -n "$ns" 2>/dev/null | grep -q "prometheus\|grafana\|loki"; then
+            echo -e "${GREEN}Found monitoring Ingresses in $ns${NC}"
+            kubectl get ingress -n "$ns" | grep -E "prometheus|grafana|loki"
+            read -p "  Delete these Ingresses? (yes/no): " delete_ing
+            if [ "$delete_ing" = "yes" ]; then
+                kubectl delete ingress -n "$ns" -l app=prometheus --ignore-not-found
+                kubectl delete ingress -n "$ns" -l app=grafana --ignore-not-found
+                kubectl delete ingress -n "$ns" prometheus-ingress --ignore-not-found
+                kubectl delete ingress -n "$ns" grafana-ingress --ignore-not-found
+            fi
+        fi
+    fi
+done
+
+echo ""
+echo -e "${YELLOW}Step 8: Checking for Prometheus Operator CRDs...${NC}"
+
+# Check for Prometheus Operator CRDs
+if kubectl get crd 2>/dev/null | grep -q "monitoring.coreos.com"; then
+    echo -e "${GREEN}Found Prometheus Operator CRDs${NC}"
+    kubectl get crd | grep "monitoring.coreos.com"
+    echo ""
+    echo -e "${RED}WARNING: Deleting these CRDs will remove ALL Prometheus Operator resources cluster-wide!${NC}"
+    read -p "  Delete Prometheus Operator CRDs? (yes/no): " delete_crd
+    if [ "$delete_crd" = "yes" ]; then
+        kubectl delete crd prometheuses.monitoring.coreos.com --ignore-not-found
+        kubectl delete crd prometheusrules.monitoring.coreos.com --ignore-not-found
+        kubectl delete crd servicemonitors.monitoring.coreos.com --ignore-not-found
+        kubectl delete crd podmonitors.monitoring.coreos.com --ignore-not-found
+        kubectl delete crd alertmanagers.monitoring.coreos.com --ignore-not-found
+        kubectl delete crd alertmanagerconfigs.monitoring.coreos.com --ignore-not-found
+        kubectl delete crd probes.monitoring.coreos.com --ignore-not-found
+        kubectl delete crd thanosrulers.monitoring.coreos.com --ignore-not-found
+    fi
+fi
+
+echo ""
+echo -e "${YELLOW}Step 9: Optional - Clean up data directories on nodes...${NC}"
+echo ""
+echo "You may have monitoring data stored on your nodes at:"
+echo "  - /mnt/local-ssd/prometheus"
+echo "  - /mnt/local-ssd/grafana"
+echo "  - /mnt/local-ssd/loki"
+echo "  - /mnt/local-ssd/tempo"
+echo "  - /var/lib/prometheus"
+echo "  - /var/lib/grafana"
+echo ""
+echo "To remove these, SSH to each node and run:"
+echo "  sudo rm -rf /mnt/local-ssd/{prometheus,grafana,loki,tempo}"
+echo "  sudo rm -rf /var/lib/{prometheus,grafana,loki,tempo}"
+echo ""
+read -p "Have you cleaned up the data directories? (yes to continue, no to skip): " cleanup_dirs
+
+echo ""
+echo -e "${GREEN}=========================================================="
+echo "Existing Monitoring Stack Cleanup Complete!"
+echo "==========================================================${NC}"
+echo ""
+echo "Summary of actions taken:"
+echo "  - Removed monitoring namespaces (if confirmed)"
+echo "  - Uninstalled Helm releases (if found and confirmed)"
+echo "  - Removed standalone monitoring components"
+echo "  - Removed monitoring ConfigMaps"
+echo "  - Removed RBAC resources"
+echo "  - Removed PVCs and PVs (if confirmed)"
+echo "  - Removed Ingresses"
+echo "  - Removed Prometheus Operator CRDs (if confirmed)"
+echo ""
+echo -e "${YELLOW}Next Steps:${NC}"
+echo "1. Verify cleanup: kubectl get all -A | grep -E 'prometheus|grafana|loki|tempo|monitoring'"
+echo "2. Clean up node data directories (see above)"
+echo "3. Deploy new observability stack: ./deploy.sh"
+echo ""
--- a/k8s/observability-stack/status.sh
+++ b/k8s/observability-stack/status.sh
@ -0,0 +1,115 @@
+#!/bin/bash
+
+GREEN='\033[0;32m'
+RED='\033[0;31m'
+YELLOW='\033[1;33m'
+BLUE='\033[0;34m'
+NC='\033[0m' # No Color
+
+echo -e "${BLUE}=================================================="
+echo "Observability Stack Status Check"
+echo "==================================================${NC}"
+echo ""
+
+# Check namespace
+echo -e "${YELLOW}Checking namespace...${NC}"
+if kubectl get namespace observability &> /dev/null; then
+    echo -e "${GREEN}✓ Namespace 'observability' exists${NC}"
+else
+    echo -e "${RED}✗ Namespace 'observability' not found${NC}"
+    exit 1
+fi
+echo ""
+
+# Check PVs
+echo -e "${YELLOW}Checking PersistentVolumes...${NC}"
+pvs=$(kubectl get pv 2>/dev/null | grep -E "(prometheus|loki|tempo|grafana)-data-pv" | wc -l)
+if [ "$pvs" -eq 4 ]; then
+    echo -e "${GREEN}✓ All 4 PersistentVolumes found${NC}"
+    kubectl get pv | grep -E "(prometheus|loki|tempo|grafana)-data-pv"
+else
+    echo -e "${RED}✗ Expected 4 PVs, found $pvs${NC}"
+fi
+echo ""
+
+# Check PVCs
+echo -e "${YELLOW}Checking PersistentVolumeClaims...${NC}"
+pvcs=$(kubectl get pvc -n observability 2>/dev/null | grep -v NAME | wc -l)
+if [ "$pvcs" -eq 4 ]; then
+    echo -e "${GREEN}✓ All 4 PersistentVolumeClaims found${NC}"
+    kubectl get pvc -n observability
+else
+    echo -e "${RED}✗ Expected 4 PVCs, found $pvcs${NC}"
+fi
+echo ""
+
+# Check Pods
+echo -e "${YELLOW}Checking Pods...${NC}"
+kubectl get pods -n observability -o wide
+echo ""
+
+# Count running pods
+total_pods=$(kubectl get pods -n observability --no-headers 2>/dev/null | wc -l)
+running_pods=$(kubectl get pods -n observability --field-selector=status.phase=Running --no-headers 2>/dev/null | wc -l)
+
+if [ "$total_pods" -eq 0 ]; then
+    echo -e "${RED}✗ No pods found in observability namespace${NC}"
+else
+    if [ "$running_pods" -eq "$total_pods" ]; then
+        echo -e "${GREEN}✓ All $total_pods pods are running${NC}"
+    else
+        echo -e "${YELLOW}⚠ $running_pods/$total_pods pods are running${NC}"
+    fi
+fi
+echo ""
+
+# Check Services
+echo -e "${YELLOW}Checking Services...${NC}"
+kubectl get svc -n observability
+echo ""
+
+# Check Ingress
+echo -e "${YELLOW}Checking Ingress...${NC}"
+if kubectl get ingress -n observability grafana-ingress &> /dev/null; then
+    echo -e "${GREEN}✓ Grafana Ingress found${NC}"
+    kubectl get ingress -n observability grafana-ingress
+else
+    echo -e "${RED}✗ Grafana Ingress not found${NC}"
+fi
+echo ""
+
+# Check ConfigMaps
+echo -e "${YELLOW}Checking ConfigMaps...${NC}"
+configmaps=$(kubectl get configmap -n observability 2>/dev/null | grep -v NAME | wc -l)
+echo "Found $configmaps ConfigMaps:"
+kubectl get configmap -n observability --no-headers | awk '{print "  - " $1}'
+echo ""
+
+# Test endpoints
+echo -e "${YELLOW}Testing service endpoints...${NC}"
+
+check_endpoint() {
+    local name=$1
+    local url=$2
+    
+    if kubectl run -it --rm test-$RANDOM --image=curlimages/curl --restart=Never -- \
+        curl -s -o /dev/null -w "%{http_code}" --max-time 5 $url 2>/dev/null | grep -q "200\|302\|401"; then
+        echo -e "${GREEN}✓ $name is responding${NC}"
+    else
+        echo -e "${RED}✗ $name is not responding${NC}"
+    fi
+}
+
+check_endpoint "Prometheus" "http://prometheus.observability.svc.cluster.local:9090/-/healthy"
+check_endpoint "Loki" "http://loki.observability.svc.cluster.local:3100/ready"
+check_endpoint "Tempo" "http://tempo.observability.svc.cluster.local:3200/ready"
+check_endpoint "Grafana" "http://grafana.observability.svc.cluster.local:3000/api/health"
+
+echo ""
+echo -e "${BLUE}=================================================="
+echo "Status Check Complete"
+echo "==================================================${NC}"
+echo ""
+echo "Access Grafana at: https://grafana.betelgeusebytes.io"
+echo "Default credentials: admin / admin"
+echo ""
--- a/k8s/observability/fluent-bit.yaml
+++ b/k8s/observability/fluent-bit.yaml
@ -0,0 +1,46 @@
+apiVersion: v1
+kind: ServiceAccount
+metadata: { name: fluent-bit, namespace: observability }
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRole
+metadata: { name: fluent-bit-read }
+rules:
+  - apiGroups: [""]
+    resources: ["pods", "namespaces"]
+    verbs: ["get", "list", "watch"]
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRoleBinding
+metadata: { name: fluent-bit-read }
+roleRef:
+  apiGroup: rbac.authorization.k8s.io
+  kind: ClusterRole
+  name: fluent-bit-read
+subjects:
+  - kind: ServiceAccount
+    name: fluent-bit
+    namespace: observability
+---
+apiVersion: apps/v1
+kind: DaemonSet
+metadata: { name: fluent-bit, namespace: observability }
+spec:
+  selector: { matchLabels: { app: fluent-bit } }
+  template:
+    metadata: { labels: { app: fluent-bit } }
+    spec:
+      serviceAccountName: fluent-bit
+      containers:
+      - name: fluent-bit
+        image: cr.fluentbit.io/fluent/fluent-bit:2.2.2
+        volumeMounts:
+          - { name: varlog, mountPath: /var/log }
+          - { name: containers, mountPath: /var/lib/docker/containers, readOnly: true }
+        env:
+          - { name: FLUENT_ELASTICSEARCH_HOST, value: elasticsearch.elastic.svc.cluster.local }
+          - { name: FLUENT_ELASTICSEARCH_PORT, value: "9200" }
+        args: ["-i","tail","-p","path=/var/log/containers/*.log","-F","kubernetes","-o","es","-p","host=${FLUENT_ELASTICSEARCH_HOST}","-p","port=${FLUENT_ELASTICSEARCH_PORT}","-p","logstash_format=On","-p","logstash_prefix=k8s-logs"]
+      volumes:
+        - { name: varlog, hostPath: { path: /var/log } }
+        - { name: containers, hostPath: { path: /var/lib/docker/containers, type: DirectoryOrCreate } }
--- a/k8s/otlp/otel-collector.yaml
+++ b/k8s/otlp/otel-collector.yaml
@ -0,0 +1,73 @@
+apiVersion: v1
+kind: Service
+metadata: { name: otel-collector, namespace: observability }
+spec:
+  selector: { app: otel-collector }
+  ports:
+    - { name: otlp-http, port: 4318, targetPort: 4318 }
+    - { name: otlp-grpc, port: 4317, targetPort: 4317 }
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata: { name: otel-collector, namespace: observability }
+spec:
+  replicas: 2
+  selector: { matchLabels: { app: otel-collector } }
+  template:
+    metadata: { labels: { app: otel-collector } }
+    spec:
+      nodeSelector: { node: hetzner-2 }
+      containers:
+      - name: otel-collector
+        image: otel/opentelemetry-collector-contrib:0.102.0
+        args: ["--config=/etc/otel/config.yaml"]
+        ports:
+          - { containerPort: 4318 }
+          - { containerPort: 4317 }
+        volumeMounts:
+          - { name: cfg, mountPath: /etc/otel }
+      volumes:
+        - { name: cfg, configMap: { name: otel-config } }
+---
+apiVersion: v1
+kind: ConfigMap
+metadata: { name: otel-config, namespace: observability }
+data:
+  config.yaml: |
+    receivers:
+      otlp:
+        protocols: { http: {}, grpc: {} }
+    processors: { batch: {} }
+    exporters:
+      logging: {}
+      elasticsearch:
+        endpoints: ["http://elasticsearch.elastic.svc.cluster.local:9200"]
+        logs_index: "k8s-logs"
+    service:
+      pipelines:
+        logs:     { receivers: [otlp], processors: [batch], exporters: [elasticsearch, logging] }
+        traces:   { receivers: [otlp], processors: [batch], exporters: [logging] }
+        metrics:  { receivers: [otlp], processors: [batch], exporters: [logging] }
+---
+apiVersion: networking.k8s.io/v1
+kind: Ingress
+metadata:
+  name: otlp
+  namespace: observability
+  annotations: { cert-manager.io/cluster-issuer: letsencrypt-prod }
+spec:
+  ingressClassName: nginx
+  tls: [{ hosts: ["otlp.betelgeusebytes.io"], secretName: otlp-tls }]
+  rules:
+  - host: otlp.betelgeusebytes.io
+    http:
+      paths:
+      - path: /v1/traces
+        pathType: Prefix
+        backend: { service: { name: otel-collector, port: { number: 4318 } } }
+      - path: /v1/metrics
+        pathType: Prefix
+        backend: { service: { name: otel-collector, port: { number: 4318 } } }
+      - path: /v1/logs
+        pathType: Prefix
+        backend: { service: { name: otel-collector, port: { number: 4318 } } }
--- a/k8s/postgres/.DS_Store
+++ b/k8s/postgres/.DS_Store
--- a/k8s/postgres/pg.yaml
+++ b/k8s/postgres/pg.yaml
@ -0,0 +1,217 @@
+# k8s/postgres/pg-init-sql-configmap.yaml
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: pg-init-sql
+  namespace: db
+data:
+  00_extensions.sql: |
+    \connect gitea
+    CREATE EXTENSION IF NOT EXISTS postgis;
+    CREATE EXTENSION IF NOT EXISTS postgis_topology;
+    CREATE EXTENSION IF NOT EXISTS fuzzystrmatch;
+    CREATE EXTENSION IF NOT EXISTS pg_trgm;
+    CREATE EXTENSION IF NOT EXISTS hstore;
+    CREATE EXTENSION IF NOT EXISTS "uuid-ossp";
+    CREATE EXTENSION IF NOT EXISTS citext;
+    CREATE EXTENSION IF NOT EXISTS unaccent;
+    CREATE EXTENSION IF NOT EXISTS pgcrypto;
+    DO $$ BEGIN
+      CREATE EXTENSION IF NOT EXISTS plpython3u;
+    EXCEPTION WHEN undefined_file THEN
+      RAISE NOTICE 'plpython3u not available in this image';
+    END $$;
+  01_tune.sql: |
+    ALTER SYSTEM SET shared_buffers = '1GB';
+    ALTER SYSTEM SET work_mem = '32MB';
+    ALTER SYSTEM SET maintenance_work_mem = '512MB';
+    ALTER SYSTEM SET max_connections = 200;
+    SELECT pg_reload_conf();
+---
+# k8s/postgres/pg-conf.yaml
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: pg-conf
+  namespace: db
+data:
+  pg_hba.conf: |
+    # Local connections
+    local   all   all                                   trust
+    host    all   all          127.0.0.1/32             trust
+    host    all   all          ::1/128                  trust
+    # TLS-only access from ANY external IP (harden as needed)
+    hostssl all  all          0.0.0.0/0                 md5
+    hostssl all  all          ::/0                      md5
+---
+# k8s/postgres/pg-secret.yaml
+apiVersion: v1
+kind: Secret
+metadata:
+  name: pg18-secret
+  namespace: db
+type: Opaque
+stringData:
+  POSTGRES_PASSWORD: "pa$$word"
+---
+# k8s/postgres/pg-certificate.yaml
+apiVersion: cert-manager.io/v1
+kind: Certificate
+metadata:
+  name: pg-tls
+  namespace: db
+spec:
+  secretName: pg-tls
+  dnsNames:
+    - pg.betelgeusebytes.io
+  issuerRef:
+    kind: ClusterIssuer
+    name: letsencrypt-prod
+---
+# k8s/postgres/postgres-svc.yaml
+apiVersion: v1
+kind: Service
+metadata:
+  name: postgres
+  namespace: db
+spec:
+  selector:
+    app: postgres
+  ports:
+    - name: postgres
+      port: 5432
+      targetPort: 5432
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: postgres-hl
+  namespace: db
+spec:
+  clusterIP: None
+  selector:
+    app: postgres
+  ports:
+    - name: postgres
+      port: 5432
+      targetPort: 5432
+---
+# k8s/postgres/postgres.yaml
+apiVersion: apps/v1
+kind: StatefulSet
+metadata:
+  name: postgres
+  namespace: db
+spec:
+  serviceName: postgres-hl
+  replicas: 1
+  selector:
+    matchLabels:
+      app: postgres
+  template:
+    metadata:
+      labels:
+        app: postgres
+    spec:
+      securityContext:
+        runAsUser: 999
+        runAsGroup: 999
+        fsGroup: 999
+        fsGroupChangePolicy: "Always"
+      initContainers:
+        - name: install-certs
+          image: busybox:1.36
+          command:
+            - sh
+            - -c
+            - |
+              cp /in/tls.crt /out/server.crt
+              cp /in/tls.key /out/server.key
+              chown 999:999 /out/* || true
+              chmod 600 /out/server.key
+          securityContext:
+            runAsUser: 0
+          volumeMounts:
+            - { name: pg-tls,   mountPath: /in,  readOnly: true }
+            - { name: pg-certs, mountPath: /out }
+      containers:
+        - name: postgres
+          image: axxs/postgres:18-postgis-vector
+          imagePullPolicy: IfNotPresent
+          args:
+            - -c
+            - ssl=on
+            - -c
+            - ssl_cert_file=/certs/server.crt
+            - -c
+            - ssl_key_file=/certs/server.key
+            - -c
+            - hba_file=/etc/postgresql-custom/pg_hba.conf
+          env:
+            - name: POSTGRES_USER
+              value: "app"
+            - name: POSTGRES_DB
+              value: "gitea"
+            - name: POSTGRES_PASSWORD
+              valueFrom:
+                secretKeyRef:
+                  name: pg18-secret
+                  key: POSTGRES_PASSWORD
+            - name: TZ
+              value: "Europe/Paris"
+          ports:
+            - name: postgres
+              containerPort: 5432
+          volumeMounts:
+            - { name: data,  mountPath: /var/lib/postgresql }        # PG18 expects parent, creates /var/lib/postgresql/18/main
+            - { name: init,  mountPath: /docker-entrypoint-initdb.d, readOnly: true }
+            - { name: pg-certs, mountPath: /certs }
+            - { name: pg-conf,  mountPath: /etc/postgresql-custom }
+          readinessProbe:
+            exec: { command: ["sh","-c","pg_isready -U \"$POSTGRES_USER\" -d \"$POSTGRES_DB\" -h 127.0.0.1"] }
+            initialDelaySeconds: 5
+            periodSeconds: 10
+            timeoutSeconds: 5
+            failureThreshold: 6
+          livenessProbe:
+            exec: { command: ["sh","-c","pg_isready -U \"$POSTGRES_USER\" -d \"$POSTGRES_DB\" -h 127.0.0.1"] }
+            initialDelaySeconds: 20
+            periodSeconds: 10
+            timeoutSeconds: 5
+            failureThreshold: 6
+          resources:
+            requests: { cpu: "250m", memory: "512Mi" }
+            limits:   { cpu: "1",    memory: "2Gi" }
+      volumes:
+        - name: init
+          configMap:
+            name: pg-init-sql
+            defaultMode: 0444
+        - name: pg-tls
+          secret:
+            secretName: pg-tls
+        - name: pg-certs
+          emptyDir: {}
+        - name: pg-conf
+          configMap:
+            name: pg-conf
+            defaultMode: 0444
+  volumeClaimTemplates:
+    - metadata:
+        name: data
+      spec:
+        accessModes: ["ReadWriteOnce"]
+        storageClassName: local-ssd-hetzner
+        resources:
+          requests:
+            storage: 80Gi
+
+
+# kubectl -n ingress-nginx create configmap tcp-services \
+#   --from-literal="5432=db/postgres:5432" \
+#   -o yaml --dry-run=client | kubectl apply -f -
+# kubectl -n ingress-nginx patch deploy ingress-nginx-controller \
+#   --type='json' -p='[
+#     {"op":"add","path":"/spec/template/spec/containers/0/args/-","value":"--tcp-services-configmap=$(POD_NAMESPACE)/tcp-services"}
+#   ]'
+# # controller must listen on hostPort:5432 (we already patched earlier)
--- a/k8s/postgres/postgres-ha.yaml
+++ b/k8s/postgres/postgres-ha.yaml
@ -0,0 +1,275 @@
+---
+apiVersion: v1
+kind: Namespace
+metadata:
+  name: db
+---
+# Password secret (replace with your own or generate one)
+apiVersion: v1
+kind: Secret
+metadata:
+  name: pg18-secret
+  namespace: db
+type: Opaque
+stringData:
+  POSTGRES_PASSWORD: "pa$$word"
+---
+# Init SQL: keeps your original name and keeps enabling PostGIS + vector
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: pg-init-sql
+  namespace: db
+data:
+  00_extensions.sql: |
+    -- enable common extensions in the default DB and template1 so future DBs inherit them
+    \connect gitea
+    CREATE EXTENSION IF NOT EXISTS postgis;
+    CREATE EXTENSION IF NOT EXISTS vector;
+    CREATE COLLATION IF NOT EXISTS arabic (provider = icu, locale = 'ar', deterministic = false);
+    CREATE EXTENSION IF NOT EXISTS tablefunc;
+    -- postpone pg_stat_statements CREATE to postStart (needs preload)
+    CREATE EXTENSION IF NOT EXISTS postgis_topology;
+    CREATE EXTENSION IF NOT EXISTS fuzzystrmatch;
+    CREATE EXTENSION IF NOT EXISTS pg_trgm;
+    CREATE EXTENSION IF NOT EXISTS hstore;
+    CREATE EXTENSION IF NOT EXISTS "uuid-ossp";
+    CREATE EXTENSION IF NOT EXISTS citext;
+    CREATE EXTENSION IF NOT EXISTS unaccent;
+    CREATE EXTENSION IF NOT EXISTS pgcrypto;
+
+    -- PL/Python (available in your image)
+    DO $$ BEGIN
+      CREATE EXTENSION IF NOT EXISTS plpython3u;
+    EXCEPTION WHEN undefined_file THEN
+      RAISE NOTICE 'plpython3u not available in this image';
+    END $$;
+
+    -- Also on template1 for new DBs (heavier, but intentional)
+    \connect template1
+    CREATE EXTENSION IF NOT EXISTS postgis;
+    CREATE EXTENSION IF NOT EXISTS pg_trgm;
+    CREATE EXTENSION IF NOT EXISTS hstore;
+    CREATE EXTENSION IF NOT EXISTS "uuid-ossp";
+    CREATE EXTENSION IF NOT EXISTS citext;
+    CREATE EXTENSION IF NOT EXISTS unaccent;
+    CREATE EXTENSION IF NOT EXISTS pgcrypto;
+
+    -- Arabic-friendly ICU collation, non-deterministic for case/diacritics
+    DO $$
+    BEGIN
+      PERFORM 1 FROM pg_collation WHERE collname='arabic';
+      IF NOT FOUND THEN
+        CREATE COLLATION arabic (provider = icu, locale = 'ar', deterministic = false);
+      END IF;
+    END$$;
+
+  01_tune.sql: |
+    -- Enable pg_stat_statements on next server start
+    DO $$
+    DECLARE
+      cur text := current_setting('shared_preload_libraries', true);
+    BEGIN
+      IF cur IS NULL OR position('pg_stat_statements' in cur) = 0 THEN
+        PERFORM pg_catalog.pg_reload_conf(); -- harmless even if no changes yet
+        EXECUTE $$ALTER SYSTEM SET shared_preload_libraries =
+          $$ || quote_literal(coalesce(NULLIF(cur,'' ) || ',pg_stat_statements', 'pg_stat_statements'));
+      END IF;
+    END$$;
+
+    -- Optional tuning (adjust to your limits)
+    ALTER SYSTEM SET shared_buffers = '1GB';
+    ALTER SYSTEM SET work_mem = '32MB';
+    ALTER SYSTEM SET maintenance_work_mem = '512MB';
+    ALTER SYSTEM SET max_connections = 200;
+
+    -- Reload applies some settings immediately; others need restart (OK after init completes)
+    SELECT pg_reload_conf();
+    ALTER SYSTEM SET pg_stat_statements.max = 10000;
+    ALTER SYSTEM SET pg_stat_statements.track = 'all';
+    ALTER SYSTEM SET pg_stat_statements.save = on;
+  pg_hba.conf: |
+    # Allow loopback
+    local   all             all                                     trust
+    host    all             all             127.0.0.1/32            trust
+    host    all             all             ::1/128                 trust
+    # Allow TLS connections from your IP(s) only
+    hostssl all             all             YOUR_PUBLIC_IP/32       md5
+    # (Optional) Add more CIDRs or a private network range here:
+    # hostssl all           all             10.0.0.0/8              md5
+---
+# Headless service required by StatefulSet for stable network IDs
+apiVersion: v1
+kind: Service
+metadata:
+  name: postgres-hl
+  namespace: db
+spec:
+  clusterIP: None
+  selector:
+    app: postgres
+  ports:
+    - name: postgres
+      port: 5432
+      targetPort: 5432
+---
+# Regular ClusterIP service for clients (keeps your original name)
+apiVersion: v1
+kind: Service
+metadata:
+  name: postgres
+  namespace: db
+spec:
+  selector:
+    app: postgres
+  ports:
+    - name: postgres
+      port: 5432
+      targetPort: 5432
+---
+apiVersion: apps/v1
+kind: StatefulSet
+metadata:
+  name: postgres
+  namespace: db
+spec:
+  serviceName: postgres-hl
+  replicas: 1
+  selector:
+    matchLabels:
+      app: postgres
+  template:
+    metadata:
+      labels:
+        app: postgres
+    spec:
+      securityContext:
+        runAsUser: 999
+        runAsGroup: 999
+        fsGroup: 999
+        fsGroupChangePolicy: "Always"
+      initContainers:
+        # Copy cert-manager certs to a writable path with correct perms for Postgres
+        - name: install-certs
+          image: busybox:1.36
+          command:
+            - sh
+            - -c
+            - |
+              cp /in/tls.crt /out/server.crt
+              cp /in/tls.key /out/server.key
+              cp /in/ca.crt  /out/ca.crt || true
+              chown 999:999 /out/* || true
+              chmod 600 /out/server.key
+          securityContext:
+            runAsUser: 0
+          volumeMounts:
+            - { name: pg-tls,   mountPath: /in,  readOnly: true }
+            - { name: pg-certs, mountPath: /out }
+      containers:
+        - name: postgres
+          image: axxs/postgres:18-postgis-vector
+          imagePullPolicy: IfNotPresent
+          args:
+            - -c
+            - ssl=on
+            - -c
+            - ssl_cert_file=/certs/server.crt
+            - -c
+            - ssl_key_file=/certs/server.key
+            - -c
+            - ssl_ca_file=/certs/ca.crt
+            - -c
+            - hba_file=/etc/postgresql-custom/pg_hba.conf
+          lifecycle:
+            postStart:
+              exec:
+                command:
+                  - /bin/sh
+                  - -c
+                  - |
+                    set -e
+                    # Wait until server accepts connections
+                    for i in $(seq 1 30); do
+                      pg_isready -h 127.0.0.1 -U "$POSTGRES_USER" -d "$POSTGRES_DB" && break
+                      sleep 1
+                    done
+                    psql -v ON_ERROR_STOP=1 -U "$POSTGRES_USER" -d "$POSTGRES_DB" -c "CREATE EXTENSION IF NOT EXISTS pg_stat_statements;"
+          env:
+            - name: POSTGRES_USER
+              value: "app"
+            - name: POSTGRES_DB
+              value: "gitea"               # matches your \connect gitea
+            - name: POSTGRES_PASSWORD
+              valueFrom:
+                secretKeyRef:
+                  name: pg18-secret
+                  key: POSTGRES_PASSWORD
+            - name: TZ
+              value: "Europe/Paris"
+          ports:
+            - name: postgres
+              containerPort: 5432
+          volumeMounts:
+            # ✅ PG 18 requires this parent path; it will create /var/lib/postgresql/18/main
+            - name: data
+              mountPath: /var/lib/postgresql
+            # your init scripts ConfigMap
+            - name: init
+              mountPath: /docker-entrypoint-initdb.d
+              readOnly: true
+            - name: pg-certs
+              mountPath: /certs
+            # pg_hba.conf
+            - name: pg-conf
+              mountPath: /etc/postgresql-custom
+          readinessProbe:
+            exec:
+              command:
+                - /bin/sh
+                - -c
+                - pg_isready -U "$POSTGRES_USER" -d "$POSTGRES_DB" -h 127.0.0.1
+            initialDelaySeconds: 5
+            periodSeconds: 10
+            timeoutSeconds: 5
+            failureThreshold: 6
+          livenessProbe:
+            exec:
+              command:
+                - /bin/sh
+                - -c
+                - pg_isready -U "$POSTGRES_USER" -d "$POSTGRES_DB" -h 127.0.0.1
+            initialDelaySeconds: 20
+            periodSeconds: 10
+            timeoutSeconds: 5
+            failureThreshold: 6
+          resources:
+            requests:
+              cpu: "250m"
+              memory: "512Mi"
+            limits:
+              cpu: "1"
+              memory: "2Gi"
+      volumes:
+        - name: init
+          configMap:
+            name: pg-init-sql
+            defaultMode: 0444
+        - name: pg-tls
+          secret:
+            secretName: pg-tls
+        - name: pg-certs
+          emptyDir: {}
+        - name: pg-conf
+          configMap:
+            name: pg-conf
+            defaultMode: 0444
+  volumeClaimTemplates:
+    - metadata:
+        name: data
+      spec:
+        accessModes: ["ReadWriteOnce"]
+        resources:
+          requests:
+            storage: 10Gi
+        # storageClassName: <your-storageclass>   # optionally pin this
--- a/k8s/postgres/postgres.yaml
+++ b/k8s/postgres/postgres.yaml
@ -0,0 +1,122 @@
+apiVersion: v1
+kind: Service
+metadata: { name: postgres, namespace: db }
+spec:
+  ports: [{ port: 5432, targetPort: 5432 }]
+  selector: { app: postgres }
+---
+apiVersion: v1
+kind: ConfigMap
+metadata: { name: pg-init-sql, namespace: db }
+data:
+  00_extensions.sql: |
+    -- enable common extensions in the default DB and template1 so future DBs inherit them
+    \connect gitea
+    CREATE EXTENSION IF NOT EXISTS postgis;
+    CREATE EXTENSION IF NOT EXISTS vector;
+    CREATE COLLATION IF NOT EXISTS arabic (provider = icu, locale = 'ar', deterministic = false);
+    CREATE EXTENSION IF NOT EXISTS tablefunc;
+    CREATE EXTENSION IF NOT EXISTS pg_stat_statements;
+
+    CREATE EXTENSION IF NOT EXISTS postgis_topology;
+    CREATE EXTENSION IF NOT EXISTS fuzzystrmatch;
+    CREATE EXTENSION IF NOT EXISTS pg_trgm;
+    CREATE EXTENSION IF NOT EXISTS hstore;
+    CREATE EXTENSION IF NOT EXISTS "uuid-ossp";
+    CREATE EXTENSION IF NOT EXISTS citext;
+    CREATE EXTENSION IF NOT EXISTS unaccent;
+    CREATE EXTENSION IF NOT EXISTS pgcrypto;
+    -- PL/Python (optional; requires image with plpython3u, postgis image has it)
+    DO $$ BEGIN
+      CREATE EXTENSION IF NOT EXISTS plpython3u;
+    EXCEPTION WHEN undefined_file THEN
+      RAISE NOTICE 'plpython3u not available in this image';
+    END $$;
+
+    -- Also on template1 for new DBs:
+    \connect template1
+    CREATE EXTENSION IF NOT EXISTS postgis;
+    CREATE EXTENSION IF NOT EXISTS pg_trgm;
+    CREATE EXTENSION IF NOT EXISTS hstore;
+    CREATE EXTENSION IF NOT EXISTS "uuid-ossp";
+    CREATE EXTENSION IF NOT EXISTS citext;
+    CREATE EXTENSION IF NOT EXISTS unaccent;
+    CREATE EXTENSION IF NOT EXISTS pgcrypto;
+
+    -- Arabic-friendly ICU collation (PostgreSQL >= 13)
+    -- Non-deterministic collation helps proper case/diacritics comparisons
+    DO $$
+    BEGIN
+      PERFORM 1 FROM pg_collation WHERE collname='arabic';
+      IF NOT FOUND THEN
+        CREATE COLLATION arabic (provider = icu, locale = 'ar', deterministic = false);
+      END IF;
+    END$$;
+
+    -- Example: ensure gitea DB uses UTF8; Arabic text search often needs unaccent + custom dictionaries.
+    -- You can create additional DBs with: CREATE DATABASE mydb TEMPLATE template1 ENCODING 'UTF8';
+
+  01_tune.sql: |
+    -- small safe defaults; adjust later
+    ALTER SYSTEM SET shared_buffers = '1GB';
+    ALTER SYSTEM SET work_mem = '32MB';
+    ALTER SYSTEM SET maintenance_work_mem = '512MB';
+    ALTER SYSTEM SET max_connections = 200;
+    SELECT pg_reload_conf();
+---
+apiVersion: apps/v1
+kind: StatefulSet
+metadata: { name: postgres, namespace: db }
+spec:
+  serviceName: postgres
+  replicas: 1
+  selector: { matchLabels: { app: postgres } }
+  template:
+    metadata: { labels: { app: postgres } }
+    spec:
+      nodeSelector:
+        node: hetzner-2
+      securityContext:
+        fsGroup: 999          # Debian postgres user/group in postgis image
+        fsGroupChangePolicy: OnRootMismatch
+      initContainers:
+      - name: fix-perms
+        image: busybox:1.36
+        command: ["sh","-c","chown -R 999:999 /var/lib/postgresql/data || true"]
+        securityContext: { runAsUser: 0 }
+        volumeMounts: [{ name: data, mountPath: /var/lib/postgresql/data }]
+      containers:
+      - name: postgres
+        image: postgres:16-3.4
+        env:
+          - name: POSTGRES_PASSWORD
+            valueFrom: { secretKeyRef: { name: postgres-auth, key: POSTGRES_PASSWORD } }
+          - { name: POSTGRES_USER, value: gitea }
+          - { name: POSTGRES_DB, value: gitea }
+          - name: POSTGRES_INITDB_ARGS
+            value: "--encoding=UTF8 --locale=C.UTF-8"
+        ports: [{ containerPort: 5432 }]
+        volumeMounts:
+          - { name: data, mountPath: /var/lib/postgresql/data }
+          - { name: init, mountPath: /docker-entrypoint-initdb.d }
+  volumeClaimTemplates:
+  - metadata: { name: data }
+    spec:
+      accessModes: ["ReadWriteOnce"]
+      storageClassName: local-ssd-hetzner
+      resources: { requests: { storage: 80Gi } }
+---
+# Mount the init scripts
+apiVersion: apps/v1
+kind: StatefulSet
+metadata:
+  name: postgres
+  namespace: db
+spec:
+  template:
+    spec:
+      volumes:
+        - name: init
+          configMap:
+            name: pg-init-sql
+            defaultMode: 0444
--- a/k8s/postgres/secret.yaml
+++ b/k8s/postgres/secret.yaml
@ -0,0 +1,7 @@
+apiVersion: v1
+kind: Secret
+metadata: { name: postgres-auth, namespace: db }
+type: Opaque
+stringData:
+  POSTGRES_PASSWORD: "PG-ADM1N"
+  GITEA_DB_PASSWORD: "G1TEA"
--- a/k8s/prometheus/prometheus-config.yaml
+++ b/k8s/prometheus/prometheus-config.yaml
@ -0,0 +1,13 @@
+apiVersion: v1
+kind: ConfigMap
+metadata: { name: prometheus-config, namespace: monitoring }
+data:
+  prometheus.yml: |
+    global: { scrape_interval: 15s }
+    scrape_configs:
+      - job_name: 'kubernetes-pods'
+        kubernetes_sd_configs: [ { role: pod } ]
+        relabel_configs:
+        - action: keep
+          source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape]
+          regex: 'true'
--- a/k8s/prometheus/prometheus.yaml
+++ b/k8s/prometheus/prometheus.yaml
@ -0,0 +1,55 @@
+apiVersion: v1
+kind: Service
+metadata: { name: prometheus, namespace: monitoring }
+spec:
+  ports: [{ port: 9090, targetPort: 9090 }]
+  selector: { app: prometheus }
+---
+apiVersion: apps/v1
+kind: StatefulSet
+metadata: { name: prometheus, namespace: monitoring }
+spec:
+  serviceName: prometheus
+  replicas: 1
+  selector: { matchLabels: { app: prometheus } }
+  template:
+    metadata: { labels: { app: prometheus } }
+    spec:
+      nodeSelector: { node: hetzner-2 }
+      containers:
+      - name: prometheus
+        image: prom/prometheus:v2.53.0
+        args: ["--config.file=/etc/prometheus/prometheus.yml","--storage.tsdb.path=/prometheus"]
+        ports: [{ containerPort: 9090 }]
+        volumeMounts:
+          - { name: data, mountPath: /prometheus }
+          - { name: config, mountPath: /etc/prometheus }
+      volumes:
+        - { name: config, configMap: { name: prometheus-config } }
+  volumeClaimTemplates:
+  - metadata: { name: data }
+    spec:
+      accessModes: ["ReadWriteOnce"]
+      storageClassName: local-ssd-hetzner
+      resources: { requests: { storage: 50Gi } }
+---
+apiVersion: networking.k8s.io/v1
+kind: Ingress
+metadata:
+  name: prometheus
+  namespace: monitoring
+  annotations:
+    cert-manager.io/cluster-issuer: letsencrypt-prod
+    nginx.ingress.kubernetes.io/auth-type: basic
+    nginx.ingress.kubernetes.io/auth-secret: basic-auth-prometheus
+    nginx.ingress.kubernetes.io/auth-realm: "Authentication Required"
+spec:
+  ingressClassName: nginx
+  tls: [{ hosts: ["prometheus.betelgeusebytes.io"], secretName: prometheus-tls }]
+  rules:
+  - host: prometheus.betelgeusebytes.io
+    http:
+      paths:
+      - path: /
+        pathType: Prefix
+        backend: { service: { name: prometheus, port: { number: 9090 } } }
--- a/k8s/redis/redis-pv.yaml
+++ b/k8s/redis/redis-pv.yaml
@ -0,0 +1,21 @@
+apiVersion: v1
+kind: PersistentVolume
+metadata:
+  name: pv-redis
+spec:
+  capacity:
+    storage: 10Gi
+  accessModes:
+    - ReadWriteOnce
+  persistentVolumeReclaimPolicy: Retain
+  storageClassName: local-ssd-hetzner
+  local:
+    path: /mnt/local-ssd/redis
+  nodeAffinity:
+    required:
+      nodeSelectorTerms:
+      - matchExpressions:
+        - key: kubernetes.io/hostname
+          operator: In
+          values:
+          - hetzner-2
--- a/k8s/redis/redis.yaml
+++ b/k8s/redis/redis.yaml
@ -0,0 +1,40 @@
+apiVersion: v1
+kind: Service
+metadata: { name: redis, namespace: db }
+spec:
+  ports: [{ port: 6379, targetPort: 6379 }]
+  selector: { app: redis }
+---
+apiVersion: apps/v1
+kind: StatefulSet
+metadata: { name: redis, namespace: db }
+spec:
+  serviceName: redis
+  replicas: 1
+  selector: { matchLabels: { app: redis } }
+  template:
+    metadata: { labels: { app: redis } }
+    spec:
+      nodeSelector: { node: hetzner-2 }
+      containers:
+      - name: redis
+        image: redis:7
+        args: ["--requirepass", "$(REDIS_PASSWORD)"]
+        env:
+          - name: REDIS_PASSWORD
+            valueFrom: { secretKeyRef: { name: redis-auth, key: REDIS_PASSWORD } }
+        ports: [{ containerPort: 6379 }]
+        volumeMounts:
+          - { name: data, mountPath: /data }
+  volumeClaimTemplates:
+  - metadata: { name: data }
+    spec:
+      accessModes: ["ReadWriteOnce"]
+      storageClassName: local-ssd-hetzner
+      resources: { requests: { storage: 10Gi } }
+---
+apiVersion: v1
+kind: Secret
+metadata: { name: redis-auth, namespace: db }
+type: Opaque
+stringData: { REDIS_PASSWORD: "RED1S" }
--- a/k8s/scripts/cleanup.sh
+++ b/k8s/scripts/cleanup.sh
@ -0,0 +1,319 @@
+#!/bin/bash
+
+set -e
+
+echo "=========================================================="
+echo "Removing Existing Monitoring Stack"
+echo "=========================================================="
+echo ""
+
+RED='\033[0;31m'
+YELLOW='\033[1;33m'
+GREEN='\033[0;32m'
+NC='\033[0m' # No Color
+
+echo -e "${YELLOW}This script will remove common monitoring deployments including:${NC}"
+echo "  - Prometheus (standalone or operator)"
+echo "  - Grafana"
+echo "  - Fluent Bit"
+echo "  - Vector"
+echo "  - Loki"
+echo "  - Tempo"
+echo "  - Node exporters"
+echo "  - kube-state-metrics"
+echo "  - Any monitoring/prometheus/grafana namespaces"
+echo ""
+echo -e "${RED}WARNING: This will delete all existing monitoring data!${NC}"
+echo ""
+read -p "Are you sure you want to continue? (yes/no): " confirm
+
+if [ "$confirm" != "yes" ]; then
+    echo "Cleanup cancelled."
+    exit 0
+fi
+
+echo ""
+echo -e "${YELLOW}Step 1: Checking for existing monitoring namespaces...${NC}"
+
+# Common namespace names for monitoring
+NAMESPACES=("monitoring" "prometheus" "grafana" "loki" "tempo" "logging")
+
+for ns in "${NAMESPACES[@]}"; do
+    if kubectl get namespace "$ns" &> /dev/null; then
+        echo -e "${GREEN}Found namespace: $ns${NC}"
+        
+        # Show what's in the namespace
+        echo "  Resources in $ns:"
+        kubectl get all -n "$ns" 2>/dev/null | head -20 || true
+        echo ""
+        
+        read -p "  Delete namespace '$ns'? (yes/no): " delete_ns
+        if [ "$delete_ns" = "yes" ]; then
+            echo "  Deleting namespace $ns..."
+            kubectl delete namespace "$ns" --timeout=120s || {
+                echo -e "${YELLOW}  Warning: Namespace deletion timed out, forcing...${NC}"
+                kubectl delete namespace "$ns" --grace-period=0 --force &
+            }
+        fi
+    fi
+done
+
+echo ""
+echo -e "${YELLOW}Step 2: Removing common monitoring Helm releases...${NC}"
+
+# Check if helm is available
+if command -v helm &> /dev/null; then
+    echo "Checking for Helm releases..."
+    
+    # Common Helm release names
+    RELEASES=("prometheus" "grafana" "loki" "tempo" "fluent-bit" "prometheus-operator" "kube-prometheus-stack")
+    
+    for release in "${RELEASES[@]}"; do
+        # Check all namespaces for the release
+        if helm list -A | grep -q "$release"; then
+            ns=$(helm list -A | grep "$release" | awk '{print $2}')
+            echo -e "${GREEN}Found Helm release: $release in namespace $ns${NC}"
+            read -p "  Uninstall Helm release '$release'? (yes/no): " uninstall
+            if [ "$uninstall" = "yes" ]; then
+                echo "  Uninstalling $release..."
+                helm uninstall "$release" -n "$ns" || echo -e "${YELLOW}  Warning: Failed to uninstall $release${NC}"
+            fi
+        fi
+    done
+else
+    echo "Helm not found, skipping Helm releases check"
+fi
+
+echo ""
+echo -e "${YELLOW}Step 3: Removing standalone monitoring components...${NC}"
+
+# Remove common DaemonSets in kube-system or default
+echo "Checking for monitoring DaemonSets..."
+for ns in kube-system default; do
+    if kubectl get daemonset -n "$ns" 2>/dev/null | grep -q "node-exporter\|fluent-bit\|fluentd\|vector"; then
+        echo -e "${GREEN}Found monitoring DaemonSets in $ns${NC}"
+        kubectl get daemonset -n "$ns" | grep -E "node-exporter|fluent-bit|fluentd|vector"
+        read -p "  Delete these DaemonSets? (yes/no): " delete_ds
+        if [ "$delete_ds" = "yes" ]; then
+            kubectl delete daemonset -n "$ns" -l app=node-exporter --ignore-not-found
+            kubectl delete daemonset -n "$ns" -l app=fluent-bit --ignore-not-found
+            kubectl delete daemonset -n "$ns" -l app=fluentd --ignore-not-found
+            kubectl delete daemonset -n "$ns" -l app=vector --ignore-not-found
+            kubectl delete daemonset -n "$ns" node-exporter --ignore-not-found
+            kubectl delete daemonset -n "$ns" fluent-bit --ignore-not-found
+            kubectl delete daemonset -n "$ns" fluentd --ignore-not-found
+            kubectl delete daemonset -n "$ns" vector --ignore-not-found
+        fi
+    fi
+done
+
+# Remove common Deployments
+echo ""
+echo "Checking for monitoring Deployments..."
+for ns in kube-system default; do
+    if kubectl get deployment -n "$ns" 2>/dev/null | grep -q "prometheus\|grafana\|kube-state-metrics\|loki\|tempo"; then
+        echo -e "${GREEN}Found monitoring Deployments in $ns${NC}"
+        kubectl get deployment -n "$ns" | grep -E "prometheus|grafana|kube-state-metrics|loki|tempo"
+        read -p "  Delete these Deployments? (yes/no): " delete_deploy
+        if [ "$delete_deploy" = "yes" ]; then
+            kubectl delete deployment -n "$ns" -l app=prometheus --ignore-not-found
+            kubectl delete deployment -n "$ns" -l app=grafana --ignore-not-found
+            kubectl delete deployment -n "$ns" -l app=kube-state-metrics --ignore-not-found
+            kubectl delete deployment -n "$ns" -l app=loki --ignore-not-found
+            kubectl delete deployment -n "$ns" -l app=tempo --ignore-not-found
+            kubectl delete deployment -n "$ns" prometheus --ignore-not-found
+            kubectl delete deployment -n "$ns" grafana --ignore-not-found
+            kubectl delete deployment -n "$ns" kube-state-metrics --ignore-not-found
+        fi
+    fi
+done
+
+# Remove common StatefulSets
+echo ""
+echo "Checking for monitoring StatefulSets..."
+for ns in kube-system default; do
+    if kubectl get statefulset -n "$ns" 2>/dev/null | grep -q "prometheus\|grafana\|loki\|tempo"; then
+        echo -e "${GREEN}Found monitoring StatefulSets in $ns${NC}"
+        kubectl get statefulset -n "$ns" | grep -E "prometheus|grafana|loki|tempo"
+        read -p "  Delete these StatefulSets? (yes/no): " delete_sts
+        if [ "$delete_sts" = "yes" ]; then
+            kubectl delete statefulset -n "$ns" -l app=prometheus --ignore-not-found
+            kubectl delete statefulset -n "$ns" -l app=grafana --ignore-not-found
+            kubectl delete statefulset -n "$ns" -l app=loki --ignore-not-found
+            kubectl delete statefulset -n "$ns" -l app=tempo --ignore-not-found
+            kubectl delete statefulset -n "$ns" prometheus --ignore-not-found
+            kubectl delete statefulset -n "$ns" grafana --ignore-not-found
+            kubectl delete statefulset -n "$ns" loki --ignore-not-found
+            kubectl delete statefulset -n "$ns" tempo --ignore-not-found
+        fi
+    fi
+done
+
+echo ""
+echo -e "${YELLOW}Step 4: Removing monitoring ConfigMaps...${NC}"
+
+# Ask before removing ConfigMaps (they might contain important configs)
+echo "Checking for monitoring ConfigMaps..."
+for ns in kube-system default monitoring prometheus grafana; do
+    if kubectl get namespace "$ns" &> /dev/null; then
+        if kubectl get configmap -n "$ns" 2>/dev/null | grep -q "prometheus\|grafana\|loki\|tempo\|fluent"; then
+            echo -e "${GREEN}Found monitoring ConfigMaps in $ns${NC}"
+            kubectl get configmap -n "$ns" | grep -E "prometheus|grafana|loki|tempo|fluent"
+            read -p "  Delete these ConfigMaps? (yes/no): " delete_cm
+            if [ "$delete_cm" = "yes" ]; then
+                kubectl delete configmap -n "$ns" -l app=prometheus --ignore-not-found
+                kubectl delete configmap -n "$ns" -l app=grafana --ignore-not-found
+                kubectl delete configmap -n "$ns" -l app=loki --ignore-not-found
+                kubectl delete configmap -n "$ns" -l app=fluent-bit --ignore-not-found
+            fi
+        fi
+    fi
+done
+
+echo ""
+echo -e "${YELLOW}Step 5: Removing ClusterRoles and ClusterRoleBindings...${NC}"
+
+# Remove monitoring-related RBAC
+echo "Checking for monitoring ClusterRoles..."
+if kubectl get clusterrole 2>/dev/null | grep -q "prometheus\|grafana\|kube-state-metrics\|fluent-bit\|node-exporter"; then
+    echo -e "${GREEN}Found monitoring ClusterRoles${NC}"
+    kubectl get clusterrole | grep -E "prometheus|grafana|kube-state-metrics|fluent-bit|node-exporter"
+    read -p "  Delete these ClusterRoles? (yes/no): " delete_cr
+    if [ "$delete_cr" = "yes" ]; then
+        kubectl delete clusterrole prometheus --ignore-not-found
+        kubectl delete clusterrole grafana --ignore-not-found
+        kubectl delete clusterrole kube-state-metrics --ignore-not-found
+        kubectl delete clusterrole fluent-bit --ignore-not-found
+        kubectl delete clusterrole node-exporter --ignore-not-found
+    fi
+fi
+
+echo "Checking for monitoring ClusterRoleBindings..."
+if kubectl get clusterrolebinding 2>/dev/null | grep -q "prometheus\|grafana\|kube-state-metrics\|fluent-bit\|node-exporter"; then
+    echo -e "${GREEN}Found monitoring ClusterRoleBindings${NC}"
+    kubectl get clusterrolebinding | grep -E "prometheus|grafana|kube-state-metrics|fluent-bit|node-exporter"
+    read -p "  Delete these ClusterRoleBindings? (yes/no): " delete_crb
+    if [ "$delete_crb" = "yes" ]; then
+        kubectl delete clusterrolebinding prometheus --ignore-not-found
+        kubectl delete clusterrolebinding grafana --ignore-not-found
+        kubectl delete clusterrolebinding kube-state-metrics --ignore-not-found
+        kubectl delete clusterrolebinding fluent-bit --ignore-not-found
+        kubectl delete clusterrolebinding node-exporter --ignore-not-found
+    fi
+fi
+
+echo ""
+echo -e "${YELLOW}Step 6: Removing PVCs and PVs...${NC}"
+
+# Check for monitoring PVCs
+echo "Checking for monitoring PersistentVolumeClaims..."
+for ns in kube-system default monitoring prometheus grafana; do
+    if kubectl get namespace "$ns" &> /dev/null; then
+        if kubectl get pvc -n "$ns" 2>/dev/null | grep -q "prometheus\|grafana\|loki\|tempo"; then
+            echo -e "${GREEN}Found monitoring PVCs in $ns${NC}"
+            kubectl get pvc -n "$ns" | grep -E "prometheus|grafana|loki|tempo"
+            echo -e "${RED}  WARNING: Deleting PVCs will delete all stored data!${NC}"
+            read -p "  Delete these PVCs? (yes/no): " delete_pvc
+            if [ "$delete_pvc" = "yes" ]; then
+                kubectl delete pvc -n "$ns" -l app=prometheus --ignore-not-found
+                kubectl delete pvc -n "$ns" -l app=grafana --ignore-not-found
+                kubectl delete pvc -n "$ns" -l app=loki --ignore-not-found
+                kubectl delete pvc -n "$ns" -l app=tempo --ignore-not-found
+                # Also try by name patterns
+                kubectl get pvc -n "$ns" -o name | grep -E "prometheus|grafana|loki|tempo" | xargs -r kubectl delete -n "$ns" || true
+            fi
+        fi
+    fi
+done
+
+# Check for monitoring PVs
+echo ""
+echo "Checking for monitoring PersistentVolumes..."
+if kubectl get pv 2>/dev/null | grep -q "prometheus\|grafana\|loki\|tempo\|monitoring"; then
+    echo -e "${GREEN}Found monitoring PVs${NC}"
+    kubectl get pv | grep -E "prometheus|grafana|loki|tempo|monitoring"
+    echo -e "${RED}  WARNING: Deleting PVs may delete data on disk!${NC}"
+    read -p "  Delete these PVs? (yes/no): " delete_pv
+    if [ "$delete_pv" = "yes" ]; then
+        kubectl get pv -o name | grep -E "prometheus|grafana|loki|tempo|monitoring" | xargs -r kubectl delete || true
+    fi
+fi
+
+echo ""
+echo -e "${YELLOW}Step 7: Checking for monitoring Ingresses...${NC}"
+
+for ns in kube-system default monitoring prometheus grafana; do
+    if kubectl get namespace "$ns" &> /dev/null; then
+        if kubectl get ingress -n "$ns" 2>/dev/null | grep -q "prometheus\|grafana\|loki"; then
+            echo -e "${GREEN}Found monitoring Ingresses in $ns${NC}"
+            kubectl get ingress -n "$ns" | grep -E "prometheus|grafana|loki"
+            read -p "  Delete these Ingresses? (yes/no): " delete_ing
+            if [ "$delete_ing" = "yes" ]; then
+                kubectl delete ingress -n "$ns" -l app=prometheus --ignore-not-found
+                kubectl delete ingress -n "$ns" -l app=grafana --ignore-not-found
+                kubectl delete ingress -n "$ns" prometheus-ingress --ignore-not-found
+                kubectl delete ingress -n "$ns" grafana-ingress --ignore-not-found
+            fi
+        fi
+    fi
+done
+
+echo ""
+echo -e "${YELLOW}Step 8: Checking for Prometheus Operator CRDs...${NC}"
+
+# Check for Prometheus Operator CRDs
+if kubectl get crd 2>/dev/null | grep -q "monitoring.coreos.com"; then
+    echo -e "${GREEN}Found Prometheus Operator CRDs${NC}"
+    kubectl get crd | grep "monitoring.coreos.com"
+    echo ""
+    echo -e "${RED}WARNING: Deleting these CRDs will remove ALL Prometheus Operator resources cluster-wide!${NC}"
+    read -p "  Delete Prometheus Operator CRDs? (yes/no): " delete_crd
+    if [ "$delete_crd" = "yes" ]; then
+        kubectl delete crd prometheuses.monitoring.coreos.com --ignore-not-found
+        kubectl delete crd prometheusrules.monitoring.coreos.com --ignore-not-found
+        kubectl delete crd servicemonitors.monitoring.coreos.com --ignore-not-found
+        kubectl delete crd podmonitors.monitoring.coreos.com --ignore-not-found
+        kubectl delete crd alertmanagers.monitoring.coreos.com --ignore-not-found
+        kubectl delete crd alertmanagerconfigs.monitoring.coreos.com --ignore-not-found
+        kubectl delete crd probes.monitoring.coreos.com --ignore-not-found
+        kubectl delete crd thanosrulers.monitoring.coreos.com --ignore-not-found
+    fi
+fi
+
+echo ""
+echo -e "${YELLOW}Step 9: Optional - Clean up data directories on nodes...${NC}"
+echo ""
+echo "You may have monitoring data stored on your nodes at:"
+echo "  - /mnt/local-ssd/prometheus"
+echo "  - /mnt/local-ssd/grafana"
+echo "  - /mnt/local-ssd/loki"
+echo "  - /mnt/local-ssd/tempo"
+echo "  - /var/lib/prometheus"
+echo "  - /var/lib/grafana"
+echo ""
+echo "To remove these, SSH to each node and run:"
+echo "  sudo rm -rf /mnt/local-ssd/{prometheus,grafana,loki,tempo}"
+echo "  sudo rm -rf /var/lib/{prometheus,grafana,loki,tempo}"
+echo ""
+read -p "Have you cleaned up the data directories? (yes to continue, no to skip): " cleanup_dirs
+
+echo ""
+echo -e "${GREEN}=========================================================="
+echo "Existing Monitoring Stack Cleanup Complete!"
+echo "==========================================================${NC}"
+echo ""
+echo "Summary of actions taken:"
+echo "  - Removed monitoring namespaces (if confirmed)"
+echo "  - Uninstalled Helm releases (if found and confirmed)"
+echo "  - Removed standalone monitoring components"
+echo "  - Removed monitoring ConfigMaps"
+echo "  - Removed RBAC resources"
+echo "  - Removed PVCs and PVs (if confirmed)"
+echo "  - Removed Ingresses"
+echo "  - Removed Prometheus Operator CRDs (if confirmed)"
+echo ""
+echo -e "${YELLOW}Next Steps:${NC}"
+echo "1. Verify cleanup: kubectl get all -A | grep -E 'prometheus|grafana|loki|tempo|monitoring'"
+echo "2. Clean up node data directories (see above)"
+echo "3. Deploy new observability stack: ./deploy.sh"
+echo ""
--- a/k8s/sso/sso.yaml
+++ b/k8s/sso/sso.yaml
@ -0,0 +1,98 @@
+# PV
+apiVersion: v1
+kind: PersistentVolume
+metadata:
+  name: pv-auth
+spec:
+  capacity:
+    storage: 10Gi
+  accessModes:
+    - ReadWriteOnce
+  persistentVolumeReclaimPolicy: Retain
+  storageClassName: local-ssd-hetzner
+  local:
+    path: /mnt/local-ssd/auth
+  nodeAffinity:
+    required:
+      nodeSelectorTerms:
+      - matchExpressions:
+        - key: kubernetes.io/hostname
+          operator: In
+          values:
+          - hetzner-2
+---
+# k8s/auth/keycloak/secret.yaml
+apiVersion: v1
+kind: Secret
+metadata: { name: keycloak-admin,  namespace: db }
+type: Opaque
+stringData: { KEYCLOAK_ADMIN: "admin", KEYCLOAK_ADMIN_PASSWORD: "admin" }
+
+---
+# k8s/auth/keycloak/pvc.yaml
+apiVersion: v1
+kind: PersistentVolumeClaim
+metadata: { name: keycloak-data,  namespace: db }
+spec:
+  accessModes: ["ReadWriteOnce"]
+  storageClassName: local-ssd-hetzner
+  resources: { requests: { storage: 10Gi } }
+
+---
+# k8s/auth/keycloak/deploy.yaml
+apiVersion: apps/v1
+kind: Deployment
+metadata: { name: keycloak,  namespace: db }
+spec:
+  replicas: 1
+  selector: { matchLabels: { app: keycloak } }
+  template:
+    metadata: { labels: { app: keycloak } }
+    spec:
+      # Ensure the PV is owned by the Keycloak UID/GID
+      securityContext:
+        fsGroup: 1000
+      initContainers:
+      - name: fix-permissions
+        image: busybox
+        command: ['sh', '-c', 'chown -R 1000:1000 /opt/keycloak/data && chmod -R 755 /opt/keycloak/data']
+        volumeMounts:
+        - name: data
+          mountPath: /opt/keycloak/data
+      containers:
+      - name: keycloak
+        image: quay.io/keycloak/keycloak:latest
+        args: ["start","--http-enabled=true","--proxy-headers=xforwarded","--hostname-strict=false"]
+        env:
+          - { name: KEYCLOAK_ADMIN, valueFrom: { secretKeyRef: { name: keycloak-admin, key: KEYCLOAK_ADMIN } } }
+          - { name: KEYCLOAK_ADMIN_PASSWORD, valueFrom: { secretKeyRef: { name: keycloak-admin, key: KEYCLOAK_ADMIN_PASSWORD } } }
+        ports: [{ containerPort: 8080 }]
+        volumeMounts: [{ name: data, mountPath: /opt/keycloak/data }]
+        securityContext:
+          runAsUser: 1000
+          runAsGroup: 1000
+      volumes:
+        - name: data
+          persistentVolumeClaim: { claimName: keycloak-data }
+---
+apiVersion: v1
+kind: Service
+metadata: { name: keycloak,  namespace: db }
+spec: { selector: { app: keycloak }, ports: [ { port: 80, targetPort: 8080 } ] }
+---
+apiVersion: networking.k8s.io/v1
+kind: Ingress
+metadata:
+  name: keycloak
+  namespace: db
+  annotations: { cert-manager.io/cluster-issuer: letsencrypt-prod }
+spec:
+  ingressClassName: nginx
+  tls: [{ hosts: ["auth.betelgeusebytes.io"], secretName: keycloak-tls }]
+  rules:
+  - host: auth.betelgeusebytes.io
+    http:
+      paths:
+      - path: /
+        pathType: Prefix
+        backend: { service: { name: keycloak, port: { number: 80 } } }
--- a/k8s/storage/persistent-volumes.yaml
+++ b/k8s/storage/persistent-volumes.yaml
@ -0,0 +1,175 @@
+apiVersion: v1
+kind: PersistentVolume
+metadata:
+  name: pv-postgres
+spec:
+  capacity:
+    storage: 80Gi
+  accessModes:
+    - ReadWriteOnce
+  persistentVolumeReclaimPolicy: Retain
+  storageClassName: local-ssd-hetzner
+  local:
+    path: /mnt/local-ssd/postgres
+  nodeAffinity:
+    required:
+      nodeSelectorTerms:
+      - matchExpressions:
+        - key: kubernetes.io/hostname
+          operator: In
+          values:
+          - hetzner-2
+---
+apiVersion: v1
+kind: PersistentVolume
+metadata:
+  name: pv-elasticsearch
+spec:
+  capacity:
+    storage: 300Gi
+  accessModes:
+    - ReadWriteOnce
+  persistentVolumeReclaimPolicy: Retain
+  storageClassName: local-ssd-hetzner
+  local:
+    path: /mnt/local-ssd/elasticsearch
+  nodeAffinity:
+    required:
+      nodeSelectorTerms:
+      - matchExpressions:
+        - key: kubernetes.io/hostname
+          operator: In
+          values:
+          - hetzner-2
+---
+apiVersion: v1
+kind: PersistentVolume
+metadata:
+  name: pv-gitea
+spec:
+  capacity:
+    storage: 50Gi
+  accessModes:
+    - ReadWriteOnce
+  persistentVolumeReclaimPolicy: Retain
+  storageClassName: local-ssd-hetzner
+  local:
+    path: /mnt/local-ssd/gitea
+  nodeAffinity:
+    required:
+      nodeSelectorTerms:
+      - matchExpressions:
+        - key: kubernetes.io/hostname
+          operator: In
+          values:
+          - hetzner-2
+---
+apiVersion: v1
+kind: PersistentVolume
+metadata:
+  name: pv-jupyter
+spec:
+  capacity:
+    storage: 20Gi
+  accessModes:
+    - ReadWriteOnce
+  persistentVolumeReclaimPolicy: Retain
+  storageClassName: local-ssd-hetzner
+  local:
+    path: /mnt/local-ssd/jupyter
+  nodeAffinity:
+    required:
+      nodeSelectorTerms:
+      - matchExpressions:
+        - key: kubernetes.io/hostname
+          operator: In
+          values:
+          - hetzner-2
+---
+apiVersion: v1
+kind: PersistentVolume
+metadata:
+  name: pv-kafka
+spec:
+  capacity:
+    storage: 50Gi
+  accessModes:
+    - ReadWriteOnce
+  persistentVolumeReclaimPolicy: Retain
+  storageClassName: local-ssd-hetzner
+  local:
+    path: /mnt/local-ssd/kafka
+  nodeAffinity:
+    required:
+      nodeSelectorTerms:
+      - matchExpressions:
+        - key: kubernetes.io/hostname
+          operator: In
+          values:
+          - hetzner-2
+---
+apiVersion: v1
+kind: PersistentVolume
+metadata:
+  name: pv-zookeeper-data
+spec:
+  capacity:
+    storage: 10Gi
+  accessModes:
+    - ReadWriteOnce
+  persistentVolumeReclaimPolicy: Retain
+  storageClassName: local-ssd-hetzner
+  local:
+    path: /mnt/local-ssd/zookeeper-data
+  nodeAffinity:
+    required:
+      nodeSelectorTerms:
+      - matchExpressions:
+        - key: kubernetes.io/hostname
+          operator: In
+          values:
+          - hetzner-2
+---
+apiVersion: v1
+kind: PersistentVolume
+metadata:
+  name: pv-zookeeper-log
+spec:
+  capacity:
+    storage: 10Gi
+  accessModes:
+    - ReadWriteOnce
+  persistentVolumeReclaimPolicy: Retain
+  storageClassName: local-ssd-hetzner
+  local:
+    path: /mnt/local-ssd/zookeeper-log
+  nodeAffinity:
+    required:
+      nodeSelectorTerms:
+      - matchExpressions:
+        - key: kubernetes.io/hostname
+          operator: In
+          values:
+          - hetzner-2
+---
+apiVersion: v1
+kind: PersistentVolume
+metadata:
+  name: pv-prometheus
+spec:
+  capacity:
+    storage: 50Gi
+  accessModes:
+    - ReadWriteOnce
+  persistentVolumeReclaimPolicy: Retain
+  storageClassName: local-ssd-hetzner
+  local:
+    path: /mnt/local-ssd/prometheus
+  nodeAffinity:
+    required:
+      nodeSelectorTerms:
+      - matchExpressions:
+        - key: kubernetes.io/hostname
+          operator: In
+          values:
+          - hetzner-2
--- a/k8s/storage/storageclass.yaml
+++ b/k8s/storage/storageclass.yaml
@ -0,0 +1,6 @@
+apiVersion: storage.k8s.io/v1
+kind: StorageClass
+metadata:
+  name: local-ssd-hetzner
+provisioner: kubernetes.io/no-provisioner
+volumeBindingMode: WaitForFirstConsumer
--- a/k8s/tei/tei.yaml
+++ b/k8s/tei/tei.yaml
@ -0,0 +1,37 @@
+# k8s/ai/tei/deploy.yaml
+apiVersion: apps/v1
+kind: Deployment
+metadata: { name: tei, namespace: ml }
+spec:
+  replicas: 1
+  selector: { matchLabels: { app: tei } }
+  template:
+    metadata: { labels: { app: tei } }
+    spec:
+      containers:
+      - name: tei
+        image: ghcr.io/huggingface/text-embeddings-inference:cpu-latest
+        env: [{ name: MODEL_ID, value: "mixedbread-ai/mxbai-embed-large-v1" }]
+        ports: [{ containerPort: 80 }]
+---
+apiVersion: v1
+kind: Service
+metadata: { name: tei, namespace: ml }
+spec: { selector: { app: tei }, ports: [ { port: 80, targetPort: 80 } ] }
+---
+apiVersion: networking.k8s.io/v1
+kind: Ingress
+metadata:
+  name: tei
+  namespace: ml
+  annotations: { cert-manager.io/cluster-issuer: letsencrypt-prod }
+spec:
+  ingressClassName: nginx
+  tls: [{ hosts: ["embeddings.betelgeusebytes.io"], secretName: tei-tls }]
+  rules:
+  - host: embeddings.betelgeusebytes.io
+    http:
+      paths:
+      - path: /
+        pathType: Prefix
+        backend: { service: { name: tei, port: { number: 80 } } }
--- a/k8s/trading/ib-gateway.yaml
+++ b/k8s/trading/ib-gateway.yaml
@ -0,0 +1,541 @@
+apiVersion: v1
+kind: Namespace
+metadata:
+  name: trading
+  labels:
+    name: trading
+    environment: production
+---
+# OPTIONAL: Use this if you want to persist IB Gateway settings/logs
+# across pod restarts. For most use cases, this is NOT needed since
+# IB Gateway is mostly stateless and credentials are in Secrets.
+#
+# Only create this PV/PVC if you need to persist:
+# - TWS session data
+# - Custom workspace layouts
+# - Historical API usage logs
+
+apiVersion: v1
+kind: PersistentVolume
+metadata:
+  name: ib-gateway-data
+  labels:
+    type: local
+    app: ib-gateway
+spec:
+  capacity:
+    storage: 5Gi
+  accessModes:
+    - ReadWriteOnce
+  persistentVolumeReclaimPolicy: Retain
+  storageClassName: local-storage
+  local:
+    path: /mnt/local-ssd/ib-gateway  # Adjust to your local SSD path
+  nodeAffinity:
+    required:
+      nodeSelectorTerms:
+      - matchExpressions:
+        - key: kubernetes.io/hostname
+          operator: In
+          values:
+          - hetzner-2
+
+---
+apiVersion: v1
+kind: PersistentVolumeClaim
+metadata:
+  name: ib-gateway-data
+  namespace: trading
+spec:
+  accessModes:
+    - ReadWriteOnce
+  resources:
+    requests:
+      storage: 5Gi
+  storageClassName: local-storage
+  selector:
+    matchLabels:
+      app: ib-gateway
+
+# To use this PVC, add to Deployment volumeMounts:
+#   - name: data
+#     mountPath: /root/Jts
+# And to volumes:
+#   - name: data
+#     persistentVolumeClaim:
+#       claimName: ib-gateway-data
+---
+apiVersion: v1
+kind: Secret
+metadata:
+  name: ib-credentials
+  namespace: trading
+type: Opaque
+stringData:
+  # IMPORTANT: Replace these with your actual IB credentials
+  # For paper trading, use your paper trading account
+  username: "saladin85"
+  password: "3Lcd@05041985"
+  # Trading mode: "paper" or "live"
+  trading-mode: "paper"
+  
+  # IB Gateway config (jts.ini equivalent)
+  # This enables headless mode and configures ports
+  ibgateway.conf: |
+    [IBGateway]
+    TradingMode=paper
+    ApiOnly=true
+    ReadOnlyApi=false
+    TrustedIPs=127.0.0.1
+    
+    [IBGatewayAPI]
+    ApiPortNumber=4002
+    
+    [Logon]
+    UseRemoteSettings=no
+    Locale=en
+    ColorPaletteName=dark
+    
+    [Display]
+    ShowSplashScreen=no
+---
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: ib-gateway-config
+  namespace: trading
+data:
+  # Startup script to configure IB Gateway for headless operation
+  startup.sh: |
+    #!/bin/bash
+    set -e
+    
+    echo "Starting IB Gateway in headless mode..."
+    echo "Trading Mode: ${TRADING_MODE}"
+    echo "Port: ${TWS_PORT}"
+    
+    # Configure based on trading mode
+    if [ "${TRADING_MODE}" == "live" ]; then
+      export TWS_PORT=4001
+      echo "⚠️  LIVE TRADING MODE - USE WITH CAUTION ⚠️"
+    else
+      export TWS_PORT=4002
+      echo "📝 Paper Trading Mode (Safe)"
+    fi
+    # IMPORTANT: use the env vars provided by the Deployment
+    export IB_USERNAME="${TWS_USERID}"
+    export IB_PASSWORD="${TWS_PASSWORD}"
+
+    # Start IB Gateway
+    exec /opt/ibgateway/ibgateway-latest-standalone-linux-x64.sh \
+      --tws-path=/root/Jts \
+      --tws-settings-path=/root \
+      --user="${IB_USERNAME}" \
+      --pw="${IB_PASSWORD}" \
+      --mode="${TRADING_MODE}" \
+      --port="${TWS_PORT}"
+  
+  # Health check script
+  healthcheck.sh: |
+    #!/bin/bash
+    # Check if TWS API port is listening
+    # PORT=${TWS_PORT:-4002}
+    # nc -z localhost $PORT
+    # exit $?
+    #!/bin/sh
+    # Pure-python TCP check (no nc required)
+    PORT="${TWS_PORT:-4002}"
+    python - <<'PY'
+    import os, socket, sys
+    port = int(os.environ.get("TWS_PORT", os.environ.get("PORT", "4002")))
+    s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+    s.settimeout(2)
+    try:
+        s.connect(("127.0.0.1", port))
+        sys.exit(0)
+    except Exception:
+        sys.exit(1)
+    finally:
+        s.close()
+    PY
+---
+# apiVersion: apps/v1
+# kind: Deployment
+# metadata:
+#   name: ib-gateway
+#   namespace: trading
+#   labels:
+#     app: ib-gateway
+#     component: trading-infrastructure
+# spec:
+#   replicas: 1  # IB Gateway should only have 1 instance per account
+#   strategy:
+#     type: Recreate  # Avoid multiple simultaneous logins
+#   selector:
+#     matchLabels:
+#       app: ib-gateway
+#   template:
+#     metadata:
+#       labels:
+#         app: ib-gateway
+#       annotations:
+#         prometheus.io/scrape: "false"  # No metrics endpoint by default
+#     spec:
+#       # Pin to hetzner-2 (matches your existing pattern)
+#       nodeSelector:
+#         kubernetes.io/hostname: hetzner-2
+      
+#       # Security context
+#       securityContext:
+#         runAsNonRoot: false  # IB Gateway requires root for VNC (even if unused)
+#         fsGroup: 1000
+      
+#       containers:
+#       - name: ib-gateway
+#         # Using community-maintained IB Gateway image
+#         # Alternative: waytrade/ib-gateway:latest
+#         image: ghcr.io/gnzsnz/ib-gateway:stable
+#         imagePullPolicy: IfNotPresent
+        
+#         env:
+#         - name: TWS_USERID
+#           valueFrom:
+#             secretKeyRef:
+#               name: ib-credentials
+#               key: username
+#         - name: TWS_PASSWORD
+#           valueFrom:
+#             secretKeyRef:
+#               name: ib-credentials
+#               key: password
+#         - name: TRADING_MODE
+#           valueFrom:
+#             secretKeyRef:
+#               name: ib-credentials
+#               key: trading-mode
+#         - name: TWS_PORT
+#           value: "4002"  # Default to paper trading
+#         - name: READ_ONLY_API
+#           value: "no"
+        
+#         # Ports
+#         ports:
+#         - name: paper-trading
+#           containerPort: 4002
+#           protocol: TCP
+#         - name: live-trading
+#           containerPort: 4001
+#           protocol: TCP
+#         - name: vnc
+#           containerPort: 5900
+#           protocol: TCP  # VNC (not exposed externally)
+        
+#         # Resource limits
+#         resources:
+#           requests:
+#             memory: "1Gi"
+#             cpu: "500m"
+#           limits:
+#             memory: "2Gi"
+#             cpu: "1000m"
+        
+#         # Liveness probe (check if API port is responsive)
+#         startupProbe:
+#           tcpSocket:
+#             port: 4002
+#           initialDelaySeconds: 60      # Wait 60s before first check
+#           periodSeconds: 10             # Check every 10s
+#           timeoutSeconds: 5
+#           failureThreshold: 18          # 60s + (10s * 18) = 240s total startup time
+        
+#         livenessProbe:
+#           tcpSocket:
+#             port: 4002
+#           initialDelaySeconds: 0  # IB Gateway takes time to start
+#           periodSeconds: 60
+#           timeoutSeconds: 5
+#           failureThreshold: 3
+        
+#         # Readiness probe
+#         readinessProbe:
+#           tcpSocket:
+#             port: 4002
+#           initialDelaySeconds: 0
+#           periodSeconds: 10
+#           timeoutSeconds: 5
+#           failureThreshold: 2
+        
+#         # Volume mounts for config
+#         volumeMounts:
+#         - name: ib-config
+#           mountPath: /root/Jts/jts.ini
+#           subPath: ibgateway.conf
+#         - name: startup-script
+#           mountPath: /startup.sh
+#           subPath: startup.sh
+#         - name: data
+#           mountPath: /root/Jts
+        
+#         # Logging to stdout (Fluent Bit will collect)
+#         # IB Gateway logs go to /root/Jts/log by default
+#         lifecycle:
+#           postStart:
+#             exec:
+#               command:
+#               - /bin/sh
+#               - -c
+#               - |
+#                 mkdir -p /root/Jts/log
+#                 ln -sf /dev/stdout /root/Jts/log/ibgateway.log || true
+      
+#       volumes:
+#       - name: ib-config
+#         secret:
+#           secretName: ib-credentials
+#           defaultMode: 0644
+#       - name: startup-script
+#         configMap:
+#           name: ib-gateway-config
+#           defaultMode: 0755
+#       - name: data
+#         persistentVolumeClaim:
+#           claimName: ib-gateway-data
+      
+#       # Restart policy
+#       restartPolicy: Always
+      
+#       # DNS policy for internal cluster resolution
+#       dnsPolicy: ClusterFirst
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: ib-gateway
+  namespace: trading
+  labels:
+    app: ib-gateway
+    component: trading-infrastructure
+spec:
+  replicas: 1
+  strategy:
+    type: Recreate
+  selector:
+    matchLabels:
+      app: ib-gateway
+  template:
+    metadata:
+      labels:
+        app: ib-gateway
+      annotations:
+        prometheus.io/scrape: "false"
+    spec:
+      nodeSelector:
+        kubernetes.io/hostname: hetzner-2
+
+      securityContext:
+        runAsNonRoot: false
+        fsGroup: 1000
+
+      # Seed writable jts.ini into the PVC once
+      initContainers:
+      - name: seed-jts-config
+        image: busybox:1.36
+        command:
+          - sh
+          - -c
+          - |
+            set -e
+            mkdir -p /data
+            if [ ! -f /data/jts.ini ]; then
+              echo "Seeding jts.ini into PVC"
+              cp /config/ibgateway.conf /data/jts.ini
+              chmod 644 /data/jts.ini
+            else
+              echo "jts.ini already exists in PVC"
+            fi
+        volumeMounts:
+        - name: ib-config
+          mountPath: /config
+          readOnly: true
+        - name: data
+          mountPath: /data
+
+      containers:
+      # ------------------------------------------------------------------
+      # IB Gateway
+      # ------------------------------------------------------------------
+      - name: ib-gateway
+        image: ghcr.io/gnzsnz/ib-gateway:stable
+        imagePullPolicy: IfNotPresent
+
+        env:
+        - name: TWS_USERID
+          valueFrom:
+            secretKeyRef:
+              name: ib-credentials
+              key: username
+        - name: TWS_PASSWORD
+          valueFrom:
+            secretKeyRef:
+              name: ib-credentials
+              key: password
+        - name: TRADING_MODE
+          valueFrom:
+            secretKeyRef:
+              name: ib-credentials
+              key: trading-mode
+        - name: TWS_PORT
+          value: "4002"
+        - name: READ_ONLY_API
+          value: "no"
+
+        ports:
+        - name: ib-api-local
+          containerPort: 4002
+          protocol: TCP
+        - name: live-trading
+          containerPort: 4001
+          protocol: TCP
+        - name: vnc
+          containerPort: 5900
+          protocol: TCP
+
+        resources:
+          requests:
+            memory: "1Gi"
+            cpu: "500m"
+          limits:
+            memory: "2Gi"
+            cpu: "1000m"
+
+        # IMPORTANT: Probes should check the local IB port (4002)
+        startupProbe:
+          tcpSocket:
+            port: 4002
+          initialDelaySeconds: 60
+          periodSeconds: 10
+          timeoutSeconds: 5
+          failureThreshold: 18
+
+        livenessProbe:
+          tcpSocket:
+            port: 4002
+          periodSeconds: 60
+          timeoutSeconds: 5
+          failureThreshold: 3
+
+        readinessProbe:
+          tcpSocket:
+            port: 4002
+          periodSeconds: 10
+          timeoutSeconds: 5
+          failureThreshold: 2
+
+        volumeMounts:
+        - name: data
+          mountPath: /root/Jts
+
+        lifecycle:
+          postStart:
+            exec:
+              command:
+              - sh
+              - -c
+              - |
+                mkdir -p /root/Jts/log
+                ln -sf /dev/stdout /root/Jts/log/ibgateway.log || true
+
+      # ------------------------------------------------------------------
+      # Sidecar TCP proxy: accepts cluster traffic, forwards to localhost:4002
+      # ------------------------------------------------------------------
+      - name: ib-api-proxy
+        image: alpine/socat:1.8.0.0
+        imagePullPolicy: IfNotPresent
+        args:
+          - "-d"
+          - "-d"
+          - "TCP-LISTEN:4003,fork,reuseaddr"
+          - "TCP:127.0.0.1:4002"
+        ports:
+        - name: ib-api
+          containerPort: 4003
+          protocol: TCP
+        resources:
+          requests:
+            memory: "32Mi"
+            cpu: "10m"
+          limits:
+            memory: "128Mi"
+            cpu: "100m"
+        # basic probe: is proxy listening
+        readinessProbe:
+          tcpSocket:
+            port: 4003
+          periodSeconds: 5
+          timeoutSeconds: 2
+          failureThreshold: 3
+
+      volumes:
+      - name: ib-config
+        secret:
+          secretName: ib-credentials
+          defaultMode: 0644
+
+      - name: data
+        persistentVolumeClaim:
+          claimName: ib-gateway-data
+
+      restartPolicy: Always
+      dnsPolicy: ClusterFirst
+
+
+---
+# apiVersion: v1
+# kind: Service
+# metadata:
+#   name: ib-gateway
+#   namespace: trading
+#   labels:
+#     app: ib-gateway
+# spec:
+#   type: ClusterIP  # Internal-only, not exposed publicly
+#   clusterIP: None  # Headless service (optional, remove if you want a stable ClusterIP)
+#   selector:
+#     app: ib-gateway
+#   ports:
+#   - name: paper-trading
+#     port: 4002
+#     targetPort: 4002
+#     protocol: TCP
+#   - name: live-trading
+#     port: 4001
+#     targetPort: 4001
+#     protocol: TCP
+#   sessionAffinity: ClientIP  # Stick to same pod (important for stateful TWS sessions)
+#   sessionAffinityConfig:
+#     clientIP:
+#       timeoutSeconds: 3600  # 1 hour session stickiness
+
+apiVersion: v1
+kind: Service
+metadata:
+  name: ib-gateway
+  namespace: trading
+  labels:
+    app: ib-gateway
+spec:
+  type: ClusterIP
+  selector:
+    app: ib-gateway
+  ports:
+  - name: paper-trading
+    port: 4002
+    targetPort: 4003   # <-- proxy sidecar, not the gateway directly
+    protocol: TCP
+  - name: live-trading
+    port: 4001
+    targetPort: 4001
+    protocol: TCP
+  sessionAffinity: ClientIP
+  sessionAffinityConfig:
+    clientIP:
+      timeoutSeconds: 3600
--- a/k8s/trading/ib-gateway2.yaml
+++ b/k8s/trading/ib-gateway2.yaml
@ -0,0 +1,169 @@
+apiVersion: v1
+kind: Namespace
+metadata:
+  name: trading
+  labels:
+    name: trading
+    environment: production
+---
+apiVersion: v1
+kind: Secret
+metadata:
+  name: ib-credentials
+  namespace: trading
+type: Opaque
+stringData:
+  # Rotate your creds (you pasted them earlier).
+  username: "saladin85"
+  password: "3Lcd@05041985"
+  trading-mode: "paper"
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: ib-gateway
+  namespace: trading
+  labels:
+    app: ib-gateway
+    component: trading-infrastructure
+spec:
+  replicas: 1
+  strategy:
+    type: Recreate
+  selector:
+    matchLabels:
+      app: ib-gateway
+  template:
+    metadata:
+      labels:
+        app: ib-gateway
+      annotations:
+        prometheus.io/scrape: "false"
+    spec:
+      nodeSelector:
+        kubernetes.io/hostname: hetzner-2
+
+      # Keep your original security context
+      securityContext:
+        runAsNonRoot: false
+        fsGroup: 1000
+
+      containers:
+      - name: ib-gateway
+        image: ghcr.io/gnzsnz/ib-gateway:stable
+        imagePullPolicy: IfNotPresent
+
+        # IMPORTANT: use env vars this image expects
+        env:
+        - name: TWS_USERID
+          valueFrom:
+            secretKeyRef:
+              name: ib-credentials
+              key: username
+        - name: TWS_PASSWORD
+          valueFrom:
+            secretKeyRef:
+              name: ib-credentials
+              key: password
+        - name: TRADING_MODE
+          valueFrom:
+            secretKeyRef:
+              name: ib-credentials
+              key: trading-mode
+        - name: READ_ONLY_API
+          value: "no"
+
+        # These two match what your log shows the image uses
+        - name: API_PORT
+          value: "4002"
+        - name: SOCAT_PORT
+          value: "4004"
+
+        # optional but nice
+        - name: TIME_ZONE
+          value: "Etc/UTC"
+        - name: TWOFA_TIMEOUT_ACTION
+          value: "exit"
+
+        ports:
+        # IB API ports (inside container / localhost use)
+        - name: api-paper
+          containerPort: 4002
+          protocol: TCP
+        - name: api-live
+          containerPort: 4001
+          protocol: TCP
+
+        # socat relay port for non-localhost clients (what we expose via Service)
+        - name: api-socat
+          containerPort: 4004
+          protocol: TCP
+
+        # optional UI/VNC
+        - name: vnc
+          containerPort: 5900
+          protocol: TCP
+
+        resources:
+          requests:
+            memory: "1Gi"
+            cpu: "500m"
+          limits:
+            memory: "2Gi"
+            cpu: "1000m"
+
+        # Probe the socat port (represents remote connectivity)
+        startupProbe:
+          tcpSocket:
+            port: 4004
+          initialDelaySeconds: 60
+          periodSeconds: 10
+          timeoutSeconds: 5
+          failureThreshold: 18
+
+        readinessProbe:
+          tcpSocket:
+            port: 4004
+          periodSeconds: 10
+          timeoutSeconds: 5
+          failureThreshold: 2
+
+        livenessProbe:
+          tcpSocket:
+            port: 4004
+          periodSeconds: 60
+          timeoutSeconds: 5
+          failureThreshold: 3
+
+      restartPolicy: Always
+      dnsPolicy: ClusterFirst
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: ib-gateway
+  namespace: trading
+  labels:
+    app: ib-gateway
+spec:
+  type: ClusterIP
+  selector:
+    app: ib-gateway
+  ports:
+  # Clients connect to 4002, but we forward to SOCAT_PORT=4004
+  - name: paper-trading
+    port: 4002
+    targetPort: 4004
+    protocol: TCP
+
+  # If you truly need live, you should relay live via another socat port too.
+  # For now keep it direct (or remove it entirely for safety).
+  - name: live-trading
+    port: 4001
+    targetPort: 4001
+    protocol: TCP
+
+  sessionAffinity: ClientIP
+  sessionAffinityConfig:
+    clientIP:
+      timeoutSeconds: 3600
--- a/k8s/vector/qdrant.yaml
+++ b/k8s/vector/qdrant.yaml
@ -0,0 +1,80 @@
+# k8s/vec/qdrant/pvc.yaml
+apiVersion: v1
+kind: PersistentVolumeClaim
+metadata: { name: qdrant-data, namespace: db}
+spec:
+  accessModes: ["ReadWriteOnce"]
+  storageClassName: local-ssd-hetzner
+  resources: { requests: { storage: 20Gi } }
+
+---
+# k8s/vec/qdrant/deploy.yaml
+apiVersion: apps/v1
+kind: Deployment
+metadata: { name: qdrant, namespace: db}
+spec:
+  replicas: 1
+  selector: { matchLabels: { app: qdrant } }
+  template:
+    metadata: { labels: { app: qdrant } }
+    spec:
+      containers:
+      - name: qdrant
+        image: qdrant/qdrant:latest
+        ports:
+          - { containerPort: 6333 }  # HTTP + Web UI
+          - { containerPort: 6334 }  # gRPC
+        volumeMounts:
+          - { name: data, mountPath: /qdrant/storage }
+      volumes:
+        - name: data
+          persistentVolumeClaim: { claimName: qdrant-data }
+---
+apiVersion: v1
+kind: Service
+metadata: { name: qdrant, namespace: db}
+spec:
+  selector: { app: qdrant }
+  ports:
+    - { name: http, port: 80,  targetPort: 6333 }
+    - { name: grpc, port: 6334, targetPort: 6334 }
+---
+apiVersion: networking.k8s.io/v1
+kind: Ingress
+metadata:
+  name: qdrant
+  namespace: db
+  annotations: { cert-manager.io/cluster-issuer: letsencrypt-prod }
+spec:
+  ingressClassName: nginx
+  tls: [{ hosts: ["vector.betelgeusebytes.io"], secretName: qdrant-tls }]
+  rules:
+  - host: vector.betelgeusebytes.io
+    http:
+      paths:
+      - path: /
+        pathType: Prefix
+        backend: { service: { name: qdrant, port: { number: 80 } } }
+---
+# PV
+apiVersion: v1
+kind: PersistentVolume
+metadata:
+  name: pv-qdrant
+spec:
+  capacity:
+    storage: 20Gi
+  accessModes:
+    - ReadWriteOnce
+  persistentVolumeReclaimPolicy: Retain
+  storageClassName: local-ssd-hetzner
+  local:
+    path: /mnt/local-ssd/qdrant
+  nodeAffinity:
+    required:
+      nodeSelectorTerms:
+      - matchExpressions:
+        - key: kubernetes.io/hostname
+          operator: In
+          values:
+          - hetzner-2
--- a/k8s/vllm/vllm.yaml
+++ b/k8s/vllm/vllm.yaml
@ -0,0 +1,142 @@
+# PV
+apiVersion: v1
+kind: PersistentVolume
+metadata:
+  name: pv-vllm
+spec:
+  capacity:
+    storage: 50Gi
+  accessModes:
+    - ReadWriteOnce
+  persistentVolumeReclaimPolicy: Retain
+  storageClassName: local-ssd-hetzner
+  local:
+    path: /mnt/local-ssd/vllm
+  nodeAffinity:
+    required:
+      nodeSelectorTerms:
+      - matchExpressions:
+        - key: kubernetes.io/hostname
+          operator: In
+          values:
+          - hetzner-2
+---
+# k8s/ai/vllm/secret.yaml
+apiVersion: v1
+kind: Secret
+metadata: { name: vllm-auth,  namespace: ml }
+type: Opaque
+stringData: { API_KEY: "replace_me" }
+
+---
+# k8s/ai/ollama/deploy.yaml
+apiVersion: apps/v1
+kind: Deployment
+metadata: { name: ollama, namespace: ml }
+spec:
+  replicas: 1
+  selector: { matchLabels: { app: ollama } }
+  template:
+    metadata: { labels: { app: ollama } }
+    spec:
+      securityContext:
+        runAsUser: 0               # needed so the init can write into /root/.ollama
+      initContainers:
+      - name: warm-models
+        image: ollama/ollama:latest
+        command: ["/bin/sh","-c"]
+        args:
+          - |
+            ollama serve &   # start a temp daemon
+            sleep 2
+            # pull one or more small, quantized models for CPU
+            ollama pull qwen2.5:3b-instruct-q4_K_M || true
+            ollama pull llama3.2:3b-instruct-q4_K_M || true
+            pkill ollama || true
+        volumeMounts:
+          - { name: data, mountPath: /root/.ollama }
+      containers:
+      - name: ollama
+        image: ollama/ollama:latest
+        env:
+          - { name: OLLAMA_ORIGINS, value: "*" }    # CORS if you call from browser
+        ports:
+          - { containerPort: 11434 }
+        volumeMounts:
+          - { name: data, mountPath: /root/.ollama }
+        resources:
+          requests: { cpu: "2", memory: "4Gi" }
+          limits:   { cpu: "4", memory: "8Gi" }
+      volumes:
+        - name: data
+          persistentVolumeClaim: { claimName: ollama-data }
+
+---
+# k8s/ai/ollama/svc-ing.yaml
+apiVersion: v1
+kind: Service
+metadata: { name: ollama, namespace: ml }
+spec:
+  selector: { app: ollama }
+  ports: [ { name: http, port: 80, targetPort: 11434 } ]
+
+# ---
+# # old k8s/ai/vllm/deploy.yaml
+# apiVersion: apps/v1
+# kind: Deployment
+# metadata: { name: vllm,  namespace: ml }
+# spec:
+#   replicas: 1
+#   selector: { matchLabels: { app: vllm } }
+#   template:
+#     metadata: { labels: { app: vllm } }
+#     spec:
+#       containers:
+#       - name: vllm
+#         image: vllm/vllm-openai:latest
+#         args: ["--model","Qwen/Qwen2.5-7B-Instruct","--max-model-len","8192","--port","8000","--host","0.0.0.0"]
+#         env:
+#           - name: VLLM_API_KEY
+#             valueFrom: { secretKeyRef: { name: vllm-auth, key: API_KEY } }
+#         ports: [{ containerPort: 8000 }]
+#         resources:
+#           limits:
+#             nvidia.com/gpu: 1
+#           requests:
+#             nvidia.com/gpu: 1
+#         volumeMounts:
+#           - { name: cache, mountPath: /root/.cache/huggingface }
+#       volumes:
+#         - name: cache
+#           persistentVolumeClaim: { claimName: vllm-cache-pvc }
+---
+apiVersion: v1
+kind: PersistentVolumeClaim
+metadata: { name: ollama-data,  namespace: ml }
+spec:
+  accessModes: ["ReadWriteOnce"]
+  storageClassName: local-ssd-hetzner
+  resources: { requests: { storage: 50Gi } }
+# ---
+#old k8s/ai/vllm/svc-ing.yaml
+# apiVersion: v1
+# kind: Service
+# metadata: { name: vllm,  namespace: ml }
+# spec: { selector: { app: vllm }, ports: [ { port: 80, targetPort: 8000 } ] }
+---
+apiVersion: networking.k8s.io/v1
+kind: Ingress
+metadata:
+  name: vllm
+  namespace: ml
+  annotations: { cert-manager.io/cluster-issuer: letsencrypt-prod }
+spec:
+  ingressClassName: nginx
+  tls: [{ hosts: ["llm.betelgeusebytes.io"], secretName: vllm-tls }]
+  rules:
+  - host: llm.betelgeusebytes.io
+    http:
+      paths:
+      - path: /
+        pathType: Prefix
+        backend: { service: { name: vllm, port: { number: 80 } } }