From 404deb1d52948277d3222284fc1564059106a07b Mon Sep 17 00:00:00 2001
From: salah <angal.salah@gmail.com>
Date: Wed, 28 Jan 2026 11:07:16 +0100
Subject: [PATCH] Add observability stack and supporting scripts

- Introduced combine.sh script to aggregate .txt, .py, .yml, .yaml, .ini files into betelgeusebytes.txt.
- Updated Loki configuration to disable retention settings.
- Modified Tempo configuration to change storage paths from /tmp to /var.
- Refactored Alloy configuration to streamline Prometheus integration and removed unnecessary metrics export.
- Enhanced RBAC permissions to include pod log access.
- Added security context to Tempo deployment for improved security.
- Created README_old.md for documentation of the observability stack.
- Developed me.md as an authoritative guide for the AI infrastructure stack.
- Implemented test-loki-logs.sh script to validate Loki log collection and connectivity.
---
 ARCHITECTURE.md                               |   93 +
 DEPLOYMENT.md                                 |   46 +
 FUTURE-PROJECTS.md                            |   34 +
 INFRASTRUCTURE.md                             |  102 +
 OBSERVABILITY.md                              |   32 +
 README.md                                     |  148 +-
 README_old.md                                 |   43 +
 ROADMAP.md                                    |   26 +
 STACK.md                                      |  153 +
 betelgeusebytes.txt                           | 5958 +++++++++++++++++
 combine.sh                                    |    5 +
 k8s/observability-stack/04-loki-config.yaml   |    7 +-
 k8s/observability-stack/05-tempo-config.yaml  |   13 +-
 k8s/observability-stack/06-alloy-config.yaml  |   22 +-
 k8s/observability-stack/08-rbac.yaml          |    3 +-
 k8s/observability-stack/12-tempo.yaml         |    9 +-
 .../{README.md => README_old.md}              |    0
 k8s/observability-stack/me.md                 |  388 ++
 k8s/observability-stack/test-loki-logs.sh     |  158 +
 19 files changed, 7171 insertions(+), 69 deletions(-)
 create mode 100644 ARCHITECTURE.md
 create mode 100644 DEPLOYMENT.md
 create mode 100644 FUTURE-PROJECTS.md
 create mode 100644 INFRASTRUCTURE.md
 create mode 100644 OBSERVABILITY.md
 create mode 100644 README_old.md
 create mode 100644 ROADMAP.md
 create mode 100644 STACK.md
 create mode 100644 betelgeusebytes.txt
 create mode 100644 combine.sh
 rename k8s/observability-stack/{README.md => README_old.md} (100%)
 create mode 100644 k8s/observability-stack/me.md
 create mode 100644 k8s/observability-stack/test-loki-logs.sh

diff --git a/ARCHITECTURE.md b/ARCHITECTURE.md
new file mode 100644
index 0000000..1b60f5c
--- /dev/null
+++ b/ARCHITECTURE.md
@@ -0,0 +1,93 @@
+# BetelgeuseBytes – Architecture Overview
+
+## High-Level Architecture
+
+This platform is a **self-hosted, production-grade Kubernetes stack** designed for:
+
+* AI / ML experimentation and serving
+* Data engineering & observability
+* Knowledge graphs & vector search
+* Automation, workflows, and research tooling
+
+The architecture follows a **hub-and-spoke model**:
+
+* **Core Infrastructure**: Kubernetes + networking + storage
+* **Platform Services**: databases, messaging, auth, observability
+* **ML / AI Services**: labeling, embeddings, LLM serving, notebooks
+* **Automation & Workflows**: Argo Workflows, n8n
+* **Access Layer**: DNS, Ingress, TLS
+
+---
+
+## Logical Architecture Diagram (Textual)
+
+```
+Internet
+   │
+   ▼
+DNS (betelgeusebytes.io)
+   │
+   ▼
+Ingress-NGINX (TLS via cert-manager)
+   │
+   ├── Platform UIs (Grafana, Kibana, Gitea, Neo4j, MinIO, etc.)
+   ├── ML UIs (Jupyter, Label Studio, MLflow)
+   ├── Automation (n8n, Argo)
+   └── APIs (Postgres TCP, Neo4j Bolt, Kafka)
+
+Kubernetes Cluster
+   ├── Control Plane
+   ├── Worker Nodes
+   ├── Stateful Workloads (local SSD)
+   └── Observability Stack
+```
+
+---
+
+## Key Design Principles
+
+* **Bare‑metal friendly** (Hetzner dedicated servers)
+* **Local SSD storage** for stateful workloads
+* **Everything observable** (logs, metrics, traces)
+* **CPU-first ML** with optional GPU expansion
+* **Single-tenant but multi-project ready**
+
+---
+
+## Networking
+
+* Cilium CNI (eBPF-based networking)
+* NGINX Ingress Controller
+* TCP services exposed via Ingress patch (Postgres, Neo4j Bolt)
+* WireGuard mesh between nodes
+
+---
+
+## Security Model
+
+* TLS everywhere (cert-manager + Let’s Encrypt)
+* Namespace isolation per domain (db, ml, graph, observability…)
+* Secrets stored in Kubernetes Secrets
+* Optional Basic Auth on sensitive UIs
+* Keycloak available for future SSO
+
+---
+
+## Scalability Notes
+
+* Currently single control-plane + workers
+* Designed to add:
+
+  * More workers
+  * Dedicated control-plane VPS nodes
+  * GPU nodes (for vLLM / training)
+
+---
+
+## What This Enables
+
+* Research platforms
+* Knowledge graph + LLM pipelines
+* End-to-end ML lifecycle
+* Automated data pipelines
+* Production observability-first apps
diff --git a/DEPLOYMENT.md b/DEPLOYMENT.md
new file mode 100644
index 0000000..9fc992c
--- /dev/null
+++ b/DEPLOYMENT.md
@@ -0,0 +1,46 @@
+# Deployment & Operations Guide
+
+## Deployment Model
+
+* Declarative Kubernetes manifests
+* Applied via `kubectl` or Argo CD
+* No Helm dependency
+
+---
+
+## General Rules
+
+* Stateless apps by default
+* PVCs required for state
+* Secrets via Kubernetes Secrets
+* Config via environment variables
+
+---
+
+## Deployment Order (Recommended)
+
+1. Networking (Cilium, Ingress)
+2. cert-manager
+3. Storage (PVs)
+4. Databases (Postgres, Redis, Kafka)
+5. Observability stack
+6. ML tooling
+7. Automation tools
+8. Custom applications
+
+---
+
+## Operations
+
+* Monitor via Grafana
+* Debug via logs & traces
+* Upgrade via Git commits
+* Rollback via Argo CD
+
+---
+
+## Backup Strategy
+
+* MinIO buckets versioned
+* Database snapshots
+* Git repositories mirrored
diff --git a/FUTURE-PROJECTS.md b/FUTURE-PROJECTS.md
new file mode 100644
index 0000000..bca3341
--- /dev/null
+++ b/FUTURE-PROJECTS.md
@@ -0,0 +1,34 @@
+# Future Use Cases & Projects
+
+This platform is intentionally **general‑purpose**.
+
+## AI & ML
+
+* RAG platforms
+* Offline assistants
+* Agent systems
+* NLP research
+
+## Knowledge Graphs
+
+* Academic citation graphs
+* Trust & provenance systems
+* Dependency analysis
+
+## Data Platforms
+
+* Event‑driven ETL
+* Feature stores
+* Research data lakes
+
+## Observability & Ops
+
+* Internal platform monitoring
+* Security analytics
+* Audit systems
+
+## Sovereign Deployments
+
+* On‑prem AI for enterprises
+* NGO / government tooling
+* Privacy‑preserving analytics
diff --git a/INFRASTRUCTURE.md b/INFRASTRUCTURE.md
new file mode 100644
index 0000000..ea30086
--- /dev/null
+++ b/INFRASTRUCTURE.md
@@ -0,0 +1,102 @@
+# BetelgeuseBytes – Infrastructure & Cluster Configuration
+
+## Hosting Provider
+
+* **Provider**: Hetzner
+* **Server Type**: Dedicated servers
+* **Region**: EU
+* **Network**: Private LAN + WireGuard
+
+---
+
+## Nodes
+
+### Current Nodes
+
+| Node      | Role                   | Notes               |
+| --------- | ---------------------- | ------------------- |
+| hetzner-1 | control-plane + worker | runs core workloads |
+| hetzner-2 | worker + storage       | hosts local SSD PVs |
+
+---
+
+## Kubernetes Setup
+
+* Kubernetes installed via kubeadm
+* Single cluster
+* Control plane is also schedulable
+
+### CNI
+
+* **Cilium**
+
+  * eBPF dataplane
+  * kube-proxy replacement
+  * Network policy support
+
+---
+
+## Storage
+
+### Persistent Volumes
+
+* Backed by **local NVMe / SSD**
+* Manually provisioned PVs
+* Bound via PVCs
+
+### Storage Layout
+
+```
+/mnt/local-ssd/
+├── postgres/
+├── neo4j/
+├── elasticsearch/
+├── prometheus/
+├── loki/
+├── tempo/
+├── grafana/
+├── minio/
+└── qdrant/
+```
+
+---
+
+## Networking
+
+* Ingress Controller: nginx
+* External DNS records → ingress IP
+* TCP mappings for:
+
+  * PostgreSQL
+  * Neo4j Bolt
+
+---
+
+## TLS & Certificates
+
+* cert-manager
+* ClusterIssuer: Let’s Encrypt
+* Automatic renewal
+
+---
+
+## Namespaces
+
+| Namespace     | Purpose                            |
+| ------------- | ---------------------------------- |
+| db            | Databases (Postgres, Redis)        |
+| graph         | Neo4j                              |
+| broker        | Kafka                              |
+| ml            | ML tooling (Jupyter, Argo, MLflow) |
+| observability | Grafana, Prometheus, Loki, Tempo   |
+| automation    | n8n                                |
+| devops        | Gitea, Argo CD                     |
+
+---
+
+## What This Infra Enables
+
+* Full on‑prem AI platform
+* Predictable performance
+* Low-latency data access
+* Independence from cloud providers
diff --git a/OBSERVABILITY.md b/OBSERVABILITY.md
new file mode 100644
index 0000000..77f76d7
--- /dev/null
+++ b/OBSERVABILITY.md
@@ -0,0 +1,32 @@
+# 🔭 Observability Stack
+
+---
+
+## Components
+
+- Grafana
+- Prometheus
+- Loki
+- Tempo
+- Grafana Alloy
+- kube-state-metrics
+- node-exporter
+
+---
+
+## Capabilities
+
+- Logs ↔ traces ↔ metrics correlation
+- OTLP-native instrumentation
+- Centralized dashboards
+- Alerting-ready
+
+---
+
+## Instrumentation Rules
+
+All apps must:
+- expose `/metrics`
+- emit structured JSON logs
+- export OTLP traces
+
diff --git a/README.md b/README.md
index d145404..c916850 100644
--- a/README.md
+++ b/README.md
@@ -1,43 +1,123 @@
-# BetelgeuseBytes K8s — Full Stack (kubectl-only)
+# 🧠 BetelgeuseBytes AI Platform — Documentation
 
-**Nodes**
-- Control-plane + worker: hetzner-1 (95.217.89.53)
-- Worker: hetzner-2 (138.201.254.97)
+This documentation describes a **self-hosted, CPU-first AI platform** running on Kubernetes,
+designed to power an **Islamic Hadith Scholar AI** and future AI/data projects.
 
-## Bring up the cluster
-```bash
-ansible -i ansible/inventories/prod/hosts.ini all -m ping
-ansible-playbook -i ansible/inventories/prod/hosts.ini ansible/playbooks/site.yml
-```
+## 📚 Documentation Index
 
-## Apply apps (edit secrets first)
-```bash
-kubectl apply -f k8s/00-namespaces.yaml
-kubectl apply -f k8s/01-secrets/
-kubectl apply -f k8s/storage/storageclass.yaml
+- [Architecture](ARCHITECTURE.md)
+- [Infrastructure](INFRASTRUCTURE.md)
+- [Full Stack Overview](STACK.md)
+- [Deployment & Operations](DEPLOYMENT.md)
+- [Observability](OBSERVABILITY.md)
+- [Roadmap & Next Steps](ROADMAP.md)
+- [Future Projects & Use Cases](FUTURE-PROJECTS.md)
 
-kubectl apply -f k8s/postgres/
-kubectl apply -f k8s/redis/
-kubectl apply -f k8s/elastic/elasticsearch.yaml
-kubectl apply -f k8s/elastic/kibana.yaml
+## 🎯 Current Focus
 
-kubectl apply -f k8s/gitea/
-kubectl apply -f k8s/jupyter/
-kubectl apply -f k8s/kafka/kafka.yaml
-kubectl apply -f k8s/kafka/kafka-ui.yaml
-kubectl apply -f k8s/neo4j/
+- Hadith sanad & matn extraction
+- Narrator relationship modeling
+- Knowledge graph construction
+- Human-in-the-loop verification
+- Explainable, sovereign AI
 
-kubectl apply -f k8s/otlp/
-kubectl apply -f k8s/observability/fluent-bit.yaml
-kubectl apply -f k8s/prometheus/
-kubectl apply -f k8s/grafana/
-```
+## 🧠 What each document gives you
+### ARCHITECTURE
 
-## DNS
-A records:
-- apps.betelgeusebytes.io → 95.217.89.53, 138.201.254.97
+- Logical system architecture
 
-CNAMEs → apps.betelgeusebytes.io:
-- gitea., kibana., grafana., prometheus., notebook., broker., neo4j., otlp.
+- Data & control flow
 
-(HA later) cp.k8s.betelgeusebytes.io → <VPS_IP>, 95.217.89.53, 138.201.254.97; then set control_plane_endpoint accordingly.
+- Networking and security model
+
+- Design principles (CPU-first, sovereign, observable)
+
+- What the architecture enables long-term
+
+This is what you show to **architects and senior engineers.**
+
+### INFRASTRUCTURE
+
+- Hetzner setup (dedicated, CPU-only, SSD)
+
+- Node roles and responsibilities
+
+- Kubernetes topology
+
+- Cilium networking
+
+- Storage layout on disk
+
+- Namespaces and isolation strategy
+
+This is what you show to **ops / SRE / infra people.**
+
+### STACK
+
+- Exhaustive list of every deployed component
+
+- Grouped by domain:
+
+    - Core platform
+
+    - Databases & messaging
+
+    - Knowledge & vectors
+
+    - ML & AI
+
+    - Automation & DevOps
+
+    - Observability
+
+    - Authentication
+
+For each: **what it does now + what it can be reused for**
+
+This is the **master mental model** of your platform.
+
+### DEPLOYMENT
+
+- How the platform is deployed (kubectl + GitOps)
+
+- Deployment order
+
+- Operational rules
+
+- Backup strategy
+
+- Day-2 operations mindset
+
+This is your ***runbook starter.***
+
+### ROADMAP
+
+- Clear technical phases:
+
+    - Neo4j isnād schema
+
+    - Authenticity scoring
+
+    - Productization
+
+    - Scaling (GPU, multi-project)
+
+This keeps the project ***directionally sane.***
+
+### FUTURE-PROJECTS
+
+- Explicitly documents that this is **not just a Hadith stack**
+
+- Lists realistic reuse cases:
+
+    - RAG
+
+    - Knowledge graphs
+
+    - Sovereign AI
+
+    - Digital humanities
+
+    - Research platforms
+
+This justifies the ***investment in infra quality.***
\ No newline at end of file
diff --git a/README_old.md b/README_old.md
new file mode 100644
index 0000000..d145404
--- /dev/null
+++ b/README_old.md
@@ -0,0 +1,43 @@
+# BetelgeuseBytes K8s — Full Stack (kubectl-only)
+
+**Nodes**
+- Control-plane + worker: hetzner-1 (95.217.89.53)
+- Worker: hetzner-2 (138.201.254.97)
+
+## Bring up the cluster
+```bash
+ansible -i ansible/inventories/prod/hosts.ini all -m ping
+ansible-playbook -i ansible/inventories/prod/hosts.ini ansible/playbooks/site.yml
+```
+
+## Apply apps (edit secrets first)
+```bash
+kubectl apply -f k8s/00-namespaces.yaml
+kubectl apply -f k8s/01-secrets/
+kubectl apply -f k8s/storage/storageclass.yaml
+
+kubectl apply -f k8s/postgres/
+kubectl apply -f k8s/redis/
+kubectl apply -f k8s/elastic/elasticsearch.yaml
+kubectl apply -f k8s/elastic/kibana.yaml
+
+kubectl apply -f k8s/gitea/
+kubectl apply -f k8s/jupyter/
+kubectl apply -f k8s/kafka/kafka.yaml
+kubectl apply -f k8s/kafka/kafka-ui.yaml
+kubectl apply -f k8s/neo4j/
+
+kubectl apply -f k8s/otlp/
+kubectl apply -f k8s/observability/fluent-bit.yaml
+kubectl apply -f k8s/prometheus/
+kubectl apply -f k8s/grafana/
+```
+
+## DNS
+A records:
+- apps.betelgeusebytes.io → 95.217.89.53, 138.201.254.97
+
+CNAMEs → apps.betelgeusebytes.io:
+- gitea., kibana., grafana., prometheus., notebook., broker., neo4j., otlp.
+
+(HA later) cp.k8s.betelgeusebytes.io → <VPS_IP>, 95.217.89.53, 138.201.254.97; then set control_plane_endpoint accordingly.
diff --git a/ROADMAP.md b/ROADMAP.md
new file mode 100644
index 0000000..1b7e509
--- /dev/null
+++ b/ROADMAP.md
@@ -0,0 +1,26 @@
+# Roadmap & Next Steps
+
+## Phase 1 – Knowledge Modeling
+
+* Design Neo4j isnād schema
+* Identity resolution
+* Relationship typing
+
+## Phase 2 – Authenticity Scoring
+
+* Chain continuity analysis
+* Narrator reliability
+* Graph‑based scoring
+* LLM‑assisted reasoning
+
+## Phase 3 – Productization
+
+* Admin dashboards
+* APIs
+* Provenance visualization
+
+## Phase 4 – Scale & Extend
+
+* GPU nodes
+* vLLM integration
+* Multi‑project tenancy
diff --git a/STACK.md b/STACK.md
new file mode 100644
index 0000000..3111b6f
--- /dev/null
+++ b/STACK.md
@@ -0,0 +1,153 @@
+# 🧠 BetelgeuseBytes – Full Stack Catalog
+
+
+This document lists **every major component deployed in the cluster**, what it is used for today, and what it can be reused for.
+
+---
+
+## Core Platform
+
+| Component     | Namespace     | Purpose         | Reuse           |
+| ------------- | ------------- | --------------- | --------------- |
+| Kubernetes    | all           | Orchestration   | Any platform    |
+| Cilium        | kube-system   | Networking      | Secure clusters |
+| NGINX Ingress | ingress-nginx | Traffic routing | API gateway     |
+| cert-manager  | cert-manager  | TLS automation  | PKI             |
+
+---
+
+## Databases & Messaging
+
+| Component     | URL / Access    | Purpose         | Reuse            |
+| ------------- | --------------- | --------------- | ---------------- |
+| PostgreSQL    | TCP via Ingress | Relational DB   | App backends     |
+| Redis         | internal        | Cache           | Queues           |
+| Kafka         | kafka-ui UI     | Event streaming | Streaming ETL    |
+| Elasticsearch | Kibana UI       | Search + logs   | Full‑text search |
+
+---
+
+## Knowledge & Vector
+
+| Component | URL                       | Purpose         | Reuse           |
+| --------- | ------------------------- | --------------- | --------------- |
+| Neo4j     | neo4j.betelgeusebytes.io  | Knowledge graph | Graph analytics |
+| Qdrant    | vector.betelgeusebytes.io | Vector search   | RAG             |
+
+---
+
+## ML & AI
+
+| Component    | URL                           | Purpose         | Reuse            |
+| ------------ | ----------------------------- | --------------- | ---------------- |
+| Jupyter      | notebook UI                   | Experiments     | Research         |
+| Label Studio | label.betelgeusebytes.io      | Annotation      | Dataset creation |
+| MLflow       | mlflow.betelgeusebytes.io     | Model tracking  | MLOps            |
+| Ollama / LLM | llm.betelgeusebytes.io        | LLM inference   | Agents           |
+| Embeddings   | embeddings.betelgeusebytes.io | Text embeddings | Semantic search  |
+
+---
+
+## Automation & DevOps
+
+| Component      | URL                     | Purpose             | Reuse       |
+| -------------- | ----------------------- | ------------------- | ----------- |
+| Argo Workflows | argo.betelgeusebytes.io | Pipelines           | ETL         |
+| Argo CD        | argocd UI               | GitOps              | CI/CD       |
+| Gitea          | gitea UI                | Git hosting         | SCM         |
+| n8n            | automation UI           | Workflow automation | Integration |
+
+---
+
+## Observability (LGTM)
+
+| Component  | Purpose         | Reuse                  |
+| ---------- | --------------- | ---------------------- |
+| Grafana    | Dashboards      | Ops center             |
+| Prometheus | Metrics         | Monitoring             |
+| Loki       | Logs            | Debugging              |
+| Tempo      | Traces          | Distributed tracing    |
+| Alloy      | Telemetry agent | Standardized telemetry |
+
+---
+
+## Authentication
+
+| Component | Purpose    | Reuse |
+| --------- | ---------- | ----- |
+| Keycloak  | OIDC / SSO | IAM   |
+
+---
+
+## Why This Stack Matters
+
+* Covers **data → ML → serving → observability** end‑to‑end
+* Suitable for research **and** production
+* Modular and future‑proof
+
+
+# 📚 Stack Catalog — Services, URLs, Access & Usage
+
+This document lists **every deployed component**, how to access it,
+what it is used for **now**, and what it enables **in the future**.
+
+---
+
+## 🌐 Public Services (Ingress / HTTPS)
+
+| Component | URL | Auth | What It Is | Current Usage | Future Usage |
+|--------|-----|------|------------|---------------|--------------|
+| LLM Inference | https://llm.betelgeusebytes.io | none / internal | CPU LLM server (Ollama / llama.cpp) | Extract sanad & matn as JSON | Agents, doc AI, RAG |
+| Embeddings | https://embeddings.betelgeusebytes.io | none / internal | Text Embeddings Inference (HF) | Hadith & bio embeddings | Semantic search |
+| Vector DB | https://vector.betelgeusebytes.io | none | Qdrant + UI | Similarity search | Recommendations |
+| Graph DB | https://neo4j.betelgeusebytes.io | Basic Auth | Neo4j Browser | Isnād graph | Knowledge graphs |
+| Orchestrator | https://hadith-api.betelgeusebytes.io | OIDC | FastAPI router | Core AI API | Any AI backend |
+| Admin UI | https://hadith-admin.betelgeusebytes.io | OIDC | Next.js UI | Scholar review | Any internal tool |
+| Labeling | https://label.betelgeusebytes.io | Local / OIDC | Label Studio | NER/RE annotation | Dataset curation |
+| ML Tracking | https://mlflow.betelgeusebytes.io | OIDC | MLflow UI | Experiments & models | Governance |
+| Object Storage | https://minio.betelgeusebytes.io | Access key | MinIO Console | Datasets & artifacts | Data lake |
+| Pipelines | https://argo.betelgeusebytes.io | SA / OIDC | Argo Workflows UI | ML pipelines | ETL |
+| Auth | https://auth.betelgeusebytes.io | Admin login | Keycloak | SSO & tokens | IAM |
+| Observability | https://grafana.betelgeusebytes.io | Login | Grafana | Metrics/logs/traces | Ops center |
+
+---
+
+## 🔐 Authentication & Access Summary
+
+| System | Auth Method | Who Uses It |
+|-----|------------|-------------|
+| Keycloak | Username / Password | Admins |
+| Admin UI | OIDC (Keycloak) | Scholars |
+| Orchestrator API | OIDC Bearer Token | Apps |
+| MLflow | OIDC | ML engineers |
+| Label Studio | Local / OIDC | Annotators |
+| Neo4j | Basic Auth | Engineers |
+| MinIO | Access / Secret key | Pipelines |
+| Grafana | Login | Operators |
+
+---
+
+## 🧠 Internal Cluster Services (ClusterIP)
+
+| Component | Namespace | Purpose |
+|--------|-----------|--------|
+| PostgreSQL | db | Relational storage |
+| Redis | db | Cache / temp state |
+| Kafka | broker | Event backbone |
+| Prometheus | observability | Metrics |
+| Loki | observability | Logs |
+| Tempo | observability | Traces |
+| Alloy | observability | Telemetry agent |
+
+---
+
+## 🗂 Storage Responsibilities
+
+| Storage | Used By | Contains |
+|------|--------|---------|
+| MinIO | Pipelines, MLflow | Datasets, models |
+| Neo4j PVC | Graph DB | Isnād graph |
+| Qdrant PVC | Vector DB | Embeddings |
+| PostgreSQL PVC | DB | Metadata |
+| Observability PVCs | LGTM | Logs, metrics, traces |
+
diff --git a/betelgeusebytes.txt b/betelgeusebytes.txt
new file mode 100644
index 0000000..341a1dc
--- /dev/null
+++ b/betelgeusebytes.txt
@@ -0,0 +1,5958 @@
+=== ./ansible/inventories/prod/group_vars/all.yml ===
+cluster_name: prod
+k8s_version: "v1.30.3"
+control_plane_endpoint: "95.217.89.53:6443"  # switch later to cp.k8s.betelgeusebytes.io:6443
+
+pod_cidr: "10.244.0.0/16"
+service_cidr: "10.96.0.0/12"
+cilium_version: "1.15.7"
+
+local_path_dir: "/srv/k8s"
+local_sc_name: "local-ssd-hetzner"
+
+stateful_node_label_key: "node"
+stateful_node_label_val: "hetzner-2"
+
+=== ./ansible/inventories/prod/hosts.ini ===
+[k8s_control_plane]
+hetzner-1 ansible_host=95.217.89.53 public_ip=95.217.89.53 wg_address=10.66.0.11
+
+[k8s_workers]
+hetzner-1 ansible_host=95.217.89.53 public_ip=95.217.89.53 wg_address=10.66.0.11
+hetzner-2 ansible_host=138.201.254.97 public_ip=138.201.254.97 wg_address=10.66.0.12
+
+[k8s_nodes:children]
+k8s_control_plane
+k8s_workers
+
+# add tiny VPS control-planes here when ready
+[new_control_planes]
+# cp-a ansible_host=<VPS1_IP> public_ip=<VPS1_IP> wg_address=10.66.0.10
+
+[all:vars]
+ansible_user=root
+ansible_password=3Lcd0504
+ansible_become=true
+
+=== ./ansible/playbooks/add-control-planes.yml ===
+- hosts: k8s_control_plane[0]
+  become: yes
+  roles:
+    - kubeadm_cp_discovery
+
+- hosts: new_control_planes
+  become: yes
+  roles:
+    - common
+    - wireguard
+    - containerd
+    - kubernetes
+
+- hosts: new_control_planes
+  become: yes
+  roles:
+    - kubeadm_join_cp
+  vars:
+    kubeadm_cp_join_cmd: "{{ hostvars[groups['k8s_control_plane'][0]].kubeadm_cp_join_cmd | default(kubeadm_cp_join_cmd) }}"
+
+=== ./ansible/playbooks/site.yml ===
+- hosts: k8s_nodes
+  become: yes
+  # serial: 1 
+  roles:
+    # - ../roles/common
+    #- ../roles/wireguard
+    #- ../roles/containerd
+    #- ../roles/kubernetes
+
+- hosts: k8s_control_plane
+  become: yes
+  roles:
+    - ../roles/kubeadm_init
+
+# - hosts: k8s_workers
+#   become: yes
+#   roles:
+#     - ../roles/kubeadm_join
+
+- hosts: k8s_control_plane
+  become: yes
+  roles:
+    # - ../roles/cilium
+    # - ../roles/ingress
+    #- ../roles/cert_manager
+
+- hosts: k8s_nodes
+  become: yes
+  roles:
+    #- ../roles/storage_local_path
+    - ../roles/labels
+
+=== ./ansible/roles/cert_manager/tasks/main.yml ===
+- name: Install cert-manager
+  shell: kubectl apply -f https://github.com/cert-manager/cert-manager/releases/download/v1.14.4/cert-manager.yaml
+
+- name: Wait for cert-manager pods to be ready
+  shell: kubectl wait --for=condition=ready --timeout=300s pod -l app.kubernetes.io/instance=cert-manager -n cert-manager
+
+- name: Wait for webhook endpoint to be ready
+  shell: |
+    for i in {1..30}; do
+      if kubectl get endpoints cert-manager-webhook -n cert-manager -o jsonpath='{.subsets[*].addresses[*].ip}' | grep -q .; then
+        echo "Webhook endpoint is ready"
+        exit 0
+      fi
+      echo "Waiting for webhook endpoint... attempt $i/30"
+      sleep 2
+    done
+    exit 1
+
+- name: Test webhook connectivity
+  shell: kubectl run test-webhook --image=curlimages/curl:latest --rm -i --restart=Never -- curl -k https://cert-manager-webhook.cert-manager.svc:443/healthz
+  register: webhook_test
+  ignore_errors: yes
+
+- name: Display webhook test result
+  debug:
+    var: webhook_test
+
+- name: ClusterIssuer
+  copy:
+    dest: /root/cluster-issuer-prod.yaml
+    content: |
+      apiVersion: cert-manager.io/v1
+      kind: ClusterIssuer
+      metadata:
+        name: letsencrypt-prod
+      spec:
+        acme:
+- name: ClusterIssuer
+  copy:
+    dest: /root/cluster-issuer-prod.yaml
+    content: |
+      apiVersion: cert-manager.io/v1
+      kind: ClusterIssuer
+      metadata:
+        name: letsencrypt-prod
+      spec:
+        acme:
+          email: admin@betelgeusebytes.io
+          server: https://acme-v02.api.letsencrypt.org/directory
+          privateKeySecretRef:
+            name: letsencrypt-prod-key
+          solvers:
+          - http01:
+              ingress:
+                class: nginx
+
+- name: Temporarily disable cert-manager webhook
+  shell: |
+    kubectl delete validatingwebhookconfiguration cert-manager-webhook || true
+  ignore_errors: yes
+
+- name: Apply ClusterIssuer
+  command: kubectl apply -f /root/cluster-issuer-prod.yaml
+
+- name: Reinstall cert-manager to restore webhook
+  shell: kubectl apply -f https://github.com/cert-manager/cert-manager/releases/download/v1.14.4/cert-manager.yaml
+
+=== ./ansible/roles/cilium/tasks/main.yml ===
+- name: Install cilium CLI
+  shell: |
+    curl -L --fail --remote-name-all https://github.com/cilium/cilium-cli/releases/latest/download/cilium-linux-amd64.tar.gz
+    tar xzf cilium-linux-amd64.tar.gz -C /usr/local/bin
+  args: { creates: /usr/local/bin/cilium }
+
+- name: Deploy cilium
+  shell: |
+    cilium install --version {{ cilium_version }} --set kubeProxyReplacement=strict --set bpf.masquerade=true
+
+=== ./ansible/roles/common/tasks/main.yml ===
+- name: Disable swap
+  command: swapoff -a
+  when: ansible_swaptotal_mb|int > 0
+
+- name: Ensure swap disabled on boot
+  replace:
+    path: /etc/fstab
+    regexp: '^([^#].*\sswap\s)'
+    replace: '# \1'
+
+- name: Kernel modules
+  copy:
+    dest: /etc/modules-load.d/containerd.conf
+    content: |
+      overlay
+      br_netfilter
+
+- name: Load modules
+  command: modprobe {{ item }}
+  loop: [overlay, br_netfilter]
+
+- name: Sysctl for k8s
+  copy:
+    dest: /etc/sysctl.d/99-kubernetes.conf
+    content: |
+      net.bridge.bridge-nf-call-iptables = 1
+      net.bridge.bridge-nf-call-ip6tables = 1
+      net.ipv4.ip_forward = 1
+      vm.max_map_count = 262144
+- name: Apply sysctl
+  command: sysctl --system
+
+=== ./ansible/roles/containerd/tasks/main.yml ===
+- name: Install containerd
+  apt:
+    name: containerd
+    state: present
+    update_cache: yes
+
+- name: Ensure containerd config directory
+  file:
+    path: /etc/containerd
+    state: directory
+    mode: '0755'
+
+- name: Generate default config
+  shell: containerd config default > /etc/containerd/config.toml
+  args: { creates: /etc/containerd/config.toml }
+
+- name: Ensure SystemdCgroup=true
+  replace:
+    path: /etc/containerd/config.toml
+    regexp: 'SystemdCgroup = false'
+    replace: 'SystemdCgroup = true'
+
+- name: Restart containerd
+  service:
+    name: containerd
+    state: restarted
+    enabled: yes
+
+=== ./ansible/roles/ingress/tasks/main.yml ===
+- name: Deploy ingress-nginx (baremetal)
+  shell: kubectl apply -f https://raw.githubusercontent.com/kubernetes/ingress-nginx/main/deploy/static/provider/baremetal/deploy.yaml
+
+=== ./ansible/roles/kubeadm_cp_discovery/tasks/main.yml ===
+- name: Upload certs and get certificate key
+  shell: kubeadm init phase upload-certs --upload-certs | tail -n 1
+  register: cert_key
+
+- name: Compute CA cert hash
+  shell: |
+    openssl x509 -pubkey -in /etc/kubernetes/pki/ca.crt     | openssl rsa -pubin -outform der 2>/dev/null     | openssl dgst -sha256 -hex | awk '{print $2}'
+  register: ca_hash
+
+- name: Create short-lived token
+  shell: kubeadm token create --ttl 30m
+  register: join_token
+
+- name: Determine control-plane endpoint
+  set_fact:
+    cp_endpoint: "{{ hostvars[inventory_hostname].control_plane_endpoint | default(ansible_host ~ ':6443') }}"
+
+- set_fact:
+    kubeadm_cp_join_cmd: >-
+      kubeadm join {{ cp_endpoint }}
+      --token {{ join_token.stdout }}
+      --discovery-token-ca-cert-hash sha256:{{ ca_hash.stdout }}
+      --control-plane
+      --certificate-key {{ cert_key.stdout }}
+
+=== ./ansible/roles/kubeadm_init/tasks/main.yml ===
+# - name: Write kubeadm config
+#   template:
+#     src: kubeadm-config.yaml.j2
+#     dest: /etc/kubernetes/kubeadm-config.yaml
+
+# - name: Pre-pull images
+#   command: kubeadm config images pull
+
+# - name: Init control-plane
+#   command: kubeadm init --config=/etc/kubernetes/kubeadm-config.yaml
+#   args: { creates: /etc/kubernetes/admin.conf }
+
+# - name: Setup kubeconfig
+#   shell: |
+#     mkdir -p $HOME/.kube
+#     cp -i /etc/kubernetes/admin.conf $HOME/.kube/config
+#     chown $(id -u):$(id -g) $HOME/.kube/config
+
+- name: Save join command
+  shell: kubeadm token create --print-join-command
+  register: join_cmd
+
+- set_fact:
+    kubeadm_join_command_all: "{{ join_cmd.stdout }}"
+
+=== ./ansible/roles/kubeadm_join/tasks/main.yml ===
+- name: Join node to cluster
+  command: "{{ hostvars[groups['k8s_control_plane'][0]].kubeadm_join_command_all }} --ignore-preflight-errors=FileAvailable--etc-kubernetes-kubelet.conf,FileAvailable--etc-kubernetes-pki-ca.crt,Port-10250"
+
+=== ./ansible/roles/kubeadm_join_cp/tasks/main.yml ===
+- name: Ensure join command provided
+  fail:
+    msg: "Set kubeadm_cp_join_cmd variable (string)"
+  when: kubeadm_cp_join_cmd is not defined
+
+- name: Join node as control-plane
+  command: "{{ kubeadm_cp_join_cmd }}"
+  args:
+    creates: /etc/kubernetes/kubelet.conf
+
+=== ./ansible/roles/kubernetes/tasks/main.yml ===
+- name: Install Kubernetes apt key
+  shell: curl -fsSL https://pkgs.k8s.io/core:/stable:/v1.30/deb/Release.key | gpg --dearmor -o /etc/apt/keyrings/kubernetes-apt-keyring.gpg
+  args: { creates: /etc/apt/keyrings/kubernetes-apt-keyring.gpg }
+
+- name: Add Kubernetes repo
+  apt_repository:
+    repo: "deb [signed-by=/etc/apt/keyrings/kubernetes-apt-keyring.gpg] https://pkgs.k8s.io/core:/stable:/v1.30/deb/ /"
+    state: present
+
+- name: Install kubeadm, kubelet, kubectl
+  apt:
+    name: [kubeadm, kubelet, kubectl]
+    state: present
+    update_cache: yes
+
+- name: Hold kube packages
+  command: apt-mark hold kubeadm kubelet kubectl
+
+=== ./ansible/roles/labels/tasks/main.yml ===
+- name: Label hetzner-2 for stateful
+  command: kubectl label node hetzner-2 {{ stateful_node_label_key }}={{ stateful_node_label_val }} --overwrite
+  delegate_to: "{{ groups['k8s_control_plane'][0] }}"
+  run_once: true
+
+=== ./ansible/roles/storage_local_path/tasks/main.yml ===
+- name: Ensure local path dir
+  file:
+    path: "{{ local_path_dir }}"
+    state: directory
+    mode: '0777'
+
+- name: StorageClass local-ssd-hetzner
+  copy:
+    dest: /root/local-sc.yaml
+    content: |
+      apiVersion: storage.k8s.io/v1
+      kind: StorageClass
+      metadata:
+        name: {{ local_sc_name }}
+      provisioner: kubernetes.io/no-provisioner
+      volumeBindingMode: WaitForFirstConsumer
+  when: inventory_hostname in groups['k8s_control_plane']
+
+- name: Apply SC
+  command: kubectl apply -f /root/local-sc.yaml
+  environment:
+    KUBECONFIG: /etc/kubernetes/admin.conf
+  when: inventory_hostname in groups['k8s_control_plane']
+
+- name: Create local-path directory
+  file:
+    path: /mnt/local-ssd
+    state: directory
+    mode: '0755'
+
+- name: Create subdirectories for each PV
+  file:
+    path: "/mnt/local-ssd/{{ item }}"
+    state: directory
+    mode: '0755'
+  loop:
+    - postgres
+    - prometheus
+    - elasticsearch
+    - grafana
+
+- name: Copy PV manifest
+  template:
+    src: local-ssd-pv.yaml
+    dest: /tmp/local-ssd-pv.yaml
+
+- name: Apply PV
+  command: kubectl apply -f /tmp/local-ssd-pv.yaml
+  run_once: true
+  delegate_to: "{{ groups['k8s_control_plane'][0] }}"
+
+- name: Apply SC
+  command: kubectl apply -f /tmp/local-ssd-sc.yaml
+  run_once: true
+  delegate_to: "{{ groups['k8s_control_plane'][0] }}"
+
+=== ./ansible/roles/storage_local_path/templates/local-ssd-pv.yaml ===
+apiVersion: v1
+kind: PersistentVolume
+metadata:
+  name: local-ssd-postgres
+spec:
+  capacity:
+    storage: 100Gi
+  accessModes:
+    - ReadWriteOnce
+  persistentVolumeReclaimPolicy: Retain
+  storageClassName: local-ssd-hetzner
+  local:
+    path: /mnt/local-ssd/postgres
+  nodeAffinity:
+    required:
+      nodeSelectorTerms:
+      - matchExpressions:
+        - key: kubernetes.io/hostname
+          operator: In
+          values:
+          - hetzner-2
+---
+apiVersion: v1
+kind: PersistentVolume
+metadata:
+  name: local-ssd-prometheus
+spec:
+  capacity:
+    storage: 100Gi
+  accessModes:
+    - ReadWriteOnce
+  persistentVolumeReclaimPolicy: Retain
+  storageClassName: local-ssd-hetzner
+  local:
+    path: /mnt/local-ssd/prometheus
+  nodeAffinity:
+    required:
+      nodeSelectorTerms:
+      - matchExpressions:
+        - key: kubernetes.io/hostname
+          operator: In
+          values:
+          - hetzner-2
+---
+apiVersion: v1
+kind: PersistentVolume
+metadata:
+  name: local-ssd-elasticsearch
+spec:
+  capacity:
+    storage: 300Gi
+  accessModes:
+    - ReadWriteOnce
+  persistentVolumeReclaimPolicy: Retain
+  storageClassName: local-ssd-hetzner
+  local:
+    path: /mnt/local-ssd/elasticsearch
+  nodeAffinity:
+    required:
+      nodeSelectorTerms:
+      - matchExpressions:
+        - key: kubernetes.io/hostname
+          operator: In
+          values:
+          - hetzner-2
+
+=== ./ansible/roles/wireguard/tasks/main.yml ===
+- name: Install wireguard
+  apt:
+    name: [wireguard, qrencode]
+    state: present
+    update_cache: yes
+
+- name: Ensure key dir
+  file: { path: /etc/wireguard/keys, state: directory, mode: '0700' }
+
+- name: Generate private key if missing
+  shell: "[ -f /etc/wireguard/keys/privatekey ] || (umask 077 && wg genkey > /etc/wireguard/keys/privatekey)"
+  args: { creates: /etc/wireguard/keys/privatekey }
+
+- name: Generate public key
+  shell: "wg pubkey < /etc/wireguard/keys/privatekey > /etc/wireguard/keys/publickey"
+  args: { creates: /etc/wireguard/keys/publickey }
+
+- name: Read pubkey
+  slurp: { src: /etc/wireguard/keys/publickey }
+  register: pubkey_raw
+
+- name: Read private key
+  slurp: { src: /etc/wireguard/keys/privatekey }
+  register: privkey_raw
+
+- set_fact:
+    wg_public_key: "{{ pubkey_raw.content | b64decode | trim }}"
+    wg_private_key: "{{ privkey_raw.content | b64decode | trim }}"
+
+- name: Gather facts from all hosts
+  setup:
+  delegate_to: "{{ item }}"
+  delegate_facts: true
+  loop: "{{ groups['k8s_nodes'] }}"
+  run_once: true
+
+- name: Pretty print hostvars
+  debug:
+    msg: "{{ hostvars['hetzner-1']['wg_public_key']  }}"
+
+- name: Render config
+  template:
+    src: wg0.conf.j2
+    dest: /etc/wireguard/wg0.conf
+    mode: '0600'
+
+- name: Enable IP forward
+  sysctl:
+    name: net.ipv4.ip_forward
+    value: "1"
+    sysctl_set: yes
+    state: present
+    reload: yes
+
+- name: Enable wg-quick
+  service:
+    name: wg-quick@wg0
+    enabled: yes
+    state: started
+
+- debug:
+    var: wg_show.stdout
+=== ./ansible/roles/wireguard/vars/main.yml ===
+wg_interface: wg0
+wg_port: 51820
+wg_cidr: 10.66.0.0/24
+wg_nodes:
+  hetzner-1: { address: 10.66.0.11, public_ip: "95.217.89.53" }
+  hetzner-2: { address: 10.66.0.12, public_ip: "138.201.254.97" }
+
+=== ./DNS_RECORDS.txt ===
+apps.betelgeusebytes.io.   300 IN A 95.217.89.53
+apps.betelgeusebytes.io.   300 IN A 138.201.254.97
+gitea.betelgeusebytes.io.      300 IN CNAME apps.betelgeusebytes.io.
+kibana.betelgeusebytes.io.     300 IN CNAME apps.betelgeusebytes.io.
+grafana.betelgeusebytes.io.    300 IN CNAME apps.betelgeusebytes.io.
+prometheus.betelgeusebytes.io. 300 IN CNAME apps.betelgeusebytes.io.
+notebook.betelgeusebytes.io.   300 IN CNAME apps.betelgeusebytes.io.
+broker.betelgeusebytes.io.     300 IN CNAME apps.betelgeusebytes.io.
+neo4j.betelgeusebytes.io.      300 IN CNAME apps.betelgeusebytes.io.
+otlp.betelgeusebytes.io.       300 IN CNAME apps.betelgeusebytes.io.
+
+=== ./k8s/00-namespaces.yaml ===
+apiVersion: v1
+kind: Namespace
+metadata: { name: db }
+---
+apiVersion: v1
+kind: Namespace
+metadata: { name: scm }
+---
+apiVersion: v1
+kind: Namespace
+metadata: { name: ml }
+---
+apiVersion: v1
+kind: Namespace
+metadata: { name: monitoring }
+---
+apiVersion: v1
+kind: Namespace
+metadata: { name: elastic }
+---
+apiVersion: v1
+kind: Namespace
+metadata: { name: broker }
+---
+apiVersion: v1
+kind: Namespace
+metadata: { name: graph }
+---
+apiVersion: v1
+kind: Namespace
+metadata: { name: observability }
+
+=== ./k8s/01-secrets/basic-auth.yaml ===
+# Replace each 'auth' line with a real htpasswd pair:
+#   htpasswd -nbBC 10 admin 'Str0ngP@ss'  (copy 'admin:...' to value below)
+
+apiVersion: v1
+kind: Secret
+metadata: { name: basic-auth-kibana, namespace: elastic }
+type: Opaque
+stringData: { auth: "admin:$2y$10$MBLgALyI7xwFrQh2PHqZruX.EzaTUGagmJODwpBEvF27snFAxCBvq" }
+---
+apiVersion: v1
+kind: Secret
+metadata: { name: basic-auth-grafana, namespace: monitoring }
+type: Opaque
+stringData: { auth: "admin:$2y$10$MBLgALyI7xwFrQh2PHqZruX.EzaTUGagmJODwpBEvF27snFAxCBvq" }
+---
+apiVersion: v1
+kind: Secret
+metadata: { name: basic-auth-prometheus, namespace: monitoring }
+type: Opaque
+stringData: { auth: "aadmin:$2y$10$MBLgALyI7xwFrQh2PHqZruX.EzaTUGagmJODwpBEvF27snFAxCBvq" }
+---
+apiVersion: v1
+kind: Secret
+metadata: { name: basic-auth-notebook, namespace: ml }
+type: Opaque
+stringData: { auth: "admin:$2y$10$MBLgALyI7xwFrQh2PHqZruX.EzaTUGagmJODwpBEvF27snFAxCBvq" }
+---
+apiVersion: v1
+kind: Secret
+metadata: { name: basic-auth-broker, namespace: broker }
+type: Opaque
+stringData: { auth: "admin:$2y$10$MBLgALyI7xwFrQh2PHqZruX.EzaTUGagmJODwpBEvF27snFAxCBvq" }
+---
+apiVersion: v1
+kind: Secret
+metadata: { name: basic-auth-neo4j, namespace: graph }
+type: Opaque
+stringData: { auth: "admin:$2y$10$MBLgALyI7xwFrQh2PHqZruX.EzaTUGagmJODwpBEvF27snFAxCBvq" }
+
+=== ./k8s/argoflow/argo.yaml ===
+apiVersion: v1
+kind: Secret
+metadata:
+  name: argo-artifacts
+  namespace: ml
+type: Opaque
+stringData:
+  accesskey: "minioadmin"          # <-- change
+  secretkey: "minioadmin"   # <-- change
+---
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: workflow-controller-configmap
+  namespace: ml
+data:
+  config: |
+    artifactRepository:
+      s3:
+        bucket: argo-artifacts
+        endpoint: minio.betelgeusebytes.io     # no scheme here
+        insecure: false                        # https via Ingress
+        accessKeySecret:
+          name: argo-artifacts
+          key: accesskey
+        secretKeySecret:
+          name: argo-artifacts
+          key: secretkey
+        keyFormat: "{{workflow.namespace}}/{{workflow.name}}/{{pod.name}}"
+
+---
+# k8s/argo/workflows/ns-rbac.yaml
+apiVersion: v1
+kind: ServiceAccount
+metadata:
+  name: argo-server
+  namespace: ml
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: Role
+metadata:
+  name: argo-namespaced
+  namespace: ml
+rules:
+- apiGroups: [""]
+  resources: ["pods","pods/log","secrets","configmaps","events","persistentvolumeclaims","serviceaccounts"]
+  verbs: ["get","list","watch","create","delete","patch","update"]
+- apiGroups: ["coordination.k8s.io"]
+  resources: ["leases"]
+  verbs: ["get","list","watch","create","delete","patch","update"]
+- apiGroups: ["argoproj.io"]
+  resources: ["workflows","workflowtemplates","cronworkflows","workfloweventbindings","sensors","eventsources","workflowtasksets","workflowartifactgctasks","workflowtaskresults"]
+  verbs: ["get","list","watch","create","delete","patch","update"]
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: RoleBinding
+metadata:
+  name: argo-namespaced-binding
+  namespace: ml
+subjects:
+- kind: ServiceAccount
+  name: argo-server
+  namespace: ml
+roleRef:
+  apiGroup: rbac.authorization.k8s.io
+  kind: Role
+  name: argo-namespaced
+
+---
+# k8s/argo/workflows/controller.yaml
+apiVersion: apps/v1
+kind: Deployment
+metadata: { name: workflow-controller,  namespace: ml }
+spec:
+  replicas: 1
+  selector: { matchLabels: { app: workflow-controller } }
+  template:
+    metadata: { labels: { app: workflow-controller } }
+    spec:
+      serviceAccountName: argo-server
+      containers:
+      - name: controller
+        image: quay.io/argoproj/workflow-controller:latest
+        args: ["--namespaced"]
+        env:
+        - name: LEADER_ELECTION_IDENTITY
+          valueFrom:
+            fieldRef:
+              fieldPath: metadata.name
+        ports: [{ containerPort: 9090 }]
+        readinessProbe:
+          httpGet: { path: /metrics, port: 9090, scheme: HTTPS }
+          initialDelaySeconds: 5
+          periodSeconds: 10
+        livenessProbe:
+          httpGet: { path: /metrics, port: 9090, scheme: HTTPS }
+          initialDelaySeconds: 20
+          periodSeconds: 20
+
+---
+# k8s/argo/workflows/server.yaml
+apiVersion: apps/v1
+kind: Deployment
+metadata: { name: argo-server,  namespace: ml }
+spec:
+  replicas: 1
+  selector: { matchLabels: { app: argo-server } }
+  template:
+    metadata: { labels: { app: argo-server } }
+    spec:
+      serviceAccountName: argo-server
+      containers:
+      - name: server
+        image: quay.io/argoproj/argocli:latest
+        args: ["server","--auth-mode","server","--namespaced","--secure=false"]
+        ports: [{ containerPort: 2746 }]
+        readinessProbe:
+          httpGet: { path: /, port: 2746, scheme: HTTP }
+          initialDelaySeconds: 5
+          periodSeconds: 10
+        livenessProbe:
+          httpGet: { path: /, port: 2746, scheme: HTTP }
+          initialDelaySeconds: 20
+          periodSeconds: 20
+---
+apiVersion: v1
+kind: Service
+metadata: { name: argo-server,  namespace: ml }
+spec: { selector: { app: argo-server }, ports: [ { port: 80, targetPort: 2746 } ] }
+---
+apiVersion: networking.k8s.io/v1
+kind: Ingress
+metadata:
+  name: argo
+  namespace: ml
+  annotations: { cert-manager.io/cluster-issuer: letsencrypt-prod }
+spec:
+  ingressClassName: nginx
+  tls: [{ hosts: ["argo.betelgeusebytes.io"], secretName: argo-tls }]
+  rules:
+  - host: argo.betelgeusebytes.io
+    http:
+      paths:
+      - path: /
+        pathType: Prefix
+        backend: { service: { name: argo-server, port: { number: 80 } } }
+=== ./k8s/automation/n8n.yaml ===
+apiVersion: v1
+kind: Namespace
+metadata:
+  name: automation
+  labels:
+    name: automation
+---
+apiVersion: v1
+kind: PersistentVolume
+metadata:
+  name: n8n-pv
+  labels:
+    app: n8n
+spec:
+  capacity:
+    storage: 20Gi
+  volumeMode: Filesystem
+  accessModes:
+    - ReadWriteOnce
+  persistentVolumeReclaimPolicy: Retain
+  storageClassName: local-ssd
+  local:
+    path: /mnt/local-ssd/n8n
+  nodeAffinity:
+    required:
+      nodeSelectorTerms:
+      - matchExpressions:
+        - key: kubernetes.io/hostname
+          operator: In
+          values:
+          - hetzner-2
+---
+apiVersion: v1
+kind: PersistentVolumeClaim
+metadata:
+  name: n8n-data
+  namespace: automation
+  labels:
+    app: n8n
+spec:
+  accessModes:
+    - ReadWriteOnce
+  storageClassName: local-ssd
+  resources:
+    requests:
+      storage: 20Gi
+  selector:
+    matchLabels:
+      app: n8n
+---
+apiVersion: v1
+kind: Secret
+metadata:
+  name: n8n-secrets
+  namespace: automation
+type: Opaque
+stringData:
+  # Generate a strong encryption key with: openssl rand -base64 32
+  N8N_ENCRYPTION_KEY: "G/US0ePajEpWwRUjlchyOs6+6I/AT+0bisXmE2fugSU="
+  # Optional: Database connection if using PostgreSQL
+  DB_TYPE: "postgresdb"
+  DB_POSTGRESDB_HOST: "pg.betelgeusebytes.io"
+  DB_POSTGRESDB_PORT: "5432"
+  DB_POSTGRESDB_DATABASE: "n8n"
+  DB_POSTGRESDB_USER: "app"
+  DB_POSTGRESDB_PASSWORD: "pa$$word"
+---
+apiVersion: apps/v1
+kind: StatefulSet
+metadata:
+  name: n8n
+  namespace: automation
+spec:
+  serviceName: n8n
+  replicas: 1
+  selector:
+    matchLabels:
+      app: n8n
+  template:
+    metadata:
+      labels:
+        app: n8n
+    spec:
+      nodeSelector:
+        kubernetes.io/hostname: hetzner-2
+      containers:
+      - name: n8n
+        image: n8nio/n8n:latest
+        ports:
+        - containerPort: 5678
+          name: http
+        env:
+        - name: N8N_HOST
+          value: "n8n.betelgeusebytes.io"
+        - name: N8N_PORT
+          value: "5678"
+        - name: N8N_PROTOCOL
+          value: "https"
+        - name: WEBHOOK_URL
+          value: "https://n8n.betelgeusebytes.io/"
+        - name: GENERIC_TIMEZONE
+          value: "UTC"
+        - name: N8N_ENCRYPTION_KEY
+          valueFrom:
+            secretKeyRef:
+              name: n8n-secrets
+              key: N8N_ENCRYPTION_KEY
+        # Uncomment if using PostgreSQL
+        - name: DB_TYPE
+          valueFrom:
+            secretKeyRef:
+              name: n8n-secrets
+              key: DB_TYPE
+        - name: DB_POSTGRESDB_HOST
+          valueFrom:
+            secretKeyRef:
+              name: n8n-secrets
+              key: DB_POSTGRESDB_HOST
+        - name: DB_POSTGRESDB_PORT
+          valueFrom:
+            secretKeyRef:
+              name: n8n-secrets
+              key: DB_POSTGRESDB_PORT
+        - name: DB_POSTGRESDB_DATABASE
+          valueFrom:
+            secretKeyRef:
+              name: n8n-secrets
+              key: DB_POSTGRESDB_DATABASE
+        - name: DB_POSTGRESDB_USER
+          valueFrom:
+            secretKeyRef:
+              name: n8n-secrets
+              key: DB_POSTGRESDB_USER
+        - name: DB_POSTGRESDB_PASSWORD
+          valueFrom:
+            secretKeyRef:
+              name: n8n-secrets
+              key: DB_POSTGRESDB_PASSWORD
+        volumeMounts:
+        - name: n8n-data
+          mountPath: /home/node/.n8n
+        resources:
+          requests:
+            memory: "512Mi"
+            cpu: "250m"
+          limits:
+            memory: "2Gi"
+            cpu: "1000m"
+        livenessProbe:
+          httpGet:
+            path: /healthz
+            port: 5678
+          initialDelaySeconds: 60
+          periodSeconds: 30
+          timeoutSeconds: 10
+          failureThreshold: 5
+        readinessProbe:
+          httpGet:
+            path: /healthz
+            port: 5678
+          initialDelaySeconds: 30
+          periodSeconds: 10
+          timeoutSeconds: 5
+          failureThreshold: 3
+      volumes:
+      - name: n8n-data
+        persistentVolumeClaim:
+          claimName: n8n-data
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: n8n
+  namespace: automation
+  labels:
+    app: n8n
+spec:
+  type: ClusterIP
+  ports:
+  - port: 5678
+    targetPort: 5678
+    protocol: TCP
+    name: http
+  selector:
+    app: n8n
+---
+apiVersion: networking.k8s.io/v1
+kind: Ingress
+metadata:
+  name: n8n
+  namespace: automation
+  annotations:
+    cert-manager.io/cluster-issuer: "letsencrypt-prod"
+    # nginx.ingress.kubernetes.io/proxy-body-size: "50m"
+    # nginx.ingress.kubernetes.io/proxy-read-timeout: "300"
+    # nginx.ingress.kubernetes.io/proxy-send-timeout: "300"
+    # Uncomment below if you want basic auth protection in addition to n8n's auth
+    # nginx.ingress.kubernetes.io/auth-type: basic
+    # nginx.ingress.kubernetes.io/auth-secret: n8n-basic-auth
+    # nginx.ingress.kubernetes.io/auth-realm: 'Authentication Required'
+spec:
+  ingressClassName: nginx
+  tls:
+  - hosts:
+    - n8n.betelgeusebytes.io
+    secretName: wildcard-betelgeusebytes-tls
+  rules:
+  - host: n8n.betelgeusebytes.io
+    http:
+      paths:
+      - path: /
+        pathType: Prefix
+        backend:
+          service:
+            name: n8n
+            port:
+              number: 5678
+=== ./k8s/cert-manager/cluster-issuer.yaml ===
+apiVersion: cert-manager.io/v1
+kind: ClusterIssuer
+metadata: { name: letsencrypt-prod }
+spec:
+  acme:
+    email: angal.salah@gmail.com
+    server: https://acme-v02.api.letsencrypt.org/directory
+    privateKeySecretRef: { name: letsencrypt-prod-key }
+    solvers:
+    - http01: { ingress: { class: nginx } }
+
+=== ./k8s/elastic/elastic-pv.yaml ===
+apiVersion: v1
+kind: PersistentVolume
+metadata:
+  name: pv-elasticsearch
+spec:
+  capacity:
+    storage: 80Gi
+  accessModes:
+    - ReadWriteOnce
+  persistentVolumeReclaimPolicy: Retain
+  storageClassName: local-ssd-hetzner
+  local:
+    path: /mnt/local-ssd/elasticsearch
+  nodeAffinity:
+    required:
+      nodeSelectorTerms:
+      - matchExpressions:
+        - key: kubernetes.io/hostname
+          operator: In
+          values:
+          - hetzner-2
+=== ./k8s/elastic/elasticsearch.yaml ===
+apiVersion: v1
+kind: Service
+metadata: { name: elasticsearch, namespace: elastic }
+spec:
+  ports:
+    - { name: http, port: 9200, targetPort: 9200 }
+    - { name: transport, port: 9300, targetPort: 9300 }
+  selector: { app: elasticsearch }
+---
+apiVersion: apps/v1
+kind: StatefulSet
+metadata: { name: elasticsearch, namespace: elastic }
+spec:
+  serviceName: elasticsearch
+  replicas: 1
+  selector: { matchLabels: { app: elasticsearch } }
+  template:
+    metadata: { labels: { app: elasticsearch } }
+    spec:
+      nodeSelector: { node: hetzner-2 }
+      containers:
+      - name: es
+        image: docker.elastic.co/elasticsearch/elasticsearch:8.14.0
+        env:
+          - { name: discovery.type, value: single-node }
+          - { name: xpack.security.enabled, value: "false" }
+          - { name: ES_JAVA_OPTS, value: "-Xms2g -Xmx2g" }
+        ports:
+          - { containerPort: 9200 }
+          - { containerPort: 9300 }
+        volumeMounts:
+          - { name: data, mountPath: /usr/share/elasticsearch/data }
+  volumeClaimTemplates:
+  - metadata: { name: data }
+    spec:
+      accessModes: ["ReadWriteOnce"]
+      storageClassName: local-ssd-hetzner
+      resources: { requests: { storage: 80Gi } }
+
+=== ./k8s/elastic/kibana.yaml ===
+apiVersion: v1
+kind: Service
+metadata: { name: kibana, namespace: elastic }
+spec:
+  ports: [{ port: 5601, targetPort: 5601 }]
+  selector: { app: kibana }
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata: { name: kibana, namespace: elastic }
+spec:
+  replicas: 1
+  selector: { matchLabels: { app: kibana } }
+  template:
+    metadata: { labels: { app: kibana } }
+    spec:
+      nodeSelector: { node: hetzner-2 }
+      containers:
+      - name: kibana
+        image: docker.elastic.co/kibana/kibana:8.14.0
+        env:
+          - { name: ELASTICSEARCH_HOSTS, value: "http://elasticsearch.elastic.svc.cluster.local:9200" }
+        ports: [{ containerPort: 5601 }]
+---
+apiVersion: networking.k8s.io/v1
+kind: Ingress
+metadata:
+  name: kibana
+  namespace: elastic
+  annotations:
+    cert-manager.io/cluster-issuer: letsencrypt-prod
+    # nginx.ingress.kubernetes.io/auth-type: basic
+    # nginx.ingress.kubernetes.io/auth-secret: basic-auth-kibana
+    # nginx.ingress.kubernetes.io/auth-realm: "Authentication Required"
+spec:
+  ingressClassName: nginx
+  tls: [{ hosts: ["kibana.betelgeusebytes.io"], secretName: kibana-tls }]
+  rules:
+  - host: kibana.betelgeusebytes.io
+    http:
+      paths:
+      - path: /
+        pathType: Prefix
+        backend: { service: { name: kibana, port: { number: 5601 } } }
+
+=== ./k8s/gitea/gitea-pv.yaml ===
+apiVersion: v1
+kind: PersistentVolume
+metadata:
+  name: pv-gitea
+spec:
+  capacity:
+    storage: 50Gi
+  accessModes:
+    - ReadWriteOnce
+  persistentVolumeReclaimPolicy: Retain
+  storageClassName: local-ssd-hetzner
+  local:
+    path: /mnt/local-ssd/gitea
+  nodeAffinity:
+    required:
+      nodeSelectorTerms:
+      - matchExpressions:
+        - key: kubernetes.io/hostname
+          operator: In
+          values:
+          - hetzner-2
+=== ./k8s/gitea/gitea.yaml ===
+apiVersion: v1
+kind: Service
+metadata: { name: gitea, namespace: scm }
+spec:
+  ports: [{ port: 80, targetPort: 3000 }]
+  selector: { app: gitea }
+---
+apiVersion: apps/v1
+kind: StatefulSet
+metadata: { name: gitea, namespace: scm }
+spec:
+  serviceName: gitea
+  replicas: 1
+  selector: { matchLabels: { app: gitea } }
+  template:
+    metadata: { labels: { app: gitea } }
+    spec:
+      nodeSelector: { node: hetzner-2 }
+      containers:
+      - name: gitea
+        image: gitea/gitea:1.21.11
+        env:
+          - { name: GITEA__server__ROOT_URL, value: "https://gitea.betelgeusebytes.io" }
+          - { name: GITEA__database__DB_TYPE, value: "postgres" }
+          - { name: GITEA__database__HOST, value: "postgres.db.svc.cluster.local:5432" }
+          - { name: GITEA__database__NAME, value: "gitea" }
+          - { name: GITEA__database__USER, value: "app" }
+          - { name: GITEA__database__PASSWD, value: "pa$$word" }
+        ports: [{ containerPort: 3000 }]
+        volumeMounts:
+          - { name: data, mountPath: /data }
+  volumeClaimTemplates:
+  - metadata: { name: data }
+    spec:
+      accessModes: ["ReadWriteOnce"]
+      storageClassName: local-ssd-hetzner
+      resources: { requests: { storage: 50Gi } }
+---
+apiVersion: networking.k8s.io/v1
+kind: Ingress
+metadata:
+  name: gitea
+  namespace: scm
+  annotations: { cert-manager.io/cluster-issuer: letsencrypt-prod }
+spec:
+  ingressClassName: nginx
+  tls: [{ hosts: ["gitea.betelgeusebytes.io"], secretName: gitea-tls }]
+  rules:
+  - host: gitea.betelgeusebytes.io
+    http:
+      paths:
+      - path: /
+        pathType: Prefix
+        backend: { service: { name: gitea, port: { number: 80 } } }
+
+=== ./k8s/grafana/grafana.yaml ===
+apiVersion: v1
+kind: Service
+metadata: { name: grafana, namespace: monitoring }
+spec:
+  ports: [{ port: 80, targetPort: 3000 }]
+  selector: { app: grafana }
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata: { name: grafana, namespace: monitoring }
+spec:
+  replicas: 1
+  selector: { matchLabels: { app: grafana } }
+  template:
+    metadata: { labels: { app: grafana } }
+    spec:
+      nodeSelector: { node: hetzner-2 }
+      containers:
+      - name: grafana
+        image: grafana/grafana:10.4.3
+        env:
+          - { name: GF_SECURITY_ADMIN_USER, value: admin }
+          - { name: GF_SECURITY_ADMIN_PASSWORD, value: "ADMINclaude-GRAFANA" }
+        ports: [{ containerPort: 3000 }]
+---
+apiVersion: networking.k8s.io/v1
+kind: Ingress
+metadata:
+  name: grafana
+  namespace: monitoring
+  annotations:
+    cert-manager.io/cluster-issuer: letsencrypt-prod
+    nginx.ingress.kubernetes.io/auth-type: basic
+    nginx.ingress.kubernetes.io/auth-secret: basic-auth-grafana
+    nginx.ingress.kubernetes.io/auth-realm: "Authentication Required"
+spec:
+  ingressClassName: nginx
+  tls: [{ hosts: ["grafana.betelgeusebytes.io"], secretName: grafana-tls }]
+  rules:
+  - host: grafana.betelgeusebytes.io
+    http:
+      paths:
+      - path: /
+        pathType: Prefix
+        backend: { service: { name: grafana, port: { number: 80 } } }
+
+=== ./k8s/ingress-patch/kustomization.yaml ===
+apiVersion: kustomize.config.k8s.io/v1beta1
+kind: Kustomization
+namespace: ingress-nginx
+
+# Create the tcp-services ConfigMap from *quoted* literals
+configMapGenerator:
+  - name: tcp-services
+    literals:
+      - "5432=db/postgres:5432"
+      - "7687=graph/neo4j:7687"
+
+generatorOptions:
+  disableNameSuffixHash: true
+
+# Inline JSON6902 patches
+patches:
+  # 1) Add controller arg for tcp-services
+  - target:
+      group: apps
+      version: v1
+      kind: Deployment
+      name: ingress-nginx-controller
+      namespace: ingress-nginx
+    patch: |-
+      - op: add
+        path: /spec/template/spec/containers/0/args/-
+        value: --tcp-services-configmap=$(POD_NAMESPACE)/tcp-services
+
+  # 2) Expose Service ports 5432 and 7687 (keeps 80/443)
+  - target:
+      version: v1
+      kind: Service
+      name: ingress-nginx-controller
+      namespace: ingress-nginx
+    patch: |-
+      - op: add
+        path: /spec/ports/-
+        value:
+          name: tcp-5432
+          port: 5432
+          protocol: TCP
+          targetPort: 5432
+      - op: add
+        path: /spec/ports/-
+        value:
+          name: tcp-7687
+          port: 7687
+          protocol: TCP
+          targetPort: 7687
+
+=== ./k8s/jupyter/jupyter.yaml ===
+apiVersion: v1
+kind: Service
+metadata: { name: notebook, namespace: ml }
+spec:
+  selector: { app: jupyterlab }
+  ports: [{ port: 80, targetPort: 8888 }]
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata: { name: jupyterlab, namespace: ml }
+spec:
+  replicas: 1
+  selector: { matchLabels: { app: jupyterlab } }
+  template:
+    metadata: { labels: { app: jupyterlab } }
+    spec:
+      securityContext:
+        runAsUser: 1000
+        fsGroup: 100
+      nodeSelector: { node: hetzner-2 }
+      containers:
+      - name: jupyter
+        image: jupyter/base-notebook:latest
+        args: ["start-notebook.sh", "--NotebookApp.token=$(PASSWORD)"]
+        env:
+          - name: PASSWORD
+            valueFrom: { secretKeyRef: { name: jupyter-auth, key: PASSWORD } }
+        ports: [{ containerPort: 8888 }]
+        volumeMounts:
+          - { name: work, mountPath: /home/jovyan/work }
+      volumes:
+        - name: work
+          persistentVolumeClaim: { claimName: jupyter-pvc }
+---
+apiVersion: v1
+kind: PersistentVolumeClaim
+metadata: { name: jupyter-pvc, namespace: ml }
+spec:
+  accessModes: ["ReadWriteOnce"]
+  storageClassName: local-ssd-hetzner
+  resources: { requests: { storage: 20Gi } }
+---
+apiVersion: v1
+kind: Secret
+metadata: { name: jupyter-auth, namespace: ml }
+type: Opaque
+stringData: { PASSWORD: "notebook" }
+---
+apiVersion: networking.k8s.io/v1
+kind: Ingress
+metadata:
+  name: notebook
+  namespace: ml
+  annotations:
+    cert-manager.io/cluster-issuer: letsencrypt-prod
+    # nginx.ingress.kubernetes.io/auth-type: basic
+    # nginx.ingress.kubernetes.io/auth-secret: basic-auth-notebook
+    # nginx.ingress.kubernetes.io/auth-realm: "Authentication Required"
+spec:
+  ingressClassName: nginx
+  tls: [{ hosts: ["notebook.betelgeusebytes.io"], secretName: notebook-tls }]
+  rules:
+  - host: notebook.betelgeusebytes.io
+    http:
+      paths:
+      - path: /
+        pathType: Prefix
+        backend: { service: { name: notebook, port: { number: 80 } } }
+
+=== ./k8s/kafka/kafka-pv.yaml ===
+apiVersion: v1
+kind: PersistentVolume
+metadata:
+  name: pv-kafka
+spec:
+  capacity:
+    storage: 50Gi
+  accessModes:
+    - ReadWriteOnce
+  persistentVolumeReclaimPolicy: Retain
+  storageClassName: local-ssd-hetzner
+  local:
+    path: /mnt/local-ssd/kafka
+  nodeAffinity:
+    required:
+      nodeSelectorTerms:
+      - matchExpressions:
+        - key: kubernetes.io/hostname
+          operator: In
+          values:
+          - hetzner-2
+---
+apiVersion: v1
+kind: PersistentVolume
+metadata:
+  name: pv-zookeeper-data
+spec:
+  capacity:
+    storage: 10Gi
+  accessModes:
+    - ReadWriteOnce
+  persistentVolumeReclaimPolicy: Retain
+  storageClassName: local-ssd-hetzner
+  local:
+    path: /mnt/local-ssd/zookeeper-data
+  nodeAffinity:
+    required:
+      nodeSelectorTerms:
+      - matchExpressions:
+        - key: kubernetes.io/hostname
+          operator: In
+          values:
+          - hetzner-2
+---
+apiVersion: v1
+kind: PersistentVolume
+metadata:
+  name: pv-zookeeper-log
+spec:
+  capacity:
+    storage: 10Gi
+  accessModes:
+    - ReadWriteOnce
+  persistentVolumeReclaimPolicy: Retain
+  storageClassName: local-ssd-hetzner
+  local:
+    path: /mnt/local-ssd/zookeeper-log
+  nodeAffinity:
+    required:
+      nodeSelectorTerms:
+      - matchExpressions:
+        - key: kubernetes.io/hostname
+          operator: In
+          values:
+          - hetzner-2
+=== ./k8s/kafka/kafka-ui.yaml ===
+apiVersion: v1
+kind: Service
+metadata: { name: kafka-ui, namespace: broker }
+spec:
+  ports: [{ port: 80, targetPort: 8080 }]
+  selector: { app: kafka-ui }
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata: { name: kafka-ui, namespace: broker }
+spec:
+  replicas: 1
+  selector: { matchLabels: { app: kafka-ui } }
+  template:
+    metadata: { labels: { app: kafka-ui } }
+    spec:
+      containers:
+      - name: ui
+        image: provectuslabs/kafka-ui:latest
+        env:
+          - { name: KAFKA_CLUSTERS_0_NAME, value: "local" }
+          - { name: KAFKA_CLUSTERS_0_BOOTSTRAPSERVERS, value: "kafka.broker.svc.cluster.local:9092" }
+        ports: [{ containerPort: 8080 }]
+---
+apiVersion: networking.k8s.io/v1
+kind: Ingress
+metadata:
+  name: kafka-ui
+  namespace: broker
+  annotations:
+    cert-manager.io/cluster-issuer: letsencrypt-prod
+    # nginx.ingress.kubernetes.io/auth-type: basic
+    # nginx.ingress.kubernetes.io/auth-secret: basic-auth-broker
+    # nginx.ingress.kubernetes.io/auth-realm: "Authentication Required"
+spec:
+  ingressClassName: nginx
+  tls: [{ hosts: ["broker.betelgeusebytes.io"], secretName: broker-tls }]
+  rules:
+  - host: broker.betelgeusebytes.io
+    http:
+      paths:
+      - path: /
+        pathType: Prefix
+        backend: { service: { name: kafka-ui, port: { number: 80 } } }
+
+=== ./k8s/kafka/kafka.yaml ===
+apiVersion: v1
+kind: Service
+metadata: { name: kafka, namespace: broker }
+spec:
+  ports: [{ name: kafka, port: 9092, targetPort: 9092 }]
+  selector: { app: kafka }
+---
+apiVersion: apps/v1
+kind: StatefulSet
+metadata: { name: kafka, namespace: broker }
+spec:
+  serviceName: kafka
+  replicas: 1
+  selector: { matchLabels: { app: kafka } }
+  template:
+    metadata: { labels: { app: kafka } }
+    spec:
+      nodeSelector: { node: hetzner-2 }
+      containers:
+      - name: kafka
+        image: apache/kafka:latest
+        env:
+          - { name: KAFKA_NODE_ID, value: "1" }
+          - { name: KAFKA_PROCESS_ROLES, value: "broker,controller" }
+          - { name: KAFKA_LISTENERS, value: "PLAINTEXT://:9092,CONTROLLER://:9093" }
+          - { name: KAFKA_ADVERTISED_LISTENERS, value: "PLAINTEXT://kafka.broker.svc.cluster.local:9092" }
+          - { name: KAFKA_CONTROLLER_LISTENER_NAMES, value: "CONTROLLER" }
+          - { name: KAFKA_LISTENER_SECURITY_PROTOCOL_MAP, value: "CONTROLLER:PLAINTEXT,PLAINTEXT:PLAINTEXT" }
+          - { name: KAFKA_CONTROLLER_QUORUM_VOTERS, value: "1@localhost:9093" }
+          - { name: KAFKA_OFFSETS_TOPIC_REPLICATION_FACTOR, value: "1" }
+          - { name: KAFKA_TRANSACTION_STATE_LOG_REPLICATION_FACTOR, value: "1" }
+          - { name: KAFKA_TRANSACTION_STATE_LOG_MIN_ISR, value: "1" }
+          - { name: KAFKA_LOG_DIRS, value: "/var/lib/kafka/data" }
+          - { name: CLUSTER_ID, value: "MkU3OEVBNTcwNTJENDM2Qk" }
+        ports:
+          - { containerPort: 9092 }
+          - { containerPort: 9093 }
+        volumeMounts:
+          - { name: data, mountPath: /var/lib/kafka/data }
+  volumeClaimTemplates:
+  - metadata: { name: data }
+    spec:
+      accessModes: ["ReadWriteOnce"]
+      storageClassName: local-ssd-hetzner
+      resources: { requests: { storage: 50Gi } }
+
+=== ./k8s/label_studio/label.yaml ===
+# k8s/ai/label-studio/secret-pg.yaml
+apiVersion: v1
+kind: Secret
+metadata: { name: labelstudio-pg,  namespace: ml }
+type: Opaque
+stringData: { POSTGRES_PASSWORD: "admin" }
+
+---
+# k8s/ai/label-studio/secret-minio.yaml
+apiVersion: v1
+kind: Secret
+metadata: { name: minio-label,  namespace: ml }
+type: Opaque
+stringData:
+  accesskey: "minioadmin"
+  secretkey: "minioadmin"
+
+---
+# k8s/ai/label-studio/deploy.yaml
+apiVersion: apps/v1
+kind: Deployment
+metadata: { name: label-studio,  namespace: ml }
+spec:
+  replicas: 1
+  selector: { matchLabels: { app: label-studio } }
+  template:
+    metadata: { labels: { app: label-studio } }
+    spec:
+      containers:
+      - name: app
+        image: heartexlabs/label-studio:latest
+        env:
+          - { name: POSTGRE_NAME, value: "labelstudio" }
+          - { name: POSTGRE_USER, value: "admin" }
+          - name: POSTGRE_PASSWORD
+            valueFrom: { secretKeyRef: { name: labelstudio-pg, key: POSTGRES_PASSWORD } }
+          - { name: POSTGRE_HOST, value: "postgres.db.svc.cluster.local" }
+          - { name: POSTGRE_PORT, value: "5432" }
+          - { name: S3_ENDPOINT, value: "https://minio.betelgeusebytes.io" }
+          - name: AWS_ACCESS_KEY_ID
+            valueFrom: { secretKeyRef: { name: minio-label, key: accesskey } }
+          - name: AWS_SECRET_ACCESS_KEY
+            valueFrom: { secretKeyRef: { name: minio-label, key: secretkey } }
+          - name: ALLOWED_HOSTS
+            value: "label.betelgeusebytes.io"
+          - name: CSRF_TRUSTED_ORIGINS
+            value: "https://label.betelgeusebytes.io"
+          - name: CSRF_COOKIE_SECURE
+            value: "1"
+          - name: SESSION_COOKIE_SECURE
+            value: "1"
+        ports: [{ containerPort: 8080 }]
+---
+apiVersion: v1
+kind: Service
+metadata: { name: label-studio,  namespace: ml }
+spec: { selector: { app: label-studio }, ports: [ { port: 80, targetPort: 8080 } ] }
+---
+apiVersion: networking.k8s.io/v1
+kind: Ingress
+metadata:
+  name: label-studio
+  namespace: ml
+  annotations: { cert-manager.io/cluster-issuer: letsencrypt-prod }
+spec:
+  ingressClassName: nginx
+  tls: [{ hosts: ["label.betelgeusebytes.io"], secretName: label-tls }]
+  rules:
+  - host: label.betelgeusebytes.io
+    http:
+      paths:
+      - path: /
+        pathType: Prefix
+        backend: { service: { name: label-studio, port: { number: 80 } } }
+
+=== ./k8s/minio/minio.yaml ===
+apiVersion: v1
+kind: Namespace
+metadata: { name: storage }
+---
+# k8s/storage/minio/secret.yaml
+apiVersion: v1
+kind: Secret
+metadata: { name: minio-root, namespace: storage }
+type: Opaque
+stringData:
+  MINIO_ROOT_USER: "minioadmin"
+  MINIO_ROOT_PASSWORD: "minioadmin"
+
+---
+# k8s/storage/minio/pvc.yaml
+apiVersion: v1
+kind: PersistentVolumeClaim
+metadata: { name: minio-data, namespace: storage }
+spec:
+  accessModes: ["ReadWriteOnce"]
+  storageClassName: local-ssd-hetzner
+  resources: { requests: { storage: 20Gi } }
+
+---
+# k8s/storage/minio/deploy.yaml
+apiVersion: apps/v1
+kind: Deployment
+metadata: { name: minio, namespace: storage }
+spec:
+  replicas: 1
+  selector: { matchLabels: { app: minio } }
+  template:
+    metadata: { labels: { app: minio } }
+    spec:
+      containers:
+      - name: minio
+        image: minio/minio:latest
+        args: ["server","/data","--console-address",":9001"]
+        envFrom: [{ secretRef: { name: minio-root } }]
+        ports:
+          - { containerPort: 9000 } # S3
+          - { containerPort: 9001 } # Console
+        volumeMounts:
+          - { name: data, mountPath: /data }
+      volumes:
+        - name: data
+          persistentVolumeClaim: { claimName: minio-data }
+---
+apiVersion: v1
+kind: Service
+metadata: { name: minio, namespace: storage }
+spec:
+  selector: { app: minio }
+  ports:
+    - { name: s3,      port: 9000, targetPort: 9000 }
+    - { name: console, port: 9001, targetPort: 9001 }
+---
+apiVersion: networking.k8s.io/v1
+kind: Ingress
+metadata:
+  name: minio
+  namespace: storage
+  annotations: { cert-manager.io/cluster-issuer: letsencrypt-prod }
+spec:
+  ingressClassName: nginx
+  tls: [{ hosts: ["minio.betelgeusebytes.io"], secretName: minio-tls }]
+  rules:
+  - host: minio.betelgeusebytes.io
+    http:
+      paths:
+      - path: /
+        pathType: Prefix
+        backend: { service: { name: minio, port: { number: 9001 } } }
+---
+# PV
+apiVersion: v1
+kind: PersistentVolume
+metadata:
+  name: pv-minio
+spec:
+  capacity:
+    storage: 20Gi
+  accessModes:
+    - ReadWriteOnce
+  persistentVolumeReclaimPolicy: Retain
+  storageClassName: local-ssd-hetzner
+  local:
+    path: /mnt/local-ssd/minio
+  nodeAffinity:
+    required:
+      nodeSelectorTerms:
+      - matchExpressions:
+        - key: kubernetes.io/hostname
+          operator: In
+          values:
+          - hetzner-2
+=== ./k8s/mlflow/mlflow.yaml ===
+# k8s/mlops/mlflow/secret-pg.yaml
+apiVersion: v1
+kind: Secret
+metadata: { name: mlflow-pg,  namespace: ml }
+type: Opaque
+stringData: { POSTGRES_PASSWORD: "pa$$word" }
+
+---
+# k8s/mlops/mlflow/secret-minio.yaml
+apiVersion: v1
+kind: Secret
+metadata: { name: mlflow-minio,  namespace: ml }
+type: Opaque
+stringData:
+  accesskey: "minioadmin"
+  secretkey: "minioadmin"
+
+---
+# k8s/mlops/mlflow/deploy.yaml
+apiVersion: apps/v1
+kind: Deployment
+metadata: { name: mlflow,  namespace: ml }
+spec:
+  replicas: 1
+  selector: { matchLabels: { app: mlflow } }
+  template:
+    metadata: { labels: { app: mlflow } }
+    spec:
+      containers:
+      - name: mlflow
+        # image: ghcr.io/mlflow/mlflow:v3.6.0
+        image: axxs/mlflow-pg
+        env:
+          - { name: MLFLOW_BACKEND_STORE_URI,
+              value: "postgresql://admin:admin@postgres.db.svc.cluster.local:5432/mlflow" }
+          - { name: POSTGRES_PASSWORD, valueFrom: { secretKeyRef: { name: mlflow-pg, key: POSTGRES_PASSWORD } } }
+          - { name: MLFLOW_S3_ENDPOINT_URL, value: "https://minio.betelgeusebytes.io" }
+          - { name: AWS_ACCESS_KEY_ID, valueFrom: { secretKeyRef: { name: mlflow-minio, key: accesskey } } }
+          - { name: AWS_SECRET_ACCESS_KEY, valueFrom: { secretKeyRef: { name: mlflow-minio, key: secretkey } } }
+        args: ["mlflow","server","--host","0.0.0.0","--port","5000","--artifacts-destination","s3://mlflow", "--allowed-hosts", "*.betelgeusebytes.io"]
+        ports: [{ containerPort: 5000 }]
+---
+apiVersion: v1
+kind: Service
+metadata: { name: mlflow,  namespace: ml }
+spec: { selector: { app: mlflow }, ports: [ { port: 80, targetPort: 5000 } ] }
+---
+apiVersion: networking.k8s.io/v1
+kind: Ingress
+metadata:
+  name: mlflow
+  namespace: ml
+  annotations: { cert-manager.io/cluster-issuer: letsencrypt-prod }
+spec:
+  ingressClassName: nginx
+  tls: [{ hosts: ["mlflow.betelgeusebytes.io"], secretName: mlflow-tls }]
+  rules:
+  - host: mlflow.betelgeusebytes.io
+    http:
+      paths:
+      - path: /
+        pathType: Prefix
+        backend: { service: { name: mlflow, port: { number: 80 } } }
+
+
+=== ./k8s/neo4j/neo4j-pv.yaml ===
+apiVersion: v1
+kind: PersistentVolume
+metadata:
+  name: pv-neo4j
+spec:
+  capacity:
+    storage: 20Gi
+  accessModes:
+    - ReadWriteOnce
+  persistentVolumeReclaimPolicy: Retain
+  storageClassName: local-ssd-hetzner
+  local:
+    path: /mnt/local-ssd/neo4j
+  nodeAffinity:
+    required:
+      nodeSelectorTerms:
+      - matchExpressions:
+        - key: kubernetes.io/hostname
+          operator: In
+          values:
+          - hetzner-2
+=== ./k8s/neo4j/neo4j.yaml ===
+apiVersion: v1
+kind: Service
+metadata: { name: neo4j, namespace: graph }
+spec:
+  selector: { app: neo4j }
+  ports:
+    - { name: http, port: 7474, targetPort: 7474 }
+    - { name: bolt, port: 7687, targetPort: 7687 }
+---
+apiVersion: apps/v1
+kind: StatefulSet
+metadata: { name: neo4j, namespace: graph }
+spec:
+  serviceName: neo4j
+  replicas: 1
+  selector: { matchLabels: { app: neo4j } }
+  template:
+    metadata: { labels: { app: neo4j } }
+    spec:
+      enableServiceLinks: false
+      nodeSelector: { node: hetzner-2 }
+      containers:
+      - name: neo4j
+        image: neo4j:5.20
+        env:
+          - name: NEO4J_AUTH
+            valueFrom: { secretKeyRef: { name: neo4j-auth, key: NEO4J_AUTH } }
+          - name: NEO4J_dbms_ssl_policy_bolt_enabled
+            value: "true"
+          - name: NEO4J_dbms_ssl_policy_bolt_base__directory
+            value: "/certs/bolt"
+          - name: NEO4J_dbms_ssl_policy_bolt_private__key
+            value: "tls.key"
+          - name: NEO4J_dbms_ssl_policy_bolt_public__certificate
+            value: "tls.crt"
+          - name: NEO4J_dbms_connector_bolt_tls__level
+            value: "REQUIRED"
+          # Advertise public hostname so the Browser uses the external FQDN for Bolt
+          - name: NEO4J_dbms_connector_bolt_advertised__address
+            value: "neo4j.betelgeusebytes.io:7687"
+          # also set a default advertised address (recommended)
+          - name: NEO4J_dbms_default__advertised__address
+            value: "neo4j.betelgeusebytes.io"
+        ports:
+          - { containerPort: 7474 }
+          - { containerPort: 7687 }
+        volumeMounts:
+          - { name: data, mountPath: /data }
+          - { name: bolt-certs, mountPath: /certs/bolt }
+      volumes:
+        - name: bolt-certs
+          secret:
+            secretName: neo4j-tls
+            items:
+              - key: tls.crt
+                path: tls.crt
+              - key: tls.key
+                path: tls.key
+  volumeClaimTemplates:
+  - metadata: { name: data }
+    spec:
+      accessModes: ["ReadWriteOnce"]
+      storageClassName: local-ssd-hetzner
+      resources: { requests: { storage: 20Gi } }
+---
+apiVersion: v1
+kind: Secret
+metadata: { name: neo4j-auth, namespace: graph }
+type: Opaque
+stringData: { NEO4J_AUTH: "neo4j/NEO4J-PASS" }
+---
+apiVersion: networking.k8s.io/v1
+kind: Ingress
+metadata:
+  name: neo4j-http
+  namespace: graph
+  annotations:
+    cert-manager.io/cluster-issuer: letsencrypt-prod
+    # nginx.ingress.kubernetes.io/auth-type: basic
+    # nginx.ingress.kubernetes.io/auth-secret: basic-auth-neo4j
+    # nginx.ingress.kubernetes.io/auth-realm: "Authentication Required"
+spec:
+  ingressClassName: nginx
+  tls: [{ hosts: ["neo4j.betelgeusebytes.io"], secretName: neo4j-tls }]
+  rules:
+  - host: neo4j.betelgeusebytes.io
+    http:
+      paths:
+      - path: /
+        pathType: Prefix
+        backend: { service: { name: neo4j, port: { number: 7474 } } }
+
+# create or update the tcp-services configmap
+# kubectl -n ingress-nginx create configmap tcp-services \
+#   --from-literal="7687=graph/neo4j:7687" \
+#   -o yaml --dry-run=client | kubectl apply -f -
+
+# kubectl -n ingress-nginx patch deploy ingress-nginx-controller \
+#   --type='json' -p='[{"op":"add","path":"/spec/template/spec/containers/0/args/-","value":"--tcp-services-configmap=$(POD_NAMESPACE)/tcp-services"}]'
+
+# kubectl -n ingress-nginx patch deploy ingress-nginx-controller \
+#   --type='json' -p='[{"op":"add","path":"/spec/template/spec/containers/0/args/-","value":"--tcp-services-configmap=$(POD_NAMESPACE)/tcp-services"}]'
+
+# kubectl -n ingress-nginx patch deployment ingress-nginx-controller \
+#   --type='json' -p='[
+#     {"op":"add","path":"/spec/template/spec/containers/0/ports/-","value":{"name":"tcp-7687","containerPort":7687,"hostPort":7687,"protocol":"TCP"}}
+#   ]'
+=== ./k8s/observability/fluent-bit.yaml ===
+apiVersion: v1
+kind: ServiceAccount
+metadata: { name: fluent-bit, namespace: observability }
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRole
+metadata: { name: fluent-bit-read }
+rules:
+  - apiGroups: [""]
+    resources: ["pods", "namespaces"]
+    verbs: ["get", "list", "watch"]
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRoleBinding
+metadata: { name: fluent-bit-read }
+roleRef:
+  apiGroup: rbac.authorization.k8s.io
+  kind: ClusterRole
+  name: fluent-bit-read
+subjects:
+  - kind: ServiceAccount
+    name: fluent-bit
+    namespace: observability
+---
+apiVersion: apps/v1
+kind: DaemonSet
+metadata: { name: fluent-bit, namespace: observability }
+spec:
+  selector: { matchLabels: { app: fluent-bit } }
+  template:
+    metadata: { labels: { app: fluent-bit } }
+    spec:
+      serviceAccountName: fluent-bit
+      containers:
+      - name: fluent-bit
+        image: cr.fluentbit.io/fluent/fluent-bit:2.2.2
+        volumeMounts:
+          - { name: varlog, mountPath: /var/log }
+          - { name: containers, mountPath: /var/lib/docker/containers, readOnly: true }
+        env:
+          - { name: FLUENT_ELASTICSEARCH_HOST, value: elasticsearch.elastic.svc.cluster.local }
+          - { name: FLUENT_ELASTICSEARCH_PORT, value: "9200" }
+        args: ["-i","tail","-p","path=/var/log/containers/*.log","-F","kubernetes","-o","es","-p","host=${FLUENT_ELASTICSEARCH_HOST}","-p","port=${FLUENT_ELASTICSEARCH_PORT}","-p","logstash_format=On","-p","logstash_prefix=k8s-logs"]
+      volumes:
+        - { name: varlog, hostPath: { path: /var/log } }
+        - { name: containers, hostPath: { path: /var/lib/docker/containers, type: DirectoryOrCreate } }
+
+=== ./k8s/observability-stack/00-namespace.yaml ===
+apiVersion: v1
+kind: Namespace
+metadata:
+  name: observability
+  labels:
+    name: observability
+    monitoring: "true"
+
+=== ./k8s/observability-stack/01-persistent-volumes.yaml ===
+---
+# Prometheus PV
+apiVersion: v1
+kind: PersistentVolume
+metadata:
+  name: prometheus-data-pv
+spec:
+  capacity:
+    storage: 50Gi
+  accessModes:
+    - ReadWriteOnce
+  persistentVolumeReclaimPolicy: Retain
+  storageClassName: local-storage
+  local:
+    path: /mnt/local-ssd/prometheus
+  nodeAffinity:
+    required:
+      nodeSelectorTerms:
+      - matchExpressions:
+        - key: kubernetes.io/hostname
+          operator: In
+          values:
+          - hetzner-2
+
+---
+# Loki PV
+apiVersion: v1
+kind: PersistentVolume
+metadata:
+  name: loki-data-pv
+spec:
+  capacity:
+    storage: 100Gi
+  accessModes:
+    - ReadWriteOnce
+  persistentVolumeReclaimPolicy: Retain
+  storageClassName: local-storage
+  local:
+    path: /mnt/local-ssd/loki
+  nodeAffinity:
+    required:
+      nodeSelectorTerms:
+      - matchExpressions:
+        - key: kubernetes.io/hostname
+          operator: In
+          values:
+          - hetzner-2
+
+---
+# Tempo PV
+apiVersion: v1
+kind: PersistentVolume
+metadata:
+  name: tempo-data-pv
+spec:
+  capacity:
+    storage: 50Gi
+  accessModes:
+    - ReadWriteOnce
+  persistentVolumeReclaimPolicy: Retain
+  storageClassName: local-storage
+  local:
+    path: /mnt/local-ssd/tempo
+  nodeAffinity:
+    required:
+      nodeSelectorTerms:
+      - matchExpressions:
+        - key: kubernetes.io/hostname
+          operator: In
+          values:
+          - hetzner-2
+
+---
+# Grafana PV
+apiVersion: v1
+kind: PersistentVolume
+metadata:
+  name: grafana-data-pv
+spec:
+  capacity:
+    storage: 10Gi
+  accessModes:
+    - ReadWriteOnce
+  persistentVolumeReclaimPolicy: Retain
+  storageClassName: local-storage
+  local:
+    path: /mnt/local-ssd/grafana
+  nodeAffinity:
+    required:
+      nodeSelectorTerms:
+      - matchExpressions:
+        - key: kubernetes.io/hostname
+          operator: In
+          values:
+          - hetzner-2
+
+=== ./k8s/observability-stack/02-persistent-volume-claims.yaml ===
+---
+apiVersion: v1
+kind: PersistentVolumeClaim
+metadata:
+  name: prometheus-data
+  namespace: observability
+spec:
+  accessModes:
+    - ReadWriteOnce
+  storageClassName: local-storage
+  resources:
+    requests:
+      storage: 50Gi
+
+---
+apiVersion: v1
+kind: PersistentVolumeClaim
+metadata:
+  name: loki-data
+  namespace: observability
+spec:
+  accessModes:
+    - ReadWriteOnce
+  storageClassName: local-storage
+  resources:
+    requests:
+      storage: 100Gi
+
+---
+apiVersion: v1
+kind: PersistentVolumeClaim
+metadata:
+  name: tempo-data
+  namespace: observability
+spec:
+  accessModes:
+    - ReadWriteOnce
+  storageClassName: local-storage
+  resources:
+    requests:
+      storage: 50Gi
+
+---
+apiVersion: v1
+kind: PersistentVolumeClaim
+metadata:
+  name: grafana-data
+  namespace: observability
+spec:
+  accessModes:
+    - ReadWriteOnce
+  storageClassName: local-storage
+  resources:
+    requests:
+      storage: 10Gi
+
+=== ./k8s/observability-stack/03-prometheus-config.yaml ===
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: prometheus-config
+  namespace: observability
+data:
+  prometheus.yml: |
+    global:
+      scrape_interval: 15s
+      evaluation_interval: 15s
+      external_labels:
+        cluster: 'betelgeuse-k8s'
+        environment: 'production'
+
+    # Alerting configuration (optional - can add alertmanager later)
+    alerting:
+      alertmanagers:
+        - static_configs:
+            - targets: []
+
+    # Rule files
+    rule_files:
+      - /etc/prometheus/rules/*.yml
+
+    scrape_configs:
+      # Scrape Prometheus itself
+      - job_name: 'prometheus'
+        static_configs:
+          - targets: ['localhost:9090']
+
+      # Kubernetes API server
+      - job_name: 'kubernetes-apiservers'
+        kubernetes_sd_configs:
+          - role: endpoints
+        scheme: https
+        tls_config:
+          ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
+        bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
+        relabel_configs:
+          - source_labels: [__meta_kubernetes_namespace, __meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name]
+            action: keep
+            regex: default;kubernetes;https
+
+      # Kubernetes nodes
+      - job_name: 'kubernetes-nodes'
+        kubernetes_sd_configs:
+          - role: node
+        scheme: https
+        tls_config:
+          ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
+        bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
+        relabel_configs:
+          - action: labelmap
+            regex: __meta_kubernetes_node_label_(.+)
+          - target_label: __address__
+            replacement: kubernetes.default.svc:443
+          - source_labels: [__meta_kubernetes_node_name]
+            regex: (.+)
+            target_label: __metrics_path__
+            replacement: /api/v1/nodes/${1}/proxy/metrics
+
+      # Kubernetes nodes cadvisor
+      - job_name: 'kubernetes-cadvisor'
+        kubernetes_sd_configs:
+          - role: node
+        scheme: https
+        tls_config:
+          ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
+        bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
+        relabel_configs:
+          - action: labelmap
+            regex: __meta_kubernetes_node_label_(.+)
+          - target_label: __address__
+            replacement: kubernetes.default.svc:443
+          - source_labels: [__meta_kubernetes_node_name]
+            regex: (.+)
+            target_label: __metrics_path__
+            replacement: /api/v1/nodes/${1}/proxy/metrics/cadvisor
+
+      # Kubernetes service endpoints
+      - job_name: 'kubernetes-service-endpoints'
+        kubernetes_sd_configs:
+          - role: endpoints
+        relabel_configs:
+          - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scrape]
+            action: keep
+            regex: true
+          - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scheme]
+            action: replace
+            target_label: __scheme__
+            regex: (https?)
+          - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_path]
+            action: replace
+            target_label: __metrics_path__
+            regex: (.+)
+          - source_labels: [__address__, __meta_kubernetes_service_annotation_prometheus_io_port]
+            action: replace
+            target_label: __address__
+            regex: ([^:]+)(?::\d+)?;(\d+)
+            replacement: $1:$2
+          - action: labelmap
+            regex: __meta_kubernetes_service_label_(.+)
+          - source_labels: [__meta_kubernetes_namespace]
+            action: replace
+            target_label: kubernetes_namespace
+          - source_labels: [__meta_kubernetes_service_name]
+            action: replace
+            target_label: kubernetes_name
+          - source_labels: [__meta_kubernetes_pod_name]
+            action: replace
+            target_label: kubernetes_pod_name
+
+      # Kubernetes pods
+      - job_name: 'kubernetes-pods'
+        kubernetes_sd_configs:
+          - role: pod
+        relabel_configs:
+          - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape]
+            action: keep
+            regex: true
+          - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_path]
+            action: replace
+            target_label: __metrics_path__
+            regex: (.+)
+          - source_labels: [__address__, __meta_kubernetes_pod_annotation_prometheus_io_port]
+            action: replace
+            regex: ([^:]+)(?::\d+)?;(\d+)
+            replacement: $1:$2
+            target_label: __address__
+          - action: labelmap
+            regex: __meta_kubernetes_pod_label_(.+)
+          - source_labels: [__meta_kubernetes_namespace]
+            action: replace
+            target_label: kubernetes_namespace
+          - source_labels: [__meta_kubernetes_pod_name]
+            action: replace
+            target_label: kubernetes_pod_name
+
+      # kube-state-metrics
+      - job_name: 'kube-state-metrics'
+        static_configs:
+          - targets: ['kube-state-metrics.observability.svc.cluster.local:8080']
+
+      # node-exporter
+      - job_name: 'node-exporter'
+        kubernetes_sd_configs:
+          - role: pod
+        relabel_configs:
+          - source_labels: [__meta_kubernetes_pod_label_app]
+            action: keep
+            regex: node-exporter
+          - source_labels: [__meta_kubernetes_pod_node_name]
+            action: replace
+            target_label: instance
+
+      # Grafana Loki
+      - job_name: 'loki'
+        static_configs:
+          - targets: ['loki.observability.svc.cluster.local:3100']
+
+      # Grafana Tempo
+      - job_name: 'tempo'
+        static_configs:
+          - targets: ['tempo.observability.svc.cluster.local:3200']
+
+      # Grafana
+      - job_name: 'grafana'
+        static_configs:
+          - targets: ['grafana.observability.svc.cluster.local:3000']
+
+=== ./k8s/observability-stack/04-loki-config.yaml ===
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: loki-config
+  namespace: observability
+data:
+  loki.yaml: |
+    auth_enabled: false
+
+    server:
+      http_listen_port: 3100
+      grpc_listen_port: 9096
+      log_level: info
+
+    common:
+      path_prefix: /loki
+      storage:
+        filesystem:
+          chunks_directory: /loki/chunks
+          rules_directory: /loki/rules
+      replication_factor: 1
+      ring:
+        kvstore:
+          store: inmemory
+
+    schema_config:
+      configs:
+        - from: 2024-01-01
+          store: tsdb
+          object_store: filesystem
+          schema: v13
+          index:
+            prefix: index_
+            period: 24h
+
+    storage_config:
+      tsdb_shipper:
+        active_index_directory: /loki/tsdb-index
+        cache_location: /loki/tsdb-cache
+      filesystem:
+        directory: /loki/chunks
+
+    compactor:
+      working_directory: /loki/compactor
+      compaction_interval: 10m
+      retention_enabled: false
+
+    limits_config:
+      reject_old_samples: true
+      reject_old_samples_max_age: 168h  # 7 days
+      retention_period: 168h  # 7 days
+      max_query_length: 721h  # 30 days for queries
+      max_query_parallelism: 32
+      max_streams_per_user: 0
+      max_global_streams_per_user: 0
+      ingestion_rate_mb: 50
+      ingestion_burst_size_mb: 100
+      per_stream_rate_limit: 10MB
+      per_stream_rate_limit_burst: 20MB
+      split_queries_by_interval: 15m
+
+    query_range:
+      align_queries_with_step: true
+      cache_results: true
+      results_cache:
+        cache:
+          embedded_cache:
+            enabled: true
+            max_size_mb: 500
+
+    frontend:
+      log_queries_longer_than: 5s
+      compress_responses: true
+
+    query_scheduler:
+      max_outstanding_requests_per_tenant: 2048
+
+    ingester:
+      chunk_idle_period: 30m
+      chunk_block_size: 262144
+      chunk_encoding: snappy
+      chunk_retain_period: 1m
+      max_chunk_age: 2h
+      wal:
+        enabled: true
+        dir: /loki/wal
+        flush_on_shutdown: true
+        replay_memory_ceiling: 1GB
+
+    analytics:
+      reporting_enabled: false
+=== ./k8s/observability-stack/05-tempo-config.yaml ===
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: tempo-config
+  namespace: observability
+data:
+  tempo.yaml: |
+    server:
+      http_listen_port: 3200
+      log_level: info
+
+    distributor:
+      receivers:
+        jaeger:
+          protocols:
+            thrift_http:
+              endpoint: 0.0.0.0:14268
+            grpc:
+              endpoint: 0.0.0.0:14250
+        zipkin:
+          endpoint: 0.0.0.0:9411
+        otlp:
+          protocols:
+            http:
+              endpoint: 0.0.0.0:4318
+            grpc:
+              endpoint: 0.0.0.0:4317
+
+    ingester:
+      max_block_duration: 5m
+
+    compactor:
+      compaction:
+        block_retention: 168h  # 7 days
+
+    metrics_generator:
+      registry:
+        external_labels:
+          source: tempo
+          cluster: betelgeuse-k8s
+      storage:
+        path: /var/tempo/generator/wal
+        remote_write:
+          - url: http://prometheus.observability.svc.cluster.local:9090/api/v1/write
+            send_exemplars: true
+
+    storage:
+      trace:
+        backend: local
+        wal:
+          path: /var/tempo/wal
+        local:
+          path: /var/tempo/blocks
+        pool:
+          max_workers: 100
+          queue_depth: 10000
+
+    # Single instance mode - no need for frontend/querier split
+    query_frontend:
+      search:
+        duration_slo: 5s
+        throughput_bytes_slo: 1.073741824e+09
+      trace_by_id:
+        duration_slo: 5s
+
+    overrides:
+      defaults:
+        metrics_generator:
+          processors: [service-graphs, span-metrics]
+=== ./k8s/observability-stack/06-alloy-config.yaml ===
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: alloy-config
+  namespace: observability
+data:
+  config.alloy: |
+    // Logging configuration
+    logging {
+      level  = "info"
+      format = "logfmt"
+    }
+
+    // Discover Kubernetes pods for log collection
+    discovery.kubernetes "pods" {
+      role = "pod"
+    }
+
+    // Discover Kubernetes nodes
+    discovery.kubernetes "nodes" {
+      role = "node"
+    }
+
+    // Relabel pods for log collection
+    discovery.relabel "pod_logs" {
+      targets = discovery.kubernetes.pods.targets
+
+      // Only scrape pods with logs
+      rule {
+        source_labels = ["__meta_kubernetes_pod_container_name"]
+        action        = "keep"
+        regex         = ".+"
+      }
+
+      // Set the log path
+      rule {
+        source_labels = ["__meta_kubernetes_pod_uid", "__meta_kubernetes_pod_container_name"]
+        target_label  = "__path__"
+        separator     = "/"
+        replacement   = "/var/log/pods/*$1/*.log"
+      }
+
+      // Set namespace label
+      rule {
+        source_labels = ["__meta_kubernetes_namespace"]
+        target_label  = "namespace"
+      }
+
+      // Set pod name label
+      rule {
+        source_labels = ["__meta_kubernetes_pod_name"]
+        target_label  = "pod"
+      }
+
+      // Set container name label
+      rule {
+        source_labels = ["__meta_kubernetes_pod_container_name"]
+        target_label  = "container"
+      }
+
+      // Set node name label
+      rule {
+        source_labels = ["__meta_kubernetes_pod_node_name"]
+        target_label  = "node"
+      }
+
+      // Copy all pod labels
+      rule {
+        action = "labelmap"
+        regex  = "__meta_kubernetes_pod_label_(.+)"
+      }
+    }
+
+    // Read logs from discovered pods
+    loki.source.kubernetes "pod_logs" {
+      targets    = discovery.relabel.pod_logs.output
+      forward_to = [loki.process.pod_logs.receiver]
+    }
+
+    // Process and enrich logs
+    loki.process "pod_logs" {
+      forward_to = [loki.write.local.receiver]
+
+      // Parse JSON logs
+      stage.json {
+        expressions = {
+          level   = "level",
+          message = "message",
+          timestamp = "timestamp",
+        }
+      }
+
+      // Extract log level
+      stage.labels {
+        values = {
+          level = "",
+        }
+      }
+
+      // Add cluster label
+      stage.static_labels {
+        values = {
+          cluster = "betelgeuse-k8s",
+        }
+      }
+    }
+
+    // Write logs to Loki
+    loki.write "local" {
+      endpoint {
+        url = "http://loki.observability.svc.cluster.local:3100/loki/api/v1/push"
+      }
+    }
+
+    // OpenTelemetry receiver for traces
+    otelcol.receiver.otlp "default" {
+      grpc {
+        endpoint = "0.0.0.0:4317"
+      }
+
+      http {
+        endpoint = "0.0.0.0:4318"
+      }
+
+      output {
+        traces  = [otelcol.exporter.otlp.tempo.input]
+      }
+    }
+
+    // Export traces to Tempo
+    otelcol.exporter.otlp "tempo" {
+      client {
+        endpoint = "tempo.observability.svc.cluster.local:4317"
+        tls {
+          insecure = true
+        }
+      }
+    }
+
+    // Scrape local metrics (Alloy's own metrics)
+    // Prometheus will scrape these via service discovery
+    prometheus.exporter.self "alloy" {
+    }
+=== ./k8s/observability-stack/07-grafana-datasources.yaml ===
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: grafana-datasources
+  namespace: observability
+data:
+  datasources.yaml: |
+    apiVersion: 1
+    datasources:
+      # Prometheus
+      - name: Prometheus
+        type: prometheus
+        access: proxy
+        url: http://prometheus.observability.svc.cluster.local:9090
+        isDefault: true
+        editable: true
+        jsonData:
+          timeInterval: 15s
+          queryTimeout: 60s
+          httpMethod: POST
+
+      # Loki
+      - name: Loki
+        type: loki
+        access: proxy
+        url: http://loki.observability.svc.cluster.local:3100
+        editable: true
+        jsonData:
+          maxLines: 1000
+          derivedFields:
+            - datasourceUid: tempo
+              matcherRegex: "traceID=(\\w+)"
+              name: TraceID
+              url: "$${__value.raw}"
+
+      # Tempo
+      - name: Tempo
+        type: tempo
+        access: proxy
+        url: http://tempo.observability.svc.cluster.local:3200
+        editable: true
+        uid: tempo
+        jsonData:
+          tracesToLogsV2:
+            datasourceUid: loki
+            spanStartTimeShift: -1h
+            spanEndTimeShift: 1h
+            filterByTraceID: true
+            filterBySpanID: false
+            customQuery: false
+          tracesToMetrics:
+            datasourceUid: prometheus
+            spanStartTimeShift: -1h
+            spanEndTimeShift: 1h
+          serviceMap:
+            datasourceUid: prometheus
+          nodeGraph:
+            enabled: true
+          search:
+            hide: false
+          lokiSearch:
+            datasourceUid: loki
+
+=== ./k8s/observability-stack/08-rbac.yaml ===
+---
+# Prometheus ServiceAccount
+apiVersion: v1
+kind: ServiceAccount
+metadata:
+  name: prometheus
+  namespace: observability
+
+---
+# Prometheus ClusterRole
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRole
+metadata:
+  name: prometheus
+rules:
+  - apiGroups: [""]
+    resources:
+      - nodes
+      - nodes/proxy
+      - services
+      - endpoints
+      - pods
+    verbs: ["get", "list", "watch"]
+  - apiGroups:
+      - extensions
+    resources:
+      - ingresses
+    verbs: ["get", "list", "watch"]
+  - nonResourceURLs: ["/metrics"]
+    verbs: ["get"]
+
+---
+# Prometheus ClusterRoleBinding
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRoleBinding
+metadata:
+  name: prometheus
+roleRef:
+  apiGroup: rbac.authorization.k8s.io
+  kind: ClusterRole
+  name: prometheus
+subjects:
+  - kind: ServiceAccount
+    name: prometheus
+    namespace: observability
+
+---
+# Alloy ServiceAccount
+apiVersion: v1
+kind: ServiceAccount
+metadata:
+  name: alloy
+  namespace: observability
+
+---
+# Alloy ClusterRole
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRole
+metadata:
+  name: alloy
+rules:
+  - apiGroups: [""]
+    resources:
+      - nodes
+      - nodes/proxy
+      - services
+      - endpoints
+      - pods
+      - pods/log
+    verbs: ["get", "list", "watch"]
+  - apiGroups:
+      - extensions
+    resources:
+      - ingresses
+    verbs: ["get", "list", "watch"]
+
+---
+# Alloy ClusterRoleBinding
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRoleBinding
+metadata:
+  name: alloy
+roleRef:
+  apiGroup: rbac.authorization.k8s.io
+  kind: ClusterRole
+  name: alloy
+subjects:
+  - kind: ServiceAccount
+    name: alloy
+    namespace: observability
+
+---
+# kube-state-metrics ServiceAccount
+apiVersion: v1
+kind: ServiceAccount
+metadata:
+  name: kube-state-metrics
+  namespace: observability
+
+---
+# kube-state-metrics ClusterRole
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRole
+metadata:
+  name: kube-state-metrics
+rules:
+  - apiGroups: [""]
+    resources:
+      - configmaps
+      - secrets
+      - nodes
+      - pods
+      - services
+      - resourcequotas
+      - replicationcontrollers
+      - limitranges
+      - persistentvolumeclaims
+      - persistentvolumes
+      - namespaces
+      - endpoints
+    verbs: ["list", "watch"]
+  - apiGroups: ["apps"]
+    resources:
+      - statefulsets
+      - daemonsets
+      - deployments
+      - replicasets
+    verbs: ["list", "watch"]
+  - apiGroups: ["batch"]
+    resources:
+      - cronjobs
+      - jobs
+    verbs: ["list", "watch"]
+  - apiGroups: ["autoscaling"]
+    resources:
+      - horizontalpodautoscalers
+    verbs: ["list", "watch"]
+  - apiGroups: ["policy"]
+    resources:
+      - poddisruptionbudgets
+    verbs: ["list", "watch"]
+  - apiGroups: ["certificates.k8s.io"]
+    resources:
+      - certificatesigningrequests
+    verbs: ["list", "watch"]
+  - apiGroups: ["storage.k8s.io"]
+    resources:
+      - storageclasses
+      - volumeattachments
+    verbs: ["list", "watch"]
+  - apiGroups: ["admissionregistration.k8s.io"]
+    resources:
+      - mutatingwebhookconfigurations
+      - validatingwebhookconfigurations
+    verbs: ["list", "watch"]
+  - apiGroups: ["networking.k8s.io"]
+    resources:
+      - networkpolicies
+      - ingresses
+    verbs: ["list", "watch"]
+  - apiGroups: ["coordination.k8s.io"]
+    resources:
+      - leases
+    verbs: ["list", "watch"]
+
+---
+# kube-state-metrics ClusterRoleBinding
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRoleBinding
+metadata:
+  name: kube-state-metrics
+roleRef:
+  apiGroup: rbac.authorization.k8s.io
+  kind: ClusterRole
+  name: kube-state-metrics
+subjects:
+  - kind: ServiceAccount
+    name: kube-state-metrics
+    namespace: observability
+=== ./k8s/observability-stack/10-prometheus.yaml ===
+apiVersion: apps/v1
+kind: StatefulSet
+metadata:
+  name: prometheus
+  namespace: observability
+  labels:
+    app: prometheus
+spec:
+  serviceName: prometheus
+  replicas: 1
+  selector:
+    matchLabels:
+      app: prometheus
+  template:
+    metadata:
+      labels:
+        app: prometheus
+      annotations:
+        prometheus.io/scrape: "true"
+        prometheus.io/port: "9090"
+    spec:
+      serviceAccountName: prometheus
+      nodeSelector:
+        kubernetes.io/hostname: hetzner-2
+      containers:
+        - name: prometheus
+          image: prom/prometheus:v2.54.1
+          args:
+            - '--config.file=/etc/prometheus/prometheus.yml'
+            - '--storage.tsdb.path=/prometheus'
+            - '--storage.tsdb.retention.time=7d'
+            - '--web.console.libraries=/usr/share/prometheus/console_libraries'
+            - '--web.console.templates=/usr/share/prometheus/consoles'
+            - '--web.enable-lifecycle'
+            - '--web.enable-admin-api'
+          ports:
+            - name: http
+              containerPort: 9090
+              protocol: TCP
+          livenessProbe:
+            httpGet:
+              path: /-/healthy
+              port: http
+            initialDelaySeconds: 30
+            periodSeconds: 10
+            timeoutSeconds: 5
+          readinessProbe:
+            httpGet:
+              path: /-/ready
+              port: http
+            initialDelaySeconds: 30
+            periodSeconds: 10
+            timeoutSeconds: 5
+          resources:
+            requests:
+              cpu: 500m
+              memory: 2Gi
+            limits:
+              cpu: 2000m
+              memory: 4Gi
+          volumeMounts:
+            - name: prometheus-config
+              mountPath: /etc/prometheus
+            - name: prometheus-data
+              mountPath: /prometheus
+      volumes:
+        - name: prometheus-config
+          configMap:
+            name: prometheus-config
+        - name: prometheus-data
+          persistentVolumeClaim:
+            claimName: prometheus-data
+
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: prometheus
+  namespace: observability
+  labels:
+    app: prometheus
+spec:
+  type: ClusterIP
+  ports:
+    - port: 9090
+      targetPort: http
+      protocol: TCP
+      name: http
+  selector:
+    app: prometheus
+
+=== ./k8s/observability-stack/11-loki.yaml ===
+apiVersion: apps/v1
+kind: StatefulSet
+metadata:
+  name: loki
+  namespace: observability
+  labels:
+    app: loki
+spec:
+  serviceName: loki
+  replicas: 1
+  selector:
+    matchLabels:
+      app: loki
+  template:
+    metadata:
+      labels:
+        app: loki
+      annotations:
+        prometheus.io/scrape: "true"
+        prometheus.io/port: "3100"
+    spec:
+      nodeSelector:
+        kubernetes.io/hostname: hetzner-2
+      securityContext:
+        fsGroup: 10001
+        runAsGroup: 10001
+        runAsNonRoot: true
+        runAsUser: 10001
+      containers:
+        - name: loki
+          image: grafana/loki:3.2.1
+          args:
+            - '-config.file=/etc/loki/loki.yaml'
+            - '-target=all'
+          ports:
+            - name: http
+              containerPort: 3100
+              protocol: TCP
+            - name: grpc
+              containerPort: 9096
+              protocol: TCP
+          livenessProbe:
+            httpGet:
+              path: /ready
+              port: http
+            initialDelaySeconds: 45
+            periodSeconds: 10
+            timeoutSeconds: 5
+          readinessProbe:
+            httpGet:
+              path: /ready
+              port: http
+            initialDelaySeconds: 45
+            periodSeconds: 10
+            timeoutSeconds: 5
+          resources:
+            requests:
+              cpu: 500m
+              memory: 1Gi
+            limits:
+              cpu: 2000m
+              memory: 2Gi
+          volumeMounts:
+            - name: loki-config
+              mountPath: /etc/loki
+            - name: loki-data
+              mountPath: /loki
+      volumes:
+        - name: loki-config
+          configMap:
+            name: loki-config
+        - name: loki-data
+          persistentVolumeClaim:
+            claimName: loki-data
+
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: loki
+  namespace: observability
+  labels:
+    app: loki
+spec:
+  type: ClusterIP
+  ports:
+    - port: 3100
+      targetPort: http
+      protocol: TCP
+      name: http
+    - port: 9096
+      targetPort: grpc
+      protocol: TCP
+      name: grpc
+  selector:
+    app: loki
+
+=== ./k8s/observability-stack/12-tempo.yaml ===
+apiVersion: apps/v1
+kind: StatefulSet
+metadata:
+  name: tempo
+  namespace: observability
+  labels:
+    app: tempo
+spec:
+  serviceName: tempo
+  replicas: 1
+  selector:
+    matchLabels:
+      app: tempo
+  template:
+    metadata:
+      labels:
+        app: tempo
+      annotations:
+        prometheus.io/scrape: "true"
+        prometheus.io/port: "3200"
+    spec:
+      nodeSelector:
+        kubernetes.io/hostname: hetzner-2
+      securityContext:
+        fsGroup: 10001
+        runAsGroup: 10001
+        runAsNonRoot: true
+        runAsUser: 10001
+      containers:
+        - name: tempo
+          image: grafana/tempo:2.6.1
+          args:
+            - '-config.file=/etc/tempo/tempo.yaml'
+          ports:
+            - name: http
+              containerPort: 3200
+              protocol: TCP
+            - name: otlp-grpc
+              containerPort: 4317
+              protocol: TCP
+            - name: otlp-http
+              containerPort: 4318
+              protocol: TCP
+            - name: jaeger-grpc
+              containerPort: 14250
+              protocol: TCP
+            - name: jaeger-http
+              containerPort: 14268
+              protocol: TCP
+            - name: zipkin
+              containerPort: 9411
+              protocol: TCP
+          livenessProbe:
+            httpGet:
+              path: /ready
+              port: http
+            initialDelaySeconds: 30
+            periodSeconds: 10
+            timeoutSeconds: 5
+          readinessProbe:
+            httpGet:
+              path: /ready
+              port: http
+            initialDelaySeconds: 30
+            periodSeconds: 10
+            timeoutSeconds: 5
+          resources:
+            requests:
+              cpu: 500m
+              memory: 1Gi
+            limits:
+              cpu: 2000m
+              memory: 2Gi
+          volumeMounts:
+            - name: tempo-config
+              mountPath: /etc/tempo
+            - name: tempo-data
+              mountPath: /var/tempo
+      volumes:
+        - name: tempo-config
+          configMap:
+            name: tempo-config
+        - name: tempo-data
+          persistentVolumeClaim:
+            claimName: tempo-data
+
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: tempo
+  namespace: observability
+  labels:
+    app: tempo
+spec:
+  type: ClusterIP
+  ports:
+    - port: 3200
+      targetPort: http
+      protocol: TCP
+      name: http
+    - port: 4317
+      targetPort: otlp-grpc
+      protocol: TCP
+      name: otlp-grpc
+    - port: 4318
+      targetPort: otlp-http
+      protocol: TCP
+      name: otlp-http
+    - port: 14250
+      targetPort: jaeger-grpc
+      protocol: TCP
+      name: jaeger-grpc
+    - port: 14268
+      targetPort: jaeger-http
+      protocol: TCP
+      name: jaeger-http
+    - port: 9411
+      targetPort: zipkin
+      protocol: TCP
+      name: zipkin
+  selector:
+    app: tempo
+=== ./k8s/observability-stack/13-grafana.yaml ===
+apiVersion: apps/v1
+kind: StatefulSet
+metadata:
+  name: grafana
+  namespace: observability
+  labels:
+    app: grafana
+spec:
+  serviceName: grafana
+  replicas: 1
+  selector:
+    matchLabels:
+      app: grafana
+  template:
+    metadata:
+      labels:
+        app: grafana
+    spec:
+      nodeSelector:
+        kubernetes.io/hostname: hetzner-2
+      securityContext:
+        fsGroup: 472
+        runAsGroup: 472
+        runAsUser: 472
+      containers:
+        - name: grafana
+          image: grafana/grafana:11.4.0
+          ports:
+            - name: http
+              containerPort: 3000
+              protocol: TCP
+          env:
+            - name: GF_SECURITY_ADMIN_USER
+              value: admin
+            - name: GF_SECURITY_ADMIN_PASSWORD
+              value: admin  # Change this in production!
+            - name: GF_INSTALL_PLUGINS
+              value: ""
+            - name: GF_FEATURE_TOGGLES_ENABLE
+              value: "traceqlEditor,correlations"
+            - name: GF_AUTH_ANONYMOUS_ENABLED
+              value: "false"
+            - name: GF_ANALYTICS_REPORTING_ENABLED
+              value: "false"
+            - name: GF_ANALYTICS_CHECK_FOR_UPDATES
+              value: "false"
+          livenessProbe:
+            httpGet:
+              path: /api/health
+              port: http
+            initialDelaySeconds: 60
+            periodSeconds: 10
+            timeoutSeconds: 5
+          readinessProbe:
+            httpGet:
+              path: /api/health
+              port: http
+            initialDelaySeconds: 60
+            periodSeconds: 10
+            timeoutSeconds: 5
+          resources:
+            requests:
+              cpu: 250m
+              memory: 512Mi
+            limits:
+              cpu: 1000m
+              memory: 1Gi
+          volumeMounts:
+            - name: grafana-data
+              mountPath: /var/lib/grafana
+            - name: grafana-datasources
+              mountPath: /etc/grafana/provisioning/datasources
+      volumes:
+        - name: grafana-data
+          persistentVolumeClaim:
+            claimName: grafana-data
+        - name: grafana-datasources
+          configMap:
+            name: grafana-datasources
+
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: grafana
+  namespace: observability
+  labels:
+    app: grafana
+spec:
+  type: ClusterIP
+  ports:
+    - port: 3000
+      targetPort: http
+      protocol: TCP
+      name: http
+  selector:
+    app: grafana
+
+=== ./k8s/observability-stack/14-alloy.yaml ===
+apiVersion: apps/v1
+kind: DaemonSet
+metadata:
+  name: alloy
+  namespace: observability
+  labels:
+    app: alloy
+spec:
+  selector:
+    matchLabels:
+      app: alloy
+  template:
+    metadata:
+      labels:
+        app: alloy
+    spec:
+      serviceAccountName: alloy
+      hostNetwork: true
+      hostPID: true
+      dnsPolicy: ClusterFirstWithHostNet
+      containers:
+        - name: alloy
+          image: grafana/alloy:v1.5.1
+          args:
+            - run
+            - /etc/alloy/config.alloy
+            - --storage.path=/var/lib/alloy
+            - --server.http.listen-addr=0.0.0.0:12345
+          ports:
+            - name: http-metrics
+              containerPort: 12345
+              protocol: TCP
+            - name: otlp-grpc
+              containerPort: 4317
+              protocol: TCP
+            - name: otlp-http
+              containerPort: 4318
+              protocol: TCP
+          env:
+            - name: HOSTNAME
+              valueFrom:
+                fieldRef:
+                  fieldPath: spec.nodeName
+          securityContext:
+            privileged: true
+            runAsUser: 0
+          resources:
+            requests:
+              cpu: 100m
+              memory: 256Mi
+            limits:
+              cpu: 500m
+              memory: 512Mi
+          volumeMounts:
+            - name: config
+              mountPath: /etc/alloy
+            - name: varlog
+              mountPath: /var/log
+              readOnly: true
+            - name: varlibdockercontainers
+              mountPath: /var/lib/docker/containers
+              readOnly: true
+            - name: etcmachineid
+              mountPath: /etc/machine-id
+              readOnly: true
+      tolerations:
+        - effect: NoSchedule
+          operator: Exists
+      volumes:
+        - name: config
+          configMap:
+            name: alloy-config
+        - name: varlog
+          hostPath:
+            path: /var/log
+        - name: varlibdockercontainers
+          hostPath:
+            path: /var/lib/docker/containers
+        - name: etcmachineid
+          hostPath:
+            path: /etc/machine-id
+
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: alloy
+  namespace: observability
+  labels:
+    app: alloy
+spec:
+  type: ClusterIP
+  ports:
+    - port: 12345
+      targetPort: http-metrics
+      protocol: TCP
+      name: http-metrics
+    - port: 4317
+      targetPort: otlp-grpc
+      protocol: TCP
+      name: otlp-grpc
+    - port: 4318
+      targetPort: otlp-http
+      protocol: TCP
+      name: otlp-http
+  selector:
+    app: alloy
+
+=== ./k8s/observability-stack/15-kube-state-metrics.yaml ===
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: kube-state-metrics
+  namespace: observability
+  labels:
+    app: kube-state-metrics
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: kube-state-metrics
+  template:
+    metadata:
+      labels:
+        app: kube-state-metrics
+      annotations:
+        prometheus.io/scrape: "true"
+        prometheus.io/port: "8080"
+    spec:
+      serviceAccountName: kube-state-metrics
+      containers:
+        - name: kube-state-metrics
+          image: registry.k8s.io/kube-state-metrics/kube-state-metrics:v2.13.0
+          ports:
+            - name: http-metrics
+              containerPort: 8080
+            - name: telemetry
+              containerPort: 8081
+          livenessProbe:
+            httpGet:
+              path: /healthz
+              port: 8080
+            initialDelaySeconds: 5
+            timeoutSeconds: 5
+          readinessProbe:
+            httpGet:
+              path: /
+              port: 8080
+            initialDelaySeconds: 5
+            timeoutSeconds: 5
+          resources:
+            requests:
+              cpu: 100m
+              memory: 128Mi
+            limits:
+              cpu: 200m
+              memory: 256Mi
+
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: kube-state-metrics
+  namespace: observability
+  labels:
+    app: kube-state-metrics
+  annotations:
+    prometheus.io/scrape: "true"
+    prometheus.io/port: "8080"
+spec:
+  type: ClusterIP
+  ports:
+    - name: http-metrics
+      port: 8080
+      targetPort: http-metrics
+    - name: telemetry
+      port: 8081
+      targetPort: telemetry
+  selector:
+    app: kube-state-metrics
+
+=== ./k8s/observability-stack/16-node-exporter.yaml ===
+apiVersion: apps/v1
+kind: DaemonSet
+metadata:
+  name: node-exporter
+  namespace: observability
+  labels:
+    app: node-exporter
+spec:
+  selector:
+    matchLabels:
+      app: node-exporter
+  template:
+    metadata:
+      labels:
+        app: node-exporter
+      annotations:
+        prometheus.io/scrape: "true"
+        prometheus.io/port: "9100"
+    spec:
+      hostNetwork: true
+      hostPID: true
+      containers:
+        - name: node-exporter
+          image: prom/node-exporter:v1.8.2
+          args:
+            - --path.procfs=/host/proc
+            - --path.sysfs=/host/sys
+            - --path.rootfs=/host/root
+            - --collector.filesystem.mount-points-exclude=^/(sys|proc|dev|host|etc)($$|/)
+          ports:
+            - name: metrics
+              containerPort: 9100
+              protocol: TCP
+          resources:
+            requests:
+              cpu: 100m
+              memory: 128Mi
+            limits:
+              cpu: 200m
+              memory: 256Mi
+          volumeMounts:
+            - name: proc
+              mountPath: /host/proc
+              readOnly: true
+            - name: sys
+              mountPath: /host/sys
+              readOnly: true
+            - name: root
+              mountPath: /host/root
+              mountPropagation: HostToContainer
+              readOnly: true
+      tolerations:
+        - effect: NoSchedule
+          operator: Exists
+      volumes:
+        - name: proc
+          hostPath:
+            path: /proc
+        - name: sys
+          hostPath:
+            path: /sys
+        - name: root
+          hostPath:
+            path: /
+
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: node-exporter
+  namespace: observability
+  labels:
+    app: node-exporter
+  annotations:
+    prometheus.io/scrape: "true"
+    prometheus.io/port: "9100"
+spec:
+  type: ClusterIP
+  clusterIP: None
+  ports:
+    - name: metrics
+      port: 9100
+      targetPort: metrics
+  selector:
+    app: node-exporter
+
+=== ./k8s/observability-stack/20-grafana-ingress.yaml ===
+apiVersion: networking.k8s.io/v1
+kind: Ingress
+metadata:
+  name: grafana-ingress
+  namespace: observability
+  annotations:
+    cert-manager.io/cluster-issuer: "letsencrypt-prod"
+    nginx.ingress.kubernetes.io/ssl-redirect: "true"
+    nginx.ingress.kubernetes.io/force-ssl-redirect: "true"
+spec:
+  ingressClassName: nginx
+  tls:
+    - hosts:
+        - grafana.betelgeusebytes.io
+      secretName: grafana-tls
+  rules:
+    - host: grafana.betelgeusebytes.io
+      http:
+        paths:
+          - path: /
+            pathType: Prefix
+            backend:
+              service:
+                name: grafana
+                port:
+                  number: 3000
+
+=== ./k8s/observability-stack/21-optional-ingresses.yaml ===
+---
+# Optional: Prometheus Ingress (for direct access to Prometheus UI)
+apiVersion: networking.k8s.io/v1
+kind: Ingress
+metadata:
+  name: prometheus-ingress
+  namespace: observability
+  annotations:
+    cert-manager.io/cluster-issuer: "letsencrypt-prod"
+    nginx.ingress.kubernetes.io/ssl-redirect: "true"
+    nginx.ingress.kubernetes.io/force-ssl-redirect: "true"
+    # Optional: Add basic auth for security
+    # nginx.ingress.kubernetes.io/auth-type: basic
+    # nginx.ingress.kubernetes.io/auth-secret: prometheus-basic-auth
+    # nginx.ingress.kubernetes.io/auth-realm: 'Authentication Required'
+spec:
+  ingressClassName: nginx
+  tls:
+    - hosts:
+        - prometheus.betelgeusebytes.io
+      secretName: prometheus-tls
+  rules:
+    - host: prometheus.betelgeusebytes.io
+      http:
+        paths:
+          - path: /
+            pathType: Prefix
+            backend:
+              service:
+                name: prometheus
+                port:
+                  number: 9090
+
+---
+# Optional: Loki Ingress (for direct API access)
+apiVersion: networking.k8s.io/v1
+kind: Ingress
+metadata:
+  name: loki-ingress
+  namespace: observability
+  annotations:
+    cert-manager.io/cluster-issuer: "letsencrypt-prod"
+    nginx.ingress.kubernetes.io/ssl-redirect: "true"
+    nginx.ingress.kubernetes.io/force-ssl-redirect: "true"
+spec:
+  ingressClassName: nginx
+  tls:
+    - hosts:
+        - loki.betelgeusebytes.io
+      secretName: loki-tls
+  rules:
+    - host: loki.betelgeusebytes.io
+      http:
+        paths:
+          - path: /
+            pathType: Prefix
+            backend:
+              service:
+                name: loki
+                port:
+                  number: 3100
+
+---
+# Optional: Tempo Ingress (for direct API access)
+apiVersion: networking.k8s.io/v1
+kind: Ingress
+metadata:
+  name: tempo-ingress
+  namespace: observability
+  annotations:
+    cert-manager.io/cluster-issuer: "letsencrypt-prod"
+    nginx.ingress.kubernetes.io/ssl-redirect: "true"
+    nginx.ingress.kubernetes.io/force-ssl-redirect: "true"
+spec:
+  ingressClassName: nginx
+  tls:
+    - hosts:
+        - tempo.betelgeusebytes.io
+      secretName: tempo-tls
+  rules:
+    - host: tempo.betelgeusebytes.io
+      http:
+        paths:
+          - path: /
+            pathType: Prefix
+            backend:
+              service:
+                name: tempo
+                port:
+                  number: 3200
+
+=== ./k8s/observability-stack/demo-app.yaml ===
+---
+# Example instrumented application to test the observability stack
+# This is a simple Python Flask app with OpenTelemetry instrumentation
+
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: demo-app
+  namespace: observability
+data:
+  app.py: |
+    from flask import Flask, jsonify
+    import logging
+    import json
+    import time
+    import random
+    
+    # OpenTelemetry imports
+    from opentelemetry import trace, metrics
+    from opentelemetry.sdk.trace import TracerProvider
+    from opentelemetry.sdk.trace.export import BatchSpanProcessor
+    from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter
+    from opentelemetry.sdk.metrics import MeterProvider
+    from opentelemetry.sdk.metrics.export import PeriodicExportingMetricReader
+    from opentelemetry.exporter.otlp.proto.grpc.metric_exporter import OTLPMetricExporter
+    from opentelemetry.instrumentation.flask import FlaskInstrumentor
+    from opentelemetry.sdk.resources import Resource
+    from prometheus_flask_exporter import PrometheusMetrics
+    
+    # Configure structured logging
+    logging.basicConfig(
+        level=logging.INFO,
+        format='%(message)s'
+    )
+    
+    class JSONFormatter(logging.Formatter):
+        def format(self, record):
+            log_obj = {
+                'timestamp': self.formatTime(record, self.datefmt),
+                'level': record.levelname,
+                'message': record.getMessage(),
+                'logger': record.name,
+            }
+            if hasattr(record, 'trace_id'):
+                log_obj['trace_id'] = record.trace_id
+                log_obj['span_id'] = record.span_id
+            return json.dumps(log_obj)
+    
+    handler = logging.StreamHandler()
+    handler.setFormatter(JSONFormatter())
+    logger = logging.getLogger(__name__)
+    logger.addHandler(handler)
+    logger.setLevel(logging.INFO)
+    
+    # Configure OpenTelemetry
+    resource = Resource.create({"service.name": "demo-app"})
+    
+    # Tracing
+    trace_provider = TracerProvider(resource=resource)
+    trace_provider.add_span_processor(
+        BatchSpanProcessor(
+            OTLPSpanExporter(
+                endpoint="http://tempo.observability.svc.cluster.local:4317",
+                insecure=True
+            )
+        )
+    )
+    trace.set_tracer_provider(trace_provider)
+    tracer = trace.get_tracer(__name__)
+    
+    # Create Flask app
+    app = Flask(__name__)
+    
+    # Prometheus metrics
+    metrics = PrometheusMetrics(app)
+    
+    # Auto-instrument Flask
+    FlaskInstrumentor().instrument_app(app)
+    
+    # Sample data
+    ITEMS = ["apple", "banana", "orange", "grape", "mango"]
+    
+    @app.route('/')
+    def index():
+        span = trace.get_current_span()
+        trace_id = format(span.get_span_context().trace_id, '032x')
+        
+        logger.info("Index page accessed", extra={
+            'trace_id': trace_id,
+            'endpoint': '/'
+        })
+        
+        return jsonify({
+            'service': 'demo-app',
+            'status': 'healthy',
+            'trace_id': trace_id
+        })
+    
+    @app.route('/items')
+    def get_items():
+        with tracer.start_as_current_span("fetch_items") as span:
+            # Simulate database query
+            time.sleep(random.uniform(0.01, 0.1))
+            
+            span.set_attribute("items.count", len(ITEMS))
+            trace_id = format(span.get_span_context().trace_id, '032x')
+            
+            logger.info("Items fetched", extra={
+                'trace_id': trace_id,
+                'count': len(ITEMS)
+            })
+            
+            return jsonify({
+                'items': ITEMS,
+                'count': len(ITEMS),
+                'trace_id': trace_id
+            })
+    
+    @app.route('/item/<int:item_id>')
+    def get_item(item_id):
+        with tracer.start_as_current_span("fetch_item") as span:
+            span.set_attribute("item.id", item_id)
+            trace_id = format(span.get_span_context().trace_id, '032x')
+            
+            # Simulate processing
+            time.sleep(random.uniform(0.01, 0.05))
+            
+            if item_id < 0 or item_id >= len(ITEMS):
+                logger.warning("Item not found", extra={
+                    'trace_id': trace_id,
+                    'item_id': item_id
+                })
+                return jsonify({'error': 'Item not found', 'trace_id': trace_id}), 404
+            
+            item = ITEMS[item_id]
+            logger.info("Item fetched", extra={
+                'trace_id': trace_id,
+                'item_id': item_id,
+                'item': item
+            })
+            
+            return jsonify({
+                'id': item_id,
+                'name': item,
+                'trace_id': trace_id
+            })
+    
+    @app.route('/slow')
+    def slow_endpoint():
+        with tracer.start_as_current_span("slow_operation") as span:
+            trace_id = format(span.get_span_context().trace_id, '032x')
+            
+            logger.info("Slow operation started", extra={'trace_id': trace_id})
+            
+            # Simulate slow operation
+            time.sleep(random.uniform(1, 3))
+            
+            logger.info("Slow operation completed", extra={'trace_id': trace_id})
+            
+            return jsonify({
+                'message': 'Operation completed',
+                'trace_id': trace_id
+            })
+    
+    @app.route('/error')
+    def error_endpoint():
+        with tracer.start_as_current_span("error_operation") as span:
+            trace_id = format(span.get_span_context().trace_id, '032x')
+            
+            logger.error("Intentional error triggered", extra={'trace_id': trace_id})
+            span.set_attribute("error", True)
+            
+            return jsonify({
+                'error': 'This is an intentional error',
+                'trace_id': trace_id
+            }), 500
+    
+    if __name__ == '__main__':
+        app.run(host='0.0.0.0', port=8080)
+
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: demo-app
+  namespace: observability
+  labels:
+    app: demo-app
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: demo-app
+  template:
+    metadata:
+      labels:
+        app: demo-app
+      annotations:
+        prometheus.io/scrape: "true"
+        prometheus.io/port: "8080"
+        prometheus.io/path: "/metrics"
+    spec:
+      containers:
+        - name: demo-app
+          image: python:3.11-slim
+          command:
+            - /bin/bash
+            - -c
+            - |
+              pip install flask opentelemetry-api opentelemetry-sdk \
+                opentelemetry-instrumentation-flask \
+                opentelemetry-exporter-otlp-proto-grpc \
+                prometheus-flask-exporter && \
+              python /app/app.py
+          ports:
+            - name: http
+              containerPort: 8080
+          volumeMounts:
+            - name: app-code
+              mountPath: /app
+          resources:
+            requests:
+              cpu: 100m
+              memory: 256Mi
+            limits:
+              cpu: 500m
+              memory: 512Mi
+      volumes:
+        - name: app-code
+          configMap:
+            name: demo-app
+
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: demo-app
+  namespace: observability
+  labels:
+    app: demo-app
+  annotations:
+    prometheus.io/scrape: "true"
+    prometheus.io/port: "8080"
+    prometheus.io/path: "/metrics"
+spec:
+  type: ClusterIP
+  ports:
+    - port: 8080
+      targetPort: http
+      protocol: TCP
+      name: http
+  selector:
+    app: demo-app
+
+=== ./k8s/otlp/otel-collector.yaml ===
+apiVersion: v1
+kind: Service
+metadata: { name: otel-collector, namespace: observability }
+spec:
+  selector: { app: otel-collector }
+  ports:
+    - { name: otlp-http, port: 4318, targetPort: 4318 }
+    - { name: otlp-grpc, port: 4317, targetPort: 4317 }
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata: { name: otel-collector, namespace: observability }
+spec:
+  replicas: 2
+  selector: { matchLabels: { app: otel-collector } }
+  template:
+    metadata: { labels: { app: otel-collector } }
+    spec:
+      nodeSelector: { node: hetzner-2 }
+      containers:
+      - name: otel-collector
+        image: otel/opentelemetry-collector-contrib:0.102.0
+        args: ["--config=/etc/otel/config.yaml"]
+        ports:
+          - { containerPort: 4318 }
+          - { containerPort: 4317 }
+        volumeMounts:
+          - { name: cfg, mountPath: /etc/otel }
+      volumes:
+        - { name: cfg, configMap: { name: otel-config } }
+---
+apiVersion: v1
+kind: ConfigMap
+metadata: { name: otel-config, namespace: observability }
+data:
+  config.yaml: |
+    receivers:
+      otlp:
+        protocols: { http: {}, grpc: {} }
+    processors: { batch: {} }
+    exporters:
+      logging: {}
+      elasticsearch:
+        endpoints: ["http://elasticsearch.elastic.svc.cluster.local:9200"]
+        logs_index: "k8s-logs"
+    service:
+      pipelines:
+        logs:     { receivers: [otlp], processors: [batch], exporters: [elasticsearch, logging] }
+        traces:   { receivers: [otlp], processors: [batch], exporters: [logging] }
+        metrics:  { receivers: [otlp], processors: [batch], exporters: [logging] }
+---
+apiVersion: networking.k8s.io/v1
+kind: Ingress
+metadata:
+  name: otlp
+  namespace: observability
+  annotations: { cert-manager.io/cluster-issuer: letsencrypt-prod }
+spec:
+  ingressClassName: nginx
+  tls: [{ hosts: ["otlp.betelgeusebytes.io"], secretName: otlp-tls }]
+  rules:
+  - host: otlp.betelgeusebytes.io
+    http:
+      paths:
+      - path: /v1/traces
+        pathType: Prefix
+        backend: { service: { name: otel-collector, port: { number: 4318 } } }
+      - path: /v1/metrics
+        pathType: Prefix
+        backend: { service: { name: otel-collector, port: { number: 4318 } } }
+      - path: /v1/logs
+        pathType: Prefix
+        backend: { service: { name: otel-collector, port: { number: 4318 } } }
+
+=== ./k8s/postgres/pg.yaml ===
+# k8s/postgres/pg-init-sql-configmap.yaml
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: pg-init-sql
+  namespace: db
+data:
+  00_extensions.sql: |
+    \connect gitea
+    CREATE EXTENSION IF NOT EXISTS postgis;
+    CREATE EXTENSION IF NOT EXISTS postgis_topology;
+    CREATE EXTENSION IF NOT EXISTS fuzzystrmatch;
+    CREATE EXTENSION IF NOT EXISTS pg_trgm;
+    CREATE EXTENSION IF NOT EXISTS hstore;
+    CREATE EXTENSION IF NOT EXISTS "uuid-ossp";
+    CREATE EXTENSION IF NOT EXISTS citext;
+    CREATE EXTENSION IF NOT EXISTS unaccent;
+    CREATE EXTENSION IF NOT EXISTS pgcrypto;
+    DO $$ BEGIN
+      CREATE EXTENSION IF NOT EXISTS plpython3u;
+    EXCEPTION WHEN undefined_file THEN
+      RAISE NOTICE 'plpython3u not available in this image';
+    END $$;
+  01_tune.sql: |
+    ALTER SYSTEM SET shared_buffers = '1GB';
+    ALTER SYSTEM SET work_mem = '32MB';
+    ALTER SYSTEM SET maintenance_work_mem = '512MB';
+    ALTER SYSTEM SET max_connections = 200;
+    SELECT pg_reload_conf();
+---
+# k8s/postgres/pg-conf.yaml
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: pg-conf
+  namespace: db
+data:
+  pg_hba.conf: |
+    # Local connections
+    local   all   all                                   trust
+    host    all   all          127.0.0.1/32             trust
+    host    all   all          ::1/128                  trust
+    # TLS-only access from ANY external IP (harden as needed)
+    hostssl all  all          0.0.0.0/0                 md5
+    hostssl all  all          ::/0                      md5
+---
+# k8s/postgres/pg-secret.yaml
+apiVersion: v1
+kind: Secret
+metadata:
+  name: pg18-secret
+  namespace: db
+type: Opaque
+stringData:
+  POSTGRES_PASSWORD: "pa$$word"
+---
+# k8s/postgres/pg-certificate.yaml
+apiVersion: cert-manager.io/v1
+kind: Certificate
+metadata:
+  name: pg-tls
+  namespace: db
+spec:
+  secretName: pg-tls
+  dnsNames:
+    - pg.betelgeusebytes.io
+  issuerRef:
+    kind: ClusterIssuer
+    name: letsencrypt-prod
+---
+# k8s/postgres/postgres-svc.yaml
+apiVersion: v1
+kind: Service
+metadata:
+  name: postgres
+  namespace: db
+spec:
+  selector:
+    app: postgres
+  ports:
+    - name: postgres
+      port: 5432
+      targetPort: 5432
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: postgres-hl
+  namespace: db
+spec:
+  clusterIP: None
+  selector:
+    app: postgres
+  ports:
+    - name: postgres
+      port: 5432
+      targetPort: 5432
+---
+# k8s/postgres/postgres.yaml
+apiVersion: apps/v1
+kind: StatefulSet
+metadata:
+  name: postgres
+  namespace: db
+spec:
+  serviceName: postgres-hl
+  replicas: 1
+  selector:
+    matchLabels:
+      app: postgres
+  template:
+    metadata:
+      labels:
+        app: postgres
+    spec:
+      securityContext:
+        runAsUser: 999
+        runAsGroup: 999
+        fsGroup: 999
+        fsGroupChangePolicy: "Always"
+      initContainers:
+        - name: install-certs
+          image: busybox:1.36
+          command:
+            - sh
+            - -c
+            - |
+              cp /in/tls.crt /out/server.crt
+              cp /in/tls.key /out/server.key
+              chown 999:999 /out/* || true
+              chmod 600 /out/server.key
+          securityContext:
+            runAsUser: 0
+          volumeMounts:
+            - { name: pg-tls,   mountPath: /in,  readOnly: true }
+            - { name: pg-certs, mountPath: /out }
+      containers:
+        - name: postgres
+          image: axxs/postgres:18-postgis-vector
+          imagePullPolicy: IfNotPresent
+          args:
+            - -c
+            - ssl=on
+            - -c
+            - ssl_cert_file=/certs/server.crt
+            - -c
+            - ssl_key_file=/certs/server.key
+            - -c
+            - hba_file=/etc/postgresql-custom/pg_hba.conf
+          env:
+            - name: POSTGRES_USER
+              value: "app"
+            - name: POSTGRES_DB
+              value: "gitea"
+            - name: POSTGRES_PASSWORD
+              valueFrom:
+                secretKeyRef:
+                  name: pg18-secret
+                  key: POSTGRES_PASSWORD
+            - name: TZ
+              value: "Europe/Paris"
+          ports:
+            - name: postgres
+              containerPort: 5432
+          volumeMounts:
+            - { name: data,  mountPath: /var/lib/postgresql }        # PG18 expects parent, creates /var/lib/postgresql/18/main
+            - { name: init,  mountPath: /docker-entrypoint-initdb.d, readOnly: true }
+            - { name: pg-certs, mountPath: /certs }
+            - { name: pg-conf,  mountPath: /etc/postgresql-custom }
+          readinessProbe:
+            exec: { command: ["sh","-c","pg_isready -U \"$POSTGRES_USER\" -d \"$POSTGRES_DB\" -h 127.0.0.1"] }
+            initialDelaySeconds: 5
+            periodSeconds: 10
+            timeoutSeconds: 5
+            failureThreshold: 6
+          livenessProbe:
+            exec: { command: ["sh","-c","pg_isready -U \"$POSTGRES_USER\" -d \"$POSTGRES_DB\" -h 127.0.0.1"] }
+            initialDelaySeconds: 20
+            periodSeconds: 10
+            timeoutSeconds: 5
+            failureThreshold: 6
+          resources:
+            requests: { cpu: "250m", memory: "512Mi" }
+            limits:   { cpu: "1",    memory: "2Gi" }
+      volumes:
+        - name: init
+          configMap:
+            name: pg-init-sql
+            defaultMode: 0444
+        - name: pg-tls
+          secret:
+            secretName: pg-tls
+        - name: pg-certs
+          emptyDir: {}
+        - name: pg-conf
+          configMap:
+            name: pg-conf
+            defaultMode: 0444
+  volumeClaimTemplates:
+    - metadata:
+        name: data
+      spec:
+        accessModes: ["ReadWriteOnce"]
+        storageClassName: local-ssd-hetzner
+        resources:
+          requests:
+            storage: 80Gi
+
+
+# kubectl -n ingress-nginx create configmap tcp-services \
+#   --from-literal="5432=db/postgres:5432" \
+#   -o yaml --dry-run=client | kubectl apply -f -
+# kubectl -n ingress-nginx patch deploy ingress-nginx-controller \
+#   --type='json' -p='[
+#     {"op":"add","path":"/spec/template/spec/containers/0/args/-","value":"--tcp-services-configmap=$(POD_NAMESPACE)/tcp-services"}
+#   ]'
+# # controller must listen on hostPort:5432 (we already patched earlier)
+=== ./k8s/postgres/postgres-ha.yaml ===
+---
+apiVersion: v1
+kind: Namespace
+metadata:
+  name: db
+---
+# Password secret (replace with your own or generate one)
+apiVersion: v1
+kind: Secret
+metadata:
+  name: pg18-secret
+  namespace: db
+type: Opaque
+stringData:
+  POSTGRES_PASSWORD: "pa$$word"
+---
+# Init SQL: keeps your original name and keeps enabling PostGIS + vector
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: pg-init-sql
+  namespace: db
+data:
+  00_extensions.sql: |
+    -- enable common extensions in the default DB and template1 so future DBs inherit them
+    \connect gitea
+    CREATE EXTENSION IF NOT EXISTS postgis;
+    CREATE EXTENSION IF NOT EXISTS vector;
+    CREATE COLLATION IF NOT EXISTS arabic (provider = icu, locale = 'ar', deterministic = false);
+    CREATE EXTENSION IF NOT EXISTS tablefunc;
+    -- postpone pg_stat_statements CREATE to postStart (needs preload)
+    CREATE EXTENSION IF NOT EXISTS postgis_topology;
+    CREATE EXTENSION IF NOT EXISTS fuzzystrmatch;
+    CREATE EXTENSION IF NOT EXISTS pg_trgm;
+    CREATE EXTENSION IF NOT EXISTS hstore;
+    CREATE EXTENSION IF NOT EXISTS "uuid-ossp";
+    CREATE EXTENSION IF NOT EXISTS citext;
+    CREATE EXTENSION IF NOT EXISTS unaccent;
+    CREATE EXTENSION IF NOT EXISTS pgcrypto;
+
+    -- PL/Python (available in your image)
+    DO $$ BEGIN
+      CREATE EXTENSION IF NOT EXISTS plpython3u;
+    EXCEPTION WHEN undefined_file THEN
+      RAISE NOTICE 'plpython3u not available in this image';
+    END $$;
+
+    -- Also on template1 for new DBs (heavier, but intentional)
+    \connect template1
+    CREATE EXTENSION IF NOT EXISTS postgis;
+    CREATE EXTENSION IF NOT EXISTS pg_trgm;
+    CREATE EXTENSION IF NOT EXISTS hstore;
+    CREATE EXTENSION IF NOT EXISTS "uuid-ossp";
+    CREATE EXTENSION IF NOT EXISTS citext;
+    CREATE EXTENSION IF NOT EXISTS unaccent;
+    CREATE EXTENSION IF NOT EXISTS pgcrypto;
+
+    -- Arabic-friendly ICU collation, non-deterministic for case/diacritics
+    DO $$
+    BEGIN
+      PERFORM 1 FROM pg_collation WHERE collname='arabic';
+      IF NOT FOUND THEN
+        CREATE COLLATION arabic (provider = icu, locale = 'ar', deterministic = false);
+      END IF;
+    END$$;
+
+  01_tune.sql: |
+    -- Enable pg_stat_statements on next server start
+    DO $$
+    DECLARE
+      cur text := current_setting('shared_preload_libraries', true);
+    BEGIN
+      IF cur IS NULL OR position('pg_stat_statements' in cur) = 0 THEN
+        PERFORM pg_catalog.pg_reload_conf(); -- harmless even if no changes yet
+        EXECUTE $$ALTER SYSTEM SET shared_preload_libraries =
+          $$ || quote_literal(coalesce(NULLIF(cur,'' ) || ',pg_stat_statements', 'pg_stat_statements'));
+      END IF;
+    END$$;
+
+    -- Optional tuning (adjust to your limits)
+    ALTER SYSTEM SET shared_buffers = '1GB';
+    ALTER SYSTEM SET work_mem = '32MB';
+    ALTER SYSTEM SET maintenance_work_mem = '512MB';
+    ALTER SYSTEM SET max_connections = 200;
+
+    -- Reload applies some settings immediately; others need restart (OK after init completes)
+    SELECT pg_reload_conf();
+    ALTER SYSTEM SET pg_stat_statements.max = 10000;
+    ALTER SYSTEM SET pg_stat_statements.track = 'all';
+    ALTER SYSTEM SET pg_stat_statements.save = on;
+  pg_hba.conf: |
+    # Allow loopback
+    local   all             all                                     trust
+    host    all             all             127.0.0.1/32            trust
+    host    all             all             ::1/128                 trust
+    # Allow TLS connections from your IP(s) only
+    hostssl all             all             YOUR_PUBLIC_IP/32       md5
+    # (Optional) Add more CIDRs or a private network range here:
+    # hostssl all           all             10.0.0.0/8              md5
+---
+# Headless service required by StatefulSet for stable network IDs
+apiVersion: v1
+kind: Service
+metadata:
+  name: postgres-hl
+  namespace: db
+spec:
+  clusterIP: None
+  selector:
+    app: postgres
+  ports:
+    - name: postgres
+      port: 5432
+      targetPort: 5432
+---
+# Regular ClusterIP service for clients (keeps your original name)
+apiVersion: v1
+kind: Service
+metadata:
+  name: postgres
+  namespace: db
+spec:
+  selector:
+    app: postgres
+  ports:
+    - name: postgres
+      port: 5432
+      targetPort: 5432
+---
+apiVersion: apps/v1
+kind: StatefulSet
+metadata:
+  name: postgres
+  namespace: db
+spec:
+  serviceName: postgres-hl
+  replicas: 1
+  selector:
+    matchLabels:
+      app: postgres
+  template:
+    metadata:
+      labels:
+        app: postgres
+    spec:
+      securityContext:
+        runAsUser: 999
+        runAsGroup: 999
+        fsGroup: 999
+        fsGroupChangePolicy: "Always"
+      initContainers:
+        # Copy cert-manager certs to a writable path with correct perms for Postgres
+        - name: install-certs
+          image: busybox:1.36
+          command:
+            - sh
+            - -c
+            - |
+              cp /in/tls.crt /out/server.crt
+              cp /in/tls.key /out/server.key
+              cp /in/ca.crt  /out/ca.crt || true
+              chown 999:999 /out/* || true
+              chmod 600 /out/server.key
+          securityContext:
+            runAsUser: 0
+          volumeMounts:
+            - { name: pg-tls,   mountPath: /in,  readOnly: true }
+            - { name: pg-certs, mountPath: /out }
+      containers:
+        - name: postgres
+          image: axxs/postgres:18-postgis-vector
+          imagePullPolicy: IfNotPresent
+          args:
+            - -c
+            - ssl=on
+            - -c
+            - ssl_cert_file=/certs/server.crt
+            - -c
+            - ssl_key_file=/certs/server.key
+            - -c
+            - ssl_ca_file=/certs/ca.crt
+            - -c
+            - hba_file=/etc/postgresql-custom/pg_hba.conf
+          lifecycle:
+            postStart:
+              exec:
+                command:
+                  - /bin/sh
+                  - -c
+                  - |
+                    set -e
+                    # Wait until server accepts connections
+                    for i in $(seq 1 30); do
+                      pg_isready -h 127.0.0.1 -U "$POSTGRES_USER" -d "$POSTGRES_DB" && break
+                      sleep 1
+                    done
+                    psql -v ON_ERROR_STOP=1 -U "$POSTGRES_USER" -d "$POSTGRES_DB" -c "CREATE EXTENSION IF NOT EXISTS pg_stat_statements;"
+          env:
+            - name: POSTGRES_USER
+              value: "app"
+            - name: POSTGRES_DB
+              value: "gitea"               # matches your \connect gitea
+            - name: POSTGRES_PASSWORD
+              valueFrom:
+                secretKeyRef:
+                  name: pg18-secret
+                  key: POSTGRES_PASSWORD
+            - name: TZ
+              value: "Europe/Paris"
+          ports:
+            - name: postgres
+              containerPort: 5432
+          volumeMounts:
+            # ✅ PG 18 requires this parent path; it will create /var/lib/postgresql/18/main
+            - name: data
+              mountPath: /var/lib/postgresql
+            # your init scripts ConfigMap
+            - name: init
+              mountPath: /docker-entrypoint-initdb.d
+              readOnly: true
+            - name: pg-certs
+              mountPath: /certs
+            # pg_hba.conf
+            - name: pg-conf
+              mountPath: /etc/postgresql-custom
+          readinessProbe:
+            exec:
+              command:
+                - /bin/sh
+                - -c
+                - pg_isready -U "$POSTGRES_USER" -d "$POSTGRES_DB" -h 127.0.0.1
+            initialDelaySeconds: 5
+            periodSeconds: 10
+            timeoutSeconds: 5
+            failureThreshold: 6
+          livenessProbe:
+            exec:
+              command:
+                - /bin/sh
+                - -c
+                - pg_isready -U "$POSTGRES_USER" -d "$POSTGRES_DB" -h 127.0.0.1
+            initialDelaySeconds: 20
+            periodSeconds: 10
+            timeoutSeconds: 5
+            failureThreshold: 6
+          resources:
+            requests:
+              cpu: "250m"
+              memory: "512Mi"
+            limits:
+              cpu: "1"
+              memory: "2Gi"
+      volumes:
+        - name: init
+          configMap:
+            name: pg-init-sql
+            defaultMode: 0444
+        - name: pg-tls
+          secret:
+            secretName: pg-tls
+        - name: pg-certs
+          emptyDir: {}
+        - name: pg-conf
+          configMap:
+            name: pg-conf
+            defaultMode: 0444
+  volumeClaimTemplates:
+    - metadata:
+        name: data
+      spec:
+        accessModes: ["ReadWriteOnce"]
+        resources:
+          requests:
+            storage: 10Gi
+        # storageClassName: <your-storageclass>   # optionally pin this
+
+=== ./k8s/postgres/postgres.yaml ===
+apiVersion: v1
+kind: Service
+metadata: { name: postgres, namespace: db }
+spec:
+  ports: [{ port: 5432, targetPort: 5432 }]
+  selector: { app: postgres }
+---
+apiVersion: v1
+kind: ConfigMap
+metadata: { name: pg-init-sql, namespace: db }
+data:
+  00_extensions.sql: |
+    -- enable common extensions in the default DB and template1 so future DBs inherit them
+    \connect gitea
+    CREATE EXTENSION IF NOT EXISTS postgis;
+    CREATE EXTENSION IF NOT EXISTS vector;
+    CREATE COLLATION IF NOT EXISTS arabic (provider = icu, locale = 'ar', deterministic = false);
+    CREATE EXTENSION IF NOT EXISTS tablefunc;
+    CREATE EXTENSION IF NOT EXISTS pg_stat_statements;
+
+    CREATE EXTENSION IF NOT EXISTS postgis_topology;
+    CREATE EXTENSION IF NOT EXISTS fuzzystrmatch;
+    CREATE EXTENSION IF NOT EXISTS pg_trgm;
+    CREATE EXTENSION IF NOT EXISTS hstore;
+    CREATE EXTENSION IF NOT EXISTS "uuid-ossp";
+    CREATE EXTENSION IF NOT EXISTS citext;
+    CREATE EXTENSION IF NOT EXISTS unaccent;
+    CREATE EXTENSION IF NOT EXISTS pgcrypto;
+    -- PL/Python (optional; requires image with plpython3u, postgis image has it)
+    DO $$ BEGIN
+      CREATE EXTENSION IF NOT EXISTS plpython3u;
+    EXCEPTION WHEN undefined_file THEN
+      RAISE NOTICE 'plpython3u not available in this image';
+    END $$;
+
+    -- Also on template1 for new DBs:
+    \connect template1
+    CREATE EXTENSION IF NOT EXISTS postgis;
+    CREATE EXTENSION IF NOT EXISTS pg_trgm;
+    CREATE EXTENSION IF NOT EXISTS hstore;
+    CREATE EXTENSION IF NOT EXISTS "uuid-ossp";
+    CREATE EXTENSION IF NOT EXISTS citext;
+    CREATE EXTENSION IF NOT EXISTS unaccent;
+    CREATE EXTENSION IF NOT EXISTS pgcrypto;
+
+    -- Arabic-friendly ICU collation (PostgreSQL >= 13)
+    -- Non-deterministic collation helps proper case/diacritics comparisons
+    DO $$
+    BEGIN
+      PERFORM 1 FROM pg_collation WHERE collname='arabic';
+      IF NOT FOUND THEN
+        CREATE COLLATION arabic (provider = icu, locale = 'ar', deterministic = false);
+      END IF;
+    END$$;
+
+    -- Example: ensure gitea DB uses UTF8; Arabic text search often needs unaccent + custom dictionaries.
+    -- You can create additional DBs with: CREATE DATABASE mydb TEMPLATE template1 ENCODING 'UTF8';
+
+  01_tune.sql: |
+    -- small safe defaults; adjust later
+    ALTER SYSTEM SET shared_buffers = '1GB';
+    ALTER SYSTEM SET work_mem = '32MB';
+    ALTER SYSTEM SET maintenance_work_mem = '512MB';
+    ALTER SYSTEM SET max_connections = 200;
+    SELECT pg_reload_conf();
+---
+apiVersion: apps/v1
+kind: StatefulSet
+metadata: { name: postgres, namespace: db }
+spec:
+  serviceName: postgres
+  replicas: 1
+  selector: { matchLabels: { app: postgres } }
+  template:
+    metadata: { labels: { app: postgres } }
+    spec:
+      nodeSelector:
+        node: hetzner-2
+      securityContext:
+        fsGroup: 999          # Debian postgres user/group in postgis image
+        fsGroupChangePolicy: OnRootMismatch
+      initContainers:
+      - name: fix-perms
+        image: busybox:1.36
+        command: ["sh","-c","chown -R 999:999 /var/lib/postgresql/data || true"]
+        securityContext: { runAsUser: 0 }
+        volumeMounts: [{ name: data, mountPath: /var/lib/postgresql/data }]
+      containers:
+      - name: postgres
+        image: postgres:16-3.4
+        env:
+          - name: POSTGRES_PASSWORD
+            valueFrom: { secretKeyRef: { name: postgres-auth, key: POSTGRES_PASSWORD } }
+          - { name: POSTGRES_USER, value: gitea }
+          - { name: POSTGRES_DB, value: gitea }
+          - name: POSTGRES_INITDB_ARGS
+            value: "--encoding=UTF8 --locale=C.UTF-8"
+        ports: [{ containerPort: 5432 }]
+        volumeMounts:
+          - { name: data, mountPath: /var/lib/postgresql/data }
+          - { name: init, mountPath: /docker-entrypoint-initdb.d }
+  volumeClaimTemplates:
+  - metadata: { name: data }
+    spec:
+      accessModes: ["ReadWriteOnce"]
+      storageClassName: local-ssd-hetzner
+      resources: { requests: { storage: 80Gi } }
+---
+# Mount the init scripts
+apiVersion: apps/v1
+kind: StatefulSet
+metadata:
+  name: postgres
+  namespace: db
+spec:
+  template:
+    spec:
+      volumes:
+        - name: init
+          configMap:
+            name: pg-init-sql
+            defaultMode: 0444
+
+=== ./k8s/postgres/secret.yaml ===
+apiVersion: v1
+kind: Secret
+metadata: { name: postgres-auth, namespace: db }
+type: Opaque
+stringData:
+  POSTGRES_PASSWORD: "PG-ADM1N"
+  GITEA_DB_PASSWORD: "G1TEA"
+
+=== ./k8s/prometheus/prometheus-config.yaml ===
+apiVersion: v1
+kind: ConfigMap
+metadata: { name: prometheus-config, namespace: monitoring }
+data:
+  prometheus.yml: |
+    global: { scrape_interval: 15s }
+    scrape_configs:
+      - job_name: 'kubernetes-pods'
+        kubernetes_sd_configs: [ { role: pod } ]
+        relabel_configs:
+        - action: keep
+          source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape]
+          regex: 'true'
+
+=== ./k8s/prometheus/prometheus.yaml ===
+apiVersion: v1
+kind: Service
+metadata: { name: prometheus, namespace: monitoring }
+spec:
+  ports: [{ port: 9090, targetPort: 9090 }]
+  selector: { app: prometheus }
+---
+apiVersion: apps/v1
+kind: StatefulSet
+metadata: { name: prometheus, namespace: monitoring }
+spec:
+  serviceName: prometheus
+  replicas: 1
+  selector: { matchLabels: { app: prometheus } }
+  template:
+    metadata: { labels: { app: prometheus } }
+    spec:
+      nodeSelector: { node: hetzner-2 }
+      containers:
+      - name: prometheus
+        image: prom/prometheus:v2.53.0
+        args: ["--config.file=/etc/prometheus/prometheus.yml","--storage.tsdb.path=/prometheus"]
+        ports: [{ containerPort: 9090 }]
+        volumeMounts:
+          - { name: data, mountPath: /prometheus }
+          - { name: config, mountPath: /etc/prometheus }
+      volumes:
+        - { name: config, configMap: { name: prometheus-config } }
+  volumeClaimTemplates:
+  - metadata: { name: data }
+    spec:
+      accessModes: ["ReadWriteOnce"]
+      storageClassName: local-ssd-hetzner
+      resources: { requests: { storage: 50Gi } }
+---
+apiVersion: networking.k8s.io/v1
+kind: Ingress
+metadata:
+  name: prometheus
+  namespace: monitoring
+  annotations:
+    cert-manager.io/cluster-issuer: letsencrypt-prod
+    nginx.ingress.kubernetes.io/auth-type: basic
+    nginx.ingress.kubernetes.io/auth-secret: basic-auth-prometheus
+    nginx.ingress.kubernetes.io/auth-realm: "Authentication Required"
+spec:
+  ingressClassName: nginx
+  tls: [{ hosts: ["prometheus.betelgeusebytes.io"], secretName: prometheus-tls }]
+  rules:
+  - host: prometheus.betelgeusebytes.io
+    http:
+      paths:
+      - path: /
+        pathType: Prefix
+        backend: { service: { name: prometheus, port: { number: 9090 } } }
+
+=== ./k8s/redis/redis-pv.yaml ===
+apiVersion: v1
+kind: PersistentVolume
+metadata:
+  name: pv-redis
+spec:
+  capacity:
+    storage: 10Gi
+  accessModes:
+    - ReadWriteOnce
+  persistentVolumeReclaimPolicy: Retain
+  storageClassName: local-ssd-hetzner
+  local:
+    path: /mnt/local-ssd/redis
+  nodeAffinity:
+    required:
+      nodeSelectorTerms:
+      - matchExpressions:
+        - key: kubernetes.io/hostname
+          operator: In
+          values:
+          - hetzner-2
+=== ./k8s/redis/redis.yaml ===
+apiVersion: v1
+kind: Service
+metadata: { name: redis, namespace: db }
+spec:
+  ports: [{ port: 6379, targetPort: 6379 }]
+  selector: { app: redis }
+---
+apiVersion: apps/v1
+kind: StatefulSet
+metadata: { name: redis, namespace: db }
+spec:
+  serviceName: redis
+  replicas: 1
+  selector: { matchLabels: { app: redis } }
+  template:
+    metadata: { labels: { app: redis } }
+    spec:
+      nodeSelector: { node: hetzner-2 }
+      containers:
+      - name: redis
+        image: redis:7
+        args: ["--requirepass", "$(REDIS_PASSWORD)"]
+        env:
+          - name: REDIS_PASSWORD
+            valueFrom: { secretKeyRef: { name: redis-auth, key: REDIS_PASSWORD } }
+        ports: [{ containerPort: 6379 }]
+        volumeMounts:
+          - { name: data, mountPath: /data }
+  volumeClaimTemplates:
+  - metadata: { name: data }
+    spec:
+      accessModes: ["ReadWriteOnce"]
+      storageClassName: local-ssd-hetzner
+      resources: { requests: { storage: 10Gi } }
+---
+apiVersion: v1
+kind: Secret
+metadata: { name: redis-auth, namespace: db }
+type: Opaque
+stringData: { REDIS_PASSWORD: "RED1S" }
+
+=== ./k8s/sso/sso.yaml ===
+# PV
+apiVersion: v1
+kind: PersistentVolume
+metadata:
+  name: pv-auth
+spec:
+  capacity:
+    storage: 10Gi
+  accessModes:
+    - ReadWriteOnce
+  persistentVolumeReclaimPolicy: Retain
+  storageClassName: local-ssd-hetzner
+  local:
+    path: /mnt/local-ssd/auth
+  nodeAffinity:
+    required:
+      nodeSelectorTerms:
+      - matchExpressions:
+        - key: kubernetes.io/hostname
+          operator: In
+          values:
+          - hetzner-2
+---
+# k8s/auth/keycloak/secret.yaml
+apiVersion: v1
+kind: Secret
+metadata: { name: keycloak-admin,  namespace: db }
+type: Opaque
+stringData: { KEYCLOAK_ADMIN: "admin", KEYCLOAK_ADMIN_PASSWORD: "admin" }
+
+---
+# k8s/auth/keycloak/pvc.yaml
+apiVersion: v1
+kind: PersistentVolumeClaim
+metadata: { name: keycloak-data,  namespace: db }
+spec:
+  accessModes: ["ReadWriteOnce"]
+  storageClassName: local-ssd-hetzner
+  resources: { requests: { storage: 10Gi } }
+
+---
+# k8s/auth/keycloak/deploy.yaml
+apiVersion: apps/v1
+kind: Deployment
+metadata: { name: keycloak,  namespace: db }
+spec:
+  replicas: 1
+  selector: { matchLabels: { app: keycloak } }
+  template:
+    metadata: { labels: { app: keycloak } }
+    spec:
+      # Ensure the PV is owned by the Keycloak UID/GID
+      securityContext:
+        fsGroup: 1000
+      initContainers:
+      - name: fix-permissions
+        image: busybox
+        command: ['sh', '-c', 'chown -R 1000:1000 /opt/keycloak/data && chmod -R 755 /opt/keycloak/data']
+        volumeMounts:
+        - name: data
+          mountPath: /opt/keycloak/data
+      containers:
+      - name: keycloak
+        image: quay.io/keycloak/keycloak:latest
+        args: ["start","--http-enabled=true","--proxy-headers=xforwarded","--hostname-strict=false"]
+        env:
+          - { name: KEYCLOAK_ADMIN, valueFrom: { secretKeyRef: { name: keycloak-admin, key: KEYCLOAK_ADMIN } } }
+          - { name: KEYCLOAK_ADMIN_PASSWORD, valueFrom: { secretKeyRef: { name: keycloak-admin, key: KEYCLOAK_ADMIN_PASSWORD } } }
+        ports: [{ containerPort: 8080 }]
+        volumeMounts: [{ name: data, mountPath: /opt/keycloak/data }]
+        securityContext:
+          runAsUser: 1000
+          runAsGroup: 1000
+      volumes:
+        - name: data
+          persistentVolumeClaim: { claimName: keycloak-data }
+---
+apiVersion: v1
+kind: Service
+metadata: { name: keycloak,  namespace: db }
+spec: { selector: { app: keycloak }, ports: [ { port: 80, targetPort: 8080 } ] }
+---
+apiVersion: networking.k8s.io/v1
+kind: Ingress
+metadata:
+  name: keycloak
+  namespace: db
+  annotations: { cert-manager.io/cluster-issuer: letsencrypt-prod }
+spec:
+  ingressClassName: nginx
+  tls: [{ hosts: ["auth.betelgeusebytes.io"], secretName: keycloak-tls }]
+  rules:
+  - host: auth.betelgeusebytes.io
+    http:
+      paths:
+      - path: /
+        pathType: Prefix
+        backend: { service: { name: keycloak, port: { number: 80 } } }
+
+=== ./k8s/storage/persistent-volumes.yaml ===
+apiVersion: v1
+kind: PersistentVolume
+metadata:
+  name: pv-postgres
+spec:
+  capacity:
+    storage: 80Gi
+  accessModes:
+    - ReadWriteOnce
+  persistentVolumeReclaimPolicy: Retain
+  storageClassName: local-ssd-hetzner
+  local:
+    path: /mnt/local-ssd/postgres
+  nodeAffinity:
+    required:
+      nodeSelectorTerms:
+      - matchExpressions:
+        - key: kubernetes.io/hostname
+          operator: In
+          values:
+          - hetzner-2
+---
+apiVersion: v1
+kind: PersistentVolume
+metadata:
+  name: pv-elasticsearch
+spec:
+  capacity:
+    storage: 300Gi
+  accessModes:
+    - ReadWriteOnce
+  persistentVolumeReclaimPolicy: Retain
+  storageClassName: local-ssd-hetzner
+  local:
+    path: /mnt/local-ssd/elasticsearch
+  nodeAffinity:
+    required:
+      nodeSelectorTerms:
+      - matchExpressions:
+        - key: kubernetes.io/hostname
+          operator: In
+          values:
+          - hetzner-2
+---
+apiVersion: v1
+kind: PersistentVolume
+metadata:
+  name: pv-gitea
+spec:
+  capacity:
+    storage: 50Gi
+  accessModes:
+    - ReadWriteOnce
+  persistentVolumeReclaimPolicy: Retain
+  storageClassName: local-ssd-hetzner
+  local:
+    path: /mnt/local-ssd/gitea
+  nodeAffinity:
+    required:
+      nodeSelectorTerms:
+      - matchExpressions:
+        - key: kubernetes.io/hostname
+          operator: In
+          values:
+          - hetzner-2
+---
+apiVersion: v1
+kind: PersistentVolume
+metadata:
+  name: pv-jupyter
+spec:
+  capacity:
+    storage: 20Gi
+  accessModes:
+    - ReadWriteOnce
+  persistentVolumeReclaimPolicy: Retain
+  storageClassName: local-ssd-hetzner
+  local:
+    path: /mnt/local-ssd/jupyter
+  nodeAffinity:
+    required:
+      nodeSelectorTerms:
+      - matchExpressions:
+        - key: kubernetes.io/hostname
+          operator: In
+          values:
+          - hetzner-2
+---
+apiVersion: v1
+kind: PersistentVolume
+metadata:
+  name: pv-kafka
+spec:
+  capacity:
+    storage: 50Gi
+  accessModes:
+    - ReadWriteOnce
+  persistentVolumeReclaimPolicy: Retain
+  storageClassName: local-ssd-hetzner
+  local:
+    path: /mnt/local-ssd/kafka
+  nodeAffinity:
+    required:
+      nodeSelectorTerms:
+      - matchExpressions:
+        - key: kubernetes.io/hostname
+          operator: In
+          values:
+          - hetzner-2
+---
+apiVersion: v1
+kind: PersistentVolume
+metadata:
+  name: pv-zookeeper-data
+spec:
+  capacity:
+    storage: 10Gi
+  accessModes:
+    - ReadWriteOnce
+  persistentVolumeReclaimPolicy: Retain
+  storageClassName: local-ssd-hetzner
+  local:
+    path: /mnt/local-ssd/zookeeper-data
+  nodeAffinity:
+    required:
+      nodeSelectorTerms:
+      - matchExpressions:
+        - key: kubernetes.io/hostname
+          operator: In
+          values:
+          - hetzner-2
+---
+apiVersion: v1
+kind: PersistentVolume
+metadata:
+  name: pv-zookeeper-log
+spec:
+  capacity:
+    storage: 10Gi
+  accessModes:
+    - ReadWriteOnce
+  persistentVolumeReclaimPolicy: Retain
+  storageClassName: local-ssd-hetzner
+  local:
+    path: /mnt/local-ssd/zookeeper-log
+  nodeAffinity:
+    required:
+      nodeSelectorTerms:
+      - matchExpressions:
+        - key: kubernetes.io/hostname
+          operator: In
+          values:
+          - hetzner-2
+---
+apiVersion: v1
+kind: PersistentVolume
+metadata:
+  name: pv-prometheus
+spec:
+  capacity:
+    storage: 50Gi
+  accessModes:
+    - ReadWriteOnce
+  persistentVolumeReclaimPolicy: Retain
+  storageClassName: local-ssd-hetzner
+  local:
+    path: /mnt/local-ssd/prometheus
+  nodeAffinity:
+    required:
+      nodeSelectorTerms:
+      - matchExpressions:
+        - key: kubernetes.io/hostname
+          operator: In
+          values:
+          - hetzner-2
+
+=== ./k8s/storage/storageclass.yaml ===
+apiVersion: storage.k8s.io/v1
+kind: StorageClass
+metadata:
+  name: local-ssd-hetzner
+provisioner: kubernetes.io/no-provisioner
+volumeBindingMode: WaitForFirstConsumer
+
+=== ./k8s/tei/tei.yaml ===
+# k8s/ai/tei/deploy.yaml
+apiVersion: apps/v1
+kind: Deployment
+metadata: { name: tei, namespace: ml }
+spec:
+  replicas: 1
+  selector: { matchLabels: { app: tei } }
+  template:
+    metadata: { labels: { app: tei } }
+    spec:
+      containers:
+      - name: tei
+        image: ghcr.io/huggingface/text-embeddings-inference:cpu-latest
+        env: [{ name: MODEL_ID, value: "mixedbread-ai/mxbai-embed-large-v1" }]
+        ports: [{ containerPort: 80 }]
+---
+apiVersion: v1
+kind: Service
+metadata: { name: tei, namespace: ml }
+spec: { selector: { app: tei }, ports: [ { port: 80, targetPort: 80 } ] }
+---
+apiVersion: networking.k8s.io/v1
+kind: Ingress
+metadata:
+  name: tei
+  namespace: ml
+  annotations: { cert-manager.io/cluster-issuer: letsencrypt-prod }
+spec:
+  ingressClassName: nginx
+  tls: [{ hosts: ["embeddings.betelgeusebytes.io"], secretName: tei-tls }]
+  rules:
+  - host: embeddings.betelgeusebytes.io
+    http:
+      paths:
+      - path: /
+        pathType: Prefix
+        backend: { service: { name: tei, port: { number: 80 } } }
+
+=== ./k8s/trading/ib-gateway.yaml ===
+apiVersion: v1
+kind: Namespace
+metadata:
+  name: trading
+  labels:
+    name: trading
+    environment: production
+---
+# OPTIONAL: Use this if you want to persist IB Gateway settings/logs
+# across pod restarts. For most use cases, this is NOT needed since
+# IB Gateway is mostly stateless and credentials are in Secrets.
+#
+# Only create this PV/PVC if you need to persist:
+# - TWS session data
+# - Custom workspace layouts
+# - Historical API usage logs
+
+apiVersion: v1
+kind: PersistentVolume
+metadata:
+  name: ib-gateway-data
+  labels:
+    type: local
+    app: ib-gateway
+spec:
+  capacity:
+    storage: 5Gi
+  accessModes:
+    - ReadWriteOnce
+  persistentVolumeReclaimPolicy: Retain
+  storageClassName: local-storage
+  local:
+    path: /mnt/local-ssd/ib-gateway  # Adjust to your local SSD path
+  nodeAffinity:
+    required:
+      nodeSelectorTerms:
+      - matchExpressions:
+        - key: kubernetes.io/hostname
+          operator: In
+          values:
+          - hetzner-2
+
+---
+apiVersion: v1
+kind: PersistentVolumeClaim
+metadata:
+  name: ib-gateway-data
+  namespace: trading
+spec:
+  accessModes:
+    - ReadWriteOnce
+  resources:
+    requests:
+      storage: 5Gi
+  storageClassName: local-storage
+  selector:
+    matchLabels:
+      app: ib-gateway
+
+# To use this PVC, add to Deployment volumeMounts:
+#   - name: data
+#     mountPath: /root/Jts
+# And to volumes:
+#   - name: data
+#     persistentVolumeClaim:
+#       claimName: ib-gateway-data
+---
+apiVersion: v1
+kind: Secret
+metadata:
+  name: ib-credentials
+  namespace: trading
+type: Opaque
+stringData:
+  # IMPORTANT: Replace these with your actual IB credentials
+  # For paper trading, use your paper trading account
+  username: "saladin85"
+  password: "3Lcd@05041985"
+  # Trading mode: "paper" or "live"
+  trading-mode: "paper"
+  
+  # IB Gateway config (jts.ini equivalent)
+  # This enables headless mode and configures ports
+  ibgateway.conf: |
+    [IBGateway]
+    TradingMode=paper
+    ApiOnly=true
+    ReadOnlyApi=false
+    TrustedIPs=127.0.0.1
+    
+    [IBGatewayAPI]
+    ApiPortNumber=4002
+    
+    [Logon]
+    UseRemoteSettings=no
+    Locale=en
+    ColorPaletteName=dark
+    
+    [Display]
+    ShowSplashScreen=no
+---
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: ib-gateway-config
+  namespace: trading
+data:
+  # Startup script to configure IB Gateway for headless operation
+  startup.sh: |
+    #!/bin/bash
+    set -e
+    
+    echo "Starting IB Gateway in headless mode..."
+    echo "Trading Mode: ${TRADING_MODE}"
+    echo "Port: ${TWS_PORT}"
+    
+    # Configure based on trading mode
+    if [ "${TRADING_MODE}" == "live" ]; then
+      export TWS_PORT=4001
+      echo "⚠️  LIVE TRADING MODE - USE WITH CAUTION ⚠️"
+    else
+      export TWS_PORT=4002
+      echo "📝 Paper Trading Mode (Safe)"
+    fi
+    # IMPORTANT: use the env vars provided by the Deployment
+    export IB_USERNAME="${TWS_USERID}"
+    export IB_PASSWORD="${TWS_PASSWORD}"
+
+    # Start IB Gateway
+    exec /opt/ibgateway/ibgateway-latest-standalone-linux-x64.sh \
+      --tws-path=/root/Jts \
+      --tws-settings-path=/root \
+      --user="${IB_USERNAME}" \
+      --pw="${IB_PASSWORD}" \
+      --mode="${TRADING_MODE}" \
+      --port="${TWS_PORT}"
+  
+  # Health check script
+  healthcheck.sh: |
+    #!/bin/bash
+    # Check if TWS API port is listening
+    # PORT=${TWS_PORT:-4002}
+    # nc -z localhost $PORT
+    # exit $?
+    #!/bin/sh
+    # Pure-python TCP check (no nc required)
+    PORT="${TWS_PORT:-4002}"
+    python - <<'PY'
+    import os, socket, sys
+    port = int(os.environ.get("TWS_PORT", os.environ.get("PORT", "4002")))
+    s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+    s.settimeout(2)
+    try:
+        s.connect(("127.0.0.1", port))
+        sys.exit(0)
+    except Exception:
+        sys.exit(1)
+    finally:
+        s.close()
+    PY
+---
+# apiVersion: apps/v1
+# kind: Deployment
+# metadata:
+#   name: ib-gateway
+#   namespace: trading
+#   labels:
+#     app: ib-gateway
+#     component: trading-infrastructure
+# spec:
+#   replicas: 1  # IB Gateway should only have 1 instance per account
+#   strategy:
+#     type: Recreate  # Avoid multiple simultaneous logins
+#   selector:
+#     matchLabels:
+#       app: ib-gateway
+#   template:
+#     metadata:
+#       labels:
+#         app: ib-gateway
+#       annotations:
+#         prometheus.io/scrape: "false"  # No metrics endpoint by default
+#     spec:
+#       # Pin to hetzner-2 (matches your existing pattern)
+#       nodeSelector:
+#         kubernetes.io/hostname: hetzner-2
+      
+#       # Security context
+#       securityContext:
+#         runAsNonRoot: false  # IB Gateway requires root for VNC (even if unused)
+#         fsGroup: 1000
+      
+#       containers:
+#       - name: ib-gateway
+#         # Using community-maintained IB Gateway image
+#         # Alternative: waytrade/ib-gateway:latest
+#         image: ghcr.io/gnzsnz/ib-gateway:stable
+#         imagePullPolicy: IfNotPresent
+        
+#         env:
+#         - name: TWS_USERID
+#           valueFrom:
+#             secretKeyRef:
+#               name: ib-credentials
+#               key: username
+#         - name: TWS_PASSWORD
+#           valueFrom:
+#             secretKeyRef:
+#               name: ib-credentials
+#               key: password
+#         - name: TRADING_MODE
+#           valueFrom:
+#             secretKeyRef:
+#               name: ib-credentials
+#               key: trading-mode
+#         - name: TWS_PORT
+#           value: "4002"  # Default to paper trading
+#         - name: READ_ONLY_API
+#           value: "no"
+        
+#         # Ports
+#         ports:
+#         - name: paper-trading
+#           containerPort: 4002
+#           protocol: TCP
+#         - name: live-trading
+#           containerPort: 4001
+#           protocol: TCP
+#         - name: vnc
+#           containerPort: 5900
+#           protocol: TCP  # VNC (not exposed externally)
+        
+#         # Resource limits
+#         resources:
+#           requests:
+#             memory: "1Gi"
+#             cpu: "500m"
+#           limits:
+#             memory: "2Gi"
+#             cpu: "1000m"
+        
+#         # Liveness probe (check if API port is responsive)
+#         startupProbe:
+#           tcpSocket:
+#             port: 4002
+#           initialDelaySeconds: 60      # Wait 60s before first check
+#           periodSeconds: 10             # Check every 10s
+#           timeoutSeconds: 5
+#           failureThreshold: 18          # 60s + (10s * 18) = 240s total startup time
+        
+#         livenessProbe:
+#           tcpSocket:
+#             port: 4002
+#           initialDelaySeconds: 0  # IB Gateway takes time to start
+#           periodSeconds: 60
+#           timeoutSeconds: 5
+#           failureThreshold: 3
+        
+#         # Readiness probe
+#         readinessProbe:
+#           tcpSocket:
+#             port: 4002
+#           initialDelaySeconds: 0
+#           periodSeconds: 10
+#           timeoutSeconds: 5
+#           failureThreshold: 2
+        
+#         # Volume mounts for config
+#         volumeMounts:
+#         - name: ib-config
+#           mountPath: /root/Jts/jts.ini
+#           subPath: ibgateway.conf
+#         - name: startup-script
+#           mountPath: /startup.sh
+#           subPath: startup.sh
+#         - name: data
+#           mountPath: /root/Jts
+        
+#         # Logging to stdout (Fluent Bit will collect)
+#         # IB Gateway logs go to /root/Jts/log by default
+#         lifecycle:
+#           postStart:
+#             exec:
+#               command:
+#               - /bin/sh
+#               - -c
+#               - |
+#                 mkdir -p /root/Jts/log
+#                 ln -sf /dev/stdout /root/Jts/log/ibgateway.log || true
+      
+#       volumes:
+#       - name: ib-config
+#         secret:
+#           secretName: ib-credentials
+#           defaultMode: 0644
+#       - name: startup-script
+#         configMap:
+#           name: ib-gateway-config
+#           defaultMode: 0755
+#       - name: data
+#         persistentVolumeClaim:
+#           claimName: ib-gateway-data
+      
+#       # Restart policy
+#       restartPolicy: Always
+      
+#       # DNS policy for internal cluster resolution
+#       dnsPolicy: ClusterFirst
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: ib-gateway
+  namespace: trading
+  labels:
+    app: ib-gateway
+    component: trading-infrastructure
+spec:
+  replicas: 1
+  strategy:
+    type: Recreate
+  selector:
+    matchLabels:
+      app: ib-gateway
+  template:
+    metadata:
+      labels:
+        app: ib-gateway
+      annotations:
+        prometheus.io/scrape: "false"
+    spec:
+      nodeSelector:
+        kubernetes.io/hostname: hetzner-2
+
+      securityContext:
+        runAsNonRoot: false
+        fsGroup: 1000
+
+      # Seed writable jts.ini into the PVC once
+      initContainers:
+      - name: seed-jts-config
+        image: busybox:1.36
+        command:
+          - sh
+          - -c
+          - |
+            set -e
+            mkdir -p /data
+            if [ ! -f /data/jts.ini ]; then
+              echo "Seeding jts.ini into PVC"
+              cp /config/ibgateway.conf /data/jts.ini
+              chmod 644 /data/jts.ini
+            else
+              echo "jts.ini already exists in PVC"
+            fi
+        volumeMounts:
+        - name: ib-config
+          mountPath: /config
+          readOnly: true
+        - name: data
+          mountPath: /data
+
+      containers:
+      # ------------------------------------------------------------------
+      # IB Gateway
+      # ------------------------------------------------------------------
+      - name: ib-gateway
+        image: ghcr.io/gnzsnz/ib-gateway:stable
+        imagePullPolicy: IfNotPresent
+
+        env:
+        - name: TWS_USERID
+          valueFrom:
+            secretKeyRef:
+              name: ib-credentials
+              key: username
+        - name: TWS_PASSWORD
+          valueFrom:
+            secretKeyRef:
+              name: ib-credentials
+              key: password
+        - name: TRADING_MODE
+          valueFrom:
+            secretKeyRef:
+              name: ib-credentials
+              key: trading-mode
+        - name: TWS_PORT
+          value: "4002"
+        - name: READ_ONLY_API
+          value: "no"
+
+        ports:
+        - name: ib-api-local
+          containerPort: 4002
+          protocol: TCP
+        - name: live-trading
+          containerPort: 4001
+          protocol: TCP
+        - name: vnc
+          containerPort: 5900
+          protocol: TCP
+
+        resources:
+          requests:
+            memory: "1Gi"
+            cpu: "500m"
+          limits:
+            memory: "2Gi"
+            cpu: "1000m"
+
+        # IMPORTANT: Probes should check the local IB port (4002)
+        startupProbe:
+          tcpSocket:
+            port: 4002
+          initialDelaySeconds: 60
+          periodSeconds: 10
+          timeoutSeconds: 5
+          failureThreshold: 18
+
+        livenessProbe:
+          tcpSocket:
+            port: 4002
+          periodSeconds: 60
+          timeoutSeconds: 5
+          failureThreshold: 3
+
+        readinessProbe:
+          tcpSocket:
+            port: 4002
+          periodSeconds: 10
+          timeoutSeconds: 5
+          failureThreshold: 2
+
+        volumeMounts:
+        - name: data
+          mountPath: /root/Jts
+
+        lifecycle:
+          postStart:
+            exec:
+              command:
+              - sh
+              - -c
+              - |
+                mkdir -p /root/Jts/log
+                ln -sf /dev/stdout /root/Jts/log/ibgateway.log || true
+
+      # ------------------------------------------------------------------
+      # Sidecar TCP proxy: accepts cluster traffic, forwards to localhost:4002
+      # ------------------------------------------------------------------
+      - name: ib-api-proxy
+        image: alpine/socat:1.8.0.0
+        imagePullPolicy: IfNotPresent
+        args:
+          - "-d"
+          - "-d"
+          - "TCP-LISTEN:4003,fork,reuseaddr"
+          - "TCP:127.0.0.1:4002"
+        ports:
+        - name: ib-api
+          containerPort: 4003
+          protocol: TCP
+        resources:
+          requests:
+            memory: "32Mi"
+            cpu: "10m"
+          limits:
+            memory: "128Mi"
+            cpu: "100m"
+        # basic probe: is proxy listening
+        readinessProbe:
+          tcpSocket:
+            port: 4003
+          periodSeconds: 5
+          timeoutSeconds: 2
+          failureThreshold: 3
+
+      volumes:
+      - name: ib-config
+        secret:
+          secretName: ib-credentials
+          defaultMode: 0644
+
+      - name: data
+        persistentVolumeClaim:
+          claimName: ib-gateway-data
+
+      restartPolicy: Always
+      dnsPolicy: ClusterFirst
+
+
+---
+# apiVersion: v1
+# kind: Service
+# metadata:
+#   name: ib-gateway
+#   namespace: trading
+#   labels:
+#     app: ib-gateway
+# spec:
+#   type: ClusterIP  # Internal-only, not exposed publicly
+#   clusterIP: None  # Headless service (optional, remove if you want a stable ClusterIP)
+#   selector:
+#     app: ib-gateway
+#   ports:
+#   - name: paper-trading
+#     port: 4002
+#     targetPort: 4002
+#     protocol: TCP
+#   - name: live-trading
+#     port: 4001
+#     targetPort: 4001
+#     protocol: TCP
+#   sessionAffinity: ClientIP  # Stick to same pod (important for stateful TWS sessions)
+#   sessionAffinityConfig:
+#     clientIP:
+#       timeoutSeconds: 3600  # 1 hour session stickiness
+
+apiVersion: v1
+kind: Service
+metadata:
+  name: ib-gateway
+  namespace: trading
+  labels:
+    app: ib-gateway
+spec:
+  type: ClusterIP
+  selector:
+    app: ib-gateway
+  ports:
+  - name: paper-trading
+    port: 4002
+    targetPort: 4003   # <-- proxy sidecar, not the gateway directly
+    protocol: TCP
+  - name: live-trading
+    port: 4001
+    targetPort: 4001
+    protocol: TCP
+  sessionAffinity: ClientIP
+  sessionAffinityConfig:
+    clientIP:
+      timeoutSeconds: 3600
+
+=== ./k8s/trading/ib-gateway2.yaml ===
+apiVersion: v1
+kind: Namespace
+metadata:
+  name: trading
+  labels:
+    name: trading
+    environment: production
+---
+apiVersion: v1
+kind: Secret
+metadata:
+  name: ib-credentials
+  namespace: trading
+type: Opaque
+stringData:
+  # Rotate your creds (you pasted them earlier).
+  username: "saladin85"
+  password: "3Lcd@05041985"
+  trading-mode: "paper"
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: ib-gateway
+  namespace: trading
+  labels:
+    app: ib-gateway
+    component: trading-infrastructure
+spec:
+  replicas: 1
+  strategy:
+    type: Recreate
+  selector:
+    matchLabels:
+      app: ib-gateway
+  template:
+    metadata:
+      labels:
+        app: ib-gateway
+      annotations:
+        prometheus.io/scrape: "false"
+    spec:
+      nodeSelector:
+        kubernetes.io/hostname: hetzner-2
+
+      # Keep your original security context
+      securityContext:
+        runAsNonRoot: false
+        fsGroup: 1000
+
+      containers:
+      - name: ib-gateway
+        image: ghcr.io/gnzsnz/ib-gateway:stable
+        imagePullPolicy: IfNotPresent
+
+        # IMPORTANT: use env vars this image expects
+        env:
+        - name: TWS_USERID
+          valueFrom:
+            secretKeyRef:
+              name: ib-credentials
+              key: username
+        - name: TWS_PASSWORD
+          valueFrom:
+            secretKeyRef:
+              name: ib-credentials
+              key: password
+        - name: TRADING_MODE
+          valueFrom:
+            secretKeyRef:
+              name: ib-credentials
+              key: trading-mode
+        - name: READ_ONLY_API
+          value: "no"
+
+        # These two match what your log shows the image uses
+        - name: API_PORT
+          value: "4002"
+        - name: SOCAT_PORT
+          value: "4004"
+
+        # optional but nice
+        - name: TIME_ZONE
+          value: "Etc/UTC"
+        - name: TWOFA_TIMEOUT_ACTION
+          value: "exit"
+
+        ports:
+        # IB API ports (inside container / localhost use)
+        - name: api-paper
+          containerPort: 4002
+          protocol: TCP
+        - name: api-live
+          containerPort: 4001
+          protocol: TCP
+
+        # socat relay port for non-localhost clients (what we expose via Service)
+        - name: api-socat
+          containerPort: 4004
+          protocol: TCP
+
+        # optional UI/VNC
+        - name: vnc
+          containerPort: 5900
+          protocol: TCP
+
+        resources:
+          requests:
+            memory: "1Gi"
+            cpu: "500m"
+          limits:
+            memory: "2Gi"
+            cpu: "1000m"
+
+        # Probe the socat port (represents remote connectivity)
+        startupProbe:
+          tcpSocket:
+            port: 4004
+          initialDelaySeconds: 60
+          periodSeconds: 10
+          timeoutSeconds: 5
+          failureThreshold: 18
+
+        readinessProbe:
+          tcpSocket:
+            port: 4004
+          periodSeconds: 10
+          timeoutSeconds: 5
+          failureThreshold: 2
+
+        livenessProbe:
+          tcpSocket:
+            port: 4004
+          periodSeconds: 60
+          timeoutSeconds: 5
+          failureThreshold: 3
+
+      restartPolicy: Always
+      dnsPolicy: ClusterFirst
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: ib-gateway
+  namespace: trading
+  labels:
+    app: ib-gateway
+spec:
+  type: ClusterIP
+  selector:
+    app: ib-gateway
+  ports:
+  # Clients connect to 4002, but we forward to SOCAT_PORT=4004
+  - name: paper-trading
+    port: 4002
+    targetPort: 4004
+    protocol: TCP
+
+  # If you truly need live, you should relay live via another socat port too.
+  # For now keep it direct (or remove it entirely for safety).
+  - name: live-trading
+    port: 4001
+    targetPort: 4001
+    protocol: TCP
+
+  sessionAffinity: ClientIP
+  sessionAffinityConfig:
+    clientIP:
+      timeoutSeconds: 3600
+
+=== ./k8s/vector/qdrant.yaml ===
+# k8s/vec/qdrant/pvc.yaml
+apiVersion: v1
+kind: PersistentVolumeClaim
+metadata: { name: qdrant-data, namespace: db}
+spec:
+  accessModes: ["ReadWriteOnce"]
+  storageClassName: local-ssd-hetzner
+  resources: { requests: { storage: 20Gi } }
+
+---
+# k8s/vec/qdrant/deploy.yaml
+apiVersion: apps/v1
+kind: Deployment
+metadata: { name: qdrant, namespace: db}
+spec:
+  replicas: 1
+  selector: { matchLabels: { app: qdrant } }
+  template:
+    metadata: { labels: { app: qdrant } }
+    spec:
+      containers:
+      - name: qdrant
+        image: qdrant/qdrant:latest
+        ports:
+          - { containerPort: 6333 }  # HTTP + Web UI
+          - { containerPort: 6334 }  # gRPC
+        volumeMounts:
+          - { name: data, mountPath: /qdrant/storage }
+      volumes:
+        - name: data
+          persistentVolumeClaim: { claimName: qdrant-data }
+---
+apiVersion: v1
+kind: Service
+metadata: { name: qdrant, namespace: db}
+spec:
+  selector: { app: qdrant }
+  ports:
+    - { name: http, port: 80,  targetPort: 6333 }
+    - { name: grpc, port: 6334, targetPort: 6334 }
+---
+apiVersion: networking.k8s.io/v1
+kind: Ingress
+metadata:
+  name: qdrant
+  namespace: db
+  annotations: { cert-manager.io/cluster-issuer: letsencrypt-prod }
+spec:
+  ingressClassName: nginx
+  tls: [{ hosts: ["vector.betelgeusebytes.io"], secretName: qdrant-tls }]
+  rules:
+  - host: vector.betelgeusebytes.io
+    http:
+      paths:
+      - path: /
+        pathType: Prefix
+        backend: { service: { name: qdrant, port: { number: 80 } } }
+---
+# PV
+apiVersion: v1
+kind: PersistentVolume
+metadata:
+  name: pv-qdrant
+spec:
+  capacity:
+    storage: 20Gi
+  accessModes:
+    - ReadWriteOnce
+  persistentVolumeReclaimPolicy: Retain
+  storageClassName: local-ssd-hetzner
+  local:
+    path: /mnt/local-ssd/qdrant
+  nodeAffinity:
+    required:
+      nodeSelectorTerms:
+      - matchExpressions:
+        - key: kubernetes.io/hostname
+          operator: In
+          values:
+          - hetzner-2
+
+=== ./k8s/vllm/vllm.yaml ===
+# PV
+apiVersion: v1
+kind: PersistentVolume
+metadata:
+  name: pv-vllm
+spec:
+  capacity:
+    storage: 50Gi
+  accessModes:
+    - ReadWriteOnce
+  persistentVolumeReclaimPolicy: Retain
+  storageClassName: local-ssd-hetzner
+  local:
+    path: /mnt/local-ssd/vllm
+  nodeAffinity:
+    required:
+      nodeSelectorTerms:
+      - matchExpressions:
+        - key: kubernetes.io/hostname
+          operator: In
+          values:
+          - hetzner-2
+---
+# k8s/ai/vllm/secret.yaml
+apiVersion: v1
+kind: Secret
+metadata: { name: vllm-auth,  namespace: ml }
+type: Opaque
+stringData: { API_KEY: "replace_me" }
+
+---
+# k8s/ai/ollama/deploy.yaml
+apiVersion: apps/v1
+kind: Deployment
+metadata: { name: ollama, namespace: ml }
+spec:
+  replicas: 1
+  selector: { matchLabels: { app: ollama } }
+  template:
+    metadata: { labels: { app: ollama } }
+    spec:
+      securityContext:
+        runAsUser: 0               # needed so the init can write into /root/.ollama
+      initContainers:
+      - name: warm-models
+        image: ollama/ollama:latest
+        command: ["/bin/sh","-c"]
+        args:
+          - |
+            ollama serve &   # start a temp daemon
+            sleep 2
+            # pull one or more small, quantized models for CPU
+            ollama pull qwen2.5:3b-instruct-q4_K_M || true
+            ollama pull llama3.2:3b-instruct-q4_K_M || true
+            pkill ollama || true
+        volumeMounts:
+          - { name: data, mountPath: /root/.ollama }
+      containers:
+      - name: ollama
+        image: ollama/ollama:latest
+        env:
+          - { name: OLLAMA_ORIGINS, value: "*" }    # CORS if you call from browser
+        ports:
+          - { containerPort: 11434 }
+        volumeMounts:
+          - { name: data, mountPath: /root/.ollama }
+        resources:
+          requests: { cpu: "2", memory: "4Gi" }
+          limits:   { cpu: "4", memory: "8Gi" }
+      volumes:
+        - name: data
+          persistentVolumeClaim: { claimName: ollama-data }
+
+---
+# k8s/ai/ollama/svc-ing.yaml
+apiVersion: v1
+kind: Service
+metadata: { name: ollama, namespace: ml }
+spec:
+  selector: { app: ollama }
+  ports: [ { name: http, port: 80, targetPort: 11434 } ]
+
+# ---
+# # old k8s/ai/vllm/deploy.yaml
+# apiVersion: apps/v1
+# kind: Deployment
+# metadata: { name: vllm,  namespace: ml }
+# spec:
+#   replicas: 1
+#   selector: { matchLabels: { app: vllm } }
+#   template:
+#     metadata: { labels: { app: vllm } }
+#     spec:
+#       containers:
+#       - name: vllm
+#         image: vllm/vllm-openai:latest
+#         args: ["--model","Qwen/Qwen2.5-7B-Instruct","--max-model-len","8192","--port","8000","--host","0.0.0.0"]
+#         env:
+#           - name: VLLM_API_KEY
+#             valueFrom: { secretKeyRef: { name: vllm-auth, key: API_KEY } }
+#         ports: [{ containerPort: 8000 }]
+#         resources:
+#           limits:
+#             nvidia.com/gpu: 1
+#           requests:
+#             nvidia.com/gpu: 1
+#         volumeMounts:
+#           - { name: cache, mountPath: /root/.cache/huggingface }
+#       volumes:
+#         - name: cache
+#           persistentVolumeClaim: { claimName: vllm-cache-pvc }
+---
+apiVersion: v1
+kind: PersistentVolumeClaim
+metadata: { name: ollama-data,  namespace: ml }
+spec:
+  accessModes: ["ReadWriteOnce"]
+  storageClassName: local-ssd-hetzner
+  resources: { requests: { storage: 50Gi } }
+# ---
+#old k8s/ai/vllm/svc-ing.yaml
+# apiVersion: v1
+# kind: Service
+# metadata: { name: vllm,  namespace: ml }
+# spec: { selector: { app: vllm }, ports: [ { port: 80, targetPort: 8000 } ] }
+---
+apiVersion: networking.k8s.io/v1
+kind: Ingress
+metadata:
+  name: vllm
+  namespace: ml
+  annotations: { cert-manager.io/cluster-issuer: letsencrypt-prod }
+spec:
+  ingressClassName: nginx
+  tls: [{ hosts: ["llm.betelgeusebytes.io"], secretName: vllm-tls }]
+  rules:
+  - host: llm.betelgeusebytes.io
+    http:
+      paths:
+      - path: /
+        pathType: Prefix
+        backend: { service: { name: vllm, port: { number: 80 } } }
+
diff --git a/combine.sh b/combine.sh
new file mode 100644
index 0000000..32a410f
--- /dev/null
+++ b/combine.sh
@@ -0,0 +1,5 @@
+find . -type f -name "*.txt" -o -name "*.py" -o -name "*.yml" -o -name "*.yaml" -o -name "*.YAML" -o -name "*.ini" | while read file; do
+    echo "=== $file ===" >> betelgeusebytes.txt
+    cat "$file" >> betelgeusebytes.txt
+    echo  "" >> betelgeusebytes.txt
+done
\ No newline at end of file
diff --git a/k8s/observability-stack/04-loki-config.yaml b/k8s/observability-stack/04-loki-config.yaml
index 3885de8..e9c435f 100644
--- a/k8s/observability-stack/04-loki-config.yaml
+++ b/k8s/observability-stack/04-loki-config.yaml
@@ -43,12 +43,9 @@ data:
     compactor:
       working_directory: /loki/compactor
       compaction_interval: 10m
-      retention_enabled: true
-      retention_delete_delay: 2h
-      retention_delete_worker_count: 150
+      retention_enabled: false
 
     limits_config:
-      enforce_metric_name: false
       reject_old_samples: true
       reject_old_samples_max_age: 168h  # 7 days
       retention_period: 168h  # 7 days
@@ -91,4 +88,4 @@ data:
         replay_memory_ceiling: 1GB
 
     analytics:
-      reporting_enabled: false
+      reporting_enabled: false
\ No newline at end of file
diff --git a/k8s/observability-stack/05-tempo-config.yaml b/k8s/observability-stack/05-tempo-config.yaml
index 4f606ce..6eb5fb4 100644
--- a/k8s/observability-stack/05-tempo-config.yaml
+++ b/k8s/observability-stack/05-tempo-config.yaml
@@ -39,7 +39,7 @@ data:
           source: tempo
           cluster: betelgeuse-k8s
       storage:
-        path: /tmp/tempo/generator/wal
+        path: /var/tempo/generator/wal
         remote_write:
           - url: http://prometheus.observability.svc.cluster.local:9090/api/v1/write
             send_exemplars: true
@@ -48,17 +48,14 @@ data:
       trace:
         backend: local
         wal:
-          path: /tmp/tempo/wal
+          path: /var/tempo/wal
         local:
-          path: /tmp/tempo/blocks
+          path: /var/tempo/blocks
         pool:
           max_workers: 100
           queue_depth: 10000
 
-    querier:
-      frontend_worker:
-        frontend_address: tempo.observability.svc.cluster.local:9095
-
+    # Single instance mode - no need for frontend/querier split
     query_frontend:
       search:
         duration_slo: 5s
@@ -69,4 +66,4 @@ data:
     overrides:
       defaults:
         metrics_generator:
-          processors: [service-graphs, span-metrics]
+          processors: [service-graphs, span-metrics]
\ No newline at end of file
diff --git a/k8s/observability-stack/06-alloy-config.yaml b/k8s/observability-stack/06-alloy-config.yaml
index 2f159de..3aa3082 100644
--- a/k8s/observability-stack/06-alloy-config.yaml
+++ b/k8s/observability-stack/06-alloy-config.yaml
@@ -124,7 +124,6 @@ data:
 
       output {
         traces  = [otelcol.exporter.otlp.tempo.input]
-        metrics = [otelcol.exporter.prometheus.metrics.input]
       }
     }
 
@@ -138,22 +137,7 @@ data:
       }
     }
 
-    // Export OTLP metrics to Prometheus
-    otelcol.exporter.prometheus "metrics" {
-      forward_to = [prometheus.remote_write.local.receiver]
-    }
-
-    // Remote write to Prometheus
-    prometheus.remote_write "local" {
-      endpoint {
-        url = "http://prometheus.observability.svc.cluster.local:9090/api/v1/write"
-      }
-    }
-
     // Scrape local metrics (Alloy's own metrics)
-    prometheus.scrape "alloy" {
-      targets = [{
-        __address__ = "localhost:12345",
-      }]
-      forward_to = [prometheus.remote_write.local.receiver]
-    }
+    // Prometheus will scrape these via service discovery
+    prometheus.exporter.self "alloy" {
+    }
\ No newline at end of file
diff --git a/k8s/observability-stack/08-rbac.yaml b/k8s/observability-stack/08-rbac.yaml
index dca5627..49d3ba1 100644
--- a/k8s/observability-stack/08-rbac.yaml
+++ b/k8s/observability-stack/08-rbac.yaml
@@ -66,6 +66,7 @@ rules:
       - services
       - endpoints
       - pods
+      - pods/log
     verbs: ["get", "list", "watch"]
   - apiGroups:
       - extensions
@@ -175,4 +176,4 @@ roleRef:
 subjects:
   - kind: ServiceAccount
     name: kube-state-metrics
-    namespace: observability
+    namespace: observability
\ No newline at end of file
diff --git a/k8s/observability-stack/12-tempo.yaml b/k8s/observability-stack/12-tempo.yaml
index 3fff6d6..827ca66 100644
--- a/k8s/observability-stack/12-tempo.yaml
+++ b/k8s/observability-stack/12-tempo.yaml
@@ -21,6 +21,11 @@ spec:
     spec:
       nodeSelector:
         kubernetes.io/hostname: hetzner-2
+      securityContext:
+        fsGroup: 10001
+        runAsGroup: 10001
+        runAsNonRoot: true
+        runAsUser: 10001
       containers:
         - name: tempo
           image: grafana/tempo:2.6.1
@@ -70,7 +75,7 @@ spec:
             - name: tempo-config
               mountPath: /etc/tempo
             - name: tempo-data
-              mountPath: /tmp/tempo
+              mountPath: /var/tempo
       volumes:
         - name: tempo-config
           configMap:
@@ -115,4 +120,4 @@ spec:
       protocol: TCP
       name: zipkin
   selector:
-    app: tempo
+    app: tempo
\ No newline at end of file
diff --git a/k8s/observability-stack/README.md b/k8s/observability-stack/README_old.md
similarity index 100%
rename from k8s/observability-stack/README.md
rename to k8s/observability-stack/README_old.md
diff --git a/k8s/observability-stack/me.md b/k8s/observability-stack/me.md
new file mode 100644
index 0000000..c4ecec2
--- /dev/null
+++ b/k8s/observability-stack/me.md
@@ -0,0 +1,388 @@
+# 🧠 BetelgeuseBytes — Full AI Infrastructure Stack
+## Authoritative README, Architecture & Onboarding Guide
+
+This repository documents the **entire self-hosted AI infrastructure stack** running on a Kubernetes cluster hosted on **Hetzner dedicated servers**.
+
+The stack currently powers an **Islamic Hadith Scholar AI**, but it is intentionally designed as a **general-purpose, sovereign AI, MLOps, and data platform** that can support many future projects.
+
+This document is the **single source of truth** for:
+- architecture (logical & physical)
+- infrastructure configuration
+- networking & DNS
+- every deployed component
+- why each component exists
+- how to build new systems on top of the platform
+
+---
+
+## 1. Mission & Design Philosophy
+
+### Current Mission
+Build an AI system that can:
+
+- Parse classical Islamic texts
+- Extract **Sanad** (chains of narrators) and **Matn** (hadith text)
+- Identify narrators and their relationships:
+  - teacher / student
+  - familial lineage
+- Construct a **verifiable knowledge graph**
+- Support **human scholarly review**
+- Provide **transparent and explainable reasoning**
+- Operate **fully on-prem**, CPU-first, without SaaS or GPU dependency
+
+### Core Principles
+- **Sovereignty** — no external cloud lock-in
+- **Explainability** — graph + provenance, not black boxes
+- **Human-in-the-loop** — scholars remain in control
+- **Observability-first** — everything is measurable and traceable
+- **Composable** — every part can be reused or replaced
+
+---
+
+## 2. Physical Infrastructure (Hetzner)
+
+### Nodes
+- **Provider:** Hetzner
+- **Type:** Dedicated servers
+- **Architecture:** x86_64
+- **GPU:** None (CPU-only by design)
+- **Storage:** Local NVMe / SSD
+
+### Node Roles (Logical)
+| Node Type | Responsibilities |
+|---------|------------------|
+| Control / Worker | Kubernetes control plane + workloads |
+| Storage-heavy | Databases, MinIO, observability data |
+| Compute-heavy | LLM inference, embeddings, pipelines |
+
+> The cluster is intentionally **single-region and on-prem-like**, optimized for predictability and data locality.
+
+---
+
+## 3. Kubernetes Infrastructure Configuration
+
+### Kubernetes
+- Runtime for **all services**
+- Namespaced isolation
+- Explicit PersistentVolumeClaims
+- Declarative configuration (GitOps)
+
+### Namespaces (Conceptual)
+| Namespace | Purpose |
+|--------|--------|
+| `ai` | LLMs, embeddings, labeling |
+| `vec` | Vector database |
+| `graph` | Knowledge graph |
+| `db` | Relational databases |
+| `storage` | Object storage |
+| `mlops` | MLflow |
+| `ml` | Argo Workflows |
+| `auth` | Keycloak |
+| `observability` | LGTM stack |
+| `hadith` | Custom apps (orchestrator, UI) |
+
+---
+
+## 4. Networking & DNS
+
+### Ingress
+- **NGINX Ingress Controller**
+- HTTPS termination at ingress
+- Internal services communicate via ClusterIP
+
+### TLS
+- **cert-manager**
+- Let’s Encrypt
+- Automatic renewal
+
+### Public Endpoints
+
+| URL | Service |
+|----|--------|
+| https://llm.betelgeusebytes.io | LLM inference (Ollama / llama.cpp) |
+| https://embeddings.betelgeusebytes.io | Text Embeddings Inference |
+| https://vector.betelgeusebytes.io | Qdrant + UI |
+| https://neo4j.betelgeusebytes.io | Neo4j Browser |
+| https://hadith-api.betelgeusebytes.io | FastAPI Orchestrator |
+| https://hadith-admin.betelgeusebytes.io | Admin / Curation UI |
+| https://label.betelgeusebytes.io | Label Studio |
+| https://mlflow.betelgeusebytes.io | MLflow |
+| https://minio.betelgeusebytes.io | MinIO Console |
+| https://argo.betelgeusebytes.io | Argo Workflows |
+| https://auth.betelgeusebytes.io | Keycloak |
+| https://grafana.betelgeusebytes.io | Grafana |
+
+---
+
+## 5. Full Logical Architecture
+
+```mermaid
+flowchart LR
+  User --> AdminUI --> Orchestrator
+
+  Orchestrator --> LLM
+  Orchestrator --> TEI --> Qdrant
+  Orchestrator --> Neo4j
+  Orchestrator --> PostgreSQL
+  Orchestrator --> Redis
+
+  LabelStudio --> MinIO
+  MinIO --> ArgoWF --> MLflow
+  MLflow --> Models --> Orchestrator
+
+  Kafka --> ArgoWF
+
+  Alloy --> Prometheus --> Grafana
+  Alloy --> Loki --> Grafana
+  Alloy --> Tempo --> Grafana
+```
+6. AI & Reasoning Layer
+Ollama / llama.cpp (CPU LLM)
+Current usage
+
+JSON-structured extraction
+
+Sanad / matn reasoning
+
+Deterministic outputs
+
+No GPU dependency
+
+Future usage
+
+Offline assistants
+
+Document intelligence
+
+Agent frameworks
+
+Replaceable by vLLM when GPUs are added
+
+Text Embeddings Inference (TEI)
+Current usage
+
+Embeddings for hadith texts and biographies
+
+Future usage
+
+RAG systems
+
+Semantic search
+
+Deduplication
+
+Similarity clustering
+
+Qdrant (Vector Database)
+Current usage
+
+Stores embeddings
+
+Similarity search
+
+Future usage
+
+Recommendation systems
+
+Agent memory
+
+Multimodal retrieval
+
+Includes Web UI.
+
+7. Knowledge & Data Layer
+Neo4j (Graph Database)
+Current usage
+
+Isnād chains
+
+Narrator relationships
+
+Future usage
+
+Knowledge graphs
+
+Trust networks
+
+Provenance systems
+
+PostgreSQL
+Current usage
+
+App data
+
+MLflow backend
+
+Label Studio DB
+
+Future usage
+
+Feature stores
+
+Metadata catalogs
+
+Transactional apps
+
+Redis
+Current usage
+
+Caching
+
+Temporary state
+
+Future usage
+
+Job queues
+
+Rate limiting
+
+Sessions
+
+Kafka
+Current usage
+
+Optional async backbone
+
+Future usage
+
+Streaming ingestion
+
+Event-driven ML
+
+Audit pipelines
+
+MinIO (S3)
+Current usage
+
+Datasets
+
+Model artifacts
+
+Pipeline outputs
+
+Future usage
+
+Data lake
+
+Backups
+
+Feature storage
+
+8. MLOps & Human-in-the-Loop
+Label Studio
+Current usage
+
+Human annotation of narrators & relations
+
+Future usage
+
+Any labeling task (text, image, audio)
+
+MLflow
+Current usage
+
+Experiment tracking
+
+Model registry
+
+Future usage
+
+Governance
+
+Model promotion
+
+Auditing
+
+Argo Workflows
+Current usage
+
+ETL & training pipelines
+
+Future usage
+
+Batch inference
+
+Scheduled automation
+
+Data engineering
+
+9. Authentication & Security
+Keycloak
+Current usage
+
+SSO for Admin UI, MLflow, Label Studio
+
+Future usage
+
+API authentication
+
+Multi-tenant access
+
+Organization-wide IAM
+
+10. Observability Stack (LGTM)
+Components
+Grafana
+
+Prometheus
+
+Loki
+
+Tempo
+
+Grafana Alloy
+
+kube-state-metrics
+
+node-exporter
+
+Capabilities
+Metrics, logs, traces
+
+Automatic correlation
+
+OTLP-native
+
+Local SSD persistence
+
+11. Design Rules for All Custom Services
+All services must:
+
+be stateless
+
+use env vars & Kubernetes Secrets
+
+authenticate via Keycloak
+
+emit:
+
+Prometheus metrics
+
+OTLP traces
+
+structured JSON logs
+
+be deployable via kubectl & Argo CD
+
+12. Future Use Cases (Beyond Hadith)
+This platform can support:
+
+General Knowledge Graph AI
+
+Legal / scholarly document analysis
+
+Enterprise RAG systems
+
+Research data platforms
+
+Explainable AI systems
+
+Internal search engines
+
+Agent-based systems
+
+Provenance & trust scoring engines
+
+Digital humanities projects
+
+Offline sovereign AI deployments
\ No newline at end of file
diff --git a/k8s/observability-stack/test-loki-logs.sh b/k8s/observability-stack/test-loki-logs.sh
new file mode 100644
index 0000000..48c719b
--- /dev/null
+++ b/k8s/observability-stack/test-loki-logs.sh
@@ -0,0 +1,158 @@
+#!/bin/bash
+
+set -e
+
+GREEN='\033[0;32m'
+RED='\033[0;31m'
+YELLOW='\033[1;33m'
+BLUE='\033[0;34m'
+NC='\033[0m'
+
+echo -e "${BLUE}========================================${NC}"
+echo -e "${BLUE}   Loki Log Collection Test${NC}"
+echo -e "${BLUE}========================================${NC}"
+echo ""
+
+PASS=0
+FAIL=0
+
+# Test 1: Check Alloy DaemonSet
+echo -e "${YELLOW}Test 1: Checking Alloy DaemonSet...${NC}"
+if kubectl get pods -n observability -l app=alloy --no-headers 2>/dev/null | grep -q "Running"; then
+    ALLOY_COUNT=$(kubectl get pods -n observability -l app=alloy --no-headers | grep -c "Running")
+    echo -e "${GREEN}✓ Alloy is running ($ALLOY_COUNT pod(s))${NC}"
+    PASS=$((PASS+1))
+else
+    echo -e "${RED}✗ Alloy is not running${NC}"
+    FAIL=$((FAIL+1))
+fi
+echo ""
+
+# Test 2: Check Loki pod
+echo -e "${YELLOW}Test 2: Checking Loki pod...${NC}"
+if kubectl get pods -n observability -l app=loki --no-headers 2>/dev/null | grep -q "Running"; then
+    echo -e "${GREEN}✓ Loki is running${NC}"
+    PASS=$((PASS+1))
+else
+    echo -e "${RED}✗ Loki is not running${NC}"
+    FAIL=$((FAIL+1))
+fi
+echo ""
+
+# Test 3: Test Loki readiness endpoint
+echo -e "${YELLOW}Test 3: Testing Loki readiness endpoint...${NC}"
+READY=$(kubectl run test-loki-ready-$RANDOM --rm -i --restart=Never --image=curlimages/curl:latest -- \
+    curl -s -m 5 http://loki.observability.svc.cluster.local:3100/ready 2>/dev/null || echo "failed")
+
+if [ "$READY" = "ready" ]; then
+    echo -e "${GREEN}✓ Loki is ready${NC}"
+    PASS=$((PASS+1))
+else
+    echo -e "${RED}✗ Loki is not ready (response: $READY)${NC}"
+    FAIL=$((FAIL+1))
+fi
+echo ""
+
+# Test 4: Check Alloy can connect to Loki
+echo -e "${YELLOW}Test 4: Checking Alloy → Loki connectivity...${NC}"
+ALLOY_ERRORS=$(kubectl logs -n observability -l app=alloy --tail=50 2>/dev/null | grep -i "error.*loki" | wc -l)
+if [ "$ALLOY_ERRORS" -eq 0 ]; then
+    echo -e "${GREEN}✓ No Alloy → Loki connection errors${NC}"
+    PASS=$((PASS+1))
+else
+    echo -e "${RED}✗ Found $ALLOY_ERRORS error(s) in Alloy logs${NC}"
+    kubectl logs -n observability -l app=alloy --tail=20 | grep -i error
+    FAIL=$((FAIL+1))
+fi
+echo ""
+
+# Test 5: Create test pod and verify logs
+echo -e "${YELLOW}Test 5: Creating test pod and verifying log collection...${NC}"
+
+# Clean up any existing test pod
+kubectl delete pod test-logger-verify --ignore-not-found 2>/dev/null
+
+# Create test pod
+echo "  Creating test pod that logs every second..."
+kubectl run test-logger-verify --image=busybox --restart=Never -- sh -c \
+  'for i in 1 2 3 4 5 6 7 8 9 10; do echo "LOKI-TEST-LOG: Message number $i at $(date)"; sleep 1; done' \
+  >/dev/null 2>&1
+
+# Wait for pod to start and generate logs
+echo "  Waiting 15 seconds for logs to be collected..."
+sleep 15
+
+# Query Loki API for test logs
+echo "  Querying Loki for test logs..."
+START_TIME=$(date -u -d '2 minutes ago' +%s)000000000
+END_TIME=$(date -u +%s)000000000
+
+QUERY_RESULT=$(kubectl run test-loki-query-$RANDOM --rm -i --restart=Never --image=curlimages/curl:latest -- \
+    curl -s -m 10 "http://loki.observability.svc.cluster.local:3100/loki/api/v1/query_range" \
+    --data-urlencode 'query={pod="test-logger-verify"}' \
+    --data-urlencode "start=$START_TIME" \
+    --data-urlencode "end=$END_TIME" 2>/dev/null || echo "failed")
+
+if echo "$QUERY_RESULT" | grep -q "LOKI-TEST-LOG"; then
+    LOG_COUNT=$(echo "$QUERY_RESULT" | grep -o "LOKI-TEST-LOG" | wc -l)
+    echo -e "${GREEN}✓ Found $LOG_COUNT test log messages in Loki${NC}"
+    PASS=$((PASS+1))
+else
+    echo -e "${RED}✗ Test logs not found in Loki${NC}"
+    echo "  Response: ${QUERY_RESULT:0:200}"
+    FAIL=$((FAIL+1))
+fi
+
+# Clean up test pod
+kubectl delete pod test-logger-verify --ignore-not-found >/dev/null 2>&1
+
+echo ""
+
+# Test 6: Check observability namespace logs
+echo -e "${YELLOW}Test 6: Checking for observability namespace logs...${NC}"
+
+OBS_QUERY=$(kubectl run test-loki-obs-$RANDOM --rm -i --restart=Never --image=curlimages/curl:latest -- \
+    curl -s -m 10 "http://loki.observability.svc.cluster.local:3100/loki/api/v1/query_range" \
+    --data-urlencode 'query={namespace="observability"}' \
+    --data-urlencode "start=$START_TIME" \
+    --data-urlencode "end=$END_TIME" \
+    --data-urlencode "limit=10" 2>/dev/null || echo "failed")
+
+if echo "$OBS_QUERY" | grep -q '"values":\[\['; then
+    echo -e "${GREEN}✓ Observability namespace logs found in Loki${NC}"
+    PASS=$((PASS+1))
+else
+    echo -e "${RED}✗ No logs found for observability namespace${NC}"
+    FAIL=$((FAIL+1))
+fi
+
+echo ""
+echo -e "${BLUE}========================================${NC}"
+echo -e "${BLUE}   Test Results${NC}"
+echo -e "${BLUE}========================================${NC}"
+echo ""
+
+TOTAL=$((PASS+FAIL))
+echo -e "Passed: ${GREEN}$PASS${NC} / $TOTAL"
+echo -e "Failed: ${RED}$FAIL${NC} / $TOTAL"
+echo ""
+
+if [ $FAIL -eq 0 ]; then
+    echo -e "${GREEN}✓✓✓ All tests passed! Logs are flowing to Loki! ✓✓✓${NC}"
+    echo ""
+    echo "Next steps:"
+    echo "  1. Open Grafana: https://grafana.betelgeusebytes.io"
+    echo "  2. Go to Explore → Loki"
+    echo "  3. Query: {namespace=\"observability\"}"
+    echo ""
+else
+    echo -e "${RED}✗✗✗ Some tests failed. Check the output above for details. ✗✗✗${NC}"
+    echo ""
+    echo "Troubleshooting:"
+    echo "  - Check Alloy logs: kubectl logs -n observability -l app=alloy"
+    echo "  - Check Loki logs: kubectl logs -n observability loki-0"
+    echo "  - Verify services: kubectl get svc -n observability"
+    echo "  - See full guide: VERIFY-LOKI-LOGS.md"
+    echo ""
+    exit 1
+fi
\ No newline at end of file