From dfdd36db3ff8ab3f1a23d6e0d0234284c566e910 Mon Sep 17 00:00:00 2001 From: salahangal Date: Sun, 25 Jan 2026 21:15:43 +0100 Subject: [PATCH] adding betelgeusebytes.io deops part --- .DS_Store | Bin 0 -> 6148 bytes CLAUDE.md | 177 ++++++ DNS_RECORDS.txt | 10 + README.md | 43 ++ ansible/inventories/prod/group_vars/all.yml | 13 + ansible/inventories/prod/hosts.ini | 19 + ansible/playbooks/add-control-planes.yml | 19 + ansible/playbooks/site.yml | 31 + ansible/roles/cert_manager/tasks/main.yml | 66 ++ ansible/roles/cilium/tasks/main.yml | 9 + ansible/roles/common/tasks/main.yml | 31 + ansible/roles/containerd/tasks/main.yml | 27 + ansible/roles/ingress/tasks/main.yml | 2 + .../roles/kubeadm_cp_discovery/tasks/main.yml | 24 + ansible/roles/kubeadm_init/tasks/main.yml | 24 + .../templates/kubeadm-config.yaml.j2 | 14 + ansible/roles/kubeadm_join/tasks/main.yml | 2 + ansible/roles/kubeadm_join_cp/tasks/main.yml | 9 + ansible/roles/kubernetes/tasks/main.yml | 17 + ansible/roles/labels/tasks/main.yml | 4 + .../roles/storage_local_path/tasks/main.yml | 55 ++ .../templates/local-ssd-pv.yaml | 65 ++ ansible/roles/wireguard/tasks/main.yml | 62 ++ ansible/roles/wireguard/templates/wg0.conf.j2 | 12 + ansible/roles/wireguard/vars/main.yml | 6 + k8s/.DS_Store | Bin 0 -> 10244 bytes k8s/00-namespaces.yaml | 31 + k8s/01-secrets/basic-auth.yaml | 38 ++ k8s/argoflow/argo.yaml | 146 +++++ k8s/automation/n8n.yaml | 217 +++++++ k8s/cert-manager/cluster-issuer.yaml | 10 + k8s/elastic/elastic-pv.yaml | 21 + k8s/elastic/elasticsearch.yaml | 38 ++ k8s/elastic/kibana.yaml | 44 ++ k8s/gitea/gitea-pv.yaml | 21 + k8s/gitea/gitea.yaml | 54 ++ k8s/grafana/grafana.yaml | 45 ++ k8s/ingress-patch/kustomization.yaml | 49 ++ k8s/jupyter/jupyter.yaml | 68 +++ k8s/kafka/kafka-pv.yaml | 65 ++ k8s/kafka/kafka-ui.yaml | 44 ++ k8s/kafka/kafka.yaml | 45 ++ k8s/label_studio/label.yaml | 74 +++ k8s/minio/minio.yaml | 96 +++ k8s/mlflow/mlflow.yaml | 64 ++ k8s/neo4j/neo4j-pv.yaml | 21 + k8s/neo4j/neo4j.yaml | 107 ++++ k8s/observability-stack/00-namespace.yaml | 7 + .../01-persistent-volumes.yaml | 95 +++ .../02-persistent-volume-claims.yaml | 55 ++ .../03-prometheus-config.yaml | 169 ++++++ k8s/observability-stack/04-loki-config.yaml | 94 +++ k8s/observability-stack/05-tempo-config.yaml | 72 +++ k8s/observability-stack/06-alloy-config.yaml | 159 +++++ .../07-grafana-datasources.yaml | 62 ++ k8s/observability-stack/08-rbac.yaml | 178 ++++++ k8s/observability-stack/10-prometheus.yaml | 90 +++ k8s/observability-stack/11-loki.yaml | 96 +++ k8s/observability-stack/12-tempo.yaml | 118 ++++ k8s/observability-stack/13-grafana.yaml | 97 +++ k8s/observability-stack/14-alloy.yaml | 107 ++++ .../15-kube-state-metrics.yaml | 71 +++ k8s/observability-stack/16-node-exporter.yaml | 85 +++ .../20-grafana-ingress.yaml | 26 + .../21-optional-ingresses.yaml | 90 +++ .../DEPLOYMENT-CHECKLIST.md | 359 +++++++++++ k8s/observability-stack/DNS-SETUP.md | 146 +++++ k8s/observability-stack/MONITORING-GUIDE.md | 572 ++++++++++++++++++ k8s/observability-stack/QUICKREF.md | 398 ++++++++++++ k8s/observability-stack/README.md | 385 ++++++++++++ k8s/observability-stack/cleanup.sh | 62 ++ k8s/observability-stack/demo-app.yaml | 253 ++++++++ k8s/observability-stack/deploy.sh | 114 ++++ .../remove-old-monitoring.sh | 319 ++++++++++ k8s/observability-stack/status.sh | 115 ++++ k8s/observability/fluent-bit.yaml | 46 ++ k8s/otlp/otel-collector.yaml | 73 +++ k8s/postgres/.DS_Store | Bin 0 -> 6148 bytes k8s/postgres/pg.yaml | 217 +++++++ k8s/postgres/postgres-ha.yaml | 275 +++++++++ k8s/postgres/postgres.yaml | 122 ++++ k8s/postgres/secret.yaml | 7 + k8s/prometheus/prometheus-config.yaml | 13 + k8s/prometheus/prometheus.yaml | 55 ++ k8s/redis/redis-pv.yaml | 21 + k8s/redis/redis.yaml | 40 ++ k8s/scripts/cleanup.sh | 319 ++++++++++ k8s/sso/sso.yaml | 98 +++ k8s/storage/persistent-volumes.yaml | 175 ++++++ k8s/storage/storageclass.yaml | 6 + k8s/tei/tei.yaml | 37 ++ k8s/trading/ib-gateway.yaml | 541 +++++++++++++++++ k8s/trading/ib-gateway2.yaml | 169 ++++++ k8s/vector/qdrant.yaml | 80 +++ k8s/vllm/vllm.yaml | 142 +++++ 95 files changed, 8869 insertions(+) create mode 100644 .DS_Store create mode 100644 CLAUDE.md create mode 100644 DNS_RECORDS.txt create mode 100644 README.md create mode 100644 ansible/inventories/prod/group_vars/all.yml create mode 100644 ansible/inventories/prod/hosts.ini create mode 100644 ansible/playbooks/add-control-planes.yml create mode 100644 ansible/playbooks/site.yml create mode 100644 ansible/roles/cert_manager/tasks/main.yml create mode 100644 ansible/roles/cilium/tasks/main.yml create mode 100644 ansible/roles/common/tasks/main.yml create mode 100644 ansible/roles/containerd/tasks/main.yml create mode 100644 ansible/roles/ingress/tasks/main.yml create mode 100644 ansible/roles/kubeadm_cp_discovery/tasks/main.yml create mode 100644 ansible/roles/kubeadm_init/tasks/main.yml create mode 100644 ansible/roles/kubeadm_init/templates/kubeadm-config.yaml.j2 create mode 100644 ansible/roles/kubeadm_join/tasks/main.yml create mode 100644 ansible/roles/kubeadm_join_cp/tasks/main.yml create mode 100644 ansible/roles/kubernetes/tasks/main.yml create mode 100644 ansible/roles/labels/tasks/main.yml create mode 100644 ansible/roles/storage_local_path/tasks/main.yml create mode 100644 ansible/roles/storage_local_path/templates/local-ssd-pv.yaml create mode 100644 ansible/roles/wireguard/tasks/main.yml create mode 100644 ansible/roles/wireguard/templates/wg0.conf.j2 create mode 100644 ansible/roles/wireguard/vars/main.yml create mode 100644 k8s/.DS_Store create mode 100644 k8s/00-namespaces.yaml create mode 100644 k8s/01-secrets/basic-auth.yaml create mode 100644 k8s/argoflow/argo.yaml create mode 100644 k8s/automation/n8n.yaml create mode 100644 k8s/cert-manager/cluster-issuer.yaml create mode 100644 k8s/elastic/elastic-pv.yaml create mode 100644 k8s/elastic/elasticsearch.yaml create mode 100644 k8s/elastic/kibana.yaml create mode 100644 k8s/gitea/gitea-pv.yaml create mode 100644 k8s/gitea/gitea.yaml create mode 100644 k8s/grafana/grafana.yaml create mode 100644 k8s/ingress-patch/kustomization.yaml create mode 100644 k8s/jupyter/jupyter.yaml create mode 100644 k8s/kafka/kafka-pv.yaml create mode 100644 k8s/kafka/kafka-ui.yaml create mode 100644 k8s/kafka/kafka.yaml create mode 100644 k8s/label_studio/label.yaml create mode 100644 k8s/minio/minio.yaml create mode 100644 k8s/mlflow/mlflow.yaml create mode 100644 k8s/neo4j/neo4j-pv.yaml create mode 100644 k8s/neo4j/neo4j.yaml create mode 100644 k8s/observability-stack/00-namespace.yaml create mode 100644 k8s/observability-stack/01-persistent-volumes.yaml create mode 100644 k8s/observability-stack/02-persistent-volume-claims.yaml create mode 100644 k8s/observability-stack/03-prometheus-config.yaml create mode 100644 k8s/observability-stack/04-loki-config.yaml create mode 100644 k8s/observability-stack/05-tempo-config.yaml create mode 100644 k8s/observability-stack/06-alloy-config.yaml create mode 100644 k8s/observability-stack/07-grafana-datasources.yaml create mode 100644 k8s/observability-stack/08-rbac.yaml create mode 100644 k8s/observability-stack/10-prometheus.yaml create mode 100644 k8s/observability-stack/11-loki.yaml create mode 100644 k8s/observability-stack/12-tempo.yaml create mode 100644 k8s/observability-stack/13-grafana.yaml create mode 100644 k8s/observability-stack/14-alloy.yaml create mode 100644 k8s/observability-stack/15-kube-state-metrics.yaml create mode 100644 k8s/observability-stack/16-node-exporter.yaml create mode 100644 k8s/observability-stack/20-grafana-ingress.yaml create mode 100644 k8s/observability-stack/21-optional-ingresses.yaml create mode 100644 k8s/observability-stack/DEPLOYMENT-CHECKLIST.md create mode 100644 k8s/observability-stack/DNS-SETUP.md create mode 100644 k8s/observability-stack/MONITORING-GUIDE.md create mode 100644 k8s/observability-stack/QUICKREF.md create mode 100644 k8s/observability-stack/README.md create mode 100755 k8s/observability-stack/cleanup.sh create mode 100644 k8s/observability-stack/demo-app.yaml create mode 100755 k8s/observability-stack/deploy.sh create mode 100755 k8s/observability-stack/remove-old-monitoring.sh create mode 100755 k8s/observability-stack/status.sh create mode 100644 k8s/observability/fluent-bit.yaml create mode 100644 k8s/otlp/otel-collector.yaml create mode 100644 k8s/postgres/.DS_Store create mode 100644 k8s/postgres/pg.yaml create mode 100644 k8s/postgres/postgres-ha.yaml create mode 100644 k8s/postgres/postgres.yaml create mode 100644 k8s/postgres/secret.yaml create mode 100644 k8s/prometheus/prometheus-config.yaml create mode 100644 k8s/prometheus/prometheus.yaml create mode 100644 k8s/redis/redis-pv.yaml create mode 100644 k8s/redis/redis.yaml create mode 100755 k8s/scripts/cleanup.sh create mode 100644 k8s/sso/sso.yaml create mode 100644 k8s/storage/persistent-volumes.yaml create mode 100644 k8s/storage/storageclass.yaml create mode 100644 k8s/tei/tei.yaml create mode 100644 k8s/trading/ib-gateway.yaml create mode 100644 k8s/trading/ib-gateway2.yaml create mode 100644 k8s/vector/qdrant.yaml create mode 100644 k8s/vllm/vllm.yaml diff --git a/.DS_Store b/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..59c966f3636a4479b77915742bf1b5f1e542408d GIT binary patch literal 6148 zcmeHK%}*0S6n_I%S`b;FRlt~R=*0wrKoB9uP)dmsMF>R%0qbr%)D7F2W_S4xN&f&n z=-tGtSC4w}=*6oy3TWdv88|yEAVW07!bHFaTfxKopC> zNF!D!h_H)tOzL!xFcLY24KC0rm*;Kwn0MF_G7vKGzcC=rZUb1rzy}8oonLOFMO+JM zEPcPEpau|w3AhjA@DMUE34Kt4f;#bkBHql{4ZKoCTJ{s3!xQ0!5dVM-D6Stu)_Y1z*Nyl_467#&Sm2-rm5_ z;9##gFnD8lzt`Lwy*bir4v&oP@5iEv!~xplY1_;Xx43c)*5O0&i)yg)_o+R1O~ z{gBVq{d_Zw>W*TKO&6LkwivB#?XB_FL?UtNaz|&^mCl^eZd>{Fq9;~5!Cmg9-3_{2 z^qfMrY~>wRTny-Tkac!x&S(2VL_!W%P+3bjacEc$6|Z-ptyXPlLrn`clql z@$(*c-C6ECfy3F-ijO=`1++B9);P|%i5$DNIf>sIHSqb ztg^%~@AcrTJ`D@_67IrFcnxpi1AKz7@Ev}V24awI(nE&GZ8Apgk}2|-KC&8){<&WhM0ykqi^jN<{BKKXdniL`t8Kw|TBq8Ar!I$d}e6*3Sq zaE2KW_k)Q=U`k^}p}ab<5_gfXM+6E%U3@eMG&QC)RurNIg{V+O6-x9k22tVIZ)(4k z#)?7}4n)q31S_46r6N*s2No%4Glhbf(pXW5Jt*OafTSU; KLI(aR1HS<&VFoGy literal 0 HcmV?d00001 diff --git a/CLAUDE.md b/CLAUDE.md new file mode 100644 index 0000000..3a7aaa9 --- /dev/null +++ b/CLAUDE.md @@ -0,0 +1,177 @@ +# CLAUDE.md - BetelgeuseBytes Full Stack + +## Project Overview + +Kubernetes cluster deployment for BetelgeuseBytes using Ansible for infrastructure automation and kubectl for application deployment. This is a complete data science/ML platform with integrated observability, databases, and ML tools. + +**Infrastructure:** +- 2-node Kubernetes cluster on Hetzner Cloud +- Control plane + worker: hetzner-1 (95.217.89.53) +- Worker node: hetzner-2 (138.201.254.97) +- Kubernetes v1.30.3 with Cilium CNI + +## Directory Structure + +``` +. +├── ansible/ # Infrastructure-as-Code for cluster setup +│ ├── inventories/prod/ # Hetzner nodes inventory & group vars +│ │ ├── hosts.ini # Node definitions +│ │ └── group_vars/all.yml # Global K8s config (versions, CIDRs) +│ ├── playbooks/ +│ │ ├── site.yml # Main cluster bootstrap playbook +│ │ └── add-control-planes.yml # HA control plane expansion +│ └── roles/ # 16 reusable Ansible roles +│ ├── common/ # Swap disable, kernel modules, sysctl +│ ├── containerd/ # Container runtime +│ ├── kubernetes/ # kubeadm, kubelet, kubectl +│ ├── kubeadm_init/ # Primary control plane init +│ ├── kubeadm_join/ # Worker node join +│ ├── cilium/ # CNI plugin +│ ├── ingress/ # NGINX Ingress Controller +│ ├── cert_manager/ # Let's Encrypt integration +│ ├── labels/ # Node labeling +│ └── storage_local_path/ # Local storage provisioning +└── k8s/ # Kubernetes manifests + ├── 00-namespaces.yaml # 8 namespaces + ├── 01-secrets/ # Basic auth secrets + ├── storage/ # StorageClass, PersistentVolumes + ├── postgres/ # PostgreSQL 16 with extensions + ├── redis/ # Redis 7 cache + ├── elastic/ # Elasticsearch 8.14 + Kibana + ├── gitea/ # Git repository service + ├── jupyter/ # JupyterLab notebook + ├── kafka/ # Apache Kafka broker + ├── neo4j/ # Neo4j graph database + ├── prometheus/ # Prometheus monitoring + ├── grafana/ # Grafana dashboards + ├── minio/ # S3-compatible object storage + ├── mlflow/ # ML lifecycle tracking + ├── vllm/ # LLM inference (Ollama) + ├── label_studio/ # Data annotation platform + ├── argoflow/ # Argo Workflows + ├── otlp/ # OpenTelemetry collector + └── observability/ # Fluent-Bit log aggregation +``` + +## Build & Deployment Commands + +### Phase 1: Cluster Infrastructure + +```bash +# Validate connectivity +ansible -i ansible/inventories/prod/hosts.ini all -m ping + +# Bootstrap Kubernetes cluster +ansible-playbook -i ansible/inventories/prod/hosts.ini ansible/playbooks/site.yml +``` + +### Phase 2: Kubernetes Applications (order matters) + +```bash +# 1. Namespaces & storage +kubectl apply -f k8s/00-namespaces.yaml +kubectl apply -f k8s/storage/storageclass.yaml + +# 2. Secrets & auth +kubectl apply -f k8s/01-secrets/ + +# 3. Infrastructure (databases, cache, search) +kubectl apply -f k8s/postgres/ +kubectl apply -f k8s/redis/ +kubectl apply -f k8s/elastic/elasticsearch.yaml +kubectl apply -f k8s/elastic/kibana.yaml + +# 4. Application layer +kubectl apply -f k8s/gitea/ +kubectl apply -f k8s/jupyter/ +kubectl apply -f k8s/kafka/kafka.yaml +kubectl apply -f k8s/kafka/kafka-ui.yaml +kubectl apply -f k8s/neo4j/ + +# 5. Observability & telemetry +kubectl apply -f k8s/otlp/ +kubectl apply -f k8s/observability/fluent-bit.yaml +kubectl apply -f k8s/prometheus/ +kubectl apply -f k8s/grafana/ +``` + +## Namespace Organization + +| Namespace | Purpose | Services | +|-----------|---------|----------| +| `db` | Databases & cache | PostgreSQL, Redis | +| `scm` | Source control | Gitea | +| `ml` | Machine Learning | JupyterLab, MLflow, Argo, Label Studio, Ollama | +| `elastic` | Search & logging | Elasticsearch, Kibana | +| `broker` | Message brokers | Kafka | +| `graph` | Graph databases | Neo4j | +| `monitoring` | Observability | Prometheus, Grafana | +| `observability` | Telemetry | OpenTelemetry, Fluent-Bit | +| `storage` | Object storage | MinIO | + +## Key Configuration + +**Kubernetes:** +- Pod CIDR: 10.244.0.0/16 +- Service CIDR: 10.96.0.0/12 +- CNI: Cilium v1.15.7 + +**Storage:** +- StorageClass: `local-ssd-hetzner` (local volumes) +- All stateful workloads pinned to hetzner-2 +- Local path: `/mnt/local-ssd/{service-name}` + +**Networking:** +- Internal DNS: `service.namespace.svc.cluster.local` +- External: `{service}.betelgeusebytes.io` via NGINX Ingress +- TLS: Let's Encrypt via cert-manager + +## DNS Records + +A records point to both nodes: +- `apps.betelgeusebytes.io` → 95.217.89.53, 138.201.254.97 + +CNAMEs to `apps.betelgeusebytes.io`: +- gitea, kibana, grafana, prometheus, notebook, broker, neo4j, otlp, label, llm, mlflow, minio + +## Secrets Location + +- `k8s/01-secrets/basic-auth.yaml` - HTTP basic auth for protected services +- Service-specific secrets inline in respective manifests (e.g., postgres-auth, redis-auth) + +## Manifest Conventions + +1. Compact YAML style: `metadata: { name: xyz, namespace: ns }` +2. StatefulSets for persistent services (databases, brokers) +3. Deployments for stateless services (web UIs, workers) +4. DaemonSets for node-level agents (Fluent-Bit) +5. Service port=80 for ingress routing, backend maps to container port +6. Ingress with TLS + basic auth annotations where needed + +## Common Operations + +```bash +# Check cluster status +kubectl get nodes +kubectl get pods -A + +# View logs for a service +kubectl logs -n -l app= + +# Scale a deployment +kubectl scale -n deployment/ --replicas=N + +# Apply changes to a specific service +kubectl apply -f k8s// + +# Delete and recreate a service +kubectl delete -f k8s// && kubectl apply -f k8s// +``` + +## Notes + +- This is a development/test setup; passwords are hardcoded in manifests +- Elasticsearch security is disabled for development +- GPU support for vLLM is commented out (requires nvidia.com/gpu resources) +- Neo4j Bolt protocol (7687) requires manual ingress-nginx TCP patch diff --git a/DNS_RECORDS.txt b/DNS_RECORDS.txt new file mode 100644 index 0000000..4bfbcae --- /dev/null +++ b/DNS_RECORDS.txt @@ -0,0 +1,10 @@ +apps.betelgeusebytes.io. 300 IN A 95.217.89.53 +apps.betelgeusebytes.io. 300 IN A 138.201.254.97 +gitea.betelgeusebytes.io. 300 IN CNAME apps.betelgeusebytes.io. +kibana.betelgeusebytes.io. 300 IN CNAME apps.betelgeusebytes.io. +grafana.betelgeusebytes.io. 300 IN CNAME apps.betelgeusebytes.io. +prometheus.betelgeusebytes.io. 300 IN CNAME apps.betelgeusebytes.io. +notebook.betelgeusebytes.io. 300 IN CNAME apps.betelgeusebytes.io. +broker.betelgeusebytes.io. 300 IN CNAME apps.betelgeusebytes.io. +neo4j.betelgeusebytes.io. 300 IN CNAME apps.betelgeusebytes.io. +otlp.betelgeusebytes.io. 300 IN CNAME apps.betelgeusebytes.io. diff --git a/README.md b/README.md new file mode 100644 index 0000000..d145404 --- /dev/null +++ b/README.md @@ -0,0 +1,43 @@ +# BetelgeuseBytes K8s — Full Stack (kubectl-only) + +**Nodes** +- Control-plane + worker: hetzner-1 (95.217.89.53) +- Worker: hetzner-2 (138.201.254.97) + +## Bring up the cluster +```bash +ansible -i ansible/inventories/prod/hosts.ini all -m ping +ansible-playbook -i ansible/inventories/prod/hosts.ini ansible/playbooks/site.yml +``` + +## Apply apps (edit secrets first) +```bash +kubectl apply -f k8s/00-namespaces.yaml +kubectl apply -f k8s/01-secrets/ +kubectl apply -f k8s/storage/storageclass.yaml + +kubectl apply -f k8s/postgres/ +kubectl apply -f k8s/redis/ +kubectl apply -f k8s/elastic/elasticsearch.yaml +kubectl apply -f k8s/elastic/kibana.yaml + +kubectl apply -f k8s/gitea/ +kubectl apply -f k8s/jupyter/ +kubectl apply -f k8s/kafka/kafka.yaml +kubectl apply -f k8s/kafka/kafka-ui.yaml +kubectl apply -f k8s/neo4j/ + +kubectl apply -f k8s/otlp/ +kubectl apply -f k8s/observability/fluent-bit.yaml +kubectl apply -f k8s/prometheus/ +kubectl apply -f k8s/grafana/ +``` + +## DNS +A records: +- apps.betelgeusebytes.io → 95.217.89.53, 138.201.254.97 + +CNAMEs → apps.betelgeusebytes.io: +- gitea., kibana., grafana., prometheus., notebook., broker., neo4j., otlp. + +(HA later) cp.k8s.betelgeusebytes.io → , 95.217.89.53, 138.201.254.97; then set control_plane_endpoint accordingly. diff --git a/ansible/inventories/prod/group_vars/all.yml b/ansible/inventories/prod/group_vars/all.yml new file mode 100644 index 0000000..eb233d1 --- /dev/null +++ b/ansible/inventories/prod/group_vars/all.yml @@ -0,0 +1,13 @@ +cluster_name: prod +k8s_version: "v1.30.3" +control_plane_endpoint: "95.217.89.53:6443" # switch later to cp.k8s.betelgeusebytes.io:6443 + +pod_cidr: "10.244.0.0/16" +service_cidr: "10.96.0.0/12" +cilium_version: "1.15.7" + +local_path_dir: "/srv/k8s" +local_sc_name: "local-ssd-hetzner" + +stateful_node_label_key: "node" +stateful_node_label_val: "hetzner-2" diff --git a/ansible/inventories/prod/hosts.ini b/ansible/inventories/prod/hosts.ini new file mode 100644 index 0000000..c4813a9 --- /dev/null +++ b/ansible/inventories/prod/hosts.ini @@ -0,0 +1,19 @@ +[k8s_control_plane] +hetzner-1 ansible_host=95.217.89.53 public_ip=95.217.89.53 wg_address=10.66.0.11 + +[k8s_workers] +hetzner-1 ansible_host=95.217.89.53 public_ip=95.217.89.53 wg_address=10.66.0.11 +hetzner-2 ansible_host=138.201.254.97 public_ip=138.201.254.97 wg_address=10.66.0.12 + +[k8s_nodes:children] +k8s_control_plane +k8s_workers + +# add tiny VPS control-planes here when ready +[new_control_planes] +# cp-a ansible_host= public_ip= wg_address=10.66.0.10 + +[all:vars] +ansible_user=root +ansible_password=3Lcd0504 +ansible_become=true diff --git a/ansible/playbooks/add-control-planes.yml b/ansible/playbooks/add-control-planes.yml new file mode 100644 index 0000000..e3671c0 --- /dev/null +++ b/ansible/playbooks/add-control-planes.yml @@ -0,0 +1,19 @@ +- hosts: k8s_control_plane[0] + become: yes + roles: + - kubeadm_cp_discovery + +- hosts: new_control_planes + become: yes + roles: + - common + - wireguard + - containerd + - kubernetes + +- hosts: new_control_planes + become: yes + roles: + - kubeadm_join_cp + vars: + kubeadm_cp_join_cmd: "{{ hostvars[groups['k8s_control_plane'][0]].kubeadm_cp_join_cmd | default(kubeadm_cp_join_cmd) }}" diff --git a/ansible/playbooks/site.yml b/ansible/playbooks/site.yml new file mode 100644 index 0000000..c3e447d --- /dev/null +++ b/ansible/playbooks/site.yml @@ -0,0 +1,31 @@ +- hosts: k8s_nodes + become: yes + # serial: 1 + roles: + # - ../roles/common + #- ../roles/wireguard + #- ../roles/containerd + #- ../roles/kubernetes + +- hosts: k8s_control_plane + become: yes + roles: + - ../roles/kubeadm_init + +# - hosts: k8s_workers +# become: yes +# roles: +# - ../roles/kubeadm_join + +- hosts: k8s_control_plane + become: yes + roles: + # - ../roles/cilium + # - ../roles/ingress + #- ../roles/cert_manager + +- hosts: k8s_nodes + become: yes + roles: + #- ../roles/storage_local_path + - ../roles/labels diff --git a/ansible/roles/cert_manager/tasks/main.yml b/ansible/roles/cert_manager/tasks/main.yml new file mode 100644 index 0000000..607e854 --- /dev/null +++ b/ansible/roles/cert_manager/tasks/main.yml @@ -0,0 +1,66 @@ +- name: Install cert-manager + shell: kubectl apply -f https://github.com/cert-manager/cert-manager/releases/download/v1.14.4/cert-manager.yaml + +- name: Wait for cert-manager pods to be ready + shell: kubectl wait --for=condition=ready --timeout=300s pod -l app.kubernetes.io/instance=cert-manager -n cert-manager + +- name: Wait for webhook endpoint to be ready + shell: | + for i in {1..30}; do + if kubectl get endpoints cert-manager-webhook -n cert-manager -o jsonpath='{.subsets[*].addresses[*].ip}' | grep -q .; then + echo "Webhook endpoint is ready" + exit 0 + fi + echo "Waiting for webhook endpoint... attempt $i/30" + sleep 2 + done + exit 1 + +- name: Test webhook connectivity + shell: kubectl run test-webhook --image=curlimages/curl:latest --rm -i --restart=Never -- curl -k https://cert-manager-webhook.cert-manager.svc:443/healthz + register: webhook_test + ignore_errors: yes + +- name: Display webhook test result + debug: + var: webhook_test + +- name: ClusterIssuer + copy: + dest: /root/cluster-issuer-prod.yaml + content: | + apiVersion: cert-manager.io/v1 + kind: ClusterIssuer + metadata: + name: letsencrypt-prod + spec: + acme: +- name: ClusterIssuer + copy: + dest: /root/cluster-issuer-prod.yaml + content: | + apiVersion: cert-manager.io/v1 + kind: ClusterIssuer + metadata: + name: letsencrypt-prod + spec: + acme: + email: admin@betelgeusebytes.io + server: https://acme-v02.api.letsencrypt.org/directory + privateKeySecretRef: + name: letsencrypt-prod-key + solvers: + - http01: + ingress: + class: nginx + +- name: Temporarily disable cert-manager webhook + shell: | + kubectl delete validatingwebhookconfiguration cert-manager-webhook || true + ignore_errors: yes + +- name: Apply ClusterIssuer + command: kubectl apply -f /root/cluster-issuer-prod.yaml + +- name: Reinstall cert-manager to restore webhook + shell: kubectl apply -f https://github.com/cert-manager/cert-manager/releases/download/v1.14.4/cert-manager.yaml diff --git a/ansible/roles/cilium/tasks/main.yml b/ansible/roles/cilium/tasks/main.yml new file mode 100644 index 0000000..4acd7d1 --- /dev/null +++ b/ansible/roles/cilium/tasks/main.yml @@ -0,0 +1,9 @@ +- name: Install cilium CLI + shell: | + curl -L --fail --remote-name-all https://github.com/cilium/cilium-cli/releases/latest/download/cilium-linux-amd64.tar.gz + tar xzf cilium-linux-amd64.tar.gz -C /usr/local/bin + args: { creates: /usr/local/bin/cilium } + +- name: Deploy cilium + shell: | + cilium install --version {{ cilium_version }} --set kubeProxyReplacement=strict --set bpf.masquerade=true diff --git a/ansible/roles/common/tasks/main.yml b/ansible/roles/common/tasks/main.yml new file mode 100644 index 0000000..0819493 --- /dev/null +++ b/ansible/roles/common/tasks/main.yml @@ -0,0 +1,31 @@ +- name: Disable swap + command: swapoff -a + when: ansible_swaptotal_mb|int > 0 + +- name: Ensure swap disabled on boot + replace: + path: /etc/fstab + regexp: '^([^#].*\sswap\s)' + replace: '# \1' + +- name: Kernel modules + copy: + dest: /etc/modules-load.d/containerd.conf + content: | + overlay + br_netfilter + +- name: Load modules + command: modprobe {{ item }} + loop: [overlay, br_netfilter] + +- name: Sysctl for k8s + copy: + dest: /etc/sysctl.d/99-kubernetes.conf + content: | + net.bridge.bridge-nf-call-iptables = 1 + net.bridge.bridge-nf-call-ip6tables = 1 + net.ipv4.ip_forward = 1 + vm.max_map_count = 262144 +- name: Apply sysctl + command: sysctl --system diff --git a/ansible/roles/containerd/tasks/main.yml b/ansible/roles/containerd/tasks/main.yml new file mode 100644 index 0000000..02752e4 --- /dev/null +++ b/ansible/roles/containerd/tasks/main.yml @@ -0,0 +1,27 @@ +- name: Install containerd + apt: + name: containerd + state: present + update_cache: yes + +- name: Ensure containerd config directory + file: + path: /etc/containerd + state: directory + mode: '0755' + +- name: Generate default config + shell: containerd config default > /etc/containerd/config.toml + args: { creates: /etc/containerd/config.toml } + +- name: Ensure SystemdCgroup=true + replace: + path: /etc/containerd/config.toml + regexp: 'SystemdCgroup = false' + replace: 'SystemdCgroup = true' + +- name: Restart containerd + service: + name: containerd + state: restarted + enabled: yes diff --git a/ansible/roles/ingress/tasks/main.yml b/ansible/roles/ingress/tasks/main.yml new file mode 100644 index 0000000..ec2c44d --- /dev/null +++ b/ansible/roles/ingress/tasks/main.yml @@ -0,0 +1,2 @@ +- name: Deploy ingress-nginx (baremetal) + shell: kubectl apply -f https://raw.githubusercontent.com/kubernetes/ingress-nginx/main/deploy/static/provider/baremetal/deploy.yaml diff --git a/ansible/roles/kubeadm_cp_discovery/tasks/main.yml b/ansible/roles/kubeadm_cp_discovery/tasks/main.yml new file mode 100644 index 0000000..bdfa3c7 --- /dev/null +++ b/ansible/roles/kubeadm_cp_discovery/tasks/main.yml @@ -0,0 +1,24 @@ +- name: Upload certs and get certificate key + shell: kubeadm init phase upload-certs --upload-certs | tail -n 1 + register: cert_key + +- name: Compute CA cert hash + shell: | + openssl x509 -pubkey -in /etc/kubernetes/pki/ca.crt | openssl rsa -pubin -outform der 2>/dev/null | openssl dgst -sha256 -hex | awk '{print $2}' + register: ca_hash + +- name: Create short-lived token + shell: kubeadm token create --ttl 30m + register: join_token + +- name: Determine control-plane endpoint + set_fact: + cp_endpoint: "{{ hostvars[inventory_hostname].control_plane_endpoint | default(ansible_host ~ ':6443') }}" + +- set_fact: + kubeadm_cp_join_cmd: >- + kubeadm join {{ cp_endpoint }} + --token {{ join_token.stdout }} + --discovery-token-ca-cert-hash sha256:{{ ca_hash.stdout }} + --control-plane + --certificate-key {{ cert_key.stdout }} diff --git a/ansible/roles/kubeadm_init/tasks/main.yml b/ansible/roles/kubeadm_init/tasks/main.yml new file mode 100644 index 0000000..aadccc4 --- /dev/null +++ b/ansible/roles/kubeadm_init/tasks/main.yml @@ -0,0 +1,24 @@ +# - name: Write kubeadm config +# template: +# src: kubeadm-config.yaml.j2 +# dest: /etc/kubernetes/kubeadm-config.yaml + +# - name: Pre-pull images +# command: kubeadm config images pull + +# - name: Init control-plane +# command: kubeadm init --config=/etc/kubernetes/kubeadm-config.yaml +# args: { creates: /etc/kubernetes/admin.conf } + +# - name: Setup kubeconfig +# shell: | +# mkdir -p $HOME/.kube +# cp -i /etc/kubernetes/admin.conf $HOME/.kube/config +# chown $(id -u):$(id -g) $HOME/.kube/config + +- name: Save join command + shell: kubeadm token create --print-join-command + register: join_cmd + +- set_fact: + kubeadm_join_command_all: "{{ join_cmd.stdout }}" diff --git a/ansible/roles/kubeadm_init/templates/kubeadm-config.yaml.j2 b/ansible/roles/kubeadm_init/templates/kubeadm-config.yaml.j2 new file mode 100644 index 0000000..014fed2 --- /dev/null +++ b/ansible/roles/kubeadm_init/templates/kubeadm-config.yaml.j2 @@ -0,0 +1,14 @@ +apiVersion: kubeadm.k8s.io/v1beta3 +kind: ClusterConfiguration +kubernetesVersion: {{ k8s_version }} +clusterName: {{ cluster_name }} +controlPlaneEndpoint: "{{ control_plane_endpoint }}" +networking: + podSubnet: "{{ pod_cidr }}" + serviceSubnet: "{{ service_cidr }}" +--- +apiVersion: kubeadm.k8s.io/v1beta3 +kind: InitConfiguration +nodeRegistration: + kubeletExtraArgs: + node-ip: "{{ hostvars[inventory_hostname].wg_address | default(hostvars[inventory_hostname].public_ip) }}" diff --git a/ansible/roles/kubeadm_join/tasks/main.yml b/ansible/roles/kubeadm_join/tasks/main.yml new file mode 100644 index 0000000..5a6101c --- /dev/null +++ b/ansible/roles/kubeadm_join/tasks/main.yml @@ -0,0 +1,2 @@ +- name: Join node to cluster + command: "{{ hostvars[groups['k8s_control_plane'][0]].kubeadm_join_command_all }} --ignore-preflight-errors=FileAvailable--etc-kubernetes-kubelet.conf,FileAvailable--etc-kubernetes-pki-ca.crt,Port-10250" diff --git a/ansible/roles/kubeadm_join_cp/tasks/main.yml b/ansible/roles/kubeadm_join_cp/tasks/main.yml new file mode 100644 index 0000000..b5e98cc --- /dev/null +++ b/ansible/roles/kubeadm_join_cp/tasks/main.yml @@ -0,0 +1,9 @@ +- name: Ensure join command provided + fail: + msg: "Set kubeadm_cp_join_cmd variable (string)" + when: kubeadm_cp_join_cmd is not defined + +- name: Join node as control-plane + command: "{{ kubeadm_cp_join_cmd }}" + args: + creates: /etc/kubernetes/kubelet.conf diff --git a/ansible/roles/kubernetes/tasks/main.yml b/ansible/roles/kubernetes/tasks/main.yml new file mode 100644 index 0000000..b1f80a1 --- /dev/null +++ b/ansible/roles/kubernetes/tasks/main.yml @@ -0,0 +1,17 @@ +- name: Install Kubernetes apt key + shell: curl -fsSL https://pkgs.k8s.io/core:/stable:/v1.30/deb/Release.key | gpg --dearmor -o /etc/apt/keyrings/kubernetes-apt-keyring.gpg + args: { creates: /etc/apt/keyrings/kubernetes-apt-keyring.gpg } + +- name: Add Kubernetes repo + apt_repository: + repo: "deb [signed-by=/etc/apt/keyrings/kubernetes-apt-keyring.gpg] https://pkgs.k8s.io/core:/stable:/v1.30/deb/ /" + state: present + +- name: Install kubeadm, kubelet, kubectl + apt: + name: [kubeadm, kubelet, kubectl] + state: present + update_cache: yes + +- name: Hold kube packages + command: apt-mark hold kubeadm kubelet kubectl diff --git a/ansible/roles/labels/tasks/main.yml b/ansible/roles/labels/tasks/main.yml new file mode 100644 index 0000000..a49379d --- /dev/null +++ b/ansible/roles/labels/tasks/main.yml @@ -0,0 +1,4 @@ +- name: Label hetzner-2 for stateful + command: kubectl label node hetzner-2 {{ stateful_node_label_key }}={{ stateful_node_label_val }} --overwrite + delegate_to: "{{ groups['k8s_control_plane'][0] }}" + run_once: true diff --git a/ansible/roles/storage_local_path/tasks/main.yml b/ansible/roles/storage_local_path/tasks/main.yml new file mode 100644 index 0000000..17715e5 --- /dev/null +++ b/ansible/roles/storage_local_path/tasks/main.yml @@ -0,0 +1,55 @@ +- name: Ensure local path dir + file: + path: "{{ local_path_dir }}" + state: directory + mode: '0777' + +- name: StorageClass local-ssd-hetzner + copy: + dest: /root/local-sc.yaml + content: | + apiVersion: storage.k8s.io/v1 + kind: StorageClass + metadata: + name: {{ local_sc_name }} + provisioner: kubernetes.io/no-provisioner + volumeBindingMode: WaitForFirstConsumer + when: inventory_hostname in groups['k8s_control_plane'] + +- name: Apply SC + command: kubectl apply -f /root/local-sc.yaml + environment: + KUBECONFIG: /etc/kubernetes/admin.conf + when: inventory_hostname in groups['k8s_control_plane'] + +- name: Create local-path directory + file: + path: /mnt/local-ssd + state: directory + mode: '0755' + +- name: Create subdirectories for each PV + file: + path: "/mnt/local-ssd/{{ item }}" + state: directory + mode: '0755' + loop: + - postgres + - prometheus + - elasticsearch + - grafana + +- name: Copy PV manifest + template: + src: local-ssd-pv.yaml + dest: /tmp/local-ssd-pv.yaml + +- name: Apply PV + command: kubectl apply -f /tmp/local-ssd-pv.yaml + run_once: true + delegate_to: "{{ groups['k8s_control_plane'][0] }}" + +- name: Apply SC + command: kubectl apply -f /tmp/local-ssd-sc.yaml + run_once: true + delegate_to: "{{ groups['k8s_control_plane'][0] }}" diff --git a/ansible/roles/storage_local_path/templates/local-ssd-pv.yaml b/ansible/roles/storage_local_path/templates/local-ssd-pv.yaml new file mode 100644 index 0000000..3065708 --- /dev/null +++ b/ansible/roles/storage_local_path/templates/local-ssd-pv.yaml @@ -0,0 +1,65 @@ +apiVersion: v1 +kind: PersistentVolume +metadata: + name: local-ssd-postgres +spec: + capacity: + storage: 100Gi + accessModes: + - ReadWriteOnce + persistentVolumeReclaimPolicy: Retain + storageClassName: local-ssd-hetzner + local: + path: /mnt/local-ssd/postgres + nodeAffinity: + required: + nodeSelectorTerms: + - matchExpressions: + - key: kubernetes.io/hostname + operator: In + values: + - hetzner-2 +--- +apiVersion: v1 +kind: PersistentVolume +metadata: + name: local-ssd-prometheus +spec: + capacity: + storage: 100Gi + accessModes: + - ReadWriteOnce + persistentVolumeReclaimPolicy: Retain + storageClassName: local-ssd-hetzner + local: + path: /mnt/local-ssd/prometheus + nodeAffinity: + required: + nodeSelectorTerms: + - matchExpressions: + - key: kubernetes.io/hostname + operator: In + values: + - hetzner-2 +--- +apiVersion: v1 +kind: PersistentVolume +metadata: + name: local-ssd-elasticsearch +spec: + capacity: + storage: 300Gi + accessModes: + - ReadWriteOnce + persistentVolumeReclaimPolicy: Retain + storageClassName: local-ssd-hetzner + local: + path: /mnt/local-ssd/elasticsearch + nodeAffinity: + required: + nodeSelectorTerms: + - matchExpressions: + - key: kubernetes.io/hostname + operator: In + values: + - hetzner-2 diff --git a/ansible/roles/wireguard/tasks/main.yml b/ansible/roles/wireguard/tasks/main.yml new file mode 100644 index 0000000..2d1a734 --- /dev/null +++ b/ansible/roles/wireguard/tasks/main.yml @@ -0,0 +1,62 @@ +- name: Install wireguard + apt: + name: [wireguard, qrencode] + state: present + update_cache: yes + +- name: Ensure key dir + file: { path: /etc/wireguard/keys, state: directory, mode: '0700' } + +- name: Generate private key if missing + shell: "[ -f /etc/wireguard/keys/privatekey ] || (umask 077 && wg genkey > /etc/wireguard/keys/privatekey)" + args: { creates: /etc/wireguard/keys/privatekey } + +- name: Generate public key + shell: "wg pubkey < /etc/wireguard/keys/privatekey > /etc/wireguard/keys/publickey" + args: { creates: /etc/wireguard/keys/publickey } + +- name: Read pubkey + slurp: { src: /etc/wireguard/keys/publickey } + register: pubkey_raw + +- name: Read private key + slurp: { src: /etc/wireguard/keys/privatekey } + register: privkey_raw + +- set_fact: + wg_public_key: "{{ pubkey_raw.content | b64decode | trim }}" + wg_private_key: "{{ privkey_raw.content | b64decode | trim }}" + +- name: Gather facts from all hosts + setup: + delegate_to: "{{ item }}" + delegate_facts: true + loop: "{{ groups['k8s_nodes'] }}" + run_once: true + +- name: Pretty print hostvars + debug: + msg: "{{ hostvars['hetzner-1']['wg_public_key'] }}" + +- name: Render config + template: + src: wg0.conf.j2 + dest: /etc/wireguard/wg0.conf + mode: '0600' + +- name: Enable IP forward + sysctl: + name: net.ipv4.ip_forward + value: "1" + sysctl_set: yes + state: present + reload: yes + +- name: Enable wg-quick + service: + name: wg-quick@wg0 + enabled: yes + state: started + +- debug: + var: wg_show.stdout \ No newline at end of file diff --git a/ansible/roles/wireguard/templates/wg0.conf.j2 b/ansible/roles/wireguard/templates/wg0.conf.j2 new file mode 100644 index 0000000..a47c73f --- /dev/null +++ b/ansible/roles/wireguard/templates/wg0.conf.j2 @@ -0,0 +1,12 @@ +[Interface] +Address = {{ wg_nodes[inventory_hostname].address }}/24 +ListenPort = {{ wg_port }} +PrivateKey = {{ wg_private_key }} + +{% for h in groups['k8s_nodes'] if h != inventory_hostname %} +[Peer] +PublicKey = {{ hostvars[h].wg_public_key }} +AllowedIPs = {{ wg_nodes[h].address }}/32 +Endpoint = {{ wg_nodes[h].public_ip }}:{{ wg_port }} +PersistentKeepalive = 25 +{% endfor %} diff --git a/ansible/roles/wireguard/vars/main.yml b/ansible/roles/wireguard/vars/main.yml new file mode 100644 index 0000000..f908d00 --- /dev/null +++ b/ansible/roles/wireguard/vars/main.yml @@ -0,0 +1,6 @@ +wg_interface: wg0 +wg_port: 51820 +wg_cidr: 10.66.0.0/24 +wg_nodes: + hetzner-1: { address: 10.66.0.11, public_ip: "95.217.89.53" } + hetzner-2: { address: 10.66.0.12, public_ip: "138.201.254.97" } diff --git a/k8s/.DS_Store b/k8s/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..df3557e815cc147582005d2f554da31480355f3b GIT binary patch literal 10244 zcmeHMYitx%6h3GAnuP&|Ql7HuLU{!X-G%ZN+_pT03euK70LwmxcIb4b?#yl}4W!2S ziWm((68}hi{zN4jMT4ThOngOCNhFaNG!g&!i$9nEo_p^s*_pP&4>cHPHgnJ1d(S!d zp6{MB_uN?k!1`=D1`q)NnQlf^g^E!w*3a${m*lq0C5j{ukbqv$z=l403a$Ww0D%C3 z0D%C30D(IZ0ot?KC65<0C<6on1OfzZBEa4c5xNs?Pkq(=r zQjnq20oy0@mQO||%^Bzd3l4%oWxtRAG==d*&lm~Fl3t6xG9-xV^kjNvqzuU%E09rW zHX4oM(A=05v$2b{c;ztcn_C(27{qJyaZ*0+r+nNcq*EtO4H7=it3J*$;>?iVgJLdT zZmG-1J%O~ET!MX7vB?JZWyK+zy1kM}>*&F%3=R=ieYARPRz7Q`?|tKxKo6f2m0rDk zonQ9g$xumhbUHaYSxB>$@XFx{-+W34eP+uDPKjRRo|%^tr!l%m@YOB!h!WDMkr&j$ zA@BIWe64Wa6NoEG^JTt};}+;N3R|-QuO#<`Ldhv(OLBjANLvbeyjrA7K0mxoEy;en z&yBJ($v#83ooMttN=3ybrDbxNJYL@4)NS=QISD6g*C#T2S*_W!jAZ^fk=&ze{hew> zgJn7i-PAH%Ku>GLvaeM)(^h}Il{M2gmu(Uqa!3w!sxt=%m#tV{9jT2*k5)$pW6@q2jFux?TT_;i?KSOAb#f|Uq_RfB(drC?F;m+l zJC(5vW0z&?j&7OlZH{)>$t&`7wRPB5#%bWnI#o4mYr8elrRTJbq>)sWv(a|;F*Wuc zEn^d{ibsXnp;Hs5SI%3sWZ9~F;+wWLJ}^8TdJSV(RmiMyuhKxp~XjT@7t3zr+Zng;^p`u2Kg;YD#+(X_WL~E6m zvYHOH7>T52h^95lI$7NwYRw7DMPr3>uN-PiS-rhvj4j&g!y9GwbNN9w@Q#G-?CjIb z{FLg$TV=V!?oMR1o&DsSd5Ime%omp0naHNJ?pPU)#{F}`;dZ8~E}hoWWb~{VGt0wG z;TBo_j$~Bb%;^*jv=$;Lc>K$EtJb1N@0&9ID7OxRhm{VORjGAr8&--m&0e zI1VS_T{sIL!&h(~F2XPHJ6wf7;2K=VB2=&(XX89vf;AY$7_P;2xE?p)c5KEL+=EH% zK?6;+(7~g442SSxd<>t#XYm9+hcDr)_!_=}Z{jI@A3wk|_!)kIU*gyJ4PL}w@iP8~ zSMVBM#~adQX_hoknlCMoYNV*NOj<5&mbOT{q;@GObxGaA0x2Je`@NP)1605LKy=?b z(y7q{v2x4SZTBfV?)X6bVIm8?DHT(vO^?i9TvNMxeceq5750(3u}3panG5nRH`KuP@=kEAyrE$m_+sN>dH!0q8Jv{Xsl9E zizq@vTy{k@Rne`iS*s{&F~zFr+YqKIxRv?_Qo|xo{<~w~Tetu}!DYAtSFsewql%Td z2$y0l`TSQzg{mN&!7wW z_(}#vfWR#fC=&e<_Wgg;*#H0ELQBvcAP^w%Uq%3nH#Tjgt&yiiqZuMmV}$MlbhC@4 zPeX#TM^/dev/null || true +kubectl get pods -n prometheus 2>/dev/null || true +kubectl get pods -n grafana 2>/dev/null || true +kubectl get pods -A | grep -E "(prometheus|grafana|loki|tempo|fluent-bit|vector)" + +# Check for Helm releases +helm list -A | grep -E "(prometheus|grafana|loki|tempo)" +``` + +- [ ] If existing monitoring is found, remove it first: +```bash +./remove-old-monitoring.sh +``` + +**OR** run the deployment script which will prompt you: +```bash +./deploy.sh # Will ask if you want to clean up first +``` + +### Prerequisites +- [ ] Kubernetes cluster is running +- [ ] NGINX Ingress Controller is installed +- [ ] cert-manager is installed with Let's Encrypt ClusterIssuer +- [ ] DNS record `grafana.betelgeusebytes.io` points to cluster IP +- [ ] Node is labeled `kubernetes.io/hostname=hetzner-2` +- [ ] kubectl is configured and working + +### Verify Prerequisites +```bash +# Check cluster +kubectl cluster-info + +# Check NGINX Ingress +kubectl get pods -n ingress-nginx + +# Check cert-manager +kubectl get pods -n cert-manager + +# Check node label +kubectl get nodes --show-labels | grep hetzner-2 + +# Check DNS (from external machine) +dig grafana.betelgeusebytes.io +``` + +## Deployment Steps + +### Step 1: Prepare Storage +- [ ] SSH into hetzner-2 node +- [ ] Create directories: +```bash +sudo mkdir -p /mnt/local-ssd/{prometheus,loki,tempo,grafana} +``` +- [ ] Set correct permissions: +```bash +sudo chown -R 65534:65534 /mnt/local-ssd/prometheus +sudo chown -R 10001:10001 /mnt/local-ssd/loki +sudo chown -R root:root /mnt/local-ssd/tempo +sudo chown -R 472:472 /mnt/local-ssd/grafana +``` +- [ ] Verify permissions: +```bash +ls -la /mnt/local-ssd/ +``` + +### Step 2: Review Configuration +- [ ] Review `03-prometheus-config.yaml` - verify scrape targets +- [ ] Review `04-loki-config.yaml` - verify retention (7 days) +- [ ] Review `05-tempo-config.yaml` - verify retention (7 days) +- [ ] Review `06-alloy-config.yaml` - verify endpoints +- [ ] Review `20-grafana-ingress.yaml` - verify domain name + +### Step 3: Deploy the Stack +- [ ] Navigate to observability-stack directory +```bash +cd /path/to/observability-stack +``` +- [ ] Make scripts executable (already done): +```bash +chmod +x *.sh +``` +- [ ] Run deployment script: +```bash +./deploy.sh +``` +OR deploy manually: +```bash +kubectl apply -f 00-namespace.yaml +kubectl apply -f 01-persistent-volumes.yaml +kubectl apply -f 02-persistent-volume-claims.yaml +kubectl apply -f 03-prometheus-config.yaml +kubectl apply -f 04-loki-config.yaml +kubectl apply -f 05-tempo-config.yaml +kubectl apply -f 06-alloy-config.yaml +kubectl apply -f 07-grafana-datasources.yaml +kubectl apply -f 08-rbac.yaml +kubectl apply -f 10-prometheus.yaml +kubectl apply -f 11-loki.yaml +kubectl apply -f 12-tempo.yaml +kubectl apply -f 13-grafana.yaml +kubectl apply -f 14-alloy.yaml +kubectl apply -f 15-kube-state-metrics.yaml +kubectl apply -f 16-node-exporter.yaml +kubectl apply -f 20-grafana-ingress.yaml +``` + +### Step 4: Verify Deployment +- [ ] Run status check: +```bash +./status.sh +``` +- [ ] Check all PersistentVolumes are Bound: +```bash +kubectl get pv +``` +- [ ] Check all PersistentVolumeClaims are Bound: +```bash +kubectl get pvc -n observability +``` +- [ ] Check all pods are Running: +```bash +kubectl get pods -n observability +``` +Expected pods: + - [x] prometheus-0 + - [x] loki-0 + - [x] tempo-0 + - [x] grafana-0 + - [x] alloy-xxxxx (one per node) + - [x] kube-state-metrics-xxxxx + - [x] node-exporter-xxxxx (one per node) + +- [ ] Check services are created: +```bash +kubectl get svc -n observability +``` +- [ ] Check ingress is created: +```bash +kubectl get ingress -n observability +``` +- [ ] Verify TLS certificate is issued: +```bash +kubectl get certificate -n observability +kubectl describe certificate grafana-tls -n observability +``` + +### Step 5: Test Connectivity +- [ ] Test Prometheus endpoint: +```bash +kubectl run -it --rm test --image=curlimages/curl --restart=Never -- \ + curl http://prometheus.observability.svc.cluster.local:9090/-/healthy +``` +- [ ] Test Loki endpoint: +```bash +kubectl run -it --rm test --image=curlimages/curl --restart=Never -- \ + curl http://loki.observability.svc.cluster.local:3100/ready +``` +- [ ] Test Tempo endpoint: +```bash +kubectl run -it --rm test --image=curlimages/curl --restart=Never -- \ + curl http://tempo.observability.svc.cluster.local:3200/ready +``` +- [ ] Test Grafana endpoint: +```bash +kubectl run -it --rm test --image=curlimages/curl --restart=Never -- \ + curl http://grafana.observability.svc.cluster.local:3000/api/health +``` + +## Post-Deployment Configuration + +### Step 6: Access Grafana +- [ ] Open browser to: https://grafana.betelgeusebytes.io +- [ ] Login with default credentials: + - Username: `admin` + - Password: `admin` +- [ ] **CRITICAL**: Change admin password immediately +- [ ] Verify datasources are configured: + - Go to Configuration → Data Sources + - Should see: Prometheus (default), Loki, Tempo + - Click "Test" on each datasource + +### Step 7: Verify Data Collection +- [ ] Check Prometheus has targets: + - In Grafana, Explore → Prometheus + - Query: `up` + - Should see multiple targets with value=1 +- [ ] Check Loki is receiving logs: + - In Grafana, Explore → Loki + - Query: `{namespace="observability"}` + - Should see logs from observability stack +- [ ] Check kube-state-metrics: + - In Grafana, Explore → Prometheus + - Query: `kube_pod_status_phase` + - Should see pod status metrics + +### Step 8: Import Dashboards (Optional) +- [ ] Import Kubernetes cluster dashboard: + - Dashboards → Import → ID: 315 +- [ ] Import Node Exporter dashboard: + - Dashboards → Import → ID: 1860 +- [ ] Import Loki dashboard: + - Dashboards → Import → ID: 13639 + +### Step 9: Test with Demo App (Optional) +- [ ] Deploy demo application: +```bash +kubectl apply -f demo-app.yaml +``` +- [ ] Wait for pod to be ready: +```bash +kubectl wait --for=condition=ready pod -l app=demo-app -n observability --timeout=300s +``` +- [ ] Test the endpoints: +```bash +kubectl port-forward -n observability svc/demo-app 8080:8080 +# In another terminal: +curl http://localhost:8080/ +curl http://localhost:8080/items +curl http://localhost:8080/slow +curl http://localhost:8080/error +``` +- [ ] Verify in Grafana: + - Logs: `{app="demo-app"}` + - Metrics: `flask_http_request_total` + - Traces: Search for "demo-app" service in Tempo + +## Monitoring and Maintenance + +### Daily Checks +- [ ] Check pod status: `kubectl get pods -n observability` +- [ ] Check resource usage: `kubectl top pods -n observability` +- [ ] Check disk usage on hetzner-2: `df -h /mnt/local-ssd/` + +### Weekly Checks +- [ ] Review Grafana for any alerts or anomalies +- [ ] Verify TLS certificate is valid +- [ ] Check logs for any errors: +```bash +kubectl logs -n observability -l app=prometheus --tail=100 +kubectl logs -n observability -l app=loki --tail=100 +kubectl logs -n observability -l app=tempo --tail=100 +kubectl logs -n observability -l app=grafana --tail=100 +``` + +### Monthly Checks +- [ ] Review retention policies (7 days is appropriate) +- [ ] Check storage growth trends +- [ ] Review and update dashboards +- [ ] Backup Grafana dashboards and configs + +## Troubleshooting Guide + +### Pod Won't Start +1. Check events: `kubectl describe pod -n observability` +2. Check logs: `kubectl logs -n observability` +3. Check storage: `kubectl get pv` and `kubectl get pvc -n observability` +4. Verify node has space: SSH to hetzner-2 and run `df -h` + +### No Logs Appearing +1. Check Alloy pods: `kubectl get pods -n observability -l app=alloy` +2. Check Alloy logs: `kubectl logs -n observability -l app=alloy` +3. Check Loki is running: `kubectl get pods -n observability -l app=loki` +4. Test Loki endpoint from Alloy pod + +### No Metrics Appearing +1. Check Prometheus targets: Port-forward and visit http://localhost:9090/targets +2. Check service discovery: Look for "kubernetes-*" targets +3. Verify RBAC: `kubectl get clusterrolebinding prometheus` +4. Check kube-state-metrics: `kubectl get pods -n observability -l app=kube-state-metrics` + +### Grafana Can't Connect to Datasources +1. Test from Grafana pod: +```bash +kubectl exec -it grafana-0 -n observability -- wget -O- http://prometheus.observability.svc.cluster.local:9090/-/healthy +``` +2. Check datasource configuration in Grafana UI +3. Verify services exist: `kubectl get svc -n observability` + +### High Resource Usage +1. Check actual usage: `kubectl top pods -n observability` +2. Check node capacity: `kubectl top nodes` +3. Consider reducing retention periods +4. Review and adjust resource limits + +## Rollback Procedure + +If something goes wrong: + +1. Remove the deployment: +```bash +./cleanup.sh +``` + +2. Fix the issue in configuration files + +3. Redeploy: +```bash +./deploy.sh +``` + +## Success Criteria + +All checked items below indicate successful deployment: + +- [x] All pods are in Running state +- [x] All PVCs are Bound +- [x] Grafana is accessible at https://grafana.betelgeusebytes.io +- [x] All three datasources (Prometheus, Loki, Tempo) test successfully +- [x] Prometheus shows targets as "up" +- [x] Loki shows logs from observability namespace +- [x] TLS certificate is valid and auto-renewing +- [x] Admin password has been changed +- [x] Resource usage is within acceptable limits + +## Documentation References + +- **README.md**: Comprehensive documentation +- **QUICKREF.md**: Quick reference for common operations +- **demo-app.yaml**: Example instrumented application +- **deploy.sh**: Automated deployment script +- **cleanup.sh**: Removal script +- **status.sh**: Status checking script + +## Next Steps After Deployment + +1. Import useful dashboards from Grafana.com +2. Configure alerts (requires Alertmanager - not included) +3. Instrument your applications to send logs/metrics/traces +4. Create custom dashboards for your specific needs +5. Set up backup procedures for Grafana dashboards +6. Document your team's observability practices + +## Notes + +- Default retention: 7 days for all components +- Default resources are optimized for single-node cluster +- Scale up resources if monitoring high-traffic applications +- Always backup before making configuration changes +- Test changes in a non-production environment first + +--- + +**Deployment Date**: _______________ +**Deployed By**: _______________ +**Grafana Version**: 11.4.0 +**Stack Version**: January 2025 diff --git a/k8s/observability-stack/DNS-SETUP.md b/k8s/observability-stack/DNS-SETUP.md new file mode 100644 index 0000000..97fa4fa --- /dev/null +++ b/k8s/observability-stack/DNS-SETUP.md @@ -0,0 +1,146 @@ +# DNS Configuration Guide + +## Required DNS Records + +### Minimum Setup (Recommended) + +Only **one** DNS record is required for basic operation: + +``` +grafana.betelgeusebytes.io A/CNAME +``` + +This gives you access to the complete observability stack through Grafana's unified interface. + +## Optional DNS Records + +If you want direct access to individual components, add these DNS records: + +``` +prometheus.betelgeusebytes.io A/CNAME +loki.betelgeusebytes.io A/CNAME +tempo.betelgeusebytes.io A/CNAME +``` + +Then deploy the optional ingresses: +```bash +kubectl apply -f 21-optional-ingresses.yaml +``` + +## DNS Record Types + +**Option 1: A Record (Direct IP)** +``` +Type: A +Name: grafana.betelgeusebytes.io +Value: 1.2.3.4 (your cluster's public IP) +TTL: 300 +``` + +**Option 2: CNAME (Alias to another domain)** +``` +Type: CNAME +Name: grafana.betelgeusebytes.io +Value: your-server.example.com +TTL: 300 +``` + +## Access URLs Summary + +### After DNS Setup + +| Service | URL | Purpose | DNS Required? | +|---------|-----|---------|---------------| +| **Grafana** | https://grafana.betelgeusebytes.io | Main dashboard (logs/metrics/traces) | ✅ Yes | +| **Prometheus** | https://prometheus.betelgeusebytes.io | Metrics UI (optional) | ⚠️ Optional | +| **Loki** | https://loki.betelgeusebytes.io | Logs API (optional) | ⚠️ Optional | +| **Tempo** | https://tempo.betelgeusebytes.io | Traces API (optional) | ⚠️ Optional | + +### Internal (No DNS Needed) + +These services are accessible from within your cluster only: + +``` +# Metrics +http://prometheus.observability.svc.cluster.local:9090 + +# Logs +http://loki.observability.svc.cluster.local:3100 + +# Traces (OTLP endpoints for your apps) +http://tempo.observability.svc.cluster.local:4317 # gRPC +http://tempo.observability.svc.cluster.local:4318 # HTTP + +# Grafana (internal) +http://grafana.observability.svc.cluster.local:3000 +``` + +## Verification + +After setting up DNS, verify it's working: + +```bash +# Check DNS resolution +dig grafana.betelgeusebytes.io +nslookup grafana.betelgeusebytes.io + +# Should return your cluster IP + +# Test HTTPS access +curl -I https://grafana.betelgeusebytes.io +# Should return 200 OK or 302 redirect +``` + +## TLS Certificate + +Let's Encrypt will automatically issue certificates for: +- grafana.betelgeusebytes.io (required) +- prometheus.betelgeusebytes.io (if optional ingress deployed) +- loki.betelgeusebytes.io (if optional ingress deployed) +- tempo.betelgeusebytes.io (if optional ingress deployed) + +Check certificate status: +```bash +kubectl get certificate -n observability +kubectl describe certificate grafana-tls -n observability +``` + +## Recommendation + +**For most users:** Just configure `grafana.betelgeusebytes.io` + +Why? +- Single DNS record to manage +- Grafana provides unified access to all components +- Simpler certificate management +- All functionality available through one interface + +**For advanced users:** Add optional DNS records if you need: +- Direct Prometheus UI access for debugging +- External log/trace ingestion +- API integrations +- Programmatic queries outside Grafana + +## Troubleshooting + +**DNS not resolving:** +- Check DNS propagation: https://dnschecker.org/ +- Wait 5-15 minutes for DNS to propagate +- Verify your DNS provider settings + +**Certificate not issued:** +```bash +# Check cert-manager +kubectl get pods -n cert-manager + +# Check certificate request +kubectl describe certificate grafana-tls -n observability + +# Check challenges +kubectl get challenges -n observability +``` + +**403/404 errors:** +- Verify ingress is created: `kubectl get ingress -n observability` +- Check NGINX ingress controller: `kubectl get pods -n ingress-nginx` +- Check ingress logs: `kubectl logs -n ingress-nginx ` diff --git a/k8s/observability-stack/MONITORING-GUIDE.md b/k8s/observability-stack/MONITORING-GUIDE.md new file mode 100644 index 0000000..15e3b6f --- /dev/null +++ b/k8s/observability-stack/MONITORING-GUIDE.md @@ -0,0 +1,572 @@ +# Access URLs & Monitoring New Applications Guide + +## 🌐 Access URLs + +### Required (Already Configured) + +**Grafana - Main Dashboard** +- **URL**: https://grafana.betelgeusebytes.io +- **DNS Required**: Yes - `grafana.betelgeusebytes.io` → your cluster IP +- **Login**: admin / admin (change on first login!) +- **Purpose**: Unified interface for logs, metrics, and traces +- **Ingress**: Already included in deployment (20-grafana-ingress.yaml) + +### Optional (Direct Component Access) + +You can optionally expose these components directly: + +**Prometheus - Metrics UI** +- **URL**: https://prometheus.betelgeusebytes.io +- **DNS Required**: Yes - `prometheus.betelgeusebytes.io` → your cluster IP +- **Purpose**: Direct access to Prometheus UI, query metrics, check targets +- **Deploy**: `kubectl apply -f 21-optional-ingresses.yaml` +- **Use Case**: Debugging metric collection, advanced PromQL queries + +**Loki - Logs API** +- **URL**: https://loki.betelgeusebytes.io +- **DNS Required**: Yes - `loki.betelgeusebytes.io` → your cluster IP +- **Purpose**: Direct API access for log queries +- **Deploy**: `kubectl apply -f 21-optional-ingresses.yaml` +- **Use Case**: External log forwarding, API integration + +**Tempo - Traces API** +- **URL**: https://tempo.betelgeusebytes.io +- **DNS Required**: Yes - `tempo.betelgeusebytes.io` → your cluster IP +- **Purpose**: Direct API access for trace queries +- **Deploy**: `kubectl apply -f 21-optional-ingresses.yaml` +- **Use Case**: External trace ingestion, API integration + +### Internal Only (No DNS Required) + +These are ClusterIP services accessible only from within the cluster: + +``` +http://prometheus.observability.svc.cluster.local:9090 +http://loki.observability.svc.cluster.local:3100 +http://tempo.observability.svc.cluster.local:3200 +http://tempo.observability.svc.cluster.local:4317 # OTLP gRPC +http://tempo.observability.svc.cluster.local:4318 # OTLP HTTP +``` + +## 🎯 Recommendation + +**For most users**: Just use Grafana (grafana.betelgeusebytes.io) +- Grafana provides unified access to all components +- No need to expose Prometheus, Loki, or Tempo directly +- Simpler DNS configuration (only one subdomain) + +**For power users**: Add optional ingresses +- Direct Prometheus access is useful for debugging +- Helps verify targets and scrape configs +- Deploy with: `kubectl apply -f 21-optional-ingresses.yaml` + +## 📊 Monitoring New Applications + +### Automatic: Kubernetes Logs + +**All pod logs are automatically collected!** No configuration needed. + +Alloy runs as a DaemonSet and automatically: +1. Discovers all pods in the cluster +2. Reads logs from `/var/log/pods/` +3. Sends them to Loki with labels: + - `namespace` + - `pod` + - `container` + - `node` + - All pod labels + +**View in Grafana:** +```logql +# All logs from your app +{namespace="your-namespace", pod=~"your-app.*"} + +# Error logs only +{namespace="your-namespace"} |= "error" + +# JSON logs parsed +{namespace="your-namespace"} | json | level="error" +``` + +**Best Practice for Logs:** +Emit structured JSON logs from your application: + +```python +import json +import logging + +# Python example +logging.basicConfig( + format='%(message)s', + level=logging.INFO +) + +logger = logging.getLogger(__name__) + +# Log as JSON +logger.info(json.dumps({ + "level": "info", + "message": "User login successful", + "user_id": "123", + "ip": "1.2.3.4", + "duration_ms": 42 +})) +``` + +### Manual: Application Metrics + +#### Step 1: Expose Metrics Endpoint + +Your application needs to expose metrics at `/metrics` in Prometheus format. + +**Python (Flask) Example:** +```python +from prometheus_flask_exporter import PrometheusMetrics + +app = Flask(__name__) +metrics = PrometheusMetrics(app) + +# Now /metrics endpoint is available +# Automatic metrics: request count, duration, etc. +``` + +**Python (FastAPI) Example:** +```python +from prometheus_fastapi_instrumentator import Instrumentator + +app = FastAPI() +Instrumentator().instrument(app).expose(app) + +# /metrics endpoint is now available +``` + +**Go Example:** +```go +import ( + "github.com/prometheus/client_golang/prometheus/promhttp" + "net/http" +) + +http.Handle("/metrics", promhttp.Handler()) +``` + +**Node.js Example:** +```javascript +const promClient = require('prom-client'); + +// Create default metrics +const register = new promClient.Registry(); +promClient.collectDefaultMetrics({ register }); + +// Expose /metrics endpoint +app.get('/metrics', async (req, res) => { + res.set('Content-Type', register.contentType); + res.end(await register.metrics()); +}); +``` + +#### Step 2: Add Prometheus Annotations to Your Deployment + +Add these annotations to your pod template: + +```yaml +apiVersion: apps/v1 +kind: Deployment +metadata: + name: my-app + namespace: my-namespace +spec: + template: + metadata: + annotations: + prometheus.io/scrape: "true" # Enable scraping + prometheus.io/port: "8080" # Port where metrics are exposed + prometheus.io/path: "/metrics" # Path to metrics (optional, /metrics is default) + spec: + containers: + - name: my-app + image: my-app:latest + ports: + - name: http + containerPort: 8080 +``` + +#### Step 3: Verify Metrics Collection + +**Check in Prometheus:** +1. Access Prometheus UI (if exposed): https://prometheus.betelgeusebytes.io +2. Go to Status → Targets +3. Look for your pod under "kubernetes-pods" +4. Should show as "UP" + +**Or via Grafana:** +1. Go to Explore → Prometheus +2. Query: `up{pod=~"my-app.*"}` +3. Should return value=1 + +**Query your metrics:** +```promql +# Request rate +rate(http_requests_total{namespace="my-namespace"}[5m]) + +# Request duration 95th percentile +histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m])) + +# Error rate +rate(http_requests_total{namespace="my-namespace", status=~"5.."}[5m]) +``` + +### Manual: Application Traces + +#### Step 1: Add OpenTelemetry to Your Application + +**Python Example:** +```python +from opentelemetry import trace +from opentelemetry.sdk.trace import TracerProvider +from opentelemetry.sdk.trace.export import BatchSpanProcessor +from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter +from opentelemetry.instrumentation.flask import FlaskInstrumentor +from opentelemetry.sdk.resources import Resource + +# Configure resource +resource = Resource.create({"service.name": "my-app"}) + +# Setup tracer +trace_provider = TracerProvider(resource=resource) +trace_provider.add_span_processor( + BatchSpanProcessor( + OTLPSpanExporter( + endpoint="http://tempo.observability.svc.cluster.local:4317", + insecure=True + ) + ) +) +trace.set_tracer_provider(trace_provider) + +# Auto-instrument Flask +app = Flask(__name__) +FlaskInstrumentor().instrument_app(app) + +# Manual spans +tracer = trace.get_tracer(__name__) + +@app.route('/api/data') +def get_data(): + with tracer.start_as_current_span("fetch_data") as span: + # Your code here + span.set_attribute("rows", 100) + return {"data": "..."} +``` + +**Install dependencies:** +```bash +pip install opentelemetry-api opentelemetry-sdk \ + opentelemetry-instrumentation-flask \ + opentelemetry-exporter-otlp-proto-grpc +``` + +**Go Example:** +```go +import ( + "go.opentelemetry.io/otel" + "go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc" + "go.opentelemetry.io/otel/sdk/trace" +) + +exporter, _ := otlptracegrpc.New( + context.Background(), + otlptracegrpc.WithEndpoint("tempo.observability.svc.cluster.local:4317"), + otlptracegrpc.WithInsecure(), +) + +tp := trace.NewTracerProvider( + trace.WithBatcher(exporter), +) +otel.SetTracerProvider(tp) +``` + +**Node.js Example:** +```javascript +const { NodeTracerProvider } = require('@opentelemetry/sdk-trace-node'); +const { OTLPTraceExporter } = require('@opentelemetry/exporter-trace-otlp-grpc'); +const { BatchSpanProcessor } = require('@opentelemetry/sdk-trace-base'); + +const provider = new NodeTracerProvider(); +const exporter = new OTLPTraceExporter({ + url: 'http://tempo.observability.svc.cluster.local:4317' +}); +provider.addSpanProcessor(new BatchSpanProcessor(exporter)); +provider.register(); +``` + +#### Step 2: Add Trace IDs to Logs (Optional but Recommended) + +This enables clicking from logs to traces in Grafana! + +**Python Example:** +```python +import json +from opentelemetry import trace + +def log_with_trace(message): + span = trace.get_current_span() + trace_id = format(span.get_span_context().trace_id, '032x') + + log_entry = { + "message": message, + "trace_id": trace_id, + "level": "info" + } + print(json.dumps(log_entry)) +``` + +#### Step 3: Verify Traces + +**In Grafana:** +1. Go to Explore → Tempo +2. Search for service: "my-app" +3. Click on a trace to view details +4. Click "Logs for this span" to see correlated logs + +## 📋 Complete Example: Monitoring a New App + +Here's a complete deployment with all monitoring configured: + +```yaml +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: my-app-config + namespace: my-namespace +data: + app.py: | + from flask import Flask + import logging + import json + from opentelemetry import trace + from opentelemetry.sdk.trace import TracerProvider + from opentelemetry.sdk.trace.export import BatchSpanProcessor + from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter + from opentelemetry.instrumentation.flask import FlaskInstrumentor + from opentelemetry.sdk.resources import Resource + from prometheus_flask_exporter import PrometheusMetrics + + # Setup logging + logging.basicConfig(level=logging.INFO, format='%(message)s') + logger = logging.getLogger(__name__) + + # Setup tracing + resource = Resource.create({"service.name": "my-app"}) + trace_provider = TracerProvider(resource=resource) + trace_provider.add_span_processor( + BatchSpanProcessor( + OTLPSpanExporter( + endpoint="http://tempo.observability.svc.cluster.local:4317", + insecure=True + ) + ) + ) + trace.set_tracer_provider(trace_provider) + + app = Flask(__name__) + + # Setup metrics + metrics = PrometheusMetrics(app) + + # Auto-instrument with traces + FlaskInstrumentor().instrument_app(app) + + @app.route('/') + def index(): + span = trace.get_current_span() + trace_id = format(span.get_span_context().trace_id, '032x') + + logger.info(json.dumps({ + "level": "info", + "message": "Request received", + "trace_id": trace_id, + "endpoint": "/" + })) + + return {"status": "ok", "trace_id": trace_id} + + if __name__ == '__main__': + app.run(host='0.0.0.0', port=8080) + +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: my-app + namespace: my-namespace + labels: + app: my-app +spec: + replicas: 2 + selector: + matchLabels: + app: my-app + template: + metadata: + labels: + app: my-app + annotations: + # Enable Prometheus scraping + prometheus.io/scrape: "true" + prometheus.io/port: "8080" + prometheus.io/path: "/metrics" + spec: + containers: + - name: my-app + image: python:3.11-slim + command: + - /bin/bash + - -c + - | + pip install flask opentelemetry-api opentelemetry-sdk \ + opentelemetry-instrumentation-flask \ + opentelemetry-exporter-otlp-proto-grpc \ + prometheus-flask-exporter && \ + python /app/app.py + ports: + - name: http + containerPort: 8080 + volumeMounts: + - name: app-code + mountPath: /app + resources: + requests: + cpu: 100m + memory: 256Mi + limits: + cpu: 500m + memory: 512Mi + volumes: + - name: app-code + configMap: + name: my-app-config + +--- +apiVersion: v1 +kind: Service +metadata: + name: my-app + namespace: my-namespace + labels: + app: my-app + annotations: + prometheus.io/scrape: "true" + prometheus.io/port: "8080" +spec: + type: ClusterIP + ports: + - port: 8080 + targetPort: http + protocol: TCP + name: http + selector: + app: my-app +``` + +## 🔍 Verification Checklist + +After deploying a new app with monitoring: + +### Logs ✓ (Automatic) +```bash +# Check logs appear in Grafana +# Explore → Loki → {namespace="my-namespace", pod=~"my-app.*"} +``` + +### Metrics ✓ (If configured) +```bash +# Check Prometheus is scraping +# Explore → Prometheus → up{pod=~"my-app.*"} +# Should return 1 + +# Check your custom metrics +# Explore → Prometheus → flask_http_request_total{namespace="my-namespace"} +``` + +### Traces ✓ (If configured) +```bash +# Check traces appear in Tempo +# Explore → Tempo → Search for service "my-app" +# Should see traces + +# Verify log-trace correlation +# Click on a log line with trace_id → should jump to trace +``` + +## 🎓 Quick Start for Common Frameworks + +### Python Flask/FastAPI +```bash +pip install opentelemetry-distro opentelemetry-exporter-otlp prometheus-flask-exporter +opentelemetry-bootstrap -a install +``` + +```python +# Set environment variables in your deployment: +OTEL_SERVICE_NAME=my-app +OTEL_EXPORTER_OTLP_ENDPOINT=http://tempo.observability.svc.cluster.local:4317 +OTEL_EXPORTER_OTLP_PROTOCOL=grpc + +# Then run with auto-instrumentation: +opentelemetry-instrument python app.py +``` + +### Go +```bash +go get go.opentelemetry.io/otel +go get go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp +go get go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc +``` + +### Node.js +```bash +npm install @opentelemetry/sdk-node @opentelemetry/auto-instrumentations-node \ + @opentelemetry/exporter-trace-otlp-grpc prom-client +``` + +## 📚 Summary + +| Component | Automatic? | Configuration Needed | +|-----------|-----------|---------------------| +| **Logs** | ✅ Yes | None - just deploy your app | +| **Metrics** | ❌ No | Add /metrics endpoint + annotations | +| **Traces** | ❌ No | Add OpenTelemetry SDK + configure endpoint | + +**Recommended Approach:** +1. **Start simple**: Deploy app, logs work automatically +2. **Add metrics**: Expose /metrics, add annotations +3. **Add traces**: Instrument with OpenTelemetry +4. **Correlate**: Add trace IDs to logs for full observability + +## 🔗 Useful Links + +- OpenTelemetry Python: https://opentelemetry.io/docs/instrumentation/python/ +- OpenTelemetry Go: https://opentelemetry.io/docs/instrumentation/go/ +- OpenTelemetry Node.js: https://opentelemetry.io/docs/instrumentation/js/ +- Prometheus Client Libraries: https://prometheus.io/docs/instrumenting/clientlibs/ +- Grafana Docs: https://grafana.com/docs/ + +## 🆘 Troubleshooting + +**Logs not appearing:** +- Check Alloy is running: `kubectl get pods -n observability -l app=alloy` +- Check pod logs are being written to stdout/stderr +- View in real-time: `kubectl logs -f -n ` + +**Metrics not being scraped:** +- Verify annotations are present: `kubectl get pod -o yaml | grep prometheus` +- Check /metrics endpoint: `kubectl port-forward pod/ 8080:8080` then `curl localhost:8080/metrics` +- Check Prometheus targets: https://prometheus.betelgeusebytes.io/targets + +**Traces not appearing:** +- Verify endpoint: `tempo.observability.svc.cluster.local:4317` +- Check Tempo logs: `kubectl logs -n observability tempo-0` +- Verify OTLP exporter is configured correctly in your app +- Check network policies allow traffic to observability namespace diff --git a/k8s/observability-stack/QUICKREF.md b/k8s/observability-stack/QUICKREF.md new file mode 100644 index 0000000..b30bb17 --- /dev/null +++ b/k8s/observability-stack/QUICKREF.md @@ -0,0 +1,398 @@ +# Observability Stack Quick Reference + +## Before You Start + +### Remove Old Monitoring Stack +If you have existing monitoring components, remove them first: +```bash +./remove-old-monitoring.sh +``` + +This will safely remove: +- Prometheus, Grafana, Loki, Tempo deployments +- Fluent Bit, Vector, or other log collectors +- Helm releases +- ConfigMaps, PVCs, RBAC resources +- Prometheus Operator CRDs + +## Quick Access + +- **Grafana UI**: https://grafana.betelgeusebytes.io +- **Default Login**: admin / admin (change immediately!) + +## Essential Commands + +### Check Status +```bash +# Quick status check +./status.sh + +# View all pods +kubectl get pods -n observability -o wide + +# Check specific component +kubectl get pods -n observability -l app=prometheus +kubectl get pods -n observability -l app=loki +kubectl get pods -n observability -l app=tempo +kubectl get pods -n observability -l app=grafana + +# Check storage +kubectl get pv +kubectl get pvc -n observability +``` + +### View Logs +```bash +# Grafana +kubectl logs -n observability -l app=grafana -f + +# Prometheus +kubectl logs -n observability -l app=prometheus -f + +# Loki +kubectl logs -n observability -l app=loki -f + +# Tempo +kubectl logs -n observability -l app=tempo -f + +# Alloy (log collector) +kubectl logs -n observability -l app=alloy -f +``` + +### Restart Components +```bash +# Restart Prometheus +kubectl rollout restart statefulset/prometheus -n observability + +# Restart Loki +kubectl rollout restart statefulset/loki -n observability + +# Restart Tempo +kubectl rollout restart statefulset/tempo -n observability + +# Restart Grafana +kubectl rollout restart statefulset/grafana -n observability + +# Restart Alloy +kubectl rollout restart daemonset/alloy -n observability +``` + +### Update Configurations +```bash +# Edit Prometheus config +kubectl edit configmap prometheus-config -n observability +kubectl rollout restart statefulset/prometheus -n observability + +# Edit Loki config +kubectl edit configmap loki-config -n observability +kubectl rollout restart statefulset/loki -n observability + +# Edit Tempo config +kubectl edit configmap tempo-config -n observability +kubectl rollout restart statefulset/tempo -n observability + +# Edit Alloy config +kubectl edit configmap alloy-config -n observability +kubectl rollout restart daemonset/alloy -n observability + +# Edit Grafana datasources +kubectl edit configmap grafana-datasources -n observability +kubectl rollout restart statefulset/grafana -n observability +``` + +## Common LogQL Queries (Loki) + +### Basic Queries +```logql +# All logs from observability namespace +{namespace="observability"} + +# Logs from specific app +{namespace="observability", app="prometheus"} + +# Filter by log level +{namespace="default"} |= "error" +{namespace="default"} | json | level="error" + +# Exclude certain logs +{namespace="default"} != "health check" + +# Multiple filters +{namespace="default"} |= "error" != "ignore" +``` + +### Advanced Queries +```logql +# Rate of errors +rate({namespace="default"} |= "error" [5m]) + +# Count logs by level +sum by (level) (count_over_time({namespace="default"} | json [5m])) + +# Top 10 error messages +topk(10, count by (message) ( + {namespace="default"} | json | level="error" +)) +``` + +## Common PromQL Queries (Prometheus) + +### Cluster Health +```promql +# All targets up/down +up + +# Pods by phase +kube_pod_status_phase{namespace="observability"} + +# Node memory available +node_memory_MemAvailable_bytes + +# Node CPU usage +rate(node_cpu_seconds_total{mode="user"}[5m]) +``` + +### Container Metrics +```promql +# CPU usage by container +rate(container_cpu_usage_seconds_total[5m]) + +# Memory usage by container +container_memory_usage_bytes + +# Network traffic +rate(container_network_transmit_bytes_total[5m]) +rate(container_network_receive_bytes_total[5m]) +``` + +### Application Metrics +```promql +# HTTP request rate +rate(http_requests_total[5m]) + +# Request duration +histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m])) + +# Error rate +rate(http_requests_total{status=~"5.."}[5m]) +``` + +## Trace Search (Tempo) + +In Grafana Explore with Tempo datasource: + +- **Search by service**: Select from dropdown +- **Search by duration**: "> 1s", "< 100ms" +- **Search by tag**: `http.status_code=500` +- **TraceQL**: `{span.http.method="POST" && span.http.status_code>=400}` + +## Correlations + +### From Logs to Traces +1. View logs in Loki +2. Click on a log line with a trace ID +3. Click the "Tempo" link +4. Trace opens in Tempo + +### From Traces to Logs +1. View trace in Tempo +2. Click on a span +3. Click "Logs for this span" +4. Related logs appear + +### From Traces to Metrics +1. View trace in Tempo +2. Service graph shows metrics +3. Click service to see metrics + +## Demo Application + +Deploy the demo app to test the stack: + +```bash +kubectl apply -f demo-app.yaml + +# Wait for it to start +kubectl wait --for=condition=ready pod -l app=demo-app -n observability --timeout=300s + +# Test it +kubectl port-forward -n observability svc/demo-app 8080:8080 + +# In another terminal +curl http://localhost:8080/ +curl http://localhost:8080/items +curl http://localhost:8080/item/0 +curl http://localhost:8080/slow +curl http://localhost:8080/error +``` + +Now view in Grafana: +- **Logs**: Search `{app="demo-app"}` in Loki +- **Traces**: Search "demo-app" service in Tempo +- **Metrics**: Query `flask_http_request_total` in Prometheus + +## Storage Management + +### Check Disk Usage +```bash +# On hetzner-2 node +df -h /mnt/local-ssd/ + +# Detailed usage +du -sh /mnt/local-ssd/* +``` + +### Cleanup Old Data +Data is automatically deleted after 7 days. To manually adjust retention: + +**Prometheus** (in 03-prometheus-config.yaml): +```yaml +args: + - '--storage.tsdb.retention.time=7d' +``` + +**Loki** (in 04-loki-config.yaml): +```yaml +limits_config: + retention_period: 168h # 7 days +``` + +**Tempo** (in 05-tempo-config.yaml): +```yaml +compactor: + compaction: + block_retention: 168h # 7 days +``` + +## Troubleshooting + +### No Logs Appearing +```bash +# Check Alloy is running +kubectl get pods -n observability -l app=alloy + +# Check Alloy logs +kubectl logs -n observability -l app=alloy + +# Check Loki +kubectl logs -n observability -l app=loki + +# Test Loki endpoint +kubectl run -it --rm debug --image=curlimages/curl --restart=Never -- \ + curl http://loki.observability.svc.cluster.local:3100/ready +``` + +### No Traces Appearing +```bash +# Check Tempo is running +kubectl get pods -n observability -l app=tempo + +# Check Tempo logs +kubectl logs -n observability -l app=tempo + +# Test Tempo endpoint +kubectl run -it --rm debug --image=curlimages/curl --restart=Never -- \ + curl http://tempo.observability.svc.cluster.local:3200/ready + +# Verify your app sends to correct endpoint +# Should be: tempo.observability.svc.cluster.local:4317 (gRPC) +# or: tempo.observability.svc.cluster.local:4318 (HTTP) +``` + +### Grafana Can't Connect to Datasources +```bash +# Check all services are running +kubectl get svc -n observability + +# Test from Grafana pod +kubectl exec -it -n observability grafana-0 -- \ + wget -O- http://prometheus.observability.svc.cluster.local:9090/-/healthy + +kubectl exec -it -n observability grafana-0 -- \ + wget -O- http://loki.observability.svc.cluster.local:3100/ready + +kubectl exec -it -n observability grafana-0 -- \ + wget -O- http://tempo.observability.svc.cluster.local:3200/ready +``` + +### High Resource Usage +```bash +# Check resource usage +kubectl top pods -n observability +kubectl top nodes + +# Scale down if needed (for testing) +kubectl scale statefulset/prometheus -n observability --replicas=0 +kubectl scale statefulset/loki -n observability --replicas=0 +``` + +## Backup and Restore + +### Backup Grafana Dashboards +```bash +# Export all dashboards via API +kubectl port-forward -n observability svc/grafana 3000:3000 + +# In another terminal +curl -H "Authorization: Bearer " \ + http://localhost:3000/api/search?type=dash-db | jq +``` + +### Backup Configurations +```bash +# Backup all ConfigMaps +kubectl get configmap -n observability -o yaml > configmaps-backup.yaml + +# Backup specific config +kubectl get configmap prometheus-config -n observability -o yaml > prometheus-config-backup.yaml +``` + +## Useful Dashboards in Grafana + +After login, import these dashboard IDs: + +- **315**: Kubernetes cluster monitoring +- **7249**: Kubernetes cluster +- **13639**: Loki dashboard +- **12611**: Tempo dashboard +- **3662**: Prometheus 2.0 stats +- **1860**: Node Exporter Full + +Go to: Dashboards → Import → Enter ID → Load + +## Performance Tuning + +### For Higher Load +Increase resources in respective YAML files: + +```yaml +resources: + requests: + cpu: 1000m # from 500m + memory: 4Gi # from 2Gi + limits: + cpu: 4000m # from 2000m + memory: 8Gi # from 4Gi +``` + +### For Lower Resource Usage +- Reduce scrape intervals in Prometheus config +- Reduce log retention periods +- Reduce trace sampling rate + +## Security Checklist + +- [ ] Change Grafana admin password +- [ ] Review RBAC permissions +- [ ] Enable audit logging +- [ ] Consider adding NetworkPolicies +- [ ] Review ingress TLS configuration +- [ ] Backup configurations regularly + +## Getting Help + +1. Check component logs first +2. Review configurations +3. Test network connectivity +4. Check resource availability +5. Review Grafana datasource settings diff --git a/k8s/observability-stack/README.md b/k8s/observability-stack/README.md new file mode 100644 index 0000000..bd56a15 --- /dev/null +++ b/k8s/observability-stack/README.md @@ -0,0 +1,385 @@ +# State-of-the-Art Observability Stack for Kubernetes + +This deployment provides a comprehensive, production-ready observability solution using the Grafana LGTM stack (Loki, Grafana, Tempo, Mimir/Prometheus) with unified collection through Grafana Alloy. + +## Architecture Overview + +### Core Components + +1. **Grafana** (v11.4.0) - Unified visualization platform + - Pre-configured datasources for Prometheus, Loki, and Tempo + - Automatic correlation between logs, metrics, and traces + - Modern UI with TraceQL editor support + +2. **Prometheus** (v2.54.1) - Metrics collection and storage + - 7-day retention + - Comprehensive Kubernetes service discovery + - Scrapes: API server, nodes, cadvisor, pods, services + +3. **Grafana Loki** (v3.2.1) - Log aggregation + - 7-day retention with compaction + - TSDB index for efficient queries + - Automatic correlation with traces + +4. **Grafana Tempo** (v2.6.1) - Distributed tracing + - 7-day retention + - Multiple protocol support: OTLP, Jaeger, Zipkin + - Metrics generation from traces + - Automatic correlation with logs and metrics + +5. **Grafana Alloy** (v1.5.1) - Unified observability agent + - Replaces Promtail, Vector, Fluent Bit + - Collects logs from all pods + - OTLP receiver for traces + - Runs as DaemonSet on all nodes + +6. **kube-state-metrics** (v2.13.0) - Kubernetes object metrics + - Deployment, Pod, Service, Node metrics + - Essential for cluster monitoring + +7. **node-exporter** (v1.8.2) - Node-level system metrics + - CPU, memory, disk, network metrics + - Runs on all nodes via DaemonSet + +## Key Features + +- **Unified Observability**: Logs, metrics, and traces in one platform +- **Automatic Correlation**: Click from logs to traces to metrics seamlessly +- **7-Day Retention**: Optimized for single-node cluster +- **Local SSD Storage**: Fast, persistent storage on hetzner-2 node +- **OTLP Support**: Modern OpenTelemetry protocol support +- **TLS Enabled**: Secure access via NGINX Ingress with Let's Encrypt +- **Low Resource Footprint**: Optimized for single-node deployment + +## Storage Layout + +All data stored on local SSD at `/mnt/local-ssd/`: + +``` +/mnt/local-ssd/ +├── prometheus/ (50Gi) - Metrics data +├── loki/ (100Gi) - Log data +├── tempo/ (50Gi) - Trace data +└── grafana/ (10Gi) - Dashboards and settings +``` + +## Deployment Instructions + +### Prerequisites + +1. Kubernetes cluster with NGINX Ingress Controller +2. cert-manager installed with Let's Encrypt issuer +3. DNS record: `grafana.betelgeusebytes.io` → your cluster IP +4. Node labeled: `kubernetes.io/hostname=hetzner-2` + +### Step 0: Remove Existing Monitoring (If Applicable) + +If you have an existing monitoring stack (Prometheus, Grafana, Loki, Fluent Bit, etc.), remove it first to avoid conflicts: + +```bash +./remove-old-monitoring.sh +``` + +This interactive script will help you safely remove: +- Existing Prometheus/Grafana/Loki/Tempo deployments +- Helm releases for monitoring components +- Fluent Bit, Vector, or other log collectors +- Related ConfigMaps, PVCs, and RBAC resources +- Prometheus Operator CRDs (if applicable) + +**Note**: The main deployment script (`deploy.sh`) will also prompt you to run cleanup if needed. + +### Step 1: Prepare Storage Directories + +SSH into the hetzner-2 node and create directories: + +```bash +sudo mkdir -p /mnt/local-ssd/{prometheus,loki,tempo,grafana} +sudo chown -R 65534:65534 /mnt/local-ssd/prometheus +sudo chown -R 10001:10001 /mnt/local-ssd/loki +sudo chown -R root:root /mnt/local-ssd/tempo +sudo chown -R 472:472 /mnt/local-ssd/grafana +``` + +### Step 2: Deploy the Stack + +```bash +chmod +x deploy.sh +./deploy.sh +``` + +Or deploy manually: + +```bash +kubectl apply -f 00-namespace.yaml +kubectl apply -f 01-persistent-volumes.yaml +kubectl apply -f 02-persistent-volume-claims.yaml +kubectl apply -f 03-prometheus-config.yaml +kubectl apply -f 04-loki-config.yaml +kubectl apply -f 05-tempo-config.yaml +kubectl apply -f 06-alloy-config.yaml +kubectl apply -f 07-grafana-datasources.yaml +kubectl apply -f 08-rbac.yaml +kubectl apply -f 10-prometheus.yaml +kubectl apply -f 11-loki.yaml +kubectl apply -f 12-tempo.yaml +kubectl apply -f 13-grafana.yaml +kubectl apply -f 14-alloy.yaml +kubectl apply -f 15-kube-state-metrics.yaml +kubectl apply -f 16-node-exporter.yaml +kubectl apply -f 20-grafana-ingress.yaml +``` + +### Step 3: Verify Deployment + +```bash +kubectl get pods -n observability +kubectl get pv +kubectl get pvc -n observability +``` + +All pods should be in `Running` state: +- grafana-0 +- loki-0 +- prometheus-0 +- tempo-0 +- alloy-xxxxx (one per node) +- kube-state-metrics-xxxxx +- node-exporter-xxxxx (one per node) + +### Step 4: Access Grafana + +1. Open: https://grafana.betelgeusebytes.io +2. Login with default credentials: + - Username: `admin` + - Password: `admin` +3. **IMPORTANT**: Change the password on first login! + +## Using the Stack + +### Exploring Logs (Loki) + +1. In Grafana, go to **Explore** +2. Select **Loki** datasource +3. Example queries: + ``` + {namespace="observability"} + {namespace="observability", app="prometheus"} + {namespace="default"} |= "error" + {pod="my-app-xxx"} | json | level="error" + ``` + +### Exploring Metrics (Prometheus) + +1. In Grafana, go to **Explore** +2. Select **Prometheus** datasource +3. Example queries: + ``` + up + node_memory_MemAvailable_bytes + rate(container_cpu_usage_seconds_total[5m]) + kube_pod_status_phase{namespace="observability"} + ``` + +### Exploring Traces (Tempo) + +1. In Grafana, go to **Explore** +2. Select **Tempo** datasource +3. Search by: + - Service name + - Duration + - Tags +4. Click on a trace to see detailed span timeline + +### Correlations + +The stack automatically correlates: +- **Logs → Traces**: Click traceID in logs to view trace +- **Traces → Logs**: Click on trace to see related logs +- **Traces → Metrics**: Tempo generates metrics from traces + +### Instrumenting Your Applications + +#### For Logs +Logs are automatically collected from all pods by Alloy. Emit structured JSON logs: + +```json +{"level":"info","message":"Request processed","duration_ms":42} +``` + +#### For Traces +Send traces to Tempo using OTLP: + +```python +# Python with OpenTelemetry +from opentelemetry import trace +from opentelemetry.sdk.trace import TracerProvider +from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter + +provider = TracerProvider() +provider.add_span_processor( + BatchSpanProcessor( + OTLPSpanExporter(endpoint="http://tempo.observability.svc.cluster.local:4317") + ) +) +trace.set_tracer_provider(provider) +``` + +#### For Metrics +Expose metrics in Prometheus format and add annotations to your pod: + +```yaml +metadata: + annotations: + prometheus.io/scrape: "true" + prometheus.io/port: "8080" + prometheus.io/path: "/metrics" +``` + +## Monitoring Endpoints + +Internal service endpoints: + +- **Prometheus**: `http://prometheus.observability.svc.cluster.local:9090` +- **Loki**: `http://loki.observability.svc.cluster.local:3100` +- **Tempo**: + - HTTP: `http://tempo.observability.svc.cluster.local:3200` + - OTLP gRPC: `tempo.observability.svc.cluster.local:4317` + - OTLP HTTP: `tempo.observability.svc.cluster.local:4318` +- **Grafana**: `http://grafana.observability.svc.cluster.local:3000` + +## Troubleshooting + +### Check Pod Status +```bash +kubectl get pods -n observability +kubectl describe pod -n observability +``` + +### View Logs +```bash +kubectl logs -n observability -l app=grafana +kubectl logs -n observability -l app=prometheus +kubectl logs -n observability -l app=loki +kubectl logs -n observability -l app=tempo +kubectl logs -n observability -l app=alloy +``` + +### Check Storage +```bash +kubectl get pv +kubectl get pvc -n observability +``` + +### Test Connectivity +```bash +# From inside cluster +kubectl run -it --rm debug --image=curlimages/curl --restart=Never -- \ + curl http://prometheus.observability.svc.cluster.local:9090/-/healthy +``` + +### Common Issues + +**Pods stuck in Pending** +- Check if storage directories exist on hetzner-2 +- Verify PV/PVC bindings: `kubectl describe pvc -n observability` + +**Loki won't start** +- Check permissions on `/mnt/local-ssd/loki` (should be 10001:10001) +- View logs: `kubectl logs -n observability loki-0` + +**No logs appearing** +- Check Alloy pods are running: `kubectl get pods -n observability -l app=alloy` +- View Alloy logs: `kubectl logs -n observability -l app=alloy` + +**Grafana can't reach datasources** +- Verify services: `kubectl get svc -n observability` +- Check datasource URLs in Grafana UI + +## Updating Configuration + +### Update Prometheus Scrape Config +```bash +kubectl edit configmap prometheus-config -n observability +kubectl rollout restart statefulset/prometheus -n observability +``` + +### Update Loki Retention +```bash +kubectl edit configmap loki-config -n observability +kubectl rollout restart statefulset/loki -n observability +``` + +### Update Alloy Collection Rules +```bash +kubectl edit configmap alloy-config -n observability +kubectl rollout restart daemonset/alloy -n observability +``` + +## Resource Usage + +Expected resource consumption: + +| Component | CPU Request | CPU Limit | Memory Request | Memory Limit | +|-----------|-------------|-----------|----------------|--------------| +| Prometheus | 500m | 2000m | 2Gi | 4Gi | +| Loki | 500m | 2000m | 1Gi | 2Gi | +| Tempo | 500m | 2000m | 1Gi | 2Gi | +| Grafana | 250m | 1000m | 512Mi | 1Gi | +| Alloy (per node) | 100m | 500m | 256Mi | 512Mi | +| kube-state-metrics | 100m | 200m | 128Mi | 256Mi | +| node-exporter (per node) | 100m | 200m | 128Mi | 256Mi | + +**Total (single node)**: ~2.1 CPU cores, ~7.5Gi memory + +## Security Considerations + +1. **Change default Grafana password** immediately after deployment +2. Consider adding authentication for internal services if exposed +3. Review and restrict RBAC permissions as needed +4. Enable audit logging in Loki for sensitive namespaces +5. Consider adding NetworkPolicies to restrict traffic + +## Documentation + +This deployment includes comprehensive guides: + +- **README.md**: Complete deployment and configuration guide (this file) +- **MONITORING-GUIDE.md**: URLs, access, and how to monitor new applications +- **DEPLOYMENT-CHECKLIST.md**: Step-by-step deployment checklist +- **QUICKREF.md**: Quick reference for daily operations +- **demo-app.yaml**: Example fully instrumented application +- **deploy.sh**: Automated deployment script +- **status.sh**: Health check script +- **cleanup.sh**: Complete stack removal +- **remove-old-monitoring.sh**: Remove existing monitoring before deployment +- **21-optional-ingresses.yaml**: Optional external access to Prometheus/Loki/Tempo + +## Future Enhancements + +- Add Alertmanager for alerting +- Configure Grafana SMTP for email notifications +- Add custom dashboards for your applications +- Implement Grafana RBAC for team access +- Consider Mimir for long-term metrics storage +- Add backup/restore procedures + +## Support + +For issues or questions: +1. Check pod logs first +2. Review Grafana datasource configuration +3. Verify network connectivity between components +4. Check storage and resource availability + +## Version Information + +- Grafana: 11.4.0 +- Prometheus: 2.54.1 +- Loki: 3.2.1 +- Tempo: 2.6.1 +- Alloy: 1.5.1 +- kube-state-metrics: 2.13.0 +- node-exporter: 1.8.2 + +Last updated: January 2025 diff --git a/k8s/observability-stack/cleanup.sh b/k8s/observability-stack/cleanup.sh new file mode 100755 index 0000000..8c071f3 --- /dev/null +++ b/k8s/observability-stack/cleanup.sh @@ -0,0 +1,62 @@ +#!/bin/bash + +set -e + +echo "==================================================" +echo "Removing Observability Stack from Kubernetes" +echo "==================================================" +echo "" + +RED='\033[0;31m' +YELLOW='\033[1;33m' +NC='\033[0m' # No Color + +echo -e "${RED}WARNING: This will delete all observability data!${NC}" +echo "" +read -p "Are you sure you want to continue? (yes/no): " confirm + +if [ "$confirm" != "yes" ]; then + echo "Cleanup cancelled." + exit 0 +fi + +echo -e "${YELLOW}Removing Ingress...${NC}" +kubectl delete -f 20-grafana-ingress.yaml --ignore-not-found + +echo -e "${YELLOW}Removing Deployments and DaemonSets...${NC}" +kubectl delete -f 16-node-exporter.yaml --ignore-not-found +kubectl delete -f 15-kube-state-metrics.yaml --ignore-not-found +kubectl delete -f 14-alloy.yaml --ignore-not-found +kubectl delete -f 13-grafana.yaml --ignore-not-found +kubectl delete -f 12-tempo.yaml --ignore-not-found +kubectl delete -f 11-loki.yaml --ignore-not-found +kubectl delete -f 10-prometheus.yaml --ignore-not-found + +echo -e "${YELLOW}Removing RBAC...${NC}" +kubectl delete -f 08-rbac.yaml --ignore-not-found + +echo -e "${YELLOW}Removing ConfigMaps...${NC}" +kubectl delete -f 07-grafana-datasources.yaml --ignore-not-found +kubectl delete -f 06-alloy-config.yaml --ignore-not-found +kubectl delete -f 05-tempo-config.yaml --ignore-not-found +kubectl delete -f 04-loki-config.yaml --ignore-not-found +kubectl delete -f 03-prometheus-config.yaml --ignore-not-found + +echo -e "${YELLOW}Removing PVCs...${NC}" +kubectl delete -f 02-persistent-volume-claims.yaml --ignore-not-found + +echo -e "${YELLOW}Removing PVs...${NC}" +kubectl delete -f 01-persistent-volumes.yaml --ignore-not-found + +echo -e "${YELLOW}Removing Namespace...${NC}" +kubectl delete -f 00-namespace.yaml --ignore-not-found + +echo "" +echo -e "${RED}==================================================" +echo "Cleanup Complete!" +echo "==================================================${NC}" +echo "" +echo "Data directories on hetzner-2 node are preserved." +echo "To remove them, run on the node:" +echo " sudo rm -rf /mnt/local-ssd/{prometheus,loki,tempo,grafana}" +echo "" diff --git a/k8s/observability-stack/demo-app.yaml b/k8s/observability-stack/demo-app.yaml new file mode 100644 index 0000000..5a4e9f5 --- /dev/null +++ b/k8s/observability-stack/demo-app.yaml @@ -0,0 +1,253 @@ +--- +# Example instrumented application to test the observability stack +# This is a simple Python Flask app with OpenTelemetry instrumentation + +apiVersion: v1 +kind: ConfigMap +metadata: + name: demo-app + namespace: observability +data: + app.py: | + from flask import Flask, jsonify + import logging + import json + import time + import random + + # OpenTelemetry imports + from opentelemetry import trace, metrics + from opentelemetry.sdk.trace import TracerProvider + from opentelemetry.sdk.trace.export import BatchSpanProcessor + from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter + from opentelemetry.sdk.metrics import MeterProvider + from opentelemetry.sdk.metrics.export import PeriodicExportingMetricReader + from opentelemetry.exporter.otlp.proto.grpc.metric_exporter import OTLPMetricExporter + from opentelemetry.instrumentation.flask import FlaskInstrumentor + from opentelemetry.sdk.resources import Resource + from prometheus_flask_exporter import PrometheusMetrics + + # Configure structured logging + logging.basicConfig( + level=logging.INFO, + format='%(message)s' + ) + + class JSONFormatter(logging.Formatter): + def format(self, record): + log_obj = { + 'timestamp': self.formatTime(record, self.datefmt), + 'level': record.levelname, + 'message': record.getMessage(), + 'logger': record.name, + } + if hasattr(record, 'trace_id'): + log_obj['trace_id'] = record.trace_id + log_obj['span_id'] = record.span_id + return json.dumps(log_obj) + + handler = logging.StreamHandler() + handler.setFormatter(JSONFormatter()) + logger = logging.getLogger(__name__) + logger.addHandler(handler) + logger.setLevel(logging.INFO) + + # Configure OpenTelemetry + resource = Resource.create({"service.name": "demo-app"}) + + # Tracing + trace_provider = TracerProvider(resource=resource) + trace_provider.add_span_processor( + BatchSpanProcessor( + OTLPSpanExporter( + endpoint="http://tempo.observability.svc.cluster.local:4317", + insecure=True + ) + ) + ) + trace.set_tracer_provider(trace_provider) + tracer = trace.get_tracer(__name__) + + # Create Flask app + app = Flask(__name__) + + # Prometheus metrics + metrics = PrometheusMetrics(app) + + # Auto-instrument Flask + FlaskInstrumentor().instrument_app(app) + + # Sample data + ITEMS = ["apple", "banana", "orange", "grape", "mango"] + + @app.route('/') + def index(): + span = trace.get_current_span() + trace_id = format(span.get_span_context().trace_id, '032x') + + logger.info("Index page accessed", extra={ + 'trace_id': trace_id, + 'endpoint': '/' + }) + + return jsonify({ + 'service': 'demo-app', + 'status': 'healthy', + 'trace_id': trace_id + }) + + @app.route('/items') + def get_items(): + with tracer.start_as_current_span("fetch_items") as span: + # Simulate database query + time.sleep(random.uniform(0.01, 0.1)) + + span.set_attribute("items.count", len(ITEMS)) + trace_id = format(span.get_span_context().trace_id, '032x') + + logger.info("Items fetched", extra={ + 'trace_id': trace_id, + 'count': len(ITEMS) + }) + + return jsonify({ + 'items': ITEMS, + 'count': len(ITEMS), + 'trace_id': trace_id + }) + + @app.route('/item/') + def get_item(item_id): + with tracer.start_as_current_span("fetch_item") as span: + span.set_attribute("item.id", item_id) + trace_id = format(span.get_span_context().trace_id, '032x') + + # Simulate processing + time.sleep(random.uniform(0.01, 0.05)) + + if item_id < 0 or item_id >= len(ITEMS): + logger.warning("Item not found", extra={ + 'trace_id': trace_id, + 'item_id': item_id + }) + return jsonify({'error': 'Item not found', 'trace_id': trace_id}), 404 + + item = ITEMS[item_id] + logger.info("Item fetched", extra={ + 'trace_id': trace_id, + 'item_id': item_id, + 'item': item + }) + + return jsonify({ + 'id': item_id, + 'name': item, + 'trace_id': trace_id + }) + + @app.route('/slow') + def slow_endpoint(): + with tracer.start_as_current_span("slow_operation") as span: + trace_id = format(span.get_span_context().trace_id, '032x') + + logger.info("Slow operation started", extra={'trace_id': trace_id}) + + # Simulate slow operation + time.sleep(random.uniform(1, 3)) + + logger.info("Slow operation completed", extra={'trace_id': trace_id}) + + return jsonify({ + 'message': 'Operation completed', + 'trace_id': trace_id + }) + + @app.route('/error') + def error_endpoint(): + with tracer.start_as_current_span("error_operation") as span: + trace_id = format(span.get_span_context().trace_id, '032x') + + logger.error("Intentional error triggered", extra={'trace_id': trace_id}) + span.set_attribute("error", True) + + return jsonify({ + 'error': 'This is an intentional error', + 'trace_id': trace_id + }), 500 + + if __name__ == '__main__': + app.run(host='0.0.0.0', port=8080) + +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: demo-app + namespace: observability + labels: + app: demo-app +spec: + replicas: 1 + selector: + matchLabels: + app: demo-app + template: + metadata: + labels: + app: demo-app + annotations: + prometheus.io/scrape: "true" + prometheus.io/port: "8080" + prometheus.io/path: "/metrics" + spec: + containers: + - name: demo-app + image: python:3.11-slim + command: + - /bin/bash + - -c + - | + pip install flask opentelemetry-api opentelemetry-sdk \ + opentelemetry-instrumentation-flask \ + opentelemetry-exporter-otlp-proto-grpc \ + prometheus-flask-exporter && \ + python /app/app.py + ports: + - name: http + containerPort: 8080 + volumeMounts: + - name: app-code + mountPath: /app + resources: + requests: + cpu: 100m + memory: 256Mi + limits: + cpu: 500m + memory: 512Mi + volumes: + - name: app-code + configMap: + name: demo-app + +--- +apiVersion: v1 +kind: Service +metadata: + name: demo-app + namespace: observability + labels: + app: demo-app + annotations: + prometheus.io/scrape: "true" + prometheus.io/port: "8080" + prometheus.io/path: "/metrics" +spec: + type: ClusterIP + ports: + - port: 8080 + targetPort: http + protocol: TCP + name: http + selector: + app: demo-app diff --git a/k8s/observability-stack/deploy.sh b/k8s/observability-stack/deploy.sh new file mode 100755 index 0000000..fcd22ef --- /dev/null +++ b/k8s/observability-stack/deploy.sh @@ -0,0 +1,114 @@ +#!/bin/bash + +set -e + +echo "==================================================" +echo "Deploying Observability Stack to Kubernetes" +echo "==================================================" +echo "" + +# Colors for output +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +RED='\033[0;31m' +NC='\033[0m' # No Color + +echo -e "${YELLOW}Pre-deployment Check: Existing Monitoring Stack${NC}" +echo "" +echo "If you have an existing monitoring/prometheus/grafana deployment," +echo "you should remove it first to avoid conflicts." +echo "" +read -p "Do you want to run the cleanup script now? (yes/no): " run_cleanup +if [ "$run_cleanup" = "yes" ]; then + if [ -f "./remove-old-monitoring.sh" ]; then + echo "Running cleanup script..." + ./remove-old-monitoring.sh + echo "" + echo "Cleanup complete. Continuing with deployment..." + echo "" + else + echo -e "${RED}Error: remove-old-monitoring.sh not found${NC}" + echo "Please run it manually before deploying." + exit 1 + fi +fi + +echo -e "${YELLOW}Step 1: Creating storage directories on node...${NC}" +echo "Please run this on the hetzner-2 node:" +echo " sudo mkdir -p /mnt/local-ssd/{prometheus,loki,tempo,grafana}" +echo " sudo chown -R 65534:65534 /mnt/local-ssd/prometheus" +echo " sudo chown -R 10001:10001 /mnt/local-ssd/loki" +echo " sudo chown -R root:root /mnt/local-ssd/tempo" +echo " sudo chown -R 472:472 /mnt/local-ssd/grafana" +echo "" +read -p "Press Enter once directories are created..." + +echo -e "${GREEN}Step 2: Creating namespace...${NC}" +kubectl apply -f 00-namespace.yaml + +echo -e "${GREEN}Step 3: Creating PersistentVolumes...${NC}" +kubectl apply -f 01-persistent-volumes.yaml + +echo -e "${GREEN}Step 4: Creating PersistentVolumeClaims...${NC}" +kubectl apply -f 02-persistent-volume-claims.yaml + +echo -e "${GREEN}Step 5: Creating ConfigMaps...${NC}" +kubectl apply -f 03-prometheus-config.yaml +kubectl apply -f 04-loki-config.yaml +kubectl apply -f 05-tempo-config.yaml +kubectl apply -f 06-alloy-config.yaml +kubectl apply -f 07-grafana-datasources.yaml + +echo -e "${GREEN}Step 6: Creating RBAC resources...${NC}" +kubectl apply -f 08-rbac.yaml + +echo -e "${GREEN}Step 7: Deploying Prometheus...${NC}" +kubectl apply -f 10-prometheus.yaml + +echo -e "${GREEN}Step 8: Deploying Loki...${NC}" +kubectl apply -f 11-loki.yaml + +echo -e "${GREEN}Step 9: Deploying Tempo...${NC}" +kubectl apply -f 12-tempo.yaml + +echo -e "${GREEN}Step 10: Deploying Grafana...${NC}" +kubectl apply -f 13-grafana.yaml + +echo -e "${GREEN}Step 11: Deploying Grafana Alloy...${NC}" +kubectl apply -f 14-alloy.yaml + +echo -e "${GREEN}Step 12: Deploying kube-state-metrics...${NC}" +kubectl apply -f 15-kube-state-metrics.yaml + +echo -e "${GREEN}Step 13: Deploying node-exporter...${NC}" +kubectl apply -f 16-node-exporter.yaml + +echo -e "${GREEN}Step 14: Creating Grafana Ingress...${NC}" +kubectl apply -f 20-grafana-ingress.yaml + +echo "" +echo -e "${GREEN}==================================================" +echo "Deployment Complete!" +echo "==================================================${NC}" +echo "" +echo "Waiting for pods to be ready..." +kubectl wait --for=condition=ready pod -l app=prometheus -n observability --timeout=300s +kubectl wait --for=condition=ready pod -l app=loki -n observability --timeout=300s +kubectl wait --for=condition=ready pod -l app=tempo -n observability --timeout=300s +kubectl wait --for=condition=ready pod -l app=grafana -n observability --timeout=300s + +echo "" +echo -e "${GREEN}All pods are ready!${NC}" +echo "" +echo "Access Grafana at: https://grafana.betelgeusebytes.io" +echo "Default credentials: admin / admin" +echo "" +echo "To check status:" +echo " kubectl get pods -n observability" +echo "" +echo "To view logs:" +echo " kubectl logs -n observability -l app=grafana" +echo " kubectl logs -n observability -l app=prometheus" +echo " kubectl logs -n observability -l app=loki" +echo " kubectl logs -n observability -l app=tempo" +echo "" diff --git a/k8s/observability-stack/remove-old-monitoring.sh b/k8s/observability-stack/remove-old-monitoring.sh new file mode 100755 index 0000000..f617225 --- /dev/null +++ b/k8s/observability-stack/remove-old-monitoring.sh @@ -0,0 +1,319 @@ +#!/bin/bash + +set -e + +echo "==========================================================" +echo "Removing Existing Monitoring Stack" +echo "==========================================================" +echo "" + +RED='\033[0;31m' +YELLOW='\033[1;33m' +GREEN='\033[0;32m' +NC='\033[0m' # No Color + +echo -e "${YELLOW}This script will remove common monitoring deployments including:${NC}" +echo " - Prometheus (standalone or operator)" +echo " - Grafana" +echo " - Fluent Bit" +echo " - Vector" +echo " - Loki" +echo " - Tempo" +echo " - Node exporters" +echo " - kube-state-metrics" +echo " - Any monitoring/prometheus/grafana namespaces" +echo "" +echo -e "${RED}WARNING: This will delete all existing monitoring data!${NC}" +echo "" +read -p "Are you sure you want to continue? (yes/no): " confirm + +if [ "$confirm" != "yes" ]; then + echo "Cleanup cancelled." + exit 0 +fi + +echo "" +echo -e "${YELLOW}Step 1: Checking for existing monitoring namespaces...${NC}" + +# Common namespace names for monitoring +NAMESPACES=("monitoring" "prometheus" "grafana" "loki" "tempo" "logging") + +for ns in "${NAMESPACES[@]}"; do + if kubectl get namespace "$ns" &> /dev/null; then + echo -e "${GREEN}Found namespace: $ns${NC}" + + # Show what's in the namespace + echo " Resources in $ns:" + kubectl get all -n "$ns" 2>/dev/null | head -20 || true + echo "" + + read -p " Delete namespace '$ns'? (yes/no): " delete_ns + if [ "$delete_ns" = "yes" ]; then + echo " Deleting namespace $ns..." + kubectl delete namespace "$ns" --timeout=120s || { + echo -e "${YELLOW} Warning: Namespace deletion timed out, forcing...${NC}" + kubectl delete namespace "$ns" --grace-period=0 --force & + } + fi + fi +done + +echo "" +echo -e "${YELLOW}Step 2: Removing common monitoring Helm releases...${NC}" + +# Check if helm is available +if command -v helm &> /dev/null; then + echo "Checking for Helm releases..." + + # Common Helm release names + RELEASES=("prometheus" "grafana" "loki" "tempo" "fluent-bit" "prometheus-operator" "kube-prometheus-stack") + + for release in "${RELEASES[@]}"; do + # Check all namespaces for the release + if helm list -A | grep -q "$release"; then + ns=$(helm list -A | grep "$release" | awk '{print $2}') + echo -e "${GREEN}Found Helm release: $release in namespace $ns${NC}" + read -p " Uninstall Helm release '$release'? (yes/no): " uninstall + if [ "$uninstall" = "yes" ]; then + echo " Uninstalling $release..." + helm uninstall "$release" -n "$ns" || echo -e "${YELLOW} Warning: Failed to uninstall $release${NC}" + fi + fi + done +else + echo "Helm not found, skipping Helm releases check" +fi + +echo "" +echo -e "${YELLOW}Step 3: Removing standalone monitoring components...${NC}" + +# Remove common DaemonSets in kube-system or default +echo "Checking for monitoring DaemonSets..." +for ns in kube-system default; do + if kubectl get daemonset -n "$ns" 2>/dev/null | grep -q "node-exporter\|fluent-bit\|fluentd\|vector"; then + echo -e "${GREEN}Found monitoring DaemonSets in $ns${NC}" + kubectl get daemonset -n "$ns" | grep -E "node-exporter|fluent-bit|fluentd|vector" + read -p " Delete these DaemonSets? (yes/no): " delete_ds + if [ "$delete_ds" = "yes" ]; then + kubectl delete daemonset -n "$ns" -l app=node-exporter --ignore-not-found + kubectl delete daemonset -n "$ns" -l app=fluent-bit --ignore-not-found + kubectl delete daemonset -n "$ns" -l app=fluentd --ignore-not-found + kubectl delete daemonset -n "$ns" -l app=vector --ignore-not-found + kubectl delete daemonset -n "$ns" node-exporter --ignore-not-found + kubectl delete daemonset -n "$ns" fluent-bit --ignore-not-found + kubectl delete daemonset -n "$ns" fluentd --ignore-not-found + kubectl delete daemonset -n "$ns" vector --ignore-not-found + fi + fi +done + +# Remove common Deployments +echo "" +echo "Checking for monitoring Deployments..." +for ns in kube-system default; do + if kubectl get deployment -n "$ns" 2>/dev/null | grep -q "prometheus\|grafana\|kube-state-metrics\|loki\|tempo"; then + echo -e "${GREEN}Found monitoring Deployments in $ns${NC}" + kubectl get deployment -n "$ns" | grep -E "prometheus|grafana|kube-state-metrics|loki|tempo" + read -p " Delete these Deployments? (yes/no): " delete_deploy + if [ "$delete_deploy" = "yes" ]; then + kubectl delete deployment -n "$ns" -l app=prometheus --ignore-not-found + kubectl delete deployment -n "$ns" -l app=grafana --ignore-not-found + kubectl delete deployment -n "$ns" -l app=kube-state-metrics --ignore-not-found + kubectl delete deployment -n "$ns" -l app=loki --ignore-not-found + kubectl delete deployment -n "$ns" -l app=tempo --ignore-not-found + kubectl delete deployment -n "$ns" prometheus --ignore-not-found + kubectl delete deployment -n "$ns" grafana --ignore-not-found + kubectl delete deployment -n "$ns" kube-state-metrics --ignore-not-found + fi + fi +done + +# Remove common StatefulSets +echo "" +echo "Checking for monitoring StatefulSets..." +for ns in kube-system default; do + if kubectl get statefulset -n "$ns" 2>/dev/null | grep -q "prometheus\|grafana\|loki\|tempo"; then + echo -e "${GREEN}Found monitoring StatefulSets in $ns${NC}" + kubectl get statefulset -n "$ns" | grep -E "prometheus|grafana|loki|tempo" + read -p " Delete these StatefulSets? (yes/no): " delete_sts + if [ "$delete_sts" = "yes" ]; then + kubectl delete statefulset -n "$ns" -l app=prometheus --ignore-not-found + kubectl delete statefulset -n "$ns" -l app=grafana --ignore-not-found + kubectl delete statefulset -n "$ns" -l app=loki --ignore-not-found + kubectl delete statefulset -n "$ns" -l app=tempo --ignore-not-found + kubectl delete statefulset -n "$ns" prometheus --ignore-not-found + kubectl delete statefulset -n "$ns" grafana --ignore-not-found + kubectl delete statefulset -n "$ns" loki --ignore-not-found + kubectl delete statefulset -n "$ns" tempo --ignore-not-found + fi + fi +done + +echo "" +echo -e "${YELLOW}Step 4: Removing monitoring ConfigMaps...${NC}" + +# Ask before removing ConfigMaps (they might contain important configs) +echo "Checking for monitoring ConfigMaps..." +for ns in kube-system default monitoring prometheus grafana; do + if kubectl get namespace "$ns" &> /dev/null; then + if kubectl get configmap -n "$ns" 2>/dev/null | grep -q "prometheus\|grafana\|loki\|tempo\|fluent"; then + echo -e "${GREEN}Found monitoring ConfigMaps in $ns${NC}" + kubectl get configmap -n "$ns" | grep -E "prometheus|grafana|loki|tempo|fluent" + read -p " Delete these ConfigMaps? (yes/no): " delete_cm + if [ "$delete_cm" = "yes" ]; then + kubectl delete configmap -n "$ns" -l app=prometheus --ignore-not-found + kubectl delete configmap -n "$ns" -l app=grafana --ignore-not-found + kubectl delete configmap -n "$ns" -l app=loki --ignore-not-found + kubectl delete configmap -n "$ns" -l app=fluent-bit --ignore-not-found + fi + fi + fi +done + +echo "" +echo -e "${YELLOW}Step 5: Removing ClusterRoles and ClusterRoleBindings...${NC}" + +# Remove monitoring-related RBAC +echo "Checking for monitoring ClusterRoles..." +if kubectl get clusterrole 2>/dev/null | grep -q "prometheus\|grafana\|kube-state-metrics\|fluent-bit\|node-exporter"; then + echo -e "${GREEN}Found monitoring ClusterRoles${NC}" + kubectl get clusterrole | grep -E "prometheus|grafana|kube-state-metrics|fluent-bit|node-exporter" + read -p " Delete these ClusterRoles? (yes/no): " delete_cr + if [ "$delete_cr" = "yes" ]; then + kubectl delete clusterrole prometheus --ignore-not-found + kubectl delete clusterrole grafana --ignore-not-found + kubectl delete clusterrole kube-state-metrics --ignore-not-found + kubectl delete clusterrole fluent-bit --ignore-not-found + kubectl delete clusterrole node-exporter --ignore-not-found + fi +fi + +echo "Checking for monitoring ClusterRoleBindings..." +if kubectl get clusterrolebinding 2>/dev/null | grep -q "prometheus\|grafana\|kube-state-metrics\|fluent-bit\|node-exporter"; then + echo -e "${GREEN}Found monitoring ClusterRoleBindings${NC}" + kubectl get clusterrolebinding | grep -E "prometheus|grafana|kube-state-metrics|fluent-bit|node-exporter" + read -p " Delete these ClusterRoleBindings? (yes/no): " delete_crb + if [ "$delete_crb" = "yes" ]; then + kubectl delete clusterrolebinding prometheus --ignore-not-found + kubectl delete clusterrolebinding grafana --ignore-not-found + kubectl delete clusterrolebinding kube-state-metrics --ignore-not-found + kubectl delete clusterrolebinding fluent-bit --ignore-not-found + kubectl delete clusterrolebinding node-exporter --ignore-not-found + fi +fi + +echo "" +echo -e "${YELLOW}Step 6: Removing PVCs and PVs...${NC}" + +# Check for monitoring PVCs +echo "Checking for monitoring PersistentVolumeClaims..." +for ns in kube-system default monitoring prometheus grafana; do + if kubectl get namespace "$ns" &> /dev/null; then + if kubectl get pvc -n "$ns" 2>/dev/null | grep -q "prometheus\|grafana\|loki\|tempo"; then + echo -e "${GREEN}Found monitoring PVCs in $ns${NC}" + kubectl get pvc -n "$ns" | grep -E "prometheus|grafana|loki|tempo" + echo -e "${RED} WARNING: Deleting PVCs will delete all stored data!${NC}" + read -p " Delete these PVCs? (yes/no): " delete_pvc + if [ "$delete_pvc" = "yes" ]; then + kubectl delete pvc -n "$ns" -l app=prometheus --ignore-not-found + kubectl delete pvc -n "$ns" -l app=grafana --ignore-not-found + kubectl delete pvc -n "$ns" -l app=loki --ignore-not-found + kubectl delete pvc -n "$ns" -l app=tempo --ignore-not-found + # Also try by name patterns + kubectl get pvc -n "$ns" -o name | grep -E "prometheus|grafana|loki|tempo" | xargs -r kubectl delete -n "$ns" || true + fi + fi + fi +done + +# Check for monitoring PVs +echo "" +echo "Checking for monitoring PersistentVolumes..." +if kubectl get pv 2>/dev/null | grep -q "prometheus\|grafana\|loki\|tempo\|monitoring"; then + echo -e "${GREEN}Found monitoring PVs${NC}" + kubectl get pv | grep -E "prometheus|grafana|loki|tempo|monitoring" + echo -e "${RED} WARNING: Deleting PVs may delete data on disk!${NC}" + read -p " Delete these PVs? (yes/no): " delete_pv + if [ "$delete_pv" = "yes" ]; then + kubectl get pv -o name | grep -E "prometheus|grafana|loki|tempo|monitoring" | xargs -r kubectl delete || true + fi +fi + +echo "" +echo -e "${YELLOW}Step 7: Checking for monitoring Ingresses...${NC}" + +for ns in kube-system default monitoring prometheus grafana; do + if kubectl get namespace "$ns" &> /dev/null; then + if kubectl get ingress -n "$ns" 2>/dev/null | grep -q "prometheus\|grafana\|loki"; then + echo -e "${GREEN}Found monitoring Ingresses in $ns${NC}" + kubectl get ingress -n "$ns" | grep -E "prometheus|grafana|loki" + read -p " Delete these Ingresses? (yes/no): " delete_ing + if [ "$delete_ing" = "yes" ]; then + kubectl delete ingress -n "$ns" -l app=prometheus --ignore-not-found + kubectl delete ingress -n "$ns" -l app=grafana --ignore-not-found + kubectl delete ingress -n "$ns" prometheus-ingress --ignore-not-found + kubectl delete ingress -n "$ns" grafana-ingress --ignore-not-found + fi + fi + fi +done + +echo "" +echo -e "${YELLOW}Step 8: Checking for Prometheus Operator CRDs...${NC}" + +# Check for Prometheus Operator CRDs +if kubectl get crd 2>/dev/null | grep -q "monitoring.coreos.com"; then + echo -e "${GREEN}Found Prometheus Operator CRDs${NC}" + kubectl get crd | grep "monitoring.coreos.com" + echo "" + echo -e "${RED}WARNING: Deleting these CRDs will remove ALL Prometheus Operator resources cluster-wide!${NC}" + read -p " Delete Prometheus Operator CRDs? (yes/no): " delete_crd + if [ "$delete_crd" = "yes" ]; then + kubectl delete crd prometheuses.monitoring.coreos.com --ignore-not-found + kubectl delete crd prometheusrules.monitoring.coreos.com --ignore-not-found + kubectl delete crd servicemonitors.monitoring.coreos.com --ignore-not-found + kubectl delete crd podmonitors.monitoring.coreos.com --ignore-not-found + kubectl delete crd alertmanagers.monitoring.coreos.com --ignore-not-found + kubectl delete crd alertmanagerconfigs.monitoring.coreos.com --ignore-not-found + kubectl delete crd probes.monitoring.coreos.com --ignore-not-found + kubectl delete crd thanosrulers.monitoring.coreos.com --ignore-not-found + fi +fi + +echo "" +echo -e "${YELLOW}Step 9: Optional - Clean up data directories on nodes...${NC}" +echo "" +echo "You may have monitoring data stored on your nodes at:" +echo " - /mnt/local-ssd/prometheus" +echo " - /mnt/local-ssd/grafana" +echo " - /mnt/local-ssd/loki" +echo " - /mnt/local-ssd/tempo" +echo " - /var/lib/prometheus" +echo " - /var/lib/grafana" +echo "" +echo "To remove these, SSH to each node and run:" +echo " sudo rm -rf /mnt/local-ssd/{prometheus,grafana,loki,tempo}" +echo " sudo rm -rf /var/lib/{prometheus,grafana,loki,tempo}" +echo "" +read -p "Have you cleaned up the data directories? (yes to continue, no to skip): " cleanup_dirs + +echo "" +echo -e "${GREEN}==========================================================" +echo "Existing Monitoring Stack Cleanup Complete!" +echo "==========================================================${NC}" +echo "" +echo "Summary of actions taken:" +echo " - Removed monitoring namespaces (if confirmed)" +echo " - Uninstalled Helm releases (if found and confirmed)" +echo " - Removed standalone monitoring components" +echo " - Removed monitoring ConfigMaps" +echo " - Removed RBAC resources" +echo " - Removed PVCs and PVs (if confirmed)" +echo " - Removed Ingresses" +echo " - Removed Prometheus Operator CRDs (if confirmed)" +echo "" +echo -e "${YELLOW}Next Steps:${NC}" +echo "1. Verify cleanup: kubectl get all -A | grep -E 'prometheus|grafana|loki|tempo|monitoring'" +echo "2. Clean up node data directories (see above)" +echo "3. Deploy new observability stack: ./deploy.sh" +echo "" diff --git a/k8s/observability-stack/status.sh b/k8s/observability-stack/status.sh new file mode 100755 index 0000000..d1895ee --- /dev/null +++ b/k8s/observability-stack/status.sh @@ -0,0 +1,115 @@ +#!/bin/bash + +GREEN='\033[0;32m' +RED='\033[0;31m' +YELLOW='\033[1;33m' +BLUE='\033[0;34m' +NC='\033[0m' # No Color + +echo -e "${BLUE}==================================================" +echo "Observability Stack Status Check" +echo "==================================================${NC}" +echo "" + +# Check namespace +echo -e "${YELLOW}Checking namespace...${NC}" +if kubectl get namespace observability &> /dev/null; then + echo -e "${GREEN}✓ Namespace 'observability' exists${NC}" +else + echo -e "${RED}✗ Namespace 'observability' not found${NC}" + exit 1 +fi +echo "" + +# Check PVs +echo -e "${YELLOW}Checking PersistentVolumes...${NC}" +pvs=$(kubectl get pv 2>/dev/null | grep -E "(prometheus|loki|tempo|grafana)-data-pv" | wc -l) +if [ "$pvs" -eq 4 ]; then + echo -e "${GREEN}✓ All 4 PersistentVolumes found${NC}" + kubectl get pv | grep -E "(prometheus|loki|tempo|grafana)-data-pv" +else + echo -e "${RED}✗ Expected 4 PVs, found $pvs${NC}" +fi +echo "" + +# Check PVCs +echo -e "${YELLOW}Checking PersistentVolumeClaims...${NC}" +pvcs=$(kubectl get pvc -n observability 2>/dev/null | grep -v NAME | wc -l) +if [ "$pvcs" -eq 4 ]; then + echo -e "${GREEN}✓ All 4 PersistentVolumeClaims found${NC}" + kubectl get pvc -n observability +else + echo -e "${RED}✗ Expected 4 PVCs, found $pvcs${NC}" +fi +echo "" + +# Check Pods +echo -e "${YELLOW}Checking Pods...${NC}" +kubectl get pods -n observability -o wide +echo "" + +# Count running pods +total_pods=$(kubectl get pods -n observability --no-headers 2>/dev/null | wc -l) +running_pods=$(kubectl get pods -n observability --field-selector=status.phase=Running --no-headers 2>/dev/null | wc -l) + +if [ "$total_pods" -eq 0 ]; then + echo -e "${RED}✗ No pods found in observability namespace${NC}" +else + if [ "$running_pods" -eq "$total_pods" ]; then + echo -e "${GREEN}✓ All $total_pods pods are running${NC}" + else + echo -e "${YELLOW}⚠ $running_pods/$total_pods pods are running${NC}" + fi +fi +echo "" + +# Check Services +echo -e "${YELLOW}Checking Services...${NC}" +kubectl get svc -n observability +echo "" + +# Check Ingress +echo -e "${YELLOW}Checking Ingress...${NC}" +if kubectl get ingress -n observability grafana-ingress &> /dev/null; then + echo -e "${GREEN}✓ Grafana Ingress found${NC}" + kubectl get ingress -n observability grafana-ingress +else + echo -e "${RED}✗ Grafana Ingress not found${NC}" +fi +echo "" + +# Check ConfigMaps +echo -e "${YELLOW}Checking ConfigMaps...${NC}" +configmaps=$(kubectl get configmap -n observability 2>/dev/null | grep -v NAME | wc -l) +echo "Found $configmaps ConfigMaps:" +kubectl get configmap -n observability --no-headers | awk '{print " - " $1}' +echo "" + +# Test endpoints +echo -e "${YELLOW}Testing service endpoints...${NC}" + +check_endpoint() { + local name=$1 + local url=$2 + + if kubectl run -it --rm test-$RANDOM --image=curlimages/curl --restart=Never -- \ + curl -s -o /dev/null -w "%{http_code}" --max-time 5 $url 2>/dev/null | grep -q "200\|302\|401"; then + echo -e "${GREEN}✓ $name is responding${NC}" + else + echo -e "${RED}✗ $name is not responding${NC}" + fi +} + +check_endpoint "Prometheus" "http://prometheus.observability.svc.cluster.local:9090/-/healthy" +check_endpoint "Loki" "http://loki.observability.svc.cluster.local:3100/ready" +check_endpoint "Tempo" "http://tempo.observability.svc.cluster.local:3200/ready" +check_endpoint "Grafana" "http://grafana.observability.svc.cluster.local:3000/api/health" + +echo "" +echo -e "${BLUE}==================================================" +echo "Status Check Complete" +echo "==================================================${NC}" +echo "" +echo "Access Grafana at: https://grafana.betelgeusebytes.io" +echo "Default credentials: admin / admin" +echo "" diff --git a/k8s/observability/fluent-bit.yaml b/k8s/observability/fluent-bit.yaml new file mode 100644 index 0000000..e6b726f --- /dev/null +++ b/k8s/observability/fluent-bit.yaml @@ -0,0 +1,46 @@ +apiVersion: v1 +kind: ServiceAccount +metadata: { name: fluent-bit, namespace: observability } +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: { name: fluent-bit-read } +rules: + - apiGroups: [""] + resources: ["pods", "namespaces"] + verbs: ["get", "list", "watch"] +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: { name: fluent-bit-read } +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: fluent-bit-read +subjects: + - kind: ServiceAccount + name: fluent-bit + namespace: observability +--- +apiVersion: apps/v1 +kind: DaemonSet +metadata: { name: fluent-bit, namespace: observability } +spec: + selector: { matchLabels: { app: fluent-bit } } + template: + metadata: { labels: { app: fluent-bit } } + spec: + serviceAccountName: fluent-bit + containers: + - name: fluent-bit + image: cr.fluentbit.io/fluent/fluent-bit:2.2.2 + volumeMounts: + - { name: varlog, mountPath: /var/log } + - { name: containers, mountPath: /var/lib/docker/containers, readOnly: true } + env: + - { name: FLUENT_ELASTICSEARCH_HOST, value: elasticsearch.elastic.svc.cluster.local } + - { name: FLUENT_ELASTICSEARCH_PORT, value: "9200" } + args: ["-i","tail","-p","path=/var/log/containers/*.log","-F","kubernetes","-o","es","-p","host=${FLUENT_ELASTICSEARCH_HOST}","-p","port=${FLUENT_ELASTICSEARCH_PORT}","-p","logstash_format=On","-p","logstash_prefix=k8s-logs"] + volumes: + - { name: varlog, hostPath: { path: /var/log } } + - { name: containers, hostPath: { path: /var/lib/docker/containers, type: DirectoryOrCreate } } diff --git a/k8s/otlp/otel-collector.yaml b/k8s/otlp/otel-collector.yaml new file mode 100644 index 0000000..924761d --- /dev/null +++ b/k8s/otlp/otel-collector.yaml @@ -0,0 +1,73 @@ +apiVersion: v1 +kind: Service +metadata: { name: otel-collector, namespace: observability } +spec: + selector: { app: otel-collector } + ports: + - { name: otlp-http, port: 4318, targetPort: 4318 } + - { name: otlp-grpc, port: 4317, targetPort: 4317 } +--- +apiVersion: apps/v1 +kind: Deployment +metadata: { name: otel-collector, namespace: observability } +spec: + replicas: 2 + selector: { matchLabels: { app: otel-collector } } + template: + metadata: { labels: { app: otel-collector } } + spec: + nodeSelector: { node: hetzner-2 } + containers: + - name: otel-collector + image: otel/opentelemetry-collector-contrib:0.102.0 + args: ["--config=/etc/otel/config.yaml"] + ports: + - { containerPort: 4318 } + - { containerPort: 4317 } + volumeMounts: + - { name: cfg, mountPath: /etc/otel } + volumes: + - { name: cfg, configMap: { name: otel-config } } +--- +apiVersion: v1 +kind: ConfigMap +metadata: { name: otel-config, namespace: observability } +data: + config.yaml: | + receivers: + otlp: + protocols: { http: {}, grpc: {} } + processors: { batch: {} } + exporters: + logging: {} + elasticsearch: + endpoints: ["http://elasticsearch.elastic.svc.cluster.local:9200"] + logs_index: "k8s-logs" + service: + pipelines: + logs: { receivers: [otlp], processors: [batch], exporters: [elasticsearch, logging] } + traces: { receivers: [otlp], processors: [batch], exporters: [logging] } + metrics: { receivers: [otlp], processors: [batch], exporters: [logging] } +--- +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + name: otlp + namespace: observability + annotations: { cert-manager.io/cluster-issuer: letsencrypt-prod } +spec: + ingressClassName: nginx + tls: [{ hosts: ["otlp.betelgeusebytes.io"], secretName: otlp-tls }] + rules: + - host: otlp.betelgeusebytes.io + http: + paths: + - path: /v1/traces + pathType: Prefix + backend: { service: { name: otel-collector, port: { number: 4318 } } } + - path: /v1/metrics + pathType: Prefix + backend: { service: { name: otel-collector, port: { number: 4318 } } } + - path: /v1/logs + pathType: Prefix + backend: { service: { name: otel-collector, port: { number: 4318 } } } diff --git a/k8s/postgres/.DS_Store b/k8s/postgres/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..2ce156ae762d4ba533e04cb295ed221ca24cff7f GIT binary patch literal 6148 zcmeHK%}N774E~}$6pFA%FT%Wl1>ayT_2AhTu&qT8TUKm)@V0N{BkGqQ#kSjv2N6kN z@}-$%GV?)aHUMO@eYgTD07DK%QKLuH-8*#>!KXyAH6HPTCDypaQ_n(w(WQH@@Qjwb z-r)QGXBguZGpx|uHtk~Gv~!DvToA|aCU&H=gZv|zxZcYgvFZ{FV{LP+9^Uq{tZ1ra zAQ?yol7VF4hYaw|RvC^QQzrw-Kr-;ffZh*4*)^p}XM13O1gheY!sQTelZ5v$Jii-kjK$JEI{GO){l_P&%#-~Su_ zWqO-@3rUp>Bm@7I0U1p1rW2kn-qwS2>RVel?m1L6u2X|Td*>s7KlC0sw@IHbYBR15 W>>PC!^; # optionally pin this diff --git a/k8s/postgres/postgres.yaml b/k8s/postgres/postgres.yaml new file mode 100644 index 0000000..3db44c7 --- /dev/null +++ b/k8s/postgres/postgres.yaml @@ -0,0 +1,122 @@ +apiVersion: v1 +kind: Service +metadata: { name: postgres, namespace: db } +spec: + ports: [{ port: 5432, targetPort: 5432 }] + selector: { app: postgres } +--- +apiVersion: v1 +kind: ConfigMap +metadata: { name: pg-init-sql, namespace: db } +data: + 00_extensions.sql: | + -- enable common extensions in the default DB and template1 so future DBs inherit them + \connect gitea + CREATE EXTENSION IF NOT EXISTS postgis; + CREATE EXTENSION IF NOT EXISTS vector; + CREATE COLLATION IF NOT EXISTS arabic (provider = icu, locale = 'ar', deterministic = false); + CREATE EXTENSION IF NOT EXISTS tablefunc; + CREATE EXTENSION IF NOT EXISTS pg_stat_statements; + + CREATE EXTENSION IF NOT EXISTS postgis_topology; + CREATE EXTENSION IF NOT EXISTS fuzzystrmatch; + CREATE EXTENSION IF NOT EXISTS pg_trgm; + CREATE EXTENSION IF NOT EXISTS hstore; + CREATE EXTENSION IF NOT EXISTS "uuid-ossp"; + CREATE EXTENSION IF NOT EXISTS citext; + CREATE EXTENSION IF NOT EXISTS unaccent; + CREATE EXTENSION IF NOT EXISTS pgcrypto; + -- PL/Python (optional; requires image with plpython3u, postgis image has it) + DO $$ BEGIN + CREATE EXTENSION IF NOT EXISTS plpython3u; + EXCEPTION WHEN undefined_file THEN + RAISE NOTICE 'plpython3u not available in this image'; + END $$; + + -- Also on template1 for new DBs: + \connect template1 + CREATE EXTENSION IF NOT EXISTS postgis; + CREATE EXTENSION IF NOT EXISTS pg_trgm; + CREATE EXTENSION IF NOT EXISTS hstore; + CREATE EXTENSION IF NOT EXISTS "uuid-ossp"; + CREATE EXTENSION IF NOT EXISTS citext; + CREATE EXTENSION IF NOT EXISTS unaccent; + CREATE EXTENSION IF NOT EXISTS pgcrypto; + + -- Arabic-friendly ICU collation (PostgreSQL >= 13) + -- Non-deterministic collation helps proper case/diacritics comparisons + DO $$ + BEGIN + PERFORM 1 FROM pg_collation WHERE collname='arabic'; + IF NOT FOUND THEN + CREATE COLLATION arabic (provider = icu, locale = 'ar', deterministic = false); + END IF; + END$$; + + -- Example: ensure gitea DB uses UTF8; Arabic text search often needs unaccent + custom dictionaries. + -- You can create additional DBs with: CREATE DATABASE mydb TEMPLATE template1 ENCODING 'UTF8'; + + 01_tune.sql: | + -- small safe defaults; adjust later + ALTER SYSTEM SET shared_buffers = '1GB'; + ALTER SYSTEM SET work_mem = '32MB'; + ALTER SYSTEM SET maintenance_work_mem = '512MB'; + ALTER SYSTEM SET max_connections = 200; + SELECT pg_reload_conf(); +--- +apiVersion: apps/v1 +kind: StatefulSet +metadata: { name: postgres, namespace: db } +spec: + serviceName: postgres + replicas: 1 + selector: { matchLabels: { app: postgres } } + template: + metadata: { labels: { app: postgres } } + spec: + nodeSelector: + node: hetzner-2 + securityContext: + fsGroup: 999 # Debian postgres user/group in postgis image + fsGroupChangePolicy: OnRootMismatch + initContainers: + - name: fix-perms + image: busybox:1.36 + command: ["sh","-c","chown -R 999:999 /var/lib/postgresql/data || true"] + securityContext: { runAsUser: 0 } + volumeMounts: [{ name: data, mountPath: /var/lib/postgresql/data }] + containers: + - name: postgres + image: postgres:16-3.4 + env: + - name: POSTGRES_PASSWORD + valueFrom: { secretKeyRef: { name: postgres-auth, key: POSTGRES_PASSWORD } } + - { name: POSTGRES_USER, value: gitea } + - { name: POSTGRES_DB, value: gitea } + - name: POSTGRES_INITDB_ARGS + value: "--encoding=UTF8 --locale=C.UTF-8" + ports: [{ containerPort: 5432 }] + volumeMounts: + - { name: data, mountPath: /var/lib/postgresql/data } + - { name: init, mountPath: /docker-entrypoint-initdb.d } + volumeClaimTemplates: + - metadata: { name: data } + spec: + accessModes: ["ReadWriteOnce"] + storageClassName: local-ssd-hetzner + resources: { requests: { storage: 80Gi } } +--- +# Mount the init scripts +apiVersion: apps/v1 +kind: StatefulSet +metadata: + name: postgres + namespace: db +spec: + template: + spec: + volumes: + - name: init + configMap: + name: pg-init-sql + defaultMode: 0444 diff --git a/k8s/postgres/secret.yaml b/k8s/postgres/secret.yaml new file mode 100644 index 0000000..cefb8de --- /dev/null +++ b/k8s/postgres/secret.yaml @@ -0,0 +1,7 @@ +apiVersion: v1 +kind: Secret +metadata: { name: postgres-auth, namespace: db } +type: Opaque +stringData: + POSTGRES_PASSWORD: "PG-ADM1N" + GITEA_DB_PASSWORD: "G1TEA" diff --git a/k8s/prometheus/prometheus-config.yaml b/k8s/prometheus/prometheus-config.yaml new file mode 100644 index 0000000..9b7eabd --- /dev/null +++ b/k8s/prometheus/prometheus-config.yaml @@ -0,0 +1,13 @@ +apiVersion: v1 +kind: ConfigMap +metadata: { name: prometheus-config, namespace: monitoring } +data: + prometheus.yml: | + global: { scrape_interval: 15s } + scrape_configs: + - job_name: 'kubernetes-pods' + kubernetes_sd_configs: [ { role: pod } ] + relabel_configs: + - action: keep + source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape] + regex: 'true' diff --git a/k8s/prometheus/prometheus.yaml b/k8s/prometheus/prometheus.yaml new file mode 100644 index 0000000..4a8e43b --- /dev/null +++ b/k8s/prometheus/prometheus.yaml @@ -0,0 +1,55 @@ +apiVersion: v1 +kind: Service +metadata: { name: prometheus, namespace: monitoring } +spec: + ports: [{ port: 9090, targetPort: 9090 }] + selector: { app: prometheus } +--- +apiVersion: apps/v1 +kind: StatefulSet +metadata: { name: prometheus, namespace: monitoring } +spec: + serviceName: prometheus + replicas: 1 + selector: { matchLabels: { app: prometheus } } + template: + metadata: { labels: { app: prometheus } } + spec: + nodeSelector: { node: hetzner-2 } + containers: + - name: prometheus + image: prom/prometheus:v2.53.0 + args: ["--config.file=/etc/prometheus/prometheus.yml","--storage.tsdb.path=/prometheus"] + ports: [{ containerPort: 9090 }] + volumeMounts: + - { name: data, mountPath: /prometheus } + - { name: config, mountPath: /etc/prometheus } + volumes: + - { name: config, configMap: { name: prometheus-config } } + volumeClaimTemplates: + - metadata: { name: data } + spec: + accessModes: ["ReadWriteOnce"] + storageClassName: local-ssd-hetzner + resources: { requests: { storage: 50Gi } } +--- +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + name: prometheus + namespace: monitoring + annotations: + cert-manager.io/cluster-issuer: letsencrypt-prod + nginx.ingress.kubernetes.io/auth-type: basic + nginx.ingress.kubernetes.io/auth-secret: basic-auth-prometheus + nginx.ingress.kubernetes.io/auth-realm: "Authentication Required" +spec: + ingressClassName: nginx + tls: [{ hosts: ["prometheus.betelgeusebytes.io"], secretName: prometheus-tls }] + rules: + - host: prometheus.betelgeusebytes.io + http: + paths: + - path: / + pathType: Prefix + backend: { service: { name: prometheus, port: { number: 9090 } } } diff --git a/k8s/redis/redis-pv.yaml b/k8s/redis/redis-pv.yaml new file mode 100644 index 0000000..7ccc1da --- /dev/null +++ b/k8s/redis/redis-pv.yaml @@ -0,0 +1,21 @@ +apiVersion: v1 +kind: PersistentVolume +metadata: + name: pv-redis +spec: + capacity: + storage: 10Gi + accessModes: + - ReadWriteOnce + persistentVolumeReclaimPolicy: Retain + storageClassName: local-ssd-hetzner + local: + path: /mnt/local-ssd/redis + nodeAffinity: + required: + nodeSelectorTerms: + - matchExpressions: + - key: kubernetes.io/hostname + operator: In + values: + - hetzner-2 \ No newline at end of file diff --git a/k8s/redis/redis.yaml b/k8s/redis/redis.yaml new file mode 100644 index 0000000..0ac5cff --- /dev/null +++ b/k8s/redis/redis.yaml @@ -0,0 +1,40 @@ +apiVersion: v1 +kind: Service +metadata: { name: redis, namespace: db } +spec: + ports: [{ port: 6379, targetPort: 6379 }] + selector: { app: redis } +--- +apiVersion: apps/v1 +kind: StatefulSet +metadata: { name: redis, namespace: db } +spec: + serviceName: redis + replicas: 1 + selector: { matchLabels: { app: redis } } + template: + metadata: { labels: { app: redis } } + spec: + nodeSelector: { node: hetzner-2 } + containers: + - name: redis + image: redis:7 + args: ["--requirepass", "$(REDIS_PASSWORD)"] + env: + - name: REDIS_PASSWORD + valueFrom: { secretKeyRef: { name: redis-auth, key: REDIS_PASSWORD } } + ports: [{ containerPort: 6379 }] + volumeMounts: + - { name: data, mountPath: /data } + volumeClaimTemplates: + - metadata: { name: data } + spec: + accessModes: ["ReadWriteOnce"] + storageClassName: local-ssd-hetzner + resources: { requests: { storage: 10Gi } } +--- +apiVersion: v1 +kind: Secret +metadata: { name: redis-auth, namespace: db } +type: Opaque +stringData: { REDIS_PASSWORD: "RED1S" } diff --git a/k8s/scripts/cleanup.sh b/k8s/scripts/cleanup.sh new file mode 100755 index 0000000..1fe758c --- /dev/null +++ b/k8s/scripts/cleanup.sh @@ -0,0 +1,319 @@ +#!/bin/bash + +set -e + +echo "==========================================================" +echo "Removing Existing Monitoring Stack" +echo "==========================================================" +echo "" + +RED='\033[0;31m' +YELLOW='\033[1;33m' +GREEN='\033[0;32m' +NC='\033[0m' # No Color + +echo -e "${YELLOW}This script will remove common monitoring deployments including:${NC}" +echo " - Prometheus (standalone or operator)" +echo " - Grafana" +echo " - Fluent Bit" +echo " - Vector" +echo " - Loki" +echo " - Tempo" +echo " - Node exporters" +echo " - kube-state-metrics" +echo " - Any monitoring/prometheus/grafana namespaces" +echo "" +echo -e "${RED}WARNING: This will delete all existing monitoring data!${NC}" +echo "" +read -p "Are you sure you want to continue? (yes/no): " confirm + +if [ "$confirm" != "yes" ]; then + echo "Cleanup cancelled." + exit 0 +fi + +echo "" +echo -e "${YELLOW}Step 1: Checking for existing monitoring namespaces...${NC}" + +# Common namespace names for monitoring +NAMESPACES=("monitoring" "prometheus" "grafana" "loki" "tempo" "logging") + +for ns in "${NAMESPACES[@]}"; do + if kubectl get namespace "$ns" &> /dev/null; then + echo -e "${GREEN}Found namespace: $ns${NC}" + + # Show what's in the namespace + echo " Resources in $ns:" + kubectl get all -n "$ns" 2>/dev/null | head -20 || true + echo "" + + read -p " Delete namespace '$ns'? (yes/no): " delete_ns + if [ "$delete_ns" = "yes" ]; then + echo " Deleting namespace $ns..." + kubectl delete namespace "$ns" --timeout=120s || { + echo -e "${YELLOW} Warning: Namespace deletion timed out, forcing...${NC}" + kubectl delete namespace "$ns" --grace-period=0 --force & + } + fi + fi +done + +echo "" +echo -e "${YELLOW}Step 2: Removing common monitoring Helm releases...${NC}" + +# Check if helm is available +if command -v helm &> /dev/null; then + echo "Checking for Helm releases..." + + # Common Helm release names + RELEASES=("prometheus" "grafana" "loki" "tempo" "fluent-bit" "prometheus-operator" "kube-prometheus-stack") + + for release in "${RELEASES[@]}"; do + # Check all namespaces for the release + if helm list -A | grep -q "$release"; then + ns=$(helm list -A | grep "$release" | awk '{print $2}') + echo -e "${GREEN}Found Helm release: $release in namespace $ns${NC}" + read -p " Uninstall Helm release '$release'? (yes/no): " uninstall + if [ "$uninstall" = "yes" ]; then + echo " Uninstalling $release..." + helm uninstall "$release" -n "$ns" || echo -e "${YELLOW} Warning: Failed to uninstall $release${NC}" + fi + fi + done +else + echo "Helm not found, skipping Helm releases check" +fi + +echo "" +echo -e "${YELLOW}Step 3: Removing standalone monitoring components...${NC}" + +# Remove common DaemonSets in kube-system or default +echo "Checking for monitoring DaemonSets..." +for ns in kube-system default; do + if kubectl get daemonset -n "$ns" 2>/dev/null | grep -q "node-exporter\|fluent-bit\|fluentd\|vector"; then + echo -e "${GREEN}Found monitoring DaemonSets in $ns${NC}" + kubectl get daemonset -n "$ns" | grep -E "node-exporter|fluent-bit|fluentd|vector" + read -p " Delete these DaemonSets? (yes/no): " delete_ds + if [ "$delete_ds" = "yes" ]; then + kubectl delete daemonset -n "$ns" -l app=node-exporter --ignore-not-found + kubectl delete daemonset -n "$ns" -l app=fluent-bit --ignore-not-found + kubectl delete daemonset -n "$ns" -l app=fluentd --ignore-not-found + kubectl delete daemonset -n "$ns" -l app=vector --ignore-not-found + kubectl delete daemonset -n "$ns" node-exporter --ignore-not-found + kubectl delete daemonset -n "$ns" fluent-bit --ignore-not-found + kubectl delete daemonset -n "$ns" fluentd --ignore-not-found + kubectl delete daemonset -n "$ns" vector --ignore-not-found + fi + fi +done + +# Remove common Deployments +echo "" +echo "Checking for monitoring Deployments..." +for ns in kube-system default; do + if kubectl get deployment -n "$ns" 2>/dev/null | grep -q "prometheus\|grafana\|kube-state-metrics\|loki\|tempo"; then + echo -e "${GREEN}Found monitoring Deployments in $ns${NC}" + kubectl get deployment -n "$ns" | grep -E "prometheus|grafana|kube-state-metrics|loki|tempo" + read -p " Delete these Deployments? (yes/no): " delete_deploy + if [ "$delete_deploy" = "yes" ]; then + kubectl delete deployment -n "$ns" -l app=prometheus --ignore-not-found + kubectl delete deployment -n "$ns" -l app=grafana --ignore-not-found + kubectl delete deployment -n "$ns" -l app=kube-state-metrics --ignore-not-found + kubectl delete deployment -n "$ns" -l app=loki --ignore-not-found + kubectl delete deployment -n "$ns" -l app=tempo --ignore-not-found + kubectl delete deployment -n "$ns" prometheus --ignore-not-found + kubectl delete deployment -n "$ns" grafana --ignore-not-found + kubectl delete deployment -n "$ns" kube-state-metrics --ignore-not-found + fi + fi +done + +# Remove common StatefulSets +echo "" +echo "Checking for monitoring StatefulSets..." +for ns in kube-system default; do + if kubectl get statefulset -n "$ns" 2>/dev/null | grep -q "prometheus\|grafana\|loki\|tempo"; then + echo -e "${GREEN}Found monitoring StatefulSets in $ns${NC}" + kubectl get statefulset -n "$ns" | grep -E "prometheus|grafana|loki|tempo" + read -p " Delete these StatefulSets? (yes/no): " delete_sts + if [ "$delete_sts" = "yes" ]; then + kubectl delete statefulset -n "$ns" -l app=prometheus --ignore-not-found + kubectl delete statefulset -n "$ns" -l app=grafana --ignore-not-found + kubectl delete statefulset -n "$ns" -l app=loki --ignore-not-found + kubectl delete statefulset -n "$ns" -l app=tempo --ignore-not-found + kubectl delete statefulset -n "$ns" prometheus --ignore-not-found + kubectl delete statefulset -n "$ns" grafana --ignore-not-found + kubectl delete statefulset -n "$ns" loki --ignore-not-found + kubectl delete statefulset -n "$ns" tempo --ignore-not-found + fi + fi +done + +echo "" +echo -e "${YELLOW}Step 4: Removing monitoring ConfigMaps...${NC}" + +# Ask before removing ConfigMaps (they might contain important configs) +echo "Checking for monitoring ConfigMaps..." +for ns in kube-system default monitoring prometheus grafana; do + if kubectl get namespace "$ns" &> /dev/null; then + if kubectl get configmap -n "$ns" 2>/dev/null | grep -q "prometheus\|grafana\|loki\|tempo\|fluent"; then + echo -e "${GREEN}Found monitoring ConfigMaps in $ns${NC}" + kubectl get configmap -n "$ns" | grep -E "prometheus|grafana|loki|tempo|fluent" + read -p " Delete these ConfigMaps? (yes/no): " delete_cm + if [ "$delete_cm" = "yes" ]; then + kubectl delete configmap -n "$ns" -l app=prometheus --ignore-not-found + kubectl delete configmap -n "$ns" -l app=grafana --ignore-not-found + kubectl delete configmap -n "$ns" -l app=loki --ignore-not-found + kubectl delete configmap -n "$ns" -l app=fluent-bit --ignore-not-found + fi + fi + fi +done + +echo "" +echo -e "${YELLOW}Step 5: Removing ClusterRoles and ClusterRoleBindings...${NC}" + +# Remove monitoring-related RBAC +echo "Checking for monitoring ClusterRoles..." +if kubectl get clusterrole 2>/dev/null | grep -q "prometheus\|grafana\|kube-state-metrics\|fluent-bit\|node-exporter"; then + echo -e "${GREEN}Found monitoring ClusterRoles${NC}" + kubectl get clusterrole | grep -E "prometheus|grafana|kube-state-metrics|fluent-bit|node-exporter" + read -p " Delete these ClusterRoles? (yes/no): " delete_cr + if [ "$delete_cr" = "yes" ]; then + kubectl delete clusterrole prometheus --ignore-not-found + kubectl delete clusterrole grafana --ignore-not-found + kubectl delete clusterrole kube-state-metrics --ignore-not-found + kubectl delete clusterrole fluent-bit --ignore-not-found + kubectl delete clusterrole node-exporter --ignore-not-found + fi +fi + +echo "Checking for monitoring ClusterRoleBindings..." +if kubectl get clusterrolebinding 2>/dev/null | grep -q "prometheus\|grafana\|kube-state-metrics\|fluent-bit\|node-exporter"; then + echo -e "${GREEN}Found monitoring ClusterRoleBindings${NC}" + kubectl get clusterrolebinding | grep -E "prometheus|grafana|kube-state-metrics|fluent-bit|node-exporter" + read -p " Delete these ClusterRoleBindings? (yes/no): " delete_crb + if [ "$delete_crb" = "yes" ]; then + kubectl delete clusterrolebinding prometheus --ignore-not-found + kubectl delete clusterrolebinding grafana --ignore-not-found + kubectl delete clusterrolebinding kube-state-metrics --ignore-not-found + kubectl delete clusterrolebinding fluent-bit --ignore-not-found + kubectl delete clusterrolebinding node-exporter --ignore-not-found + fi +fi + +echo "" +echo -e "${YELLOW}Step 6: Removing PVCs and PVs...${NC}" + +# Check for monitoring PVCs +echo "Checking for monitoring PersistentVolumeClaims..." +for ns in kube-system default monitoring prometheus grafana; do + if kubectl get namespace "$ns" &> /dev/null; then + if kubectl get pvc -n "$ns" 2>/dev/null | grep -q "prometheus\|grafana\|loki\|tempo"; then + echo -e "${GREEN}Found monitoring PVCs in $ns${NC}" + kubectl get pvc -n "$ns" | grep -E "prometheus|grafana|loki|tempo" + echo -e "${RED} WARNING: Deleting PVCs will delete all stored data!${NC}" + read -p " Delete these PVCs? (yes/no): " delete_pvc + if [ "$delete_pvc" = "yes" ]; then + kubectl delete pvc -n "$ns" -l app=prometheus --ignore-not-found + kubectl delete pvc -n "$ns" -l app=grafana --ignore-not-found + kubectl delete pvc -n "$ns" -l app=loki --ignore-not-found + kubectl delete pvc -n "$ns" -l app=tempo --ignore-not-found + # Also try by name patterns + kubectl get pvc -n "$ns" -o name | grep -E "prometheus|grafana|loki|tempo" | xargs -r kubectl delete -n "$ns" || true + fi + fi + fi +done + +# Check for monitoring PVs +echo "" +echo "Checking for monitoring PersistentVolumes..." +if kubectl get pv 2>/dev/null | grep -q "prometheus\|grafana\|loki\|tempo\|monitoring"; then + echo -e "${GREEN}Found monitoring PVs${NC}" + kubectl get pv | grep -E "prometheus|grafana|loki|tempo|monitoring" + echo -e "${RED} WARNING: Deleting PVs may delete data on disk!${NC}" + read -p " Delete these PVs? (yes/no): " delete_pv + if [ "$delete_pv" = "yes" ]; then + kubectl get pv -o name | grep -E "prometheus|grafana|loki|tempo|monitoring" | xargs -r kubectl delete || true + fi +fi + +echo "" +echo -e "${YELLOW}Step 7: Checking for monitoring Ingresses...${NC}" + +for ns in kube-system default monitoring prometheus grafana; do + if kubectl get namespace "$ns" &> /dev/null; then + if kubectl get ingress -n "$ns" 2>/dev/null | grep -q "prometheus\|grafana\|loki"; then + echo -e "${GREEN}Found monitoring Ingresses in $ns${NC}" + kubectl get ingress -n "$ns" | grep -E "prometheus|grafana|loki" + read -p " Delete these Ingresses? (yes/no): " delete_ing + if [ "$delete_ing" = "yes" ]; then + kubectl delete ingress -n "$ns" -l app=prometheus --ignore-not-found + kubectl delete ingress -n "$ns" -l app=grafana --ignore-not-found + kubectl delete ingress -n "$ns" prometheus-ingress --ignore-not-found + kubectl delete ingress -n "$ns" grafana-ingress --ignore-not-found + fi + fi + fi +done + +echo "" +echo -e "${YELLOW}Step 8: Checking for Prometheus Operator CRDs...${NC}" + +# Check for Prometheus Operator CRDs +if kubectl get crd 2>/dev/null | grep -q "monitoring.coreos.com"; then + echo -e "${GREEN}Found Prometheus Operator CRDs${NC}" + kubectl get crd | grep "monitoring.coreos.com" + echo "" + echo -e "${RED}WARNING: Deleting these CRDs will remove ALL Prometheus Operator resources cluster-wide!${NC}" + read -p " Delete Prometheus Operator CRDs? (yes/no): " delete_crd + if [ "$delete_crd" = "yes" ]; then + kubectl delete crd prometheuses.monitoring.coreos.com --ignore-not-found + kubectl delete crd prometheusrules.monitoring.coreos.com --ignore-not-found + kubectl delete crd servicemonitors.monitoring.coreos.com --ignore-not-found + kubectl delete crd podmonitors.monitoring.coreos.com --ignore-not-found + kubectl delete crd alertmanagers.monitoring.coreos.com --ignore-not-found + kubectl delete crd alertmanagerconfigs.monitoring.coreos.com --ignore-not-found + kubectl delete crd probes.monitoring.coreos.com --ignore-not-found + kubectl delete crd thanosrulers.monitoring.coreos.com --ignore-not-found + fi +fi + +echo "" +echo -e "${YELLOW}Step 9: Optional - Clean up data directories on nodes...${NC}" +echo "" +echo "You may have monitoring data stored on your nodes at:" +echo " - /mnt/local-ssd/prometheus" +echo " - /mnt/local-ssd/grafana" +echo " - /mnt/local-ssd/loki" +echo " - /mnt/local-ssd/tempo" +echo " - /var/lib/prometheus" +echo " - /var/lib/grafana" +echo "" +echo "To remove these, SSH to each node and run:" +echo " sudo rm -rf /mnt/local-ssd/{prometheus,grafana,loki,tempo}" +echo " sudo rm -rf /var/lib/{prometheus,grafana,loki,tempo}" +echo "" +read -p "Have you cleaned up the data directories? (yes to continue, no to skip): " cleanup_dirs + +echo "" +echo -e "${GREEN}==========================================================" +echo "Existing Monitoring Stack Cleanup Complete!" +echo "==========================================================${NC}" +echo "" +echo "Summary of actions taken:" +echo " - Removed monitoring namespaces (if confirmed)" +echo " - Uninstalled Helm releases (if found and confirmed)" +echo " - Removed standalone monitoring components" +echo " - Removed monitoring ConfigMaps" +echo " - Removed RBAC resources" +echo " - Removed PVCs and PVs (if confirmed)" +echo " - Removed Ingresses" +echo " - Removed Prometheus Operator CRDs (if confirmed)" +echo "" +echo -e "${YELLOW}Next Steps:${NC}" +echo "1. Verify cleanup: kubectl get all -A | grep -E 'prometheus|grafana|loki|tempo|monitoring'" +echo "2. Clean up node data directories (see above)" +echo "3. Deploy new observability stack: ./deploy.sh" +echo "" \ No newline at end of file diff --git a/k8s/sso/sso.yaml b/k8s/sso/sso.yaml new file mode 100644 index 0000000..6311842 --- /dev/null +++ b/k8s/sso/sso.yaml @@ -0,0 +1,98 @@ +# PV +apiVersion: v1 +kind: PersistentVolume +metadata: + name: pv-auth +spec: + capacity: + storage: 10Gi + accessModes: + - ReadWriteOnce + persistentVolumeReclaimPolicy: Retain + storageClassName: local-ssd-hetzner + local: + path: /mnt/local-ssd/auth + nodeAffinity: + required: + nodeSelectorTerms: + - matchExpressions: + - key: kubernetes.io/hostname + operator: In + values: + - hetzner-2 +--- +# k8s/auth/keycloak/secret.yaml +apiVersion: v1 +kind: Secret +metadata: { name: keycloak-admin, namespace: db } +type: Opaque +stringData: { KEYCLOAK_ADMIN: "admin", KEYCLOAK_ADMIN_PASSWORD: "admin" } + +--- +# k8s/auth/keycloak/pvc.yaml +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: { name: keycloak-data, namespace: db } +spec: + accessModes: ["ReadWriteOnce"] + storageClassName: local-ssd-hetzner + resources: { requests: { storage: 10Gi } } + +--- +# k8s/auth/keycloak/deploy.yaml +apiVersion: apps/v1 +kind: Deployment +metadata: { name: keycloak, namespace: db } +spec: + replicas: 1 + selector: { matchLabels: { app: keycloak } } + template: + metadata: { labels: { app: keycloak } } + spec: + # Ensure the PV is owned by the Keycloak UID/GID + securityContext: + fsGroup: 1000 + initContainers: + - name: fix-permissions + image: busybox + command: ['sh', '-c', 'chown -R 1000:1000 /opt/keycloak/data && chmod -R 755 /opt/keycloak/data'] + volumeMounts: + - name: data + mountPath: /opt/keycloak/data + containers: + - name: keycloak + image: quay.io/keycloak/keycloak:latest + args: ["start","--http-enabled=true","--proxy-headers=xforwarded","--hostname-strict=false"] + env: + - { name: KEYCLOAK_ADMIN, valueFrom: { secretKeyRef: { name: keycloak-admin, key: KEYCLOAK_ADMIN } } } + - { name: KEYCLOAK_ADMIN_PASSWORD, valueFrom: { secretKeyRef: { name: keycloak-admin, key: KEYCLOAK_ADMIN_PASSWORD } } } + ports: [{ containerPort: 8080 }] + volumeMounts: [{ name: data, mountPath: /opt/keycloak/data }] + securityContext: + runAsUser: 1000 + runAsGroup: 1000 + volumes: + - name: data + persistentVolumeClaim: { claimName: keycloak-data } +--- +apiVersion: v1 +kind: Service +metadata: { name: keycloak, namespace: db } +spec: { selector: { app: keycloak }, ports: [ { port: 80, targetPort: 8080 } ] } +--- +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + name: keycloak + namespace: db + annotations: { cert-manager.io/cluster-issuer: letsencrypt-prod } +spec: + ingressClassName: nginx + tls: [{ hosts: ["auth.betelgeusebytes.io"], secretName: keycloak-tls }] + rules: + - host: auth.betelgeusebytes.io + http: + paths: + - path: / + pathType: Prefix + backend: { service: { name: keycloak, port: { number: 80 } } } diff --git a/k8s/storage/persistent-volumes.yaml b/k8s/storage/persistent-volumes.yaml new file mode 100644 index 0000000..fa0db43 --- /dev/null +++ b/k8s/storage/persistent-volumes.yaml @@ -0,0 +1,175 @@ +apiVersion: v1 +kind: PersistentVolume +metadata: + name: pv-postgres +spec: + capacity: + storage: 80Gi + accessModes: + - ReadWriteOnce + persistentVolumeReclaimPolicy: Retain + storageClassName: local-ssd-hetzner + local: + path: /mnt/local-ssd/postgres + nodeAffinity: + required: + nodeSelectorTerms: + - matchExpressions: + - key: kubernetes.io/hostname + operator: In + values: + - hetzner-2 +--- +apiVersion: v1 +kind: PersistentVolume +metadata: + name: pv-elasticsearch +spec: + capacity: + storage: 300Gi + accessModes: + - ReadWriteOnce + persistentVolumeReclaimPolicy: Retain + storageClassName: local-ssd-hetzner + local: + path: /mnt/local-ssd/elasticsearch + nodeAffinity: + required: + nodeSelectorTerms: + - matchExpressions: + - key: kubernetes.io/hostname + operator: In + values: + - hetzner-2 +--- +apiVersion: v1 +kind: PersistentVolume +metadata: + name: pv-gitea +spec: + capacity: + storage: 50Gi + accessModes: + - ReadWriteOnce + persistentVolumeReclaimPolicy: Retain + storageClassName: local-ssd-hetzner + local: + path: /mnt/local-ssd/gitea + nodeAffinity: + required: + nodeSelectorTerms: + - matchExpressions: + - key: kubernetes.io/hostname + operator: In + values: + - hetzner-2 +--- +apiVersion: v1 +kind: PersistentVolume +metadata: + name: pv-jupyter +spec: + capacity: + storage: 20Gi + accessModes: + - ReadWriteOnce + persistentVolumeReclaimPolicy: Retain + storageClassName: local-ssd-hetzner + local: + path: /mnt/local-ssd/jupyter + nodeAffinity: + required: + nodeSelectorTerms: + - matchExpressions: + - key: kubernetes.io/hostname + operator: In + values: + - hetzner-2 +--- +apiVersion: v1 +kind: PersistentVolume +metadata: + name: pv-kafka +spec: + capacity: + storage: 50Gi + accessModes: + - ReadWriteOnce + persistentVolumeReclaimPolicy: Retain + storageClassName: local-ssd-hetzner + local: + path: /mnt/local-ssd/kafka + nodeAffinity: + required: + nodeSelectorTerms: + - matchExpressions: + - key: kubernetes.io/hostname + operator: In + values: + - hetzner-2 +--- +apiVersion: v1 +kind: PersistentVolume +metadata: + name: pv-zookeeper-data +spec: + capacity: + storage: 10Gi + accessModes: + - ReadWriteOnce + persistentVolumeReclaimPolicy: Retain + storageClassName: local-ssd-hetzner + local: + path: /mnt/local-ssd/zookeeper-data + nodeAffinity: + required: + nodeSelectorTerms: + - matchExpressions: + - key: kubernetes.io/hostname + operator: In + values: + - hetzner-2 +--- +apiVersion: v1 +kind: PersistentVolume +metadata: + name: pv-zookeeper-log +spec: + capacity: + storage: 10Gi + accessModes: + - ReadWriteOnce + persistentVolumeReclaimPolicy: Retain + storageClassName: local-ssd-hetzner + local: + path: /mnt/local-ssd/zookeeper-log + nodeAffinity: + required: + nodeSelectorTerms: + - matchExpressions: + - key: kubernetes.io/hostname + operator: In + values: + - hetzner-2 +--- +apiVersion: v1 +kind: PersistentVolume +metadata: + name: pv-prometheus +spec: + capacity: + storage: 50Gi + accessModes: + - ReadWriteOnce + persistentVolumeReclaimPolicy: Retain + storageClassName: local-ssd-hetzner + local: + path: /mnt/local-ssd/prometheus + nodeAffinity: + required: + nodeSelectorTerms: + - matchExpressions: + - key: kubernetes.io/hostname + operator: In + values: + - hetzner-2 diff --git a/k8s/storage/storageclass.yaml b/k8s/storage/storageclass.yaml new file mode 100644 index 0000000..ed7d4e3 --- /dev/null +++ b/k8s/storage/storageclass.yaml @@ -0,0 +1,6 @@ +apiVersion: storage.k8s.io/v1 +kind: StorageClass +metadata: + name: local-ssd-hetzner +provisioner: kubernetes.io/no-provisioner +volumeBindingMode: WaitForFirstConsumer diff --git a/k8s/tei/tei.yaml b/k8s/tei/tei.yaml new file mode 100644 index 0000000..ae1549e --- /dev/null +++ b/k8s/tei/tei.yaml @@ -0,0 +1,37 @@ +# k8s/ai/tei/deploy.yaml +apiVersion: apps/v1 +kind: Deployment +metadata: { name: tei, namespace: ml } +spec: + replicas: 1 + selector: { matchLabels: { app: tei } } + template: + metadata: { labels: { app: tei } } + spec: + containers: + - name: tei + image: ghcr.io/huggingface/text-embeddings-inference:cpu-latest + env: [{ name: MODEL_ID, value: "mixedbread-ai/mxbai-embed-large-v1" }] + ports: [{ containerPort: 80 }] +--- +apiVersion: v1 +kind: Service +metadata: { name: tei, namespace: ml } +spec: { selector: { app: tei }, ports: [ { port: 80, targetPort: 80 } ] } +--- +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + name: tei + namespace: ml + annotations: { cert-manager.io/cluster-issuer: letsencrypt-prod } +spec: + ingressClassName: nginx + tls: [{ hosts: ["embeddings.betelgeusebytes.io"], secretName: tei-tls }] + rules: + - host: embeddings.betelgeusebytes.io + http: + paths: + - path: / + pathType: Prefix + backend: { service: { name: tei, port: { number: 80 } } } diff --git a/k8s/trading/ib-gateway.yaml b/k8s/trading/ib-gateway.yaml new file mode 100644 index 0000000..31bbaaa --- /dev/null +++ b/k8s/trading/ib-gateway.yaml @@ -0,0 +1,541 @@ +apiVersion: v1 +kind: Namespace +metadata: + name: trading + labels: + name: trading + environment: production +--- +# OPTIONAL: Use this if you want to persist IB Gateway settings/logs +# across pod restarts. For most use cases, this is NOT needed since +# IB Gateway is mostly stateless and credentials are in Secrets. +# +# Only create this PV/PVC if you need to persist: +# - TWS session data +# - Custom workspace layouts +# - Historical API usage logs + +apiVersion: v1 +kind: PersistentVolume +metadata: + name: ib-gateway-data + labels: + type: local + app: ib-gateway +spec: + capacity: + storage: 5Gi + accessModes: + - ReadWriteOnce + persistentVolumeReclaimPolicy: Retain + storageClassName: local-storage + local: + path: /mnt/local-ssd/ib-gateway # Adjust to your local SSD path + nodeAffinity: + required: + nodeSelectorTerms: + - matchExpressions: + - key: kubernetes.io/hostname + operator: In + values: + - hetzner-2 + +--- +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: ib-gateway-data + namespace: trading +spec: + accessModes: + - ReadWriteOnce + resources: + requests: + storage: 5Gi + storageClassName: local-storage + selector: + matchLabels: + app: ib-gateway + +# To use this PVC, add to Deployment volumeMounts: +# - name: data +# mountPath: /root/Jts +# And to volumes: +# - name: data +# persistentVolumeClaim: +# claimName: ib-gateway-data +--- +apiVersion: v1 +kind: Secret +metadata: + name: ib-credentials + namespace: trading +type: Opaque +stringData: + # IMPORTANT: Replace these with your actual IB credentials + # For paper trading, use your paper trading account + username: "saladin85" + password: "3Lcd@05041985" + # Trading mode: "paper" or "live" + trading-mode: "paper" + + # IB Gateway config (jts.ini equivalent) + # This enables headless mode and configures ports + ibgateway.conf: | + [IBGateway] + TradingMode=paper + ApiOnly=true + ReadOnlyApi=false + TrustedIPs=127.0.0.1 + + [IBGatewayAPI] + ApiPortNumber=4002 + + [Logon] + UseRemoteSettings=no + Locale=en + ColorPaletteName=dark + + [Display] + ShowSplashScreen=no +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: ib-gateway-config + namespace: trading +data: + # Startup script to configure IB Gateway for headless operation + startup.sh: | + #!/bin/bash + set -e + + echo "Starting IB Gateway in headless mode..." + echo "Trading Mode: ${TRADING_MODE}" + echo "Port: ${TWS_PORT}" + + # Configure based on trading mode + if [ "${TRADING_MODE}" == "live" ]; then + export TWS_PORT=4001 + echo "⚠️ LIVE TRADING MODE - USE WITH CAUTION ⚠️" + else + export TWS_PORT=4002 + echo "📝 Paper Trading Mode (Safe)" + fi + # IMPORTANT: use the env vars provided by the Deployment + export IB_USERNAME="${TWS_USERID}" + export IB_PASSWORD="${TWS_PASSWORD}" + + # Start IB Gateway + exec /opt/ibgateway/ibgateway-latest-standalone-linux-x64.sh \ + --tws-path=/root/Jts \ + --tws-settings-path=/root \ + --user="${IB_USERNAME}" \ + --pw="${IB_PASSWORD}" \ + --mode="${TRADING_MODE}" \ + --port="${TWS_PORT}" + + # Health check script + healthcheck.sh: | + #!/bin/bash + # Check if TWS API port is listening + # PORT=${TWS_PORT:-4002} + # nc -z localhost $PORT + # exit $? + #!/bin/sh + # Pure-python TCP check (no nc required) + PORT="${TWS_PORT:-4002}" + python - <<'PY' + import os, socket, sys + port = int(os.environ.get("TWS_PORT", os.environ.get("PORT", "4002"))) + s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) + s.settimeout(2) + try: + s.connect(("127.0.0.1", port)) + sys.exit(0) + except Exception: + sys.exit(1) + finally: + s.close() + PY +--- +# apiVersion: apps/v1 +# kind: Deployment +# metadata: +# name: ib-gateway +# namespace: trading +# labels: +# app: ib-gateway +# component: trading-infrastructure +# spec: +# replicas: 1 # IB Gateway should only have 1 instance per account +# strategy: +# type: Recreate # Avoid multiple simultaneous logins +# selector: +# matchLabels: +# app: ib-gateway +# template: +# metadata: +# labels: +# app: ib-gateway +# annotations: +# prometheus.io/scrape: "false" # No metrics endpoint by default +# spec: +# # Pin to hetzner-2 (matches your existing pattern) +# nodeSelector: +# kubernetes.io/hostname: hetzner-2 + +# # Security context +# securityContext: +# runAsNonRoot: false # IB Gateway requires root for VNC (even if unused) +# fsGroup: 1000 + +# containers: +# - name: ib-gateway +# # Using community-maintained IB Gateway image +# # Alternative: waytrade/ib-gateway:latest +# image: ghcr.io/gnzsnz/ib-gateway:stable +# imagePullPolicy: IfNotPresent + +# env: +# - name: TWS_USERID +# valueFrom: +# secretKeyRef: +# name: ib-credentials +# key: username +# - name: TWS_PASSWORD +# valueFrom: +# secretKeyRef: +# name: ib-credentials +# key: password +# - name: TRADING_MODE +# valueFrom: +# secretKeyRef: +# name: ib-credentials +# key: trading-mode +# - name: TWS_PORT +# value: "4002" # Default to paper trading +# - name: READ_ONLY_API +# value: "no" + +# # Ports +# ports: +# - name: paper-trading +# containerPort: 4002 +# protocol: TCP +# - name: live-trading +# containerPort: 4001 +# protocol: TCP +# - name: vnc +# containerPort: 5900 +# protocol: TCP # VNC (not exposed externally) + +# # Resource limits +# resources: +# requests: +# memory: "1Gi" +# cpu: "500m" +# limits: +# memory: "2Gi" +# cpu: "1000m" + +# # Liveness probe (check if API port is responsive) +# startupProbe: +# tcpSocket: +# port: 4002 +# initialDelaySeconds: 60 # Wait 60s before first check +# periodSeconds: 10 # Check every 10s +# timeoutSeconds: 5 +# failureThreshold: 18 # 60s + (10s * 18) = 240s total startup time + +# livenessProbe: +# tcpSocket: +# port: 4002 +# initialDelaySeconds: 0 # IB Gateway takes time to start +# periodSeconds: 60 +# timeoutSeconds: 5 +# failureThreshold: 3 + +# # Readiness probe +# readinessProbe: +# tcpSocket: +# port: 4002 +# initialDelaySeconds: 0 +# periodSeconds: 10 +# timeoutSeconds: 5 +# failureThreshold: 2 + +# # Volume mounts for config +# volumeMounts: +# - name: ib-config +# mountPath: /root/Jts/jts.ini +# subPath: ibgateway.conf +# - name: startup-script +# mountPath: /startup.sh +# subPath: startup.sh +# - name: data +# mountPath: /root/Jts + +# # Logging to stdout (Fluent Bit will collect) +# # IB Gateway logs go to /root/Jts/log by default +# lifecycle: +# postStart: +# exec: +# command: +# - /bin/sh +# - -c +# - | +# mkdir -p /root/Jts/log +# ln -sf /dev/stdout /root/Jts/log/ibgateway.log || true + +# volumes: +# - name: ib-config +# secret: +# secretName: ib-credentials +# defaultMode: 0644 +# - name: startup-script +# configMap: +# name: ib-gateway-config +# defaultMode: 0755 +# - name: data +# persistentVolumeClaim: +# claimName: ib-gateway-data + +# # Restart policy +# restartPolicy: Always + +# # DNS policy for internal cluster resolution +# dnsPolicy: ClusterFirst +apiVersion: apps/v1 +kind: Deployment +metadata: + name: ib-gateway + namespace: trading + labels: + app: ib-gateway + component: trading-infrastructure +spec: + replicas: 1 + strategy: + type: Recreate + selector: + matchLabels: + app: ib-gateway + template: + metadata: + labels: + app: ib-gateway + annotations: + prometheus.io/scrape: "false" + spec: + nodeSelector: + kubernetes.io/hostname: hetzner-2 + + securityContext: + runAsNonRoot: false + fsGroup: 1000 + + # Seed writable jts.ini into the PVC once + initContainers: + - name: seed-jts-config + image: busybox:1.36 + command: + - sh + - -c + - | + set -e + mkdir -p /data + if [ ! -f /data/jts.ini ]; then + echo "Seeding jts.ini into PVC" + cp /config/ibgateway.conf /data/jts.ini + chmod 644 /data/jts.ini + else + echo "jts.ini already exists in PVC" + fi + volumeMounts: + - name: ib-config + mountPath: /config + readOnly: true + - name: data + mountPath: /data + + containers: + # ------------------------------------------------------------------ + # IB Gateway + # ------------------------------------------------------------------ + - name: ib-gateway + image: ghcr.io/gnzsnz/ib-gateway:stable + imagePullPolicy: IfNotPresent + + env: + - name: TWS_USERID + valueFrom: + secretKeyRef: + name: ib-credentials + key: username + - name: TWS_PASSWORD + valueFrom: + secretKeyRef: + name: ib-credentials + key: password + - name: TRADING_MODE + valueFrom: + secretKeyRef: + name: ib-credentials + key: trading-mode + - name: TWS_PORT + value: "4002" + - name: READ_ONLY_API + value: "no" + + ports: + - name: ib-api-local + containerPort: 4002 + protocol: TCP + - name: live-trading + containerPort: 4001 + protocol: TCP + - name: vnc + containerPort: 5900 + protocol: TCP + + resources: + requests: + memory: "1Gi" + cpu: "500m" + limits: + memory: "2Gi" + cpu: "1000m" + + # IMPORTANT: Probes should check the local IB port (4002) + startupProbe: + tcpSocket: + port: 4002 + initialDelaySeconds: 60 + periodSeconds: 10 + timeoutSeconds: 5 + failureThreshold: 18 + + livenessProbe: + tcpSocket: + port: 4002 + periodSeconds: 60 + timeoutSeconds: 5 + failureThreshold: 3 + + readinessProbe: + tcpSocket: + port: 4002 + periodSeconds: 10 + timeoutSeconds: 5 + failureThreshold: 2 + + volumeMounts: + - name: data + mountPath: /root/Jts + + lifecycle: + postStart: + exec: + command: + - sh + - -c + - | + mkdir -p /root/Jts/log + ln -sf /dev/stdout /root/Jts/log/ibgateway.log || true + + # ------------------------------------------------------------------ + # Sidecar TCP proxy: accepts cluster traffic, forwards to localhost:4002 + # ------------------------------------------------------------------ + - name: ib-api-proxy + image: alpine/socat:1.8.0.0 + imagePullPolicy: IfNotPresent + args: + - "-d" + - "-d" + - "TCP-LISTEN:4003,fork,reuseaddr" + - "TCP:127.0.0.1:4002" + ports: + - name: ib-api + containerPort: 4003 + protocol: TCP + resources: + requests: + memory: "32Mi" + cpu: "10m" + limits: + memory: "128Mi" + cpu: "100m" + # basic probe: is proxy listening + readinessProbe: + tcpSocket: + port: 4003 + periodSeconds: 5 + timeoutSeconds: 2 + failureThreshold: 3 + + volumes: + - name: ib-config + secret: + secretName: ib-credentials + defaultMode: 0644 + + - name: data + persistentVolumeClaim: + claimName: ib-gateway-data + + restartPolicy: Always + dnsPolicy: ClusterFirst + + +--- +# apiVersion: v1 +# kind: Service +# metadata: +# name: ib-gateway +# namespace: trading +# labels: +# app: ib-gateway +# spec: +# type: ClusterIP # Internal-only, not exposed publicly +# clusterIP: None # Headless service (optional, remove if you want a stable ClusterIP) +# selector: +# app: ib-gateway +# ports: +# - name: paper-trading +# port: 4002 +# targetPort: 4002 +# protocol: TCP +# - name: live-trading +# port: 4001 +# targetPort: 4001 +# protocol: TCP +# sessionAffinity: ClientIP # Stick to same pod (important for stateful TWS sessions) +# sessionAffinityConfig: +# clientIP: +# timeoutSeconds: 3600 # 1 hour session stickiness + +apiVersion: v1 +kind: Service +metadata: + name: ib-gateway + namespace: trading + labels: + app: ib-gateway +spec: + type: ClusterIP + selector: + app: ib-gateway + ports: + - name: paper-trading + port: 4002 + targetPort: 4003 # <-- proxy sidecar, not the gateway directly + protocol: TCP + - name: live-trading + port: 4001 + targetPort: 4001 + protocol: TCP + sessionAffinity: ClientIP + sessionAffinityConfig: + clientIP: + timeoutSeconds: 3600 diff --git a/k8s/trading/ib-gateway2.yaml b/k8s/trading/ib-gateway2.yaml new file mode 100644 index 0000000..81fc23b --- /dev/null +++ b/k8s/trading/ib-gateway2.yaml @@ -0,0 +1,169 @@ +apiVersion: v1 +kind: Namespace +metadata: + name: trading + labels: + name: trading + environment: production +--- +apiVersion: v1 +kind: Secret +metadata: + name: ib-credentials + namespace: trading +type: Opaque +stringData: + # Rotate your creds (you pasted them earlier). + username: "saladin85" + password: "3Lcd@05041985" + trading-mode: "paper" +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: ib-gateway + namespace: trading + labels: + app: ib-gateway + component: trading-infrastructure +spec: + replicas: 1 + strategy: + type: Recreate + selector: + matchLabels: + app: ib-gateway + template: + metadata: + labels: + app: ib-gateway + annotations: + prometheus.io/scrape: "false" + spec: + nodeSelector: + kubernetes.io/hostname: hetzner-2 + + # Keep your original security context + securityContext: + runAsNonRoot: false + fsGroup: 1000 + + containers: + - name: ib-gateway + image: ghcr.io/gnzsnz/ib-gateway:stable + imagePullPolicy: IfNotPresent + + # IMPORTANT: use env vars this image expects + env: + - name: TWS_USERID + valueFrom: + secretKeyRef: + name: ib-credentials + key: username + - name: TWS_PASSWORD + valueFrom: + secretKeyRef: + name: ib-credentials + key: password + - name: TRADING_MODE + valueFrom: + secretKeyRef: + name: ib-credentials + key: trading-mode + - name: READ_ONLY_API + value: "no" + + # These two match what your log shows the image uses + - name: API_PORT + value: "4002" + - name: SOCAT_PORT + value: "4004" + + # optional but nice + - name: TIME_ZONE + value: "Etc/UTC" + - name: TWOFA_TIMEOUT_ACTION + value: "exit" + + ports: + # IB API ports (inside container / localhost use) + - name: api-paper + containerPort: 4002 + protocol: TCP + - name: api-live + containerPort: 4001 + protocol: TCP + + # socat relay port for non-localhost clients (what we expose via Service) + - name: api-socat + containerPort: 4004 + protocol: TCP + + # optional UI/VNC + - name: vnc + containerPort: 5900 + protocol: TCP + + resources: + requests: + memory: "1Gi" + cpu: "500m" + limits: + memory: "2Gi" + cpu: "1000m" + + # Probe the socat port (represents remote connectivity) + startupProbe: + tcpSocket: + port: 4004 + initialDelaySeconds: 60 + periodSeconds: 10 + timeoutSeconds: 5 + failureThreshold: 18 + + readinessProbe: + tcpSocket: + port: 4004 + periodSeconds: 10 + timeoutSeconds: 5 + failureThreshold: 2 + + livenessProbe: + tcpSocket: + port: 4004 + periodSeconds: 60 + timeoutSeconds: 5 + failureThreshold: 3 + + restartPolicy: Always + dnsPolicy: ClusterFirst +--- +apiVersion: v1 +kind: Service +metadata: + name: ib-gateway + namespace: trading + labels: + app: ib-gateway +spec: + type: ClusterIP + selector: + app: ib-gateway + ports: + # Clients connect to 4002, but we forward to SOCAT_PORT=4004 + - name: paper-trading + port: 4002 + targetPort: 4004 + protocol: TCP + + # If you truly need live, you should relay live via another socat port too. + # For now keep it direct (or remove it entirely for safety). + - name: live-trading + port: 4001 + targetPort: 4001 + protocol: TCP + + sessionAffinity: ClientIP + sessionAffinityConfig: + clientIP: + timeoutSeconds: 3600 diff --git a/k8s/vector/qdrant.yaml b/k8s/vector/qdrant.yaml new file mode 100644 index 0000000..b035db9 --- /dev/null +++ b/k8s/vector/qdrant.yaml @@ -0,0 +1,80 @@ +# k8s/vec/qdrant/pvc.yaml +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: { name: qdrant-data, namespace: db} +spec: + accessModes: ["ReadWriteOnce"] + storageClassName: local-ssd-hetzner + resources: { requests: { storage: 20Gi } } + +--- +# k8s/vec/qdrant/deploy.yaml +apiVersion: apps/v1 +kind: Deployment +metadata: { name: qdrant, namespace: db} +spec: + replicas: 1 + selector: { matchLabels: { app: qdrant } } + template: + metadata: { labels: { app: qdrant } } + spec: + containers: + - name: qdrant + image: qdrant/qdrant:latest + ports: + - { containerPort: 6333 } # HTTP + Web UI + - { containerPort: 6334 } # gRPC + volumeMounts: + - { name: data, mountPath: /qdrant/storage } + volumes: + - name: data + persistentVolumeClaim: { claimName: qdrant-data } +--- +apiVersion: v1 +kind: Service +metadata: { name: qdrant, namespace: db} +spec: + selector: { app: qdrant } + ports: + - { name: http, port: 80, targetPort: 6333 } + - { name: grpc, port: 6334, targetPort: 6334 } +--- +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + name: qdrant + namespace: db + annotations: { cert-manager.io/cluster-issuer: letsencrypt-prod } +spec: + ingressClassName: nginx + tls: [{ hosts: ["vector.betelgeusebytes.io"], secretName: qdrant-tls }] + rules: + - host: vector.betelgeusebytes.io + http: + paths: + - path: / + pathType: Prefix + backend: { service: { name: qdrant, port: { number: 80 } } } +--- +# PV +apiVersion: v1 +kind: PersistentVolume +metadata: + name: pv-qdrant +spec: + capacity: + storage: 20Gi + accessModes: + - ReadWriteOnce + persistentVolumeReclaimPolicy: Retain + storageClassName: local-ssd-hetzner + local: + path: /mnt/local-ssd/qdrant + nodeAffinity: + required: + nodeSelectorTerms: + - matchExpressions: + - key: kubernetes.io/hostname + operator: In + values: + - hetzner-2 diff --git a/k8s/vllm/vllm.yaml b/k8s/vllm/vllm.yaml new file mode 100644 index 0000000..1d7fb6f --- /dev/null +++ b/k8s/vllm/vllm.yaml @@ -0,0 +1,142 @@ +# PV +apiVersion: v1 +kind: PersistentVolume +metadata: + name: pv-vllm +spec: + capacity: + storage: 50Gi + accessModes: + - ReadWriteOnce + persistentVolumeReclaimPolicy: Retain + storageClassName: local-ssd-hetzner + local: + path: /mnt/local-ssd/vllm + nodeAffinity: + required: + nodeSelectorTerms: + - matchExpressions: + - key: kubernetes.io/hostname + operator: In + values: + - hetzner-2 +--- +# k8s/ai/vllm/secret.yaml +apiVersion: v1 +kind: Secret +metadata: { name: vllm-auth, namespace: ml } +type: Opaque +stringData: { API_KEY: "replace_me" } + +--- +# k8s/ai/ollama/deploy.yaml +apiVersion: apps/v1 +kind: Deployment +metadata: { name: ollama, namespace: ml } +spec: + replicas: 1 + selector: { matchLabels: { app: ollama } } + template: + metadata: { labels: { app: ollama } } + spec: + securityContext: + runAsUser: 0 # needed so the init can write into /root/.ollama + initContainers: + - name: warm-models + image: ollama/ollama:latest + command: ["/bin/sh","-c"] + args: + - | + ollama serve & # start a temp daemon + sleep 2 + # pull one or more small, quantized models for CPU + ollama pull qwen2.5:3b-instruct-q4_K_M || true + ollama pull llama3.2:3b-instruct-q4_K_M || true + pkill ollama || true + volumeMounts: + - { name: data, mountPath: /root/.ollama } + containers: + - name: ollama + image: ollama/ollama:latest + env: + - { name: OLLAMA_ORIGINS, value: "*" } # CORS if you call from browser + ports: + - { containerPort: 11434 } + volumeMounts: + - { name: data, mountPath: /root/.ollama } + resources: + requests: { cpu: "2", memory: "4Gi" } + limits: { cpu: "4", memory: "8Gi" } + volumes: + - name: data + persistentVolumeClaim: { claimName: ollama-data } + +--- +# k8s/ai/ollama/svc-ing.yaml +apiVersion: v1 +kind: Service +metadata: { name: ollama, namespace: ml } +spec: + selector: { app: ollama } + ports: [ { name: http, port: 80, targetPort: 11434 } ] + +# --- +# # old k8s/ai/vllm/deploy.yaml +# apiVersion: apps/v1 +# kind: Deployment +# metadata: { name: vllm, namespace: ml } +# spec: +# replicas: 1 +# selector: { matchLabels: { app: vllm } } +# template: +# metadata: { labels: { app: vllm } } +# spec: +# containers: +# - name: vllm +# image: vllm/vllm-openai:latest +# args: ["--model","Qwen/Qwen2.5-7B-Instruct","--max-model-len","8192","--port","8000","--host","0.0.0.0"] +# env: +# - name: VLLM_API_KEY +# valueFrom: { secretKeyRef: { name: vllm-auth, key: API_KEY } } +# ports: [{ containerPort: 8000 }] +# resources: +# limits: +# nvidia.com/gpu: 1 +# requests: +# nvidia.com/gpu: 1 +# volumeMounts: +# - { name: cache, mountPath: /root/.cache/huggingface } +# volumes: +# - name: cache +# persistentVolumeClaim: { claimName: vllm-cache-pvc } +--- +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: { name: ollama-data, namespace: ml } +spec: + accessModes: ["ReadWriteOnce"] + storageClassName: local-ssd-hetzner + resources: { requests: { storage: 50Gi } } +# --- +#old k8s/ai/vllm/svc-ing.yaml +# apiVersion: v1 +# kind: Service +# metadata: { name: vllm, namespace: ml } +# spec: { selector: { app: vllm }, ports: [ { port: 80, targetPort: 8000 } ] } +--- +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + name: vllm + namespace: ml + annotations: { cert-manager.io/cluster-issuer: letsencrypt-prod } +spec: + ingressClassName: nginx + tls: [{ hosts: ["llm.betelgeusebytes.io"], secretName: vllm-tls }] + rules: + - host: llm.betelgeusebytes.io + http: + paths: + - path: / + pathType: Prefix + backend: { service: { name: vllm, port: { number: 80 } } }