commit dfdd36db3ff8ab3f1a23d6e0d0234284c566e910 Author: salahangal Date: Sun Jan 25 21:15:43 2026 +0100 adding betelgeusebytes.io deops part diff --git a/.DS_Store b/.DS_Store new file mode 100644 index 0000000..59c966f Binary files /dev/null and b/.DS_Store differ diff --git a/CLAUDE.md b/CLAUDE.md new file mode 100644 index 0000000..3a7aaa9 --- /dev/null +++ b/CLAUDE.md @@ -0,0 +1,177 @@ +# CLAUDE.md - BetelgeuseBytes Full Stack + +## Project Overview + +Kubernetes cluster deployment for BetelgeuseBytes using Ansible for infrastructure automation and kubectl for application deployment. This is a complete data science/ML platform with integrated observability, databases, and ML tools. + +**Infrastructure:** +- 2-node Kubernetes cluster on Hetzner Cloud +- Control plane + worker: hetzner-1 (95.217.89.53) +- Worker node: hetzner-2 (138.201.254.97) +- Kubernetes v1.30.3 with Cilium CNI + +## Directory Structure + +``` +. +├── ansible/ # Infrastructure-as-Code for cluster setup +│ ├── inventories/prod/ # Hetzner nodes inventory & group vars +│ │ ├── hosts.ini # Node definitions +│ │ └── group_vars/all.yml # Global K8s config (versions, CIDRs) +│ ├── playbooks/ +│ │ ├── site.yml # Main cluster bootstrap playbook +│ │ └── add-control-planes.yml # HA control plane expansion +│ └── roles/ # 16 reusable Ansible roles +│ ├── common/ # Swap disable, kernel modules, sysctl +│ ├── containerd/ # Container runtime +│ ├── kubernetes/ # kubeadm, kubelet, kubectl +│ ├── kubeadm_init/ # Primary control plane init +│ ├── kubeadm_join/ # Worker node join +│ ├── cilium/ # CNI plugin +│ ├── ingress/ # NGINX Ingress Controller +│ ├── cert_manager/ # Let's Encrypt integration +│ ├── labels/ # Node labeling +│ └── storage_local_path/ # Local storage provisioning +└── k8s/ # Kubernetes manifests + ├── 00-namespaces.yaml # 8 namespaces + ├── 01-secrets/ # Basic auth secrets + ├── storage/ # StorageClass, PersistentVolumes + ├── postgres/ # PostgreSQL 16 with extensions + ├── redis/ # Redis 7 cache + ├── elastic/ # Elasticsearch 8.14 + Kibana + ├── gitea/ # Git repository service + ├── jupyter/ # JupyterLab notebook + ├── kafka/ # Apache Kafka broker + ├── neo4j/ # Neo4j graph database + ├── prometheus/ # Prometheus monitoring + ├── grafana/ # Grafana dashboards + ├── minio/ # S3-compatible object storage + ├── mlflow/ # ML lifecycle tracking + ├── vllm/ # LLM inference (Ollama) + ├── label_studio/ # Data annotation platform + ├── argoflow/ # Argo Workflows + ├── otlp/ # OpenTelemetry collector + └── observability/ # Fluent-Bit log aggregation +``` + +## Build & Deployment Commands + +### Phase 1: Cluster Infrastructure + +```bash +# Validate connectivity +ansible -i ansible/inventories/prod/hosts.ini all -m ping + +# Bootstrap Kubernetes cluster +ansible-playbook -i ansible/inventories/prod/hosts.ini ansible/playbooks/site.yml +``` + +### Phase 2: Kubernetes Applications (order matters) + +```bash +# 1. Namespaces & storage +kubectl apply -f k8s/00-namespaces.yaml +kubectl apply -f k8s/storage/storageclass.yaml + +# 2. Secrets & auth +kubectl apply -f k8s/01-secrets/ + +# 3. Infrastructure (databases, cache, search) +kubectl apply -f k8s/postgres/ +kubectl apply -f k8s/redis/ +kubectl apply -f k8s/elastic/elasticsearch.yaml +kubectl apply -f k8s/elastic/kibana.yaml + +# 4. Application layer +kubectl apply -f k8s/gitea/ +kubectl apply -f k8s/jupyter/ +kubectl apply -f k8s/kafka/kafka.yaml +kubectl apply -f k8s/kafka/kafka-ui.yaml +kubectl apply -f k8s/neo4j/ + +# 5. Observability & telemetry +kubectl apply -f k8s/otlp/ +kubectl apply -f k8s/observability/fluent-bit.yaml +kubectl apply -f k8s/prometheus/ +kubectl apply -f k8s/grafana/ +``` + +## Namespace Organization + +| Namespace | Purpose | Services | +|-----------|---------|----------| +| `db` | Databases & cache | PostgreSQL, Redis | +| `scm` | Source control | Gitea | +| `ml` | Machine Learning | JupyterLab, MLflow, Argo, Label Studio, Ollama | +| `elastic` | Search & logging | Elasticsearch, Kibana | +| `broker` | Message brokers | Kafka | +| `graph` | Graph databases | Neo4j | +| `monitoring` | Observability | Prometheus, Grafana | +| `observability` | Telemetry | OpenTelemetry, Fluent-Bit | +| `storage` | Object storage | MinIO | + +## Key Configuration + +**Kubernetes:** +- Pod CIDR: 10.244.0.0/16 +- Service CIDR: 10.96.0.0/12 +- CNI: Cilium v1.15.7 + +**Storage:** +- StorageClass: `local-ssd-hetzner` (local volumes) +- All stateful workloads pinned to hetzner-2 +- Local path: `/mnt/local-ssd/{service-name}` + +**Networking:** +- Internal DNS: `service.namespace.svc.cluster.local` +- External: `{service}.betelgeusebytes.io` via NGINX Ingress +- TLS: Let's Encrypt via cert-manager + +## DNS Records + +A records point to both nodes: +- `apps.betelgeusebytes.io` → 95.217.89.53, 138.201.254.97 + +CNAMEs to `apps.betelgeusebytes.io`: +- gitea, kibana, grafana, prometheus, notebook, broker, neo4j, otlp, label, llm, mlflow, minio + +## Secrets Location + +- `k8s/01-secrets/basic-auth.yaml` - HTTP basic auth for protected services +- Service-specific secrets inline in respective manifests (e.g., postgres-auth, redis-auth) + +## Manifest Conventions + +1. Compact YAML style: `metadata: { name: xyz, namespace: ns }` +2. StatefulSets for persistent services (databases, brokers) +3. Deployments for stateless services (web UIs, workers) +4. DaemonSets for node-level agents (Fluent-Bit) +5. Service port=80 for ingress routing, backend maps to container port +6. Ingress with TLS + basic auth annotations where needed + +## Common Operations + +```bash +# Check cluster status +kubectl get nodes +kubectl get pods -A + +# View logs for a service +kubectl logs -n -l app= + +# Scale a deployment +kubectl scale -n deployment/ --replicas=N + +# Apply changes to a specific service +kubectl apply -f k8s// + +# Delete and recreate a service +kubectl delete -f k8s// && kubectl apply -f k8s// +``` + +## Notes + +- This is a development/test setup; passwords are hardcoded in manifests +- Elasticsearch security is disabled for development +- GPU support for vLLM is commented out (requires nvidia.com/gpu resources) +- Neo4j Bolt protocol (7687) requires manual ingress-nginx TCP patch diff --git a/DNS_RECORDS.txt b/DNS_RECORDS.txt new file mode 100644 index 0000000..4bfbcae --- /dev/null +++ b/DNS_RECORDS.txt @@ -0,0 +1,10 @@ +apps.betelgeusebytes.io. 300 IN A 95.217.89.53 +apps.betelgeusebytes.io. 300 IN A 138.201.254.97 +gitea.betelgeusebytes.io. 300 IN CNAME apps.betelgeusebytes.io. +kibana.betelgeusebytes.io. 300 IN CNAME apps.betelgeusebytes.io. +grafana.betelgeusebytes.io. 300 IN CNAME apps.betelgeusebytes.io. +prometheus.betelgeusebytes.io. 300 IN CNAME apps.betelgeusebytes.io. +notebook.betelgeusebytes.io. 300 IN CNAME apps.betelgeusebytes.io. +broker.betelgeusebytes.io. 300 IN CNAME apps.betelgeusebytes.io. +neo4j.betelgeusebytes.io. 300 IN CNAME apps.betelgeusebytes.io. +otlp.betelgeusebytes.io. 300 IN CNAME apps.betelgeusebytes.io. diff --git a/README.md b/README.md new file mode 100644 index 0000000..d145404 --- /dev/null +++ b/README.md @@ -0,0 +1,43 @@ +# BetelgeuseBytes K8s — Full Stack (kubectl-only) + +**Nodes** +- Control-plane + worker: hetzner-1 (95.217.89.53) +- Worker: hetzner-2 (138.201.254.97) + +## Bring up the cluster +```bash +ansible -i ansible/inventories/prod/hosts.ini all -m ping +ansible-playbook -i ansible/inventories/prod/hosts.ini ansible/playbooks/site.yml +``` + +## Apply apps (edit secrets first) +```bash +kubectl apply -f k8s/00-namespaces.yaml +kubectl apply -f k8s/01-secrets/ +kubectl apply -f k8s/storage/storageclass.yaml + +kubectl apply -f k8s/postgres/ +kubectl apply -f k8s/redis/ +kubectl apply -f k8s/elastic/elasticsearch.yaml +kubectl apply -f k8s/elastic/kibana.yaml + +kubectl apply -f k8s/gitea/ +kubectl apply -f k8s/jupyter/ +kubectl apply -f k8s/kafka/kafka.yaml +kubectl apply -f k8s/kafka/kafka-ui.yaml +kubectl apply -f k8s/neo4j/ + +kubectl apply -f k8s/otlp/ +kubectl apply -f k8s/observability/fluent-bit.yaml +kubectl apply -f k8s/prometheus/ +kubectl apply -f k8s/grafana/ +``` + +## DNS +A records: +- apps.betelgeusebytes.io → 95.217.89.53, 138.201.254.97 + +CNAMEs → apps.betelgeusebytes.io: +- gitea., kibana., grafana., prometheus., notebook., broker., neo4j., otlp. + +(HA later) cp.k8s.betelgeusebytes.io → , 95.217.89.53, 138.201.254.97; then set control_plane_endpoint accordingly. diff --git a/ansible/inventories/prod/group_vars/all.yml b/ansible/inventories/prod/group_vars/all.yml new file mode 100644 index 0000000..eb233d1 --- /dev/null +++ b/ansible/inventories/prod/group_vars/all.yml @@ -0,0 +1,13 @@ +cluster_name: prod +k8s_version: "v1.30.3" +control_plane_endpoint: "95.217.89.53:6443" # switch later to cp.k8s.betelgeusebytes.io:6443 + +pod_cidr: "10.244.0.0/16" +service_cidr: "10.96.0.0/12" +cilium_version: "1.15.7" + +local_path_dir: "/srv/k8s" +local_sc_name: "local-ssd-hetzner" + +stateful_node_label_key: "node" +stateful_node_label_val: "hetzner-2" diff --git a/ansible/inventories/prod/hosts.ini b/ansible/inventories/prod/hosts.ini new file mode 100644 index 0000000..c4813a9 --- /dev/null +++ b/ansible/inventories/prod/hosts.ini @@ -0,0 +1,19 @@ +[k8s_control_plane] +hetzner-1 ansible_host=95.217.89.53 public_ip=95.217.89.53 wg_address=10.66.0.11 + +[k8s_workers] +hetzner-1 ansible_host=95.217.89.53 public_ip=95.217.89.53 wg_address=10.66.0.11 +hetzner-2 ansible_host=138.201.254.97 public_ip=138.201.254.97 wg_address=10.66.0.12 + +[k8s_nodes:children] +k8s_control_plane +k8s_workers + +# add tiny VPS control-planes here when ready +[new_control_planes] +# cp-a ansible_host= public_ip= wg_address=10.66.0.10 + +[all:vars] +ansible_user=root +ansible_password=3Lcd0504 +ansible_become=true diff --git a/ansible/playbooks/add-control-planes.yml b/ansible/playbooks/add-control-planes.yml new file mode 100644 index 0000000..e3671c0 --- /dev/null +++ b/ansible/playbooks/add-control-planes.yml @@ -0,0 +1,19 @@ +- hosts: k8s_control_plane[0] + become: yes + roles: + - kubeadm_cp_discovery + +- hosts: new_control_planes + become: yes + roles: + - common + - wireguard + - containerd + - kubernetes + +- hosts: new_control_planes + become: yes + roles: + - kubeadm_join_cp + vars: + kubeadm_cp_join_cmd: "{{ hostvars[groups['k8s_control_plane'][0]].kubeadm_cp_join_cmd | default(kubeadm_cp_join_cmd) }}" diff --git a/ansible/playbooks/site.yml b/ansible/playbooks/site.yml new file mode 100644 index 0000000..c3e447d --- /dev/null +++ b/ansible/playbooks/site.yml @@ -0,0 +1,31 @@ +- hosts: k8s_nodes + become: yes + # serial: 1 + roles: + # - ../roles/common + #- ../roles/wireguard + #- ../roles/containerd + #- ../roles/kubernetes + +- hosts: k8s_control_plane + become: yes + roles: + - ../roles/kubeadm_init + +# - hosts: k8s_workers +# become: yes +# roles: +# - ../roles/kubeadm_join + +- hosts: k8s_control_plane + become: yes + roles: + # - ../roles/cilium + # - ../roles/ingress + #- ../roles/cert_manager + +- hosts: k8s_nodes + become: yes + roles: + #- ../roles/storage_local_path + - ../roles/labels diff --git a/ansible/roles/cert_manager/tasks/main.yml b/ansible/roles/cert_manager/tasks/main.yml new file mode 100644 index 0000000..607e854 --- /dev/null +++ b/ansible/roles/cert_manager/tasks/main.yml @@ -0,0 +1,66 @@ +- name: Install cert-manager + shell: kubectl apply -f https://github.com/cert-manager/cert-manager/releases/download/v1.14.4/cert-manager.yaml + +- name: Wait for cert-manager pods to be ready + shell: kubectl wait --for=condition=ready --timeout=300s pod -l app.kubernetes.io/instance=cert-manager -n cert-manager + +- name: Wait for webhook endpoint to be ready + shell: | + for i in {1..30}; do + if kubectl get endpoints cert-manager-webhook -n cert-manager -o jsonpath='{.subsets[*].addresses[*].ip}' | grep -q .; then + echo "Webhook endpoint is ready" + exit 0 + fi + echo "Waiting for webhook endpoint... attempt $i/30" + sleep 2 + done + exit 1 + +- name: Test webhook connectivity + shell: kubectl run test-webhook --image=curlimages/curl:latest --rm -i --restart=Never -- curl -k https://cert-manager-webhook.cert-manager.svc:443/healthz + register: webhook_test + ignore_errors: yes + +- name: Display webhook test result + debug: + var: webhook_test + +- name: ClusterIssuer + copy: + dest: /root/cluster-issuer-prod.yaml + content: | + apiVersion: cert-manager.io/v1 + kind: ClusterIssuer + metadata: + name: letsencrypt-prod + spec: + acme: +- name: ClusterIssuer + copy: + dest: /root/cluster-issuer-prod.yaml + content: | + apiVersion: cert-manager.io/v1 + kind: ClusterIssuer + metadata: + name: letsencrypt-prod + spec: + acme: + email: admin@betelgeusebytes.io + server: https://acme-v02.api.letsencrypt.org/directory + privateKeySecretRef: + name: letsencrypt-prod-key + solvers: + - http01: + ingress: + class: nginx + +- name: Temporarily disable cert-manager webhook + shell: | + kubectl delete validatingwebhookconfiguration cert-manager-webhook || true + ignore_errors: yes + +- name: Apply ClusterIssuer + command: kubectl apply -f /root/cluster-issuer-prod.yaml + +- name: Reinstall cert-manager to restore webhook + shell: kubectl apply -f https://github.com/cert-manager/cert-manager/releases/download/v1.14.4/cert-manager.yaml diff --git a/ansible/roles/cilium/tasks/main.yml b/ansible/roles/cilium/tasks/main.yml new file mode 100644 index 0000000..4acd7d1 --- /dev/null +++ b/ansible/roles/cilium/tasks/main.yml @@ -0,0 +1,9 @@ +- name: Install cilium CLI + shell: | + curl -L --fail --remote-name-all https://github.com/cilium/cilium-cli/releases/latest/download/cilium-linux-amd64.tar.gz + tar xzf cilium-linux-amd64.tar.gz -C /usr/local/bin + args: { creates: /usr/local/bin/cilium } + +- name: Deploy cilium + shell: | + cilium install --version {{ cilium_version }} --set kubeProxyReplacement=strict --set bpf.masquerade=true diff --git a/ansible/roles/common/tasks/main.yml b/ansible/roles/common/tasks/main.yml new file mode 100644 index 0000000..0819493 --- /dev/null +++ b/ansible/roles/common/tasks/main.yml @@ -0,0 +1,31 @@ +- name: Disable swap + command: swapoff -a + when: ansible_swaptotal_mb|int > 0 + +- name: Ensure swap disabled on boot + replace: + path: /etc/fstab + regexp: '^([^#].*\sswap\s)' + replace: '# \1' + +- name: Kernel modules + copy: + dest: /etc/modules-load.d/containerd.conf + content: | + overlay + br_netfilter + +- name: Load modules + command: modprobe {{ item }} + loop: [overlay, br_netfilter] + +- name: Sysctl for k8s + copy: + dest: /etc/sysctl.d/99-kubernetes.conf + content: | + net.bridge.bridge-nf-call-iptables = 1 + net.bridge.bridge-nf-call-ip6tables = 1 + net.ipv4.ip_forward = 1 + vm.max_map_count = 262144 +- name: Apply sysctl + command: sysctl --system diff --git a/ansible/roles/containerd/tasks/main.yml b/ansible/roles/containerd/tasks/main.yml new file mode 100644 index 0000000..02752e4 --- /dev/null +++ b/ansible/roles/containerd/tasks/main.yml @@ -0,0 +1,27 @@ +- name: Install containerd + apt: + name: containerd + state: present + update_cache: yes + +- name: Ensure containerd config directory + file: + path: /etc/containerd + state: directory + mode: '0755' + +- name: Generate default config + shell: containerd config default > /etc/containerd/config.toml + args: { creates: /etc/containerd/config.toml } + +- name: Ensure SystemdCgroup=true + replace: + path: /etc/containerd/config.toml + regexp: 'SystemdCgroup = false' + replace: 'SystemdCgroup = true' + +- name: Restart containerd + service: + name: containerd + state: restarted + enabled: yes diff --git a/ansible/roles/ingress/tasks/main.yml b/ansible/roles/ingress/tasks/main.yml new file mode 100644 index 0000000..ec2c44d --- /dev/null +++ b/ansible/roles/ingress/tasks/main.yml @@ -0,0 +1,2 @@ +- name: Deploy ingress-nginx (baremetal) + shell: kubectl apply -f https://raw.githubusercontent.com/kubernetes/ingress-nginx/main/deploy/static/provider/baremetal/deploy.yaml diff --git a/ansible/roles/kubeadm_cp_discovery/tasks/main.yml b/ansible/roles/kubeadm_cp_discovery/tasks/main.yml new file mode 100644 index 0000000..bdfa3c7 --- /dev/null +++ b/ansible/roles/kubeadm_cp_discovery/tasks/main.yml @@ -0,0 +1,24 @@ +- name: Upload certs and get certificate key + shell: kubeadm init phase upload-certs --upload-certs | tail -n 1 + register: cert_key + +- name: Compute CA cert hash + shell: | + openssl x509 -pubkey -in /etc/kubernetes/pki/ca.crt | openssl rsa -pubin -outform der 2>/dev/null | openssl dgst -sha256 -hex | awk '{print $2}' + register: ca_hash + +- name: Create short-lived token + shell: kubeadm token create --ttl 30m + register: join_token + +- name: Determine control-plane endpoint + set_fact: + cp_endpoint: "{{ hostvars[inventory_hostname].control_plane_endpoint | default(ansible_host ~ ':6443') }}" + +- set_fact: + kubeadm_cp_join_cmd: >- + kubeadm join {{ cp_endpoint }} + --token {{ join_token.stdout }} + --discovery-token-ca-cert-hash sha256:{{ ca_hash.stdout }} + --control-plane + --certificate-key {{ cert_key.stdout }} diff --git a/ansible/roles/kubeadm_init/tasks/main.yml b/ansible/roles/kubeadm_init/tasks/main.yml new file mode 100644 index 0000000..aadccc4 --- /dev/null +++ b/ansible/roles/kubeadm_init/tasks/main.yml @@ -0,0 +1,24 @@ +# - name: Write kubeadm config +# template: +# src: kubeadm-config.yaml.j2 +# dest: /etc/kubernetes/kubeadm-config.yaml + +# - name: Pre-pull images +# command: kubeadm config images pull + +# - name: Init control-plane +# command: kubeadm init --config=/etc/kubernetes/kubeadm-config.yaml +# args: { creates: /etc/kubernetes/admin.conf } + +# - name: Setup kubeconfig +# shell: | +# mkdir -p $HOME/.kube +# cp -i /etc/kubernetes/admin.conf $HOME/.kube/config +# chown $(id -u):$(id -g) $HOME/.kube/config + +- name: Save join command + shell: kubeadm token create --print-join-command + register: join_cmd + +- set_fact: + kubeadm_join_command_all: "{{ join_cmd.stdout }}" diff --git a/ansible/roles/kubeadm_init/templates/kubeadm-config.yaml.j2 b/ansible/roles/kubeadm_init/templates/kubeadm-config.yaml.j2 new file mode 100644 index 0000000..014fed2 --- /dev/null +++ b/ansible/roles/kubeadm_init/templates/kubeadm-config.yaml.j2 @@ -0,0 +1,14 @@ +apiVersion: kubeadm.k8s.io/v1beta3 +kind: ClusterConfiguration +kubernetesVersion: {{ k8s_version }} +clusterName: {{ cluster_name }} +controlPlaneEndpoint: "{{ control_plane_endpoint }}" +networking: + podSubnet: "{{ pod_cidr }}" + serviceSubnet: "{{ service_cidr }}" +--- +apiVersion: kubeadm.k8s.io/v1beta3 +kind: InitConfiguration +nodeRegistration: + kubeletExtraArgs: + node-ip: "{{ hostvars[inventory_hostname].wg_address | default(hostvars[inventory_hostname].public_ip) }}" diff --git a/ansible/roles/kubeadm_join/tasks/main.yml b/ansible/roles/kubeadm_join/tasks/main.yml new file mode 100644 index 0000000..5a6101c --- /dev/null +++ b/ansible/roles/kubeadm_join/tasks/main.yml @@ -0,0 +1,2 @@ +- name: Join node to cluster + command: "{{ hostvars[groups['k8s_control_plane'][0]].kubeadm_join_command_all }} --ignore-preflight-errors=FileAvailable--etc-kubernetes-kubelet.conf,FileAvailable--etc-kubernetes-pki-ca.crt,Port-10250" diff --git a/ansible/roles/kubeadm_join_cp/tasks/main.yml b/ansible/roles/kubeadm_join_cp/tasks/main.yml new file mode 100644 index 0000000..b5e98cc --- /dev/null +++ b/ansible/roles/kubeadm_join_cp/tasks/main.yml @@ -0,0 +1,9 @@ +- name: Ensure join command provided + fail: + msg: "Set kubeadm_cp_join_cmd variable (string)" + when: kubeadm_cp_join_cmd is not defined + +- name: Join node as control-plane + command: "{{ kubeadm_cp_join_cmd }}" + args: + creates: /etc/kubernetes/kubelet.conf diff --git a/ansible/roles/kubernetes/tasks/main.yml b/ansible/roles/kubernetes/tasks/main.yml new file mode 100644 index 0000000..b1f80a1 --- /dev/null +++ b/ansible/roles/kubernetes/tasks/main.yml @@ -0,0 +1,17 @@ +- name: Install Kubernetes apt key + shell: curl -fsSL https://pkgs.k8s.io/core:/stable:/v1.30/deb/Release.key | gpg --dearmor -o /etc/apt/keyrings/kubernetes-apt-keyring.gpg + args: { creates: /etc/apt/keyrings/kubernetes-apt-keyring.gpg } + +- name: Add Kubernetes repo + apt_repository: + repo: "deb [signed-by=/etc/apt/keyrings/kubernetes-apt-keyring.gpg] https://pkgs.k8s.io/core:/stable:/v1.30/deb/ /" + state: present + +- name: Install kubeadm, kubelet, kubectl + apt: + name: [kubeadm, kubelet, kubectl] + state: present + update_cache: yes + +- name: Hold kube packages + command: apt-mark hold kubeadm kubelet kubectl diff --git a/ansible/roles/labels/tasks/main.yml b/ansible/roles/labels/tasks/main.yml new file mode 100644 index 0000000..a49379d --- /dev/null +++ b/ansible/roles/labels/tasks/main.yml @@ -0,0 +1,4 @@ +- name: Label hetzner-2 for stateful + command: kubectl label node hetzner-2 {{ stateful_node_label_key }}={{ stateful_node_label_val }} --overwrite + delegate_to: "{{ groups['k8s_control_plane'][0] }}" + run_once: true diff --git a/ansible/roles/storage_local_path/tasks/main.yml b/ansible/roles/storage_local_path/tasks/main.yml new file mode 100644 index 0000000..17715e5 --- /dev/null +++ b/ansible/roles/storage_local_path/tasks/main.yml @@ -0,0 +1,55 @@ +- name: Ensure local path dir + file: + path: "{{ local_path_dir }}" + state: directory + mode: '0777' + +- name: StorageClass local-ssd-hetzner + copy: + dest: /root/local-sc.yaml + content: | + apiVersion: storage.k8s.io/v1 + kind: StorageClass + metadata: + name: {{ local_sc_name }} + provisioner: kubernetes.io/no-provisioner + volumeBindingMode: WaitForFirstConsumer + when: inventory_hostname in groups['k8s_control_plane'] + +- name: Apply SC + command: kubectl apply -f /root/local-sc.yaml + environment: + KUBECONFIG: /etc/kubernetes/admin.conf + when: inventory_hostname in groups['k8s_control_plane'] + +- name: Create local-path directory + file: + path: /mnt/local-ssd + state: directory + mode: '0755' + +- name: Create subdirectories for each PV + file: + path: "/mnt/local-ssd/{{ item }}" + state: directory + mode: '0755' + loop: + - postgres + - prometheus + - elasticsearch + - grafana + +- name: Copy PV manifest + template: + src: local-ssd-pv.yaml + dest: /tmp/local-ssd-pv.yaml + +- name: Apply PV + command: kubectl apply -f /tmp/local-ssd-pv.yaml + run_once: true + delegate_to: "{{ groups['k8s_control_plane'][0] }}" + +- name: Apply SC + command: kubectl apply -f /tmp/local-ssd-sc.yaml + run_once: true + delegate_to: "{{ groups['k8s_control_plane'][0] }}" diff --git a/ansible/roles/storage_local_path/templates/local-ssd-pv.yaml b/ansible/roles/storage_local_path/templates/local-ssd-pv.yaml new file mode 100644 index 0000000..3065708 --- /dev/null +++ b/ansible/roles/storage_local_path/templates/local-ssd-pv.yaml @@ -0,0 +1,65 @@ +apiVersion: v1 +kind: PersistentVolume +metadata: + name: local-ssd-postgres +spec: + capacity: + storage: 100Gi + accessModes: + - ReadWriteOnce + persistentVolumeReclaimPolicy: Retain + storageClassName: local-ssd-hetzner + local: + path: /mnt/local-ssd/postgres + nodeAffinity: + required: + nodeSelectorTerms: + - matchExpressions: + - key: kubernetes.io/hostname + operator: In + values: + - hetzner-2 +--- +apiVersion: v1 +kind: PersistentVolume +metadata: + name: local-ssd-prometheus +spec: + capacity: + storage: 100Gi + accessModes: + - ReadWriteOnce + persistentVolumeReclaimPolicy: Retain + storageClassName: local-ssd-hetzner + local: + path: /mnt/local-ssd/prometheus + nodeAffinity: + required: + nodeSelectorTerms: + - matchExpressions: + - key: kubernetes.io/hostname + operator: In + values: + - hetzner-2 +--- +apiVersion: v1 +kind: PersistentVolume +metadata: + name: local-ssd-elasticsearch +spec: + capacity: + storage: 300Gi + accessModes: + - ReadWriteOnce + persistentVolumeReclaimPolicy: Retain + storageClassName: local-ssd-hetzner + local: + path: /mnt/local-ssd/elasticsearch + nodeAffinity: + required: + nodeSelectorTerms: + - matchExpressions: + - key: kubernetes.io/hostname + operator: In + values: + - hetzner-2 diff --git a/ansible/roles/wireguard/tasks/main.yml b/ansible/roles/wireguard/tasks/main.yml new file mode 100644 index 0000000..2d1a734 --- /dev/null +++ b/ansible/roles/wireguard/tasks/main.yml @@ -0,0 +1,62 @@ +- name: Install wireguard + apt: + name: [wireguard, qrencode] + state: present + update_cache: yes + +- name: Ensure key dir + file: { path: /etc/wireguard/keys, state: directory, mode: '0700' } + +- name: Generate private key if missing + shell: "[ -f /etc/wireguard/keys/privatekey ] || (umask 077 && wg genkey > /etc/wireguard/keys/privatekey)" + args: { creates: /etc/wireguard/keys/privatekey } + +- name: Generate public key + shell: "wg pubkey < /etc/wireguard/keys/privatekey > /etc/wireguard/keys/publickey" + args: { creates: /etc/wireguard/keys/publickey } + +- name: Read pubkey + slurp: { src: /etc/wireguard/keys/publickey } + register: pubkey_raw + +- name: Read private key + slurp: { src: /etc/wireguard/keys/privatekey } + register: privkey_raw + +- set_fact: + wg_public_key: "{{ pubkey_raw.content | b64decode | trim }}" + wg_private_key: "{{ privkey_raw.content | b64decode | trim }}" + +- name: Gather facts from all hosts + setup: + delegate_to: "{{ item }}" + delegate_facts: true + loop: "{{ groups['k8s_nodes'] }}" + run_once: true + +- name: Pretty print hostvars + debug: + msg: "{{ hostvars['hetzner-1']['wg_public_key'] }}" + +- name: Render config + template: + src: wg0.conf.j2 + dest: /etc/wireguard/wg0.conf + mode: '0600' + +- name: Enable IP forward + sysctl: + name: net.ipv4.ip_forward + value: "1" + sysctl_set: yes + state: present + reload: yes + +- name: Enable wg-quick + service: + name: wg-quick@wg0 + enabled: yes + state: started + +- debug: + var: wg_show.stdout \ No newline at end of file diff --git a/ansible/roles/wireguard/templates/wg0.conf.j2 b/ansible/roles/wireguard/templates/wg0.conf.j2 new file mode 100644 index 0000000..a47c73f --- /dev/null +++ b/ansible/roles/wireguard/templates/wg0.conf.j2 @@ -0,0 +1,12 @@ +[Interface] +Address = {{ wg_nodes[inventory_hostname].address }}/24 +ListenPort = {{ wg_port }} +PrivateKey = {{ wg_private_key }} + +{% for h in groups['k8s_nodes'] if h != inventory_hostname %} +[Peer] +PublicKey = {{ hostvars[h].wg_public_key }} +AllowedIPs = {{ wg_nodes[h].address }}/32 +Endpoint = {{ wg_nodes[h].public_ip }}:{{ wg_port }} +PersistentKeepalive = 25 +{% endfor %} diff --git a/ansible/roles/wireguard/vars/main.yml b/ansible/roles/wireguard/vars/main.yml new file mode 100644 index 0000000..f908d00 --- /dev/null +++ b/ansible/roles/wireguard/vars/main.yml @@ -0,0 +1,6 @@ +wg_interface: wg0 +wg_port: 51820 +wg_cidr: 10.66.0.0/24 +wg_nodes: + hetzner-1: { address: 10.66.0.11, public_ip: "95.217.89.53" } + hetzner-2: { address: 10.66.0.12, public_ip: "138.201.254.97" } diff --git a/k8s/.DS_Store b/k8s/.DS_Store new file mode 100644 index 0000000..df3557e Binary files /dev/null and b/k8s/.DS_Store differ diff --git a/k8s/00-namespaces.yaml b/k8s/00-namespaces.yaml new file mode 100644 index 0000000..8df3428 --- /dev/null +++ b/k8s/00-namespaces.yaml @@ -0,0 +1,31 @@ +apiVersion: v1 +kind: Namespace +metadata: { name: db } +--- +apiVersion: v1 +kind: Namespace +metadata: { name: scm } +--- +apiVersion: v1 +kind: Namespace +metadata: { name: ml } +--- +apiVersion: v1 +kind: Namespace +metadata: { name: monitoring } +--- +apiVersion: v1 +kind: Namespace +metadata: { name: elastic } +--- +apiVersion: v1 +kind: Namespace +metadata: { name: broker } +--- +apiVersion: v1 +kind: Namespace +metadata: { name: graph } +--- +apiVersion: v1 +kind: Namespace +metadata: { name: observability } diff --git a/k8s/01-secrets/basic-auth.yaml b/k8s/01-secrets/basic-auth.yaml new file mode 100644 index 0000000..506fd99 --- /dev/null +++ b/k8s/01-secrets/basic-auth.yaml @@ -0,0 +1,38 @@ +# Replace each 'auth' line with a real htpasswd pair: +# htpasswd -nbBC 10 admin 'Str0ngP@ss' (copy 'admin:...' to value below) + +apiVersion: v1 +kind: Secret +metadata: { name: basic-auth-kibana, namespace: elastic } +type: Opaque +stringData: { auth: "admin:$2y$10$MBLgALyI7xwFrQh2PHqZruX.EzaTUGagmJODwpBEvF27snFAxCBvq" } +--- +apiVersion: v1 +kind: Secret +metadata: { name: basic-auth-grafana, namespace: monitoring } +type: Opaque +stringData: { auth: "admin:$2y$10$MBLgALyI7xwFrQh2PHqZruX.EzaTUGagmJODwpBEvF27snFAxCBvq" } +--- +apiVersion: v1 +kind: Secret +metadata: { name: basic-auth-prometheus, namespace: monitoring } +type: Opaque +stringData: { auth: "aadmin:$2y$10$MBLgALyI7xwFrQh2PHqZruX.EzaTUGagmJODwpBEvF27snFAxCBvq" } +--- +apiVersion: v1 +kind: Secret +metadata: { name: basic-auth-notebook, namespace: ml } +type: Opaque +stringData: { auth: "admin:$2y$10$MBLgALyI7xwFrQh2PHqZruX.EzaTUGagmJODwpBEvF27snFAxCBvq" } +--- +apiVersion: v1 +kind: Secret +metadata: { name: basic-auth-broker, namespace: broker } +type: Opaque +stringData: { auth: "admin:$2y$10$MBLgALyI7xwFrQh2PHqZruX.EzaTUGagmJODwpBEvF27snFAxCBvq" } +--- +apiVersion: v1 +kind: Secret +metadata: { name: basic-auth-neo4j, namespace: graph } +type: Opaque +stringData: { auth: "admin:$2y$10$MBLgALyI7xwFrQh2PHqZruX.EzaTUGagmJODwpBEvF27snFAxCBvq" } diff --git a/k8s/argoflow/argo.yaml b/k8s/argoflow/argo.yaml new file mode 100644 index 0000000..80fd19c --- /dev/null +++ b/k8s/argoflow/argo.yaml @@ -0,0 +1,146 @@ +apiVersion: v1 +kind: Secret +metadata: + name: argo-artifacts + namespace: ml +type: Opaque +stringData: + accesskey: "minioadmin" # <-- change + secretkey: "minioadmin" # <-- change +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: workflow-controller-configmap + namespace: ml +data: + config: | + artifactRepository: + s3: + bucket: argo-artifacts + endpoint: minio.betelgeusebytes.io # no scheme here + insecure: false # https via Ingress + accessKeySecret: + name: argo-artifacts + key: accesskey + secretKeySecret: + name: argo-artifacts + key: secretkey + keyFormat: "{{workflow.namespace}}/{{workflow.name}}/{{pod.name}}" + +--- +# k8s/argo/workflows/ns-rbac.yaml +apiVersion: v1 +kind: ServiceAccount +metadata: + name: argo-server + namespace: ml +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + name: argo-namespaced + namespace: ml +rules: +- apiGroups: [""] + resources: ["pods","pods/log","secrets","configmaps","events","persistentvolumeclaims","serviceaccounts"] + verbs: ["get","list","watch","create","delete","patch","update"] +- apiGroups: ["coordination.k8s.io"] + resources: ["leases"] + verbs: ["get","list","watch","create","delete","patch","update"] +- apiGroups: ["argoproj.io"] + resources: ["workflows","workflowtemplates","cronworkflows","workfloweventbindings","sensors","eventsources","workflowtasksets","workflowartifactgctasks","workflowtaskresults"] + verbs: ["get","list","watch","create","delete","patch","update"] +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: argo-namespaced-binding + namespace: ml +subjects: +- kind: ServiceAccount + name: argo-server + namespace: ml +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: argo-namespaced + +--- +# k8s/argo/workflows/controller.yaml +apiVersion: apps/v1 +kind: Deployment +metadata: { name: workflow-controller, namespace: ml } +spec: + replicas: 1 + selector: { matchLabels: { app: workflow-controller } } + template: + metadata: { labels: { app: workflow-controller } } + spec: + serviceAccountName: argo-server + containers: + - name: controller + image: quay.io/argoproj/workflow-controller:latest + args: ["--namespaced"] + env: + - name: LEADER_ELECTION_IDENTITY + valueFrom: + fieldRef: + fieldPath: metadata.name + ports: [{ containerPort: 9090 }] + readinessProbe: + httpGet: { path: /metrics, port: 9090, scheme: HTTPS } + initialDelaySeconds: 5 + periodSeconds: 10 + livenessProbe: + httpGet: { path: /metrics, port: 9090, scheme: HTTPS } + initialDelaySeconds: 20 + periodSeconds: 20 + +--- +# k8s/argo/workflows/server.yaml +apiVersion: apps/v1 +kind: Deployment +metadata: { name: argo-server, namespace: ml } +spec: + replicas: 1 + selector: { matchLabels: { app: argo-server } } + template: + metadata: { labels: { app: argo-server } } + spec: + serviceAccountName: argo-server + containers: + - name: server + image: quay.io/argoproj/argocli:latest + args: ["server","--auth-mode","server","--namespaced","--secure=false"] + ports: [{ containerPort: 2746 }] + readinessProbe: + httpGet: { path: /, port: 2746, scheme: HTTP } + initialDelaySeconds: 5 + periodSeconds: 10 + livenessProbe: + httpGet: { path: /, port: 2746, scheme: HTTP } + initialDelaySeconds: 20 + periodSeconds: 20 +--- +apiVersion: v1 +kind: Service +metadata: { name: argo-server, namespace: ml } +spec: { selector: { app: argo-server }, ports: [ { port: 80, targetPort: 2746 } ] } +--- +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + name: argo + namespace: ml + annotations: { cert-manager.io/cluster-issuer: letsencrypt-prod } +spec: + ingressClassName: nginx + tls: [{ hosts: ["argo.betelgeusebytes.io"], secretName: argo-tls }] + rules: + - host: argo.betelgeusebytes.io + http: + paths: + - path: / + pathType: Prefix + backend: { service: { name: argo-server, port: { number: 80 } } } \ No newline at end of file diff --git a/k8s/automation/n8n.yaml b/k8s/automation/n8n.yaml new file mode 100644 index 0000000..c0dc3c7 --- /dev/null +++ b/k8s/automation/n8n.yaml @@ -0,0 +1,217 @@ +apiVersion: v1 +kind: Namespace +metadata: + name: automation + labels: + name: automation +--- +apiVersion: v1 +kind: PersistentVolume +metadata: + name: n8n-pv + labels: + app: n8n +spec: + capacity: + storage: 20Gi + volumeMode: Filesystem + accessModes: + - ReadWriteOnce + persistentVolumeReclaimPolicy: Retain + storageClassName: local-ssd + local: + path: /mnt/local-ssd/n8n + nodeAffinity: + required: + nodeSelectorTerms: + - matchExpressions: + - key: kubernetes.io/hostname + operator: In + values: + - hetzner-2 +--- +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: n8n-data + namespace: automation + labels: + app: n8n +spec: + accessModes: + - ReadWriteOnce + storageClassName: local-ssd + resources: + requests: + storage: 20Gi + selector: + matchLabels: + app: n8n +--- +apiVersion: v1 +kind: Secret +metadata: + name: n8n-secrets + namespace: automation +type: Opaque +stringData: + # Generate a strong encryption key with: openssl rand -base64 32 + N8N_ENCRYPTION_KEY: "G/US0ePajEpWwRUjlchyOs6+6I/AT+0bisXmE2fugSU=" + # Optional: Database connection if using PostgreSQL + DB_TYPE: "postgresdb" + DB_POSTGRESDB_HOST: "pg.betelgeusebytes.io" + DB_POSTGRESDB_PORT: "5432" + DB_POSTGRESDB_DATABASE: "n8n" + DB_POSTGRESDB_USER: "app" + DB_POSTGRESDB_PASSWORD: "pa$$word" +--- +apiVersion: apps/v1 +kind: StatefulSet +metadata: + name: n8n + namespace: automation +spec: + serviceName: n8n + replicas: 1 + selector: + matchLabels: + app: n8n + template: + metadata: + labels: + app: n8n + spec: + nodeSelector: + kubernetes.io/hostname: hetzner-2 + containers: + - name: n8n + image: n8nio/n8n:latest + ports: + - containerPort: 5678 + name: http + env: + - name: N8N_HOST + value: "n8n.betelgeusebytes.io" + - name: N8N_PORT + value: "5678" + - name: N8N_PROTOCOL + value: "https" + - name: WEBHOOK_URL + value: "https://n8n.betelgeusebytes.io/" + - name: GENERIC_TIMEZONE + value: "UTC" + - name: N8N_ENCRYPTION_KEY + valueFrom: + secretKeyRef: + name: n8n-secrets + key: N8N_ENCRYPTION_KEY + # Uncomment if using PostgreSQL + - name: DB_TYPE + valueFrom: + secretKeyRef: + name: n8n-secrets + key: DB_TYPE + - name: DB_POSTGRESDB_HOST + valueFrom: + secretKeyRef: + name: n8n-secrets + key: DB_POSTGRESDB_HOST + - name: DB_POSTGRESDB_PORT + valueFrom: + secretKeyRef: + name: n8n-secrets + key: DB_POSTGRESDB_PORT + - name: DB_POSTGRESDB_DATABASE + valueFrom: + secretKeyRef: + name: n8n-secrets + key: DB_POSTGRESDB_DATABASE + - name: DB_POSTGRESDB_USER + valueFrom: + secretKeyRef: + name: n8n-secrets + key: DB_POSTGRESDB_USER + - name: DB_POSTGRESDB_PASSWORD + valueFrom: + secretKeyRef: + name: n8n-secrets + key: DB_POSTGRESDB_PASSWORD + volumeMounts: + - name: n8n-data + mountPath: /home/node/.n8n + resources: + requests: + memory: "512Mi" + cpu: "250m" + limits: + memory: "2Gi" + cpu: "1000m" + livenessProbe: + httpGet: + path: /healthz + port: 5678 + initialDelaySeconds: 60 + periodSeconds: 30 + timeoutSeconds: 10 + failureThreshold: 5 + readinessProbe: + httpGet: + path: /healthz + port: 5678 + initialDelaySeconds: 30 + periodSeconds: 10 + timeoutSeconds: 5 + failureThreshold: 3 + volumes: + - name: n8n-data + persistentVolumeClaim: + claimName: n8n-data +--- +apiVersion: v1 +kind: Service +metadata: + name: n8n + namespace: automation + labels: + app: n8n +spec: + type: ClusterIP + ports: + - port: 5678 + targetPort: 5678 + protocol: TCP + name: http + selector: + app: n8n +--- +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + name: n8n + namespace: automation + annotations: + cert-manager.io/cluster-issuer: "letsencrypt-prod" + # nginx.ingress.kubernetes.io/proxy-body-size: "50m" + # nginx.ingress.kubernetes.io/proxy-read-timeout: "300" + # nginx.ingress.kubernetes.io/proxy-send-timeout: "300" + # Uncomment below if you want basic auth protection in addition to n8n's auth + # nginx.ingress.kubernetes.io/auth-type: basic + # nginx.ingress.kubernetes.io/auth-secret: n8n-basic-auth + # nginx.ingress.kubernetes.io/auth-realm: 'Authentication Required' +spec: + ingressClassName: nginx + tls: + - hosts: + - n8n.betelgeusebytes.io + secretName: wildcard-betelgeusebytes-tls + rules: + - host: n8n.betelgeusebytes.io + http: + paths: + - path: / + pathType: Prefix + backend: + service: + name: n8n + port: + number: 5678 \ No newline at end of file diff --git a/k8s/cert-manager/cluster-issuer.yaml b/k8s/cert-manager/cluster-issuer.yaml new file mode 100644 index 0000000..7491904 --- /dev/null +++ b/k8s/cert-manager/cluster-issuer.yaml @@ -0,0 +1,10 @@ +apiVersion: cert-manager.io/v1 +kind: ClusterIssuer +metadata: { name: letsencrypt-prod } +spec: + acme: + email: angal.salah@gmail.com + server: https://acme-v02.api.letsencrypt.org/directory + privateKeySecretRef: { name: letsencrypt-prod-key } + solvers: + - http01: { ingress: { class: nginx } } diff --git a/k8s/elastic/elastic-pv.yaml b/k8s/elastic/elastic-pv.yaml new file mode 100644 index 0000000..85961aa --- /dev/null +++ b/k8s/elastic/elastic-pv.yaml @@ -0,0 +1,21 @@ +apiVersion: v1 +kind: PersistentVolume +metadata: + name: pv-elasticsearch +spec: + capacity: + storage: 80Gi + accessModes: + - ReadWriteOnce + persistentVolumeReclaimPolicy: Retain + storageClassName: local-ssd-hetzner + local: + path: /mnt/local-ssd/elasticsearch + nodeAffinity: + required: + nodeSelectorTerms: + - matchExpressions: + - key: kubernetes.io/hostname + operator: In + values: + - hetzner-2 \ No newline at end of file diff --git a/k8s/elastic/elasticsearch.yaml b/k8s/elastic/elasticsearch.yaml new file mode 100644 index 0000000..7d875b9 --- /dev/null +++ b/k8s/elastic/elasticsearch.yaml @@ -0,0 +1,38 @@ +apiVersion: v1 +kind: Service +metadata: { name: elasticsearch, namespace: elastic } +spec: + ports: + - { name: http, port: 9200, targetPort: 9200 } + - { name: transport, port: 9300, targetPort: 9300 } + selector: { app: elasticsearch } +--- +apiVersion: apps/v1 +kind: StatefulSet +metadata: { name: elasticsearch, namespace: elastic } +spec: + serviceName: elasticsearch + replicas: 1 + selector: { matchLabels: { app: elasticsearch } } + template: + metadata: { labels: { app: elasticsearch } } + spec: + nodeSelector: { node: hetzner-2 } + containers: + - name: es + image: docker.elastic.co/elasticsearch/elasticsearch:8.14.0 + env: + - { name: discovery.type, value: single-node } + - { name: xpack.security.enabled, value: "false" } + - { name: ES_JAVA_OPTS, value: "-Xms2g -Xmx2g" } + ports: + - { containerPort: 9200 } + - { containerPort: 9300 } + volumeMounts: + - { name: data, mountPath: /usr/share/elasticsearch/data } + volumeClaimTemplates: + - metadata: { name: data } + spec: + accessModes: ["ReadWriteOnce"] + storageClassName: local-ssd-hetzner + resources: { requests: { storage: 80Gi } } diff --git a/k8s/elastic/kibana.yaml b/k8s/elastic/kibana.yaml new file mode 100644 index 0000000..8e8c77e --- /dev/null +++ b/k8s/elastic/kibana.yaml @@ -0,0 +1,44 @@ +apiVersion: v1 +kind: Service +metadata: { name: kibana, namespace: elastic } +spec: + ports: [{ port: 5601, targetPort: 5601 }] + selector: { app: kibana } +--- +apiVersion: apps/v1 +kind: Deployment +metadata: { name: kibana, namespace: elastic } +spec: + replicas: 1 + selector: { matchLabels: { app: kibana } } + template: + metadata: { labels: { app: kibana } } + spec: + nodeSelector: { node: hetzner-2 } + containers: + - name: kibana + image: docker.elastic.co/kibana/kibana:8.14.0 + env: + - { name: ELASTICSEARCH_HOSTS, value: "http://elasticsearch.elastic.svc.cluster.local:9200" } + ports: [{ containerPort: 5601 }] +--- +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + name: kibana + namespace: elastic + annotations: + cert-manager.io/cluster-issuer: letsencrypt-prod + # nginx.ingress.kubernetes.io/auth-type: basic + # nginx.ingress.kubernetes.io/auth-secret: basic-auth-kibana + # nginx.ingress.kubernetes.io/auth-realm: "Authentication Required" +spec: + ingressClassName: nginx + tls: [{ hosts: ["kibana.betelgeusebytes.io"], secretName: kibana-tls }] + rules: + - host: kibana.betelgeusebytes.io + http: + paths: + - path: / + pathType: Prefix + backend: { service: { name: kibana, port: { number: 5601 } } } diff --git a/k8s/gitea/gitea-pv.yaml b/k8s/gitea/gitea-pv.yaml new file mode 100644 index 0000000..51222ff --- /dev/null +++ b/k8s/gitea/gitea-pv.yaml @@ -0,0 +1,21 @@ +apiVersion: v1 +kind: PersistentVolume +metadata: + name: pv-gitea +spec: + capacity: + storage: 50Gi + accessModes: + - ReadWriteOnce + persistentVolumeReclaimPolicy: Retain + storageClassName: local-ssd-hetzner + local: + path: /mnt/local-ssd/gitea + nodeAffinity: + required: + nodeSelectorTerms: + - matchExpressions: + - key: kubernetes.io/hostname + operator: In + values: + - hetzner-2 \ No newline at end of file diff --git a/k8s/gitea/gitea.yaml b/k8s/gitea/gitea.yaml new file mode 100644 index 0000000..4290faa --- /dev/null +++ b/k8s/gitea/gitea.yaml @@ -0,0 +1,54 @@ +apiVersion: v1 +kind: Service +metadata: { name: gitea, namespace: scm } +spec: + ports: [{ port: 80, targetPort: 3000 }] + selector: { app: gitea } +--- +apiVersion: apps/v1 +kind: StatefulSet +metadata: { name: gitea, namespace: scm } +spec: + serviceName: gitea + replicas: 1 + selector: { matchLabels: { app: gitea } } + template: + metadata: { labels: { app: gitea } } + spec: + nodeSelector: { node: hetzner-2 } + containers: + - name: gitea + image: gitea/gitea:1.21.11 + env: + - { name: GITEA__server__ROOT_URL, value: "https://gitea.betelgeusebytes.io" } + - { name: GITEA__database__DB_TYPE, value: "postgres" } + - { name: GITEA__database__HOST, value: "postgres.db.svc.cluster.local:5432" } + - { name: GITEA__database__NAME, value: "gitea" } + - { name: GITEA__database__USER, value: "app" } + - { name: GITEA__database__PASSWD, value: "pa$$word" } + ports: [{ containerPort: 3000 }] + volumeMounts: + - { name: data, mountPath: /data } + volumeClaimTemplates: + - metadata: { name: data } + spec: + accessModes: ["ReadWriteOnce"] + storageClassName: local-ssd-hetzner + resources: { requests: { storage: 50Gi } } +--- +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + name: gitea + namespace: scm + annotations: { cert-manager.io/cluster-issuer: letsencrypt-prod } +spec: + ingressClassName: nginx + tls: [{ hosts: ["gitea.betelgeusebytes.io"], secretName: gitea-tls }] + rules: + - host: gitea.betelgeusebytes.io + http: + paths: + - path: / + pathType: Prefix + backend: { service: { name: gitea, port: { number: 80 } } } diff --git a/k8s/grafana/grafana.yaml b/k8s/grafana/grafana.yaml new file mode 100644 index 0000000..f1aa007 --- /dev/null +++ b/k8s/grafana/grafana.yaml @@ -0,0 +1,45 @@ +apiVersion: v1 +kind: Service +metadata: { name: grafana, namespace: monitoring } +spec: + ports: [{ port: 80, targetPort: 3000 }] + selector: { app: grafana } +--- +apiVersion: apps/v1 +kind: Deployment +metadata: { name: grafana, namespace: monitoring } +spec: + replicas: 1 + selector: { matchLabels: { app: grafana } } + template: + metadata: { labels: { app: grafana } } + spec: + nodeSelector: { node: hetzner-2 } + containers: + - name: grafana + image: grafana/grafana:10.4.3 + env: + - { name: GF_SECURITY_ADMIN_USER, value: admin } + - { name: GF_SECURITY_ADMIN_PASSWORD, value: "ADMINclaude-GRAFANA" } + ports: [{ containerPort: 3000 }] +--- +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + name: grafana + namespace: monitoring + annotations: + cert-manager.io/cluster-issuer: letsencrypt-prod + nginx.ingress.kubernetes.io/auth-type: basic + nginx.ingress.kubernetes.io/auth-secret: basic-auth-grafana + nginx.ingress.kubernetes.io/auth-realm: "Authentication Required" +spec: + ingressClassName: nginx + tls: [{ hosts: ["grafana.betelgeusebytes.io"], secretName: grafana-tls }] + rules: + - host: grafana.betelgeusebytes.io + http: + paths: + - path: / + pathType: Prefix + backend: { service: { name: grafana, port: { number: 80 } } } diff --git a/k8s/ingress-patch/kustomization.yaml b/k8s/ingress-patch/kustomization.yaml new file mode 100644 index 0000000..8117468 --- /dev/null +++ b/k8s/ingress-patch/kustomization.yaml @@ -0,0 +1,49 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization +namespace: ingress-nginx + +# Create the tcp-services ConfigMap from *quoted* literals +configMapGenerator: + - name: tcp-services + literals: + - "5432=db/postgres:5432" + - "7687=graph/neo4j:7687" + +generatorOptions: + disableNameSuffixHash: true + +# Inline JSON6902 patches +patches: + # 1) Add controller arg for tcp-services + - target: + group: apps + version: v1 + kind: Deployment + name: ingress-nginx-controller + namespace: ingress-nginx + patch: |- + - op: add + path: /spec/template/spec/containers/0/args/- + value: --tcp-services-configmap=$(POD_NAMESPACE)/tcp-services + + # 2) Expose Service ports 5432 and 7687 (keeps 80/443) + - target: + version: v1 + kind: Service + name: ingress-nginx-controller + namespace: ingress-nginx + patch: |- + - op: add + path: /spec/ports/- + value: + name: tcp-5432 + port: 5432 + protocol: TCP + targetPort: 5432 + - op: add + path: /spec/ports/- + value: + name: tcp-7687 + port: 7687 + protocol: TCP + targetPort: 7687 diff --git a/k8s/jupyter/jupyter.yaml b/k8s/jupyter/jupyter.yaml new file mode 100644 index 0000000..45a6586 --- /dev/null +++ b/k8s/jupyter/jupyter.yaml @@ -0,0 +1,68 @@ +apiVersion: v1 +kind: Service +metadata: { name: notebook, namespace: ml } +spec: + selector: { app: jupyterlab } + ports: [{ port: 80, targetPort: 8888 }] +--- +apiVersion: apps/v1 +kind: Deployment +metadata: { name: jupyterlab, namespace: ml } +spec: + replicas: 1 + selector: { matchLabels: { app: jupyterlab } } + template: + metadata: { labels: { app: jupyterlab } } + spec: + securityContext: + runAsUser: 1000 + fsGroup: 100 + nodeSelector: { node: hetzner-2 } + containers: + - name: jupyter + image: jupyter/base-notebook:latest + args: ["start-notebook.sh", "--NotebookApp.token=$(PASSWORD)"] + env: + - name: PASSWORD + valueFrom: { secretKeyRef: { name: jupyter-auth, key: PASSWORD } } + ports: [{ containerPort: 8888 }] + volumeMounts: + - { name: work, mountPath: /home/jovyan/work } + volumes: + - name: work + persistentVolumeClaim: { claimName: jupyter-pvc } +--- +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: { name: jupyter-pvc, namespace: ml } +spec: + accessModes: ["ReadWriteOnce"] + storageClassName: local-ssd-hetzner + resources: { requests: { storage: 20Gi } } +--- +apiVersion: v1 +kind: Secret +metadata: { name: jupyter-auth, namespace: ml } +type: Opaque +stringData: { PASSWORD: "notebook" } +--- +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + name: notebook + namespace: ml + annotations: + cert-manager.io/cluster-issuer: letsencrypt-prod + # nginx.ingress.kubernetes.io/auth-type: basic + # nginx.ingress.kubernetes.io/auth-secret: basic-auth-notebook + # nginx.ingress.kubernetes.io/auth-realm: "Authentication Required" +spec: + ingressClassName: nginx + tls: [{ hosts: ["notebook.betelgeusebytes.io"], secretName: notebook-tls }] + rules: + - host: notebook.betelgeusebytes.io + http: + paths: + - path: / + pathType: Prefix + backend: { service: { name: notebook, port: { number: 80 } } } diff --git a/k8s/kafka/kafka-pv.yaml b/k8s/kafka/kafka-pv.yaml new file mode 100644 index 0000000..2e5d82e --- /dev/null +++ b/k8s/kafka/kafka-pv.yaml @@ -0,0 +1,65 @@ +apiVersion: v1 +kind: PersistentVolume +metadata: + name: pv-kafka +spec: + capacity: + storage: 50Gi + accessModes: + - ReadWriteOnce + persistentVolumeReclaimPolicy: Retain + storageClassName: local-ssd-hetzner + local: + path: /mnt/local-ssd/kafka + nodeAffinity: + required: + nodeSelectorTerms: + - matchExpressions: + - key: kubernetes.io/hostname + operator: In + values: + - hetzner-2 +--- +apiVersion: v1 +kind: PersistentVolume +metadata: + name: pv-zookeeper-data +spec: + capacity: + storage: 10Gi + accessModes: + - ReadWriteOnce + persistentVolumeReclaimPolicy: Retain + storageClassName: local-ssd-hetzner + local: + path: /mnt/local-ssd/zookeeper-data + nodeAffinity: + required: + nodeSelectorTerms: + - matchExpressions: + - key: kubernetes.io/hostname + operator: In + values: + - hetzner-2 +--- +apiVersion: v1 +kind: PersistentVolume +metadata: + name: pv-zookeeper-log +spec: + capacity: + storage: 10Gi + accessModes: + - ReadWriteOnce + persistentVolumeReclaimPolicy: Retain + storageClassName: local-ssd-hetzner + local: + path: /mnt/local-ssd/zookeeper-log + nodeAffinity: + required: + nodeSelectorTerms: + - matchExpressions: + - key: kubernetes.io/hostname + operator: In + values: + - hetzner-2 \ No newline at end of file diff --git a/k8s/kafka/kafka-ui.yaml b/k8s/kafka/kafka-ui.yaml new file mode 100644 index 0000000..5a80aeb --- /dev/null +++ b/k8s/kafka/kafka-ui.yaml @@ -0,0 +1,44 @@ +apiVersion: v1 +kind: Service +metadata: { name: kafka-ui, namespace: broker } +spec: + ports: [{ port: 80, targetPort: 8080 }] + selector: { app: kafka-ui } +--- +apiVersion: apps/v1 +kind: Deployment +metadata: { name: kafka-ui, namespace: broker } +spec: + replicas: 1 + selector: { matchLabels: { app: kafka-ui } } + template: + metadata: { labels: { app: kafka-ui } } + spec: + containers: + - name: ui + image: provectuslabs/kafka-ui:latest + env: + - { name: KAFKA_CLUSTERS_0_NAME, value: "local" } + - { name: KAFKA_CLUSTERS_0_BOOTSTRAPSERVERS, value: "kafka.broker.svc.cluster.local:9092" } + ports: [{ containerPort: 8080 }] +--- +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + name: kafka-ui + namespace: broker + annotations: + cert-manager.io/cluster-issuer: letsencrypt-prod + # nginx.ingress.kubernetes.io/auth-type: basic + # nginx.ingress.kubernetes.io/auth-secret: basic-auth-broker + # nginx.ingress.kubernetes.io/auth-realm: "Authentication Required" +spec: + ingressClassName: nginx + tls: [{ hosts: ["broker.betelgeusebytes.io"], secretName: broker-tls }] + rules: + - host: broker.betelgeusebytes.io + http: + paths: + - path: / + pathType: Prefix + backend: { service: { name: kafka-ui, port: { number: 80 } } } diff --git a/k8s/kafka/kafka.yaml b/k8s/kafka/kafka.yaml new file mode 100644 index 0000000..77bbfb4 --- /dev/null +++ b/k8s/kafka/kafka.yaml @@ -0,0 +1,45 @@ +apiVersion: v1 +kind: Service +metadata: { name: kafka, namespace: broker } +spec: + ports: [{ name: kafka, port: 9092, targetPort: 9092 }] + selector: { app: kafka } +--- +apiVersion: apps/v1 +kind: StatefulSet +metadata: { name: kafka, namespace: broker } +spec: + serviceName: kafka + replicas: 1 + selector: { matchLabels: { app: kafka } } + template: + metadata: { labels: { app: kafka } } + spec: + nodeSelector: { node: hetzner-2 } + containers: + - name: kafka + image: apache/kafka:latest + env: + - { name: KAFKA_NODE_ID, value: "1" } + - { name: KAFKA_PROCESS_ROLES, value: "broker,controller" } + - { name: KAFKA_LISTENERS, value: "PLAINTEXT://:9092,CONTROLLER://:9093" } + - { name: KAFKA_ADVERTISED_LISTENERS, value: "PLAINTEXT://kafka.broker.svc.cluster.local:9092" } + - { name: KAFKA_CONTROLLER_LISTENER_NAMES, value: "CONTROLLER" } + - { name: KAFKA_LISTENER_SECURITY_PROTOCOL_MAP, value: "CONTROLLER:PLAINTEXT,PLAINTEXT:PLAINTEXT" } + - { name: KAFKA_CONTROLLER_QUORUM_VOTERS, value: "1@localhost:9093" } + - { name: KAFKA_OFFSETS_TOPIC_REPLICATION_FACTOR, value: "1" } + - { name: KAFKA_TRANSACTION_STATE_LOG_REPLICATION_FACTOR, value: "1" } + - { name: KAFKA_TRANSACTION_STATE_LOG_MIN_ISR, value: "1" } + - { name: KAFKA_LOG_DIRS, value: "/var/lib/kafka/data" } + - { name: CLUSTER_ID, value: "MkU3OEVBNTcwNTJENDM2Qk" } + ports: + - { containerPort: 9092 } + - { containerPort: 9093 } + volumeMounts: + - { name: data, mountPath: /var/lib/kafka/data } + volumeClaimTemplates: + - metadata: { name: data } + spec: + accessModes: ["ReadWriteOnce"] + storageClassName: local-ssd-hetzner + resources: { requests: { storage: 50Gi } } diff --git a/k8s/label_studio/label.yaml b/k8s/label_studio/label.yaml new file mode 100644 index 0000000..30a5176 --- /dev/null +++ b/k8s/label_studio/label.yaml @@ -0,0 +1,74 @@ +# k8s/ai/label-studio/secret-pg.yaml +apiVersion: v1 +kind: Secret +metadata: { name: labelstudio-pg, namespace: ml } +type: Opaque +stringData: { POSTGRES_PASSWORD: "admin" } + +--- +# k8s/ai/label-studio/secret-minio.yaml +apiVersion: v1 +kind: Secret +metadata: { name: minio-label, namespace: ml } +type: Opaque +stringData: + accesskey: "minioadmin" + secretkey: "minioadmin" + +--- +# k8s/ai/label-studio/deploy.yaml +apiVersion: apps/v1 +kind: Deployment +metadata: { name: label-studio, namespace: ml } +spec: + replicas: 1 + selector: { matchLabels: { app: label-studio } } + template: + metadata: { labels: { app: label-studio } } + spec: + containers: + - name: app + image: heartexlabs/label-studio:latest + env: + - { name: POSTGRE_NAME, value: "labelstudio" } + - { name: POSTGRE_USER, value: "admin" } + - name: POSTGRE_PASSWORD + valueFrom: { secretKeyRef: { name: labelstudio-pg, key: POSTGRES_PASSWORD } } + - { name: POSTGRE_HOST, value: "postgres.db.svc.cluster.local" } + - { name: POSTGRE_PORT, value: "5432" } + - { name: S3_ENDPOINT, value: "https://minio.betelgeusebytes.io" } + - name: AWS_ACCESS_KEY_ID + valueFrom: { secretKeyRef: { name: minio-label, key: accesskey } } + - name: AWS_SECRET_ACCESS_KEY + valueFrom: { secretKeyRef: { name: minio-label, key: secretkey } } + - name: ALLOWED_HOSTS + value: "label.betelgeusebytes.io" + - name: CSRF_TRUSTED_ORIGINS + value: "https://label.betelgeusebytes.io" + - name: CSRF_COOKIE_SECURE + value: "1" + - name: SESSION_COOKIE_SECURE + value: "1" + ports: [{ containerPort: 8080 }] +--- +apiVersion: v1 +kind: Service +metadata: { name: label-studio, namespace: ml } +spec: { selector: { app: label-studio }, ports: [ { port: 80, targetPort: 8080 } ] } +--- +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + name: label-studio + namespace: ml + annotations: { cert-manager.io/cluster-issuer: letsencrypt-prod } +spec: + ingressClassName: nginx + tls: [{ hosts: ["label.betelgeusebytes.io"], secretName: label-tls }] + rules: + - host: label.betelgeusebytes.io + http: + paths: + - path: / + pathType: Prefix + backend: { service: { name: label-studio, port: { number: 80 } } } diff --git a/k8s/minio/minio.yaml b/k8s/minio/minio.yaml new file mode 100644 index 0000000..883f1e6 --- /dev/null +++ b/k8s/minio/minio.yaml @@ -0,0 +1,96 @@ +apiVersion: v1 +kind: Namespace +metadata: { name: storage } +--- +# k8s/storage/minio/secret.yaml +apiVersion: v1 +kind: Secret +metadata: { name: minio-root, namespace: storage } +type: Opaque +stringData: + MINIO_ROOT_USER: "minioadmin" + MINIO_ROOT_PASSWORD: "minioadmin" + +--- +# k8s/storage/minio/pvc.yaml +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: { name: minio-data, namespace: storage } +spec: + accessModes: ["ReadWriteOnce"] + storageClassName: local-ssd-hetzner + resources: { requests: { storage: 20Gi } } + +--- +# k8s/storage/minio/deploy.yaml +apiVersion: apps/v1 +kind: Deployment +metadata: { name: minio, namespace: storage } +spec: + replicas: 1 + selector: { matchLabels: { app: minio } } + template: + metadata: { labels: { app: minio } } + spec: + containers: + - name: minio + image: minio/minio:latest + args: ["server","/data","--console-address",":9001"] + envFrom: [{ secretRef: { name: minio-root } }] + ports: + - { containerPort: 9000 } # S3 + - { containerPort: 9001 } # Console + volumeMounts: + - { name: data, mountPath: /data } + volumes: + - name: data + persistentVolumeClaim: { claimName: minio-data } +--- +apiVersion: v1 +kind: Service +metadata: { name: minio, namespace: storage } +spec: + selector: { app: minio } + ports: + - { name: s3, port: 9000, targetPort: 9000 } + - { name: console, port: 9001, targetPort: 9001 } +--- +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + name: minio + namespace: storage + annotations: { cert-manager.io/cluster-issuer: letsencrypt-prod } +spec: + ingressClassName: nginx + tls: [{ hosts: ["minio.betelgeusebytes.io"], secretName: minio-tls }] + rules: + - host: minio.betelgeusebytes.io + http: + paths: + - path: / + pathType: Prefix + backend: { service: { name: minio, port: { number: 9001 } } } +--- +# PV +apiVersion: v1 +kind: PersistentVolume +metadata: + name: pv-minio +spec: + capacity: + storage: 20Gi + accessModes: + - ReadWriteOnce + persistentVolumeReclaimPolicy: Retain + storageClassName: local-ssd-hetzner + local: + path: /mnt/local-ssd/minio + nodeAffinity: + required: + nodeSelectorTerms: + - matchExpressions: + - key: kubernetes.io/hostname + operator: In + values: + - hetzner-2 \ No newline at end of file diff --git a/k8s/mlflow/mlflow.yaml b/k8s/mlflow/mlflow.yaml new file mode 100644 index 0000000..501a837 --- /dev/null +++ b/k8s/mlflow/mlflow.yaml @@ -0,0 +1,64 @@ +# k8s/mlops/mlflow/secret-pg.yaml +apiVersion: v1 +kind: Secret +metadata: { name: mlflow-pg, namespace: ml } +type: Opaque +stringData: { POSTGRES_PASSWORD: "pa$$word" } + +--- +# k8s/mlops/mlflow/secret-minio.yaml +apiVersion: v1 +kind: Secret +metadata: { name: mlflow-minio, namespace: ml } +type: Opaque +stringData: + accesskey: "minioadmin" + secretkey: "minioadmin" + +--- +# k8s/mlops/mlflow/deploy.yaml +apiVersion: apps/v1 +kind: Deployment +metadata: { name: mlflow, namespace: ml } +spec: + replicas: 1 + selector: { matchLabels: { app: mlflow } } + template: + metadata: { labels: { app: mlflow } } + spec: + containers: + - name: mlflow + # image: ghcr.io/mlflow/mlflow:v3.6.0 + image: axxs/mlflow-pg + env: + - { name: MLFLOW_BACKEND_STORE_URI, + value: "postgresql://admin:admin@postgres.db.svc.cluster.local:5432/mlflow" } + - { name: POSTGRES_PASSWORD, valueFrom: { secretKeyRef: { name: mlflow-pg, key: POSTGRES_PASSWORD } } } + - { name: MLFLOW_S3_ENDPOINT_URL, value: "https://minio.betelgeusebytes.io" } + - { name: AWS_ACCESS_KEY_ID, valueFrom: { secretKeyRef: { name: mlflow-minio, key: accesskey } } } + - { name: AWS_SECRET_ACCESS_KEY, valueFrom: { secretKeyRef: { name: mlflow-minio, key: secretkey } } } + args: ["mlflow","server","--host","0.0.0.0","--port","5000","--artifacts-destination","s3://mlflow", "--allowed-hosts", "*.betelgeusebytes.io"] + ports: [{ containerPort: 5000 }] +--- +apiVersion: v1 +kind: Service +metadata: { name: mlflow, namespace: ml } +spec: { selector: { app: mlflow }, ports: [ { port: 80, targetPort: 5000 } ] } +--- +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + name: mlflow + namespace: ml + annotations: { cert-manager.io/cluster-issuer: letsencrypt-prod } +spec: + ingressClassName: nginx + tls: [{ hosts: ["mlflow.betelgeusebytes.io"], secretName: mlflow-tls }] + rules: + - host: mlflow.betelgeusebytes.io + http: + paths: + - path: / + pathType: Prefix + backend: { service: { name: mlflow, port: { number: 80 } } } + diff --git a/k8s/neo4j/neo4j-pv.yaml b/k8s/neo4j/neo4j-pv.yaml new file mode 100644 index 0000000..df9c508 --- /dev/null +++ b/k8s/neo4j/neo4j-pv.yaml @@ -0,0 +1,21 @@ +apiVersion: v1 +kind: PersistentVolume +metadata: + name: pv-neo4j +spec: + capacity: + storage: 20Gi + accessModes: + - ReadWriteOnce + persistentVolumeReclaimPolicy: Retain + storageClassName: local-ssd-hetzner + local: + path: /mnt/local-ssd/neo4j + nodeAffinity: + required: + nodeSelectorTerms: + - matchExpressions: + - key: kubernetes.io/hostname + operator: In + values: + - hetzner-2 \ No newline at end of file diff --git a/k8s/neo4j/neo4j.yaml b/k8s/neo4j/neo4j.yaml new file mode 100644 index 0000000..b21f3cc --- /dev/null +++ b/k8s/neo4j/neo4j.yaml @@ -0,0 +1,107 @@ +apiVersion: v1 +kind: Service +metadata: { name: neo4j, namespace: graph } +spec: + selector: { app: neo4j } + ports: + - { name: http, port: 7474, targetPort: 7474 } + - { name: bolt, port: 7687, targetPort: 7687 } +--- +apiVersion: apps/v1 +kind: StatefulSet +metadata: { name: neo4j, namespace: graph } +spec: + serviceName: neo4j + replicas: 1 + selector: { matchLabels: { app: neo4j } } + template: + metadata: { labels: { app: neo4j } } + spec: + enableServiceLinks: false + nodeSelector: { node: hetzner-2 } + containers: + - name: neo4j + image: neo4j:5.20 + env: + - name: NEO4J_AUTH + valueFrom: { secretKeyRef: { name: neo4j-auth, key: NEO4J_AUTH } } + - name: NEO4J_dbms_ssl_policy_bolt_enabled + value: "true" + - name: NEO4J_dbms_ssl_policy_bolt_base__directory + value: "/certs/bolt" + - name: NEO4J_dbms_ssl_policy_bolt_private__key + value: "tls.key" + - name: NEO4J_dbms_ssl_policy_bolt_public__certificate + value: "tls.crt" + - name: NEO4J_dbms_connector_bolt_tls__level + value: "REQUIRED" + # Advertise public hostname so the Browser uses the external FQDN for Bolt + - name: NEO4J_dbms_connector_bolt_advertised__address + value: "neo4j.betelgeusebytes.io:7687" + # also set a default advertised address (recommended) + - name: NEO4J_dbms_default__advertised__address + value: "neo4j.betelgeusebytes.io" + ports: + - { containerPort: 7474 } + - { containerPort: 7687 } + volumeMounts: + - { name: data, mountPath: /data } + - { name: bolt-certs, mountPath: /certs/bolt } + volumes: + - name: bolt-certs + secret: + secretName: neo4j-tls + items: + - key: tls.crt + path: tls.crt + - key: tls.key + path: tls.key + volumeClaimTemplates: + - metadata: { name: data } + spec: + accessModes: ["ReadWriteOnce"] + storageClassName: local-ssd-hetzner + resources: { requests: { storage: 20Gi } } +--- +apiVersion: v1 +kind: Secret +metadata: { name: neo4j-auth, namespace: graph } +type: Opaque +stringData: { NEO4J_AUTH: "neo4j/NEO4J-PASS" } +--- +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + name: neo4j-http + namespace: graph + annotations: + cert-manager.io/cluster-issuer: letsencrypt-prod + # nginx.ingress.kubernetes.io/auth-type: basic + # nginx.ingress.kubernetes.io/auth-secret: basic-auth-neo4j + # nginx.ingress.kubernetes.io/auth-realm: "Authentication Required" +spec: + ingressClassName: nginx + tls: [{ hosts: ["neo4j.betelgeusebytes.io"], secretName: neo4j-tls }] + rules: + - host: neo4j.betelgeusebytes.io + http: + paths: + - path: / + pathType: Prefix + backend: { service: { name: neo4j, port: { number: 7474 } } } + +# create or update the tcp-services configmap +# kubectl -n ingress-nginx create configmap tcp-services \ +# --from-literal="7687=graph/neo4j:7687" \ +# -o yaml --dry-run=client | kubectl apply -f - + +# kubectl -n ingress-nginx patch deploy ingress-nginx-controller \ +# --type='json' -p='[{"op":"add","path":"/spec/template/spec/containers/0/args/-","value":"--tcp-services-configmap=$(POD_NAMESPACE)/tcp-services"}]' + +# kubectl -n ingress-nginx patch deploy ingress-nginx-controller \ +# --type='json' -p='[{"op":"add","path":"/spec/template/spec/containers/0/args/-","value":"--tcp-services-configmap=$(POD_NAMESPACE)/tcp-services"}]' + +# kubectl -n ingress-nginx patch deployment ingress-nginx-controller \ +# --type='json' -p='[ +# {"op":"add","path":"/spec/template/spec/containers/0/ports/-","value":{"name":"tcp-7687","containerPort":7687,"hostPort":7687,"protocol":"TCP"}} +# ]' \ No newline at end of file diff --git a/k8s/observability-stack/00-namespace.yaml b/k8s/observability-stack/00-namespace.yaml new file mode 100644 index 0000000..e420368 --- /dev/null +++ b/k8s/observability-stack/00-namespace.yaml @@ -0,0 +1,7 @@ +apiVersion: v1 +kind: Namespace +metadata: + name: observability + labels: + name: observability + monitoring: "true" diff --git a/k8s/observability-stack/01-persistent-volumes.yaml b/k8s/observability-stack/01-persistent-volumes.yaml new file mode 100644 index 0000000..579e068 --- /dev/null +++ b/k8s/observability-stack/01-persistent-volumes.yaml @@ -0,0 +1,95 @@ +--- +# Prometheus PV +apiVersion: v1 +kind: PersistentVolume +metadata: + name: prometheus-data-pv +spec: + capacity: + storage: 50Gi + accessModes: + - ReadWriteOnce + persistentVolumeReclaimPolicy: Retain + storageClassName: local-storage + local: + path: /mnt/local-ssd/prometheus + nodeAffinity: + required: + nodeSelectorTerms: + - matchExpressions: + - key: kubernetes.io/hostname + operator: In + values: + - hetzner-2 + +--- +# Loki PV +apiVersion: v1 +kind: PersistentVolume +metadata: + name: loki-data-pv +spec: + capacity: + storage: 100Gi + accessModes: + - ReadWriteOnce + persistentVolumeReclaimPolicy: Retain + storageClassName: local-storage + local: + path: /mnt/local-ssd/loki + nodeAffinity: + required: + nodeSelectorTerms: + - matchExpressions: + - key: kubernetes.io/hostname + operator: In + values: + - hetzner-2 + +--- +# Tempo PV +apiVersion: v1 +kind: PersistentVolume +metadata: + name: tempo-data-pv +spec: + capacity: + storage: 50Gi + accessModes: + - ReadWriteOnce + persistentVolumeReclaimPolicy: Retain + storageClassName: local-storage + local: + path: /mnt/local-ssd/tempo + nodeAffinity: + required: + nodeSelectorTerms: + - matchExpressions: + - key: kubernetes.io/hostname + operator: In + values: + - hetzner-2 + +--- +# Grafana PV +apiVersion: v1 +kind: PersistentVolume +metadata: + name: grafana-data-pv +spec: + capacity: + storage: 10Gi + accessModes: + - ReadWriteOnce + persistentVolumeReclaimPolicy: Retain + storageClassName: local-storage + local: + path: /mnt/local-ssd/grafana + nodeAffinity: + required: + nodeSelectorTerms: + - matchExpressions: + - key: kubernetes.io/hostname + operator: In + values: + - hetzner-2 diff --git a/k8s/observability-stack/02-persistent-volume-claims.yaml b/k8s/observability-stack/02-persistent-volume-claims.yaml new file mode 100644 index 0000000..2185484 --- /dev/null +++ b/k8s/observability-stack/02-persistent-volume-claims.yaml @@ -0,0 +1,55 @@ +--- +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: prometheus-data + namespace: observability +spec: + accessModes: + - ReadWriteOnce + storageClassName: local-storage + resources: + requests: + storage: 50Gi + +--- +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: loki-data + namespace: observability +spec: + accessModes: + - ReadWriteOnce + storageClassName: local-storage + resources: + requests: + storage: 100Gi + +--- +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: tempo-data + namespace: observability +spec: + accessModes: + - ReadWriteOnce + storageClassName: local-storage + resources: + requests: + storage: 50Gi + +--- +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: grafana-data + namespace: observability +spec: + accessModes: + - ReadWriteOnce + storageClassName: local-storage + resources: + requests: + storage: 10Gi diff --git a/k8s/observability-stack/03-prometheus-config.yaml b/k8s/observability-stack/03-prometheus-config.yaml new file mode 100644 index 0000000..4b58aba --- /dev/null +++ b/k8s/observability-stack/03-prometheus-config.yaml @@ -0,0 +1,169 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: prometheus-config + namespace: observability +data: + prometheus.yml: | + global: + scrape_interval: 15s + evaluation_interval: 15s + external_labels: + cluster: 'betelgeuse-k8s' + environment: 'production' + + # Alerting configuration (optional - can add alertmanager later) + alerting: + alertmanagers: + - static_configs: + - targets: [] + + # Rule files + rule_files: + - /etc/prometheus/rules/*.yml + + scrape_configs: + # Scrape Prometheus itself + - job_name: 'prometheus' + static_configs: + - targets: ['localhost:9090'] + + # Kubernetes API server + - job_name: 'kubernetes-apiservers' + kubernetes_sd_configs: + - role: endpoints + scheme: https + tls_config: + ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token + relabel_configs: + - source_labels: [__meta_kubernetes_namespace, __meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name] + action: keep + regex: default;kubernetes;https + + # Kubernetes nodes + - job_name: 'kubernetes-nodes' + kubernetes_sd_configs: + - role: node + scheme: https + tls_config: + ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token + relabel_configs: + - action: labelmap + regex: __meta_kubernetes_node_label_(.+) + - target_label: __address__ + replacement: kubernetes.default.svc:443 + - source_labels: [__meta_kubernetes_node_name] + regex: (.+) + target_label: __metrics_path__ + replacement: /api/v1/nodes/${1}/proxy/metrics + + # Kubernetes nodes cadvisor + - job_name: 'kubernetes-cadvisor' + kubernetes_sd_configs: + - role: node + scheme: https + tls_config: + ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token + relabel_configs: + - action: labelmap + regex: __meta_kubernetes_node_label_(.+) + - target_label: __address__ + replacement: kubernetes.default.svc:443 + - source_labels: [__meta_kubernetes_node_name] + regex: (.+) + target_label: __metrics_path__ + replacement: /api/v1/nodes/${1}/proxy/metrics/cadvisor + + # Kubernetes service endpoints + - job_name: 'kubernetes-service-endpoints' + kubernetes_sd_configs: + - role: endpoints + relabel_configs: + - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scrape] + action: keep + regex: true + - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scheme] + action: replace + target_label: __scheme__ + regex: (https?) + - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_path] + action: replace + target_label: __metrics_path__ + regex: (.+) + - source_labels: [__address__, __meta_kubernetes_service_annotation_prometheus_io_port] + action: replace + target_label: __address__ + regex: ([^:]+)(?::\d+)?;(\d+) + replacement: $1:$2 + - action: labelmap + regex: __meta_kubernetes_service_label_(.+) + - source_labels: [__meta_kubernetes_namespace] + action: replace + target_label: kubernetes_namespace + - source_labels: [__meta_kubernetes_service_name] + action: replace + target_label: kubernetes_name + - source_labels: [__meta_kubernetes_pod_name] + action: replace + target_label: kubernetes_pod_name + + # Kubernetes pods + - job_name: 'kubernetes-pods' + kubernetes_sd_configs: + - role: pod + relabel_configs: + - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape] + action: keep + regex: true + - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_path] + action: replace + target_label: __metrics_path__ + regex: (.+) + - source_labels: [__address__, __meta_kubernetes_pod_annotation_prometheus_io_port] + action: replace + regex: ([^:]+)(?::\d+)?;(\d+) + replacement: $1:$2 + target_label: __address__ + - action: labelmap + regex: __meta_kubernetes_pod_label_(.+) + - source_labels: [__meta_kubernetes_namespace] + action: replace + target_label: kubernetes_namespace + - source_labels: [__meta_kubernetes_pod_name] + action: replace + target_label: kubernetes_pod_name + + # kube-state-metrics + - job_name: 'kube-state-metrics' + static_configs: + - targets: ['kube-state-metrics.observability.svc.cluster.local:8080'] + + # node-exporter + - job_name: 'node-exporter' + kubernetes_sd_configs: + - role: pod + relabel_configs: + - source_labels: [__meta_kubernetes_pod_label_app] + action: keep + regex: node-exporter + - source_labels: [__meta_kubernetes_pod_node_name] + action: replace + target_label: instance + + # Grafana Loki + - job_name: 'loki' + static_configs: + - targets: ['loki.observability.svc.cluster.local:3100'] + + # Grafana Tempo + - job_name: 'tempo' + static_configs: + - targets: ['tempo.observability.svc.cluster.local:3200'] + + # Grafana + - job_name: 'grafana' + static_configs: + - targets: ['grafana.observability.svc.cluster.local:3000'] diff --git a/k8s/observability-stack/04-loki-config.yaml b/k8s/observability-stack/04-loki-config.yaml new file mode 100644 index 0000000..3885de8 --- /dev/null +++ b/k8s/observability-stack/04-loki-config.yaml @@ -0,0 +1,94 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: loki-config + namespace: observability +data: + loki.yaml: | + auth_enabled: false + + server: + http_listen_port: 3100 + grpc_listen_port: 9096 + log_level: info + + common: + path_prefix: /loki + storage: + filesystem: + chunks_directory: /loki/chunks + rules_directory: /loki/rules + replication_factor: 1 + ring: + kvstore: + store: inmemory + + schema_config: + configs: + - from: 2024-01-01 + store: tsdb + object_store: filesystem + schema: v13 + index: + prefix: index_ + period: 24h + + storage_config: + tsdb_shipper: + active_index_directory: /loki/tsdb-index + cache_location: /loki/tsdb-cache + filesystem: + directory: /loki/chunks + + compactor: + working_directory: /loki/compactor + compaction_interval: 10m + retention_enabled: true + retention_delete_delay: 2h + retention_delete_worker_count: 150 + + limits_config: + enforce_metric_name: false + reject_old_samples: true + reject_old_samples_max_age: 168h # 7 days + retention_period: 168h # 7 days + max_query_length: 721h # 30 days for queries + max_query_parallelism: 32 + max_streams_per_user: 0 + max_global_streams_per_user: 0 + ingestion_rate_mb: 50 + ingestion_burst_size_mb: 100 + per_stream_rate_limit: 10MB + per_stream_rate_limit_burst: 20MB + split_queries_by_interval: 15m + + query_range: + align_queries_with_step: true + cache_results: true + results_cache: + cache: + embedded_cache: + enabled: true + max_size_mb: 500 + + frontend: + log_queries_longer_than: 5s + compress_responses: true + + query_scheduler: + max_outstanding_requests_per_tenant: 2048 + + ingester: + chunk_idle_period: 30m + chunk_block_size: 262144 + chunk_encoding: snappy + chunk_retain_period: 1m + max_chunk_age: 2h + wal: + enabled: true + dir: /loki/wal + flush_on_shutdown: true + replay_memory_ceiling: 1GB + + analytics: + reporting_enabled: false diff --git a/k8s/observability-stack/05-tempo-config.yaml b/k8s/observability-stack/05-tempo-config.yaml new file mode 100644 index 0000000..4f606ce --- /dev/null +++ b/k8s/observability-stack/05-tempo-config.yaml @@ -0,0 +1,72 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: tempo-config + namespace: observability +data: + tempo.yaml: | + server: + http_listen_port: 3200 + log_level: info + + distributor: + receivers: + jaeger: + protocols: + thrift_http: + endpoint: 0.0.0.0:14268 + grpc: + endpoint: 0.0.0.0:14250 + zipkin: + endpoint: 0.0.0.0:9411 + otlp: + protocols: + http: + endpoint: 0.0.0.0:4318 + grpc: + endpoint: 0.0.0.0:4317 + + ingester: + max_block_duration: 5m + + compactor: + compaction: + block_retention: 168h # 7 days + + metrics_generator: + registry: + external_labels: + source: tempo + cluster: betelgeuse-k8s + storage: + path: /tmp/tempo/generator/wal + remote_write: + - url: http://prometheus.observability.svc.cluster.local:9090/api/v1/write + send_exemplars: true + + storage: + trace: + backend: local + wal: + path: /tmp/tempo/wal + local: + path: /tmp/tempo/blocks + pool: + max_workers: 100 + queue_depth: 10000 + + querier: + frontend_worker: + frontend_address: tempo.observability.svc.cluster.local:9095 + + query_frontend: + search: + duration_slo: 5s + throughput_bytes_slo: 1.073741824e+09 + trace_by_id: + duration_slo: 5s + + overrides: + defaults: + metrics_generator: + processors: [service-graphs, span-metrics] diff --git a/k8s/observability-stack/06-alloy-config.yaml b/k8s/observability-stack/06-alloy-config.yaml new file mode 100644 index 0000000..2f159de --- /dev/null +++ b/k8s/observability-stack/06-alloy-config.yaml @@ -0,0 +1,159 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: alloy-config + namespace: observability +data: + config.alloy: | + // Logging configuration + logging { + level = "info" + format = "logfmt" + } + + // Discover Kubernetes pods for log collection + discovery.kubernetes "pods" { + role = "pod" + } + + // Discover Kubernetes nodes + discovery.kubernetes "nodes" { + role = "node" + } + + // Relabel pods for log collection + discovery.relabel "pod_logs" { + targets = discovery.kubernetes.pods.targets + + // Only scrape pods with logs + rule { + source_labels = ["__meta_kubernetes_pod_container_name"] + action = "keep" + regex = ".+" + } + + // Set the log path + rule { + source_labels = ["__meta_kubernetes_pod_uid", "__meta_kubernetes_pod_container_name"] + target_label = "__path__" + separator = "/" + replacement = "/var/log/pods/*$1/*.log" + } + + // Set namespace label + rule { + source_labels = ["__meta_kubernetes_namespace"] + target_label = "namespace" + } + + // Set pod name label + rule { + source_labels = ["__meta_kubernetes_pod_name"] + target_label = "pod" + } + + // Set container name label + rule { + source_labels = ["__meta_kubernetes_pod_container_name"] + target_label = "container" + } + + // Set node name label + rule { + source_labels = ["__meta_kubernetes_pod_node_name"] + target_label = "node" + } + + // Copy all pod labels + rule { + action = "labelmap" + regex = "__meta_kubernetes_pod_label_(.+)" + } + } + + // Read logs from discovered pods + loki.source.kubernetes "pod_logs" { + targets = discovery.relabel.pod_logs.output + forward_to = [loki.process.pod_logs.receiver] + } + + // Process and enrich logs + loki.process "pod_logs" { + forward_to = [loki.write.local.receiver] + + // Parse JSON logs + stage.json { + expressions = { + level = "level", + message = "message", + timestamp = "timestamp", + } + } + + // Extract log level + stage.labels { + values = { + level = "", + } + } + + // Add cluster label + stage.static_labels { + values = { + cluster = "betelgeuse-k8s", + } + } + } + + // Write logs to Loki + loki.write "local" { + endpoint { + url = "http://loki.observability.svc.cluster.local:3100/loki/api/v1/push" + } + } + + // OpenTelemetry receiver for traces + otelcol.receiver.otlp "default" { + grpc { + endpoint = "0.0.0.0:4317" + } + + http { + endpoint = "0.0.0.0:4318" + } + + output { + traces = [otelcol.exporter.otlp.tempo.input] + metrics = [otelcol.exporter.prometheus.metrics.input] + } + } + + // Export traces to Tempo + otelcol.exporter.otlp "tempo" { + client { + endpoint = "tempo.observability.svc.cluster.local:4317" + tls { + insecure = true + } + } + } + + // Export OTLP metrics to Prometheus + otelcol.exporter.prometheus "metrics" { + forward_to = [prometheus.remote_write.local.receiver] + } + + // Remote write to Prometheus + prometheus.remote_write "local" { + endpoint { + url = "http://prometheus.observability.svc.cluster.local:9090/api/v1/write" + } + } + + // Scrape local metrics (Alloy's own metrics) + prometheus.scrape "alloy" { + targets = [{ + __address__ = "localhost:12345", + }] + forward_to = [prometheus.remote_write.local.receiver] + } diff --git a/k8s/observability-stack/07-grafana-datasources.yaml b/k8s/observability-stack/07-grafana-datasources.yaml new file mode 100644 index 0000000..4d3f568 --- /dev/null +++ b/k8s/observability-stack/07-grafana-datasources.yaml @@ -0,0 +1,62 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: grafana-datasources + namespace: observability +data: + datasources.yaml: | + apiVersion: 1 + datasources: + # Prometheus + - name: Prometheus + type: prometheus + access: proxy + url: http://prometheus.observability.svc.cluster.local:9090 + isDefault: true + editable: true + jsonData: + timeInterval: 15s + queryTimeout: 60s + httpMethod: POST + + # Loki + - name: Loki + type: loki + access: proxy + url: http://loki.observability.svc.cluster.local:3100 + editable: true + jsonData: + maxLines: 1000 + derivedFields: + - datasourceUid: tempo + matcherRegex: "traceID=(\\w+)" + name: TraceID + url: "$${__value.raw}" + + # Tempo + - name: Tempo + type: tempo + access: proxy + url: http://tempo.observability.svc.cluster.local:3200 + editable: true + uid: tempo + jsonData: + tracesToLogsV2: + datasourceUid: loki + spanStartTimeShift: -1h + spanEndTimeShift: 1h + filterByTraceID: true + filterBySpanID: false + customQuery: false + tracesToMetrics: + datasourceUid: prometheus + spanStartTimeShift: -1h + spanEndTimeShift: 1h + serviceMap: + datasourceUid: prometheus + nodeGraph: + enabled: true + search: + hide: false + lokiSearch: + datasourceUid: loki diff --git a/k8s/observability-stack/08-rbac.yaml b/k8s/observability-stack/08-rbac.yaml new file mode 100644 index 0000000..dca5627 --- /dev/null +++ b/k8s/observability-stack/08-rbac.yaml @@ -0,0 +1,178 @@ +--- +# Prometheus ServiceAccount +apiVersion: v1 +kind: ServiceAccount +metadata: + name: prometheus + namespace: observability + +--- +# Prometheus ClusterRole +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: prometheus +rules: + - apiGroups: [""] + resources: + - nodes + - nodes/proxy + - services + - endpoints + - pods + verbs: ["get", "list", "watch"] + - apiGroups: + - extensions + resources: + - ingresses + verbs: ["get", "list", "watch"] + - nonResourceURLs: ["/metrics"] + verbs: ["get"] + +--- +# Prometheus ClusterRoleBinding +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: prometheus +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: prometheus +subjects: + - kind: ServiceAccount + name: prometheus + namespace: observability + +--- +# Alloy ServiceAccount +apiVersion: v1 +kind: ServiceAccount +metadata: + name: alloy + namespace: observability + +--- +# Alloy ClusterRole +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: alloy +rules: + - apiGroups: [""] + resources: + - nodes + - nodes/proxy + - services + - endpoints + - pods + verbs: ["get", "list", "watch"] + - apiGroups: + - extensions + resources: + - ingresses + verbs: ["get", "list", "watch"] + +--- +# Alloy ClusterRoleBinding +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: alloy +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: alloy +subjects: + - kind: ServiceAccount + name: alloy + namespace: observability + +--- +# kube-state-metrics ServiceAccount +apiVersion: v1 +kind: ServiceAccount +metadata: + name: kube-state-metrics + namespace: observability + +--- +# kube-state-metrics ClusterRole +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: kube-state-metrics +rules: + - apiGroups: [""] + resources: + - configmaps + - secrets + - nodes + - pods + - services + - resourcequotas + - replicationcontrollers + - limitranges + - persistentvolumeclaims + - persistentvolumes + - namespaces + - endpoints + verbs: ["list", "watch"] + - apiGroups: ["apps"] + resources: + - statefulsets + - daemonsets + - deployments + - replicasets + verbs: ["list", "watch"] + - apiGroups: ["batch"] + resources: + - cronjobs + - jobs + verbs: ["list", "watch"] + - apiGroups: ["autoscaling"] + resources: + - horizontalpodautoscalers + verbs: ["list", "watch"] + - apiGroups: ["policy"] + resources: + - poddisruptionbudgets + verbs: ["list", "watch"] + - apiGroups: ["certificates.k8s.io"] + resources: + - certificatesigningrequests + verbs: ["list", "watch"] + - apiGroups: ["storage.k8s.io"] + resources: + - storageclasses + - volumeattachments + verbs: ["list", "watch"] + - apiGroups: ["admissionregistration.k8s.io"] + resources: + - mutatingwebhookconfigurations + - validatingwebhookconfigurations + verbs: ["list", "watch"] + - apiGroups: ["networking.k8s.io"] + resources: + - networkpolicies + - ingresses + verbs: ["list", "watch"] + - apiGroups: ["coordination.k8s.io"] + resources: + - leases + verbs: ["list", "watch"] + +--- +# kube-state-metrics ClusterRoleBinding +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: kube-state-metrics +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: kube-state-metrics +subjects: + - kind: ServiceAccount + name: kube-state-metrics + namespace: observability diff --git a/k8s/observability-stack/10-prometheus.yaml b/k8s/observability-stack/10-prometheus.yaml new file mode 100644 index 0000000..39c027b --- /dev/null +++ b/k8s/observability-stack/10-prometheus.yaml @@ -0,0 +1,90 @@ +apiVersion: apps/v1 +kind: StatefulSet +metadata: + name: prometheus + namespace: observability + labels: + app: prometheus +spec: + serviceName: prometheus + replicas: 1 + selector: + matchLabels: + app: prometheus + template: + metadata: + labels: + app: prometheus + annotations: + prometheus.io/scrape: "true" + prometheus.io/port: "9090" + spec: + serviceAccountName: prometheus + nodeSelector: + kubernetes.io/hostname: hetzner-2 + containers: + - name: prometheus + image: prom/prometheus:v2.54.1 + args: + - '--config.file=/etc/prometheus/prometheus.yml' + - '--storage.tsdb.path=/prometheus' + - '--storage.tsdb.retention.time=7d' + - '--web.console.libraries=/usr/share/prometheus/console_libraries' + - '--web.console.templates=/usr/share/prometheus/consoles' + - '--web.enable-lifecycle' + - '--web.enable-admin-api' + ports: + - name: http + containerPort: 9090 + protocol: TCP + livenessProbe: + httpGet: + path: /-/healthy + port: http + initialDelaySeconds: 30 + periodSeconds: 10 + timeoutSeconds: 5 + readinessProbe: + httpGet: + path: /-/ready + port: http + initialDelaySeconds: 30 + periodSeconds: 10 + timeoutSeconds: 5 + resources: + requests: + cpu: 500m + memory: 2Gi + limits: + cpu: 2000m + memory: 4Gi + volumeMounts: + - name: prometheus-config + mountPath: /etc/prometheus + - name: prometheus-data + mountPath: /prometheus + volumes: + - name: prometheus-config + configMap: + name: prometheus-config + - name: prometheus-data + persistentVolumeClaim: + claimName: prometheus-data + +--- +apiVersion: v1 +kind: Service +metadata: + name: prometheus + namespace: observability + labels: + app: prometheus +spec: + type: ClusterIP + ports: + - port: 9090 + targetPort: http + protocol: TCP + name: http + selector: + app: prometheus diff --git a/k8s/observability-stack/11-loki.yaml b/k8s/observability-stack/11-loki.yaml new file mode 100644 index 0000000..380a06b --- /dev/null +++ b/k8s/observability-stack/11-loki.yaml @@ -0,0 +1,96 @@ +apiVersion: apps/v1 +kind: StatefulSet +metadata: + name: loki + namespace: observability + labels: + app: loki +spec: + serviceName: loki + replicas: 1 + selector: + matchLabels: + app: loki + template: + metadata: + labels: + app: loki + annotations: + prometheus.io/scrape: "true" + prometheus.io/port: "3100" + spec: + nodeSelector: + kubernetes.io/hostname: hetzner-2 + securityContext: + fsGroup: 10001 + runAsGroup: 10001 + runAsNonRoot: true + runAsUser: 10001 + containers: + - name: loki + image: grafana/loki:3.2.1 + args: + - '-config.file=/etc/loki/loki.yaml' + - '-target=all' + ports: + - name: http + containerPort: 3100 + protocol: TCP + - name: grpc + containerPort: 9096 + protocol: TCP + livenessProbe: + httpGet: + path: /ready + port: http + initialDelaySeconds: 45 + periodSeconds: 10 + timeoutSeconds: 5 + readinessProbe: + httpGet: + path: /ready + port: http + initialDelaySeconds: 45 + periodSeconds: 10 + timeoutSeconds: 5 + resources: + requests: + cpu: 500m + memory: 1Gi + limits: + cpu: 2000m + memory: 2Gi + volumeMounts: + - name: loki-config + mountPath: /etc/loki + - name: loki-data + mountPath: /loki + volumes: + - name: loki-config + configMap: + name: loki-config + - name: loki-data + persistentVolumeClaim: + claimName: loki-data + +--- +apiVersion: v1 +kind: Service +metadata: + name: loki + namespace: observability + labels: + app: loki +spec: + type: ClusterIP + ports: + - port: 3100 + targetPort: http + protocol: TCP + name: http + - port: 9096 + targetPort: grpc + protocol: TCP + name: grpc + selector: + app: loki diff --git a/k8s/observability-stack/12-tempo.yaml b/k8s/observability-stack/12-tempo.yaml new file mode 100644 index 0000000..3fff6d6 --- /dev/null +++ b/k8s/observability-stack/12-tempo.yaml @@ -0,0 +1,118 @@ +apiVersion: apps/v1 +kind: StatefulSet +metadata: + name: tempo + namespace: observability + labels: + app: tempo +spec: + serviceName: tempo + replicas: 1 + selector: + matchLabels: + app: tempo + template: + metadata: + labels: + app: tempo + annotations: + prometheus.io/scrape: "true" + prometheus.io/port: "3200" + spec: + nodeSelector: + kubernetes.io/hostname: hetzner-2 + containers: + - name: tempo + image: grafana/tempo:2.6.1 + args: + - '-config.file=/etc/tempo/tempo.yaml' + ports: + - name: http + containerPort: 3200 + protocol: TCP + - name: otlp-grpc + containerPort: 4317 + protocol: TCP + - name: otlp-http + containerPort: 4318 + protocol: TCP + - name: jaeger-grpc + containerPort: 14250 + protocol: TCP + - name: jaeger-http + containerPort: 14268 + protocol: TCP + - name: zipkin + containerPort: 9411 + protocol: TCP + livenessProbe: + httpGet: + path: /ready + port: http + initialDelaySeconds: 30 + periodSeconds: 10 + timeoutSeconds: 5 + readinessProbe: + httpGet: + path: /ready + port: http + initialDelaySeconds: 30 + periodSeconds: 10 + timeoutSeconds: 5 + resources: + requests: + cpu: 500m + memory: 1Gi + limits: + cpu: 2000m + memory: 2Gi + volumeMounts: + - name: tempo-config + mountPath: /etc/tempo + - name: tempo-data + mountPath: /tmp/tempo + volumes: + - name: tempo-config + configMap: + name: tempo-config + - name: tempo-data + persistentVolumeClaim: + claimName: tempo-data + +--- +apiVersion: v1 +kind: Service +metadata: + name: tempo + namespace: observability + labels: + app: tempo +spec: + type: ClusterIP + ports: + - port: 3200 + targetPort: http + protocol: TCP + name: http + - port: 4317 + targetPort: otlp-grpc + protocol: TCP + name: otlp-grpc + - port: 4318 + targetPort: otlp-http + protocol: TCP + name: otlp-http + - port: 14250 + targetPort: jaeger-grpc + protocol: TCP + name: jaeger-grpc + - port: 14268 + targetPort: jaeger-http + protocol: TCP + name: jaeger-http + - port: 9411 + targetPort: zipkin + protocol: TCP + name: zipkin + selector: + app: tempo diff --git a/k8s/observability-stack/13-grafana.yaml b/k8s/observability-stack/13-grafana.yaml new file mode 100644 index 0000000..c3d7ddb --- /dev/null +++ b/k8s/observability-stack/13-grafana.yaml @@ -0,0 +1,97 @@ +apiVersion: apps/v1 +kind: StatefulSet +metadata: + name: grafana + namespace: observability + labels: + app: grafana +spec: + serviceName: grafana + replicas: 1 + selector: + matchLabels: + app: grafana + template: + metadata: + labels: + app: grafana + spec: + nodeSelector: + kubernetes.io/hostname: hetzner-2 + securityContext: + fsGroup: 472 + runAsGroup: 472 + runAsUser: 472 + containers: + - name: grafana + image: grafana/grafana:11.4.0 + ports: + - name: http + containerPort: 3000 + protocol: TCP + env: + - name: GF_SECURITY_ADMIN_USER + value: admin + - name: GF_SECURITY_ADMIN_PASSWORD + value: admin # Change this in production! + - name: GF_INSTALL_PLUGINS + value: "" + - name: GF_FEATURE_TOGGLES_ENABLE + value: "traceqlEditor,correlations" + - name: GF_AUTH_ANONYMOUS_ENABLED + value: "false" + - name: GF_ANALYTICS_REPORTING_ENABLED + value: "false" + - name: GF_ANALYTICS_CHECK_FOR_UPDATES + value: "false" + livenessProbe: + httpGet: + path: /api/health + port: http + initialDelaySeconds: 60 + periodSeconds: 10 + timeoutSeconds: 5 + readinessProbe: + httpGet: + path: /api/health + port: http + initialDelaySeconds: 60 + periodSeconds: 10 + timeoutSeconds: 5 + resources: + requests: + cpu: 250m + memory: 512Mi + limits: + cpu: 1000m + memory: 1Gi + volumeMounts: + - name: grafana-data + mountPath: /var/lib/grafana + - name: grafana-datasources + mountPath: /etc/grafana/provisioning/datasources + volumes: + - name: grafana-data + persistentVolumeClaim: + claimName: grafana-data + - name: grafana-datasources + configMap: + name: grafana-datasources + +--- +apiVersion: v1 +kind: Service +metadata: + name: grafana + namespace: observability + labels: + app: grafana +spec: + type: ClusterIP + ports: + - port: 3000 + targetPort: http + protocol: TCP + name: http + selector: + app: grafana diff --git a/k8s/observability-stack/14-alloy.yaml b/k8s/observability-stack/14-alloy.yaml new file mode 100644 index 0000000..0e217c5 --- /dev/null +++ b/k8s/observability-stack/14-alloy.yaml @@ -0,0 +1,107 @@ +apiVersion: apps/v1 +kind: DaemonSet +metadata: + name: alloy + namespace: observability + labels: + app: alloy +spec: + selector: + matchLabels: + app: alloy + template: + metadata: + labels: + app: alloy + spec: + serviceAccountName: alloy + hostNetwork: true + hostPID: true + dnsPolicy: ClusterFirstWithHostNet + containers: + - name: alloy + image: grafana/alloy:v1.5.1 + args: + - run + - /etc/alloy/config.alloy + - --storage.path=/var/lib/alloy + - --server.http.listen-addr=0.0.0.0:12345 + ports: + - name: http-metrics + containerPort: 12345 + protocol: TCP + - name: otlp-grpc + containerPort: 4317 + protocol: TCP + - name: otlp-http + containerPort: 4318 + protocol: TCP + env: + - name: HOSTNAME + valueFrom: + fieldRef: + fieldPath: spec.nodeName + securityContext: + privileged: true + runAsUser: 0 + resources: + requests: + cpu: 100m + memory: 256Mi + limits: + cpu: 500m + memory: 512Mi + volumeMounts: + - name: config + mountPath: /etc/alloy + - name: varlog + mountPath: /var/log + readOnly: true + - name: varlibdockercontainers + mountPath: /var/lib/docker/containers + readOnly: true + - name: etcmachineid + mountPath: /etc/machine-id + readOnly: true + tolerations: + - effect: NoSchedule + operator: Exists + volumes: + - name: config + configMap: + name: alloy-config + - name: varlog + hostPath: + path: /var/log + - name: varlibdockercontainers + hostPath: + path: /var/lib/docker/containers + - name: etcmachineid + hostPath: + path: /etc/machine-id + +--- +apiVersion: v1 +kind: Service +metadata: + name: alloy + namespace: observability + labels: + app: alloy +spec: + type: ClusterIP + ports: + - port: 12345 + targetPort: http-metrics + protocol: TCP + name: http-metrics + - port: 4317 + targetPort: otlp-grpc + protocol: TCP + name: otlp-grpc + - port: 4318 + targetPort: otlp-http + protocol: TCP + name: otlp-http + selector: + app: alloy diff --git a/k8s/observability-stack/15-kube-state-metrics.yaml b/k8s/observability-stack/15-kube-state-metrics.yaml new file mode 100644 index 0000000..0cb4394 --- /dev/null +++ b/k8s/observability-stack/15-kube-state-metrics.yaml @@ -0,0 +1,71 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: kube-state-metrics + namespace: observability + labels: + app: kube-state-metrics +spec: + replicas: 1 + selector: + matchLabels: + app: kube-state-metrics + template: + metadata: + labels: + app: kube-state-metrics + annotations: + prometheus.io/scrape: "true" + prometheus.io/port: "8080" + spec: + serviceAccountName: kube-state-metrics + containers: + - name: kube-state-metrics + image: registry.k8s.io/kube-state-metrics/kube-state-metrics:v2.13.0 + ports: + - name: http-metrics + containerPort: 8080 + - name: telemetry + containerPort: 8081 + livenessProbe: + httpGet: + path: /healthz + port: 8080 + initialDelaySeconds: 5 + timeoutSeconds: 5 + readinessProbe: + httpGet: + path: / + port: 8080 + initialDelaySeconds: 5 + timeoutSeconds: 5 + resources: + requests: + cpu: 100m + memory: 128Mi + limits: + cpu: 200m + memory: 256Mi + +--- +apiVersion: v1 +kind: Service +metadata: + name: kube-state-metrics + namespace: observability + labels: + app: kube-state-metrics + annotations: + prometheus.io/scrape: "true" + prometheus.io/port: "8080" +spec: + type: ClusterIP + ports: + - name: http-metrics + port: 8080 + targetPort: http-metrics + - name: telemetry + port: 8081 + targetPort: telemetry + selector: + app: kube-state-metrics diff --git a/k8s/observability-stack/16-node-exporter.yaml b/k8s/observability-stack/16-node-exporter.yaml new file mode 100644 index 0000000..a956464 --- /dev/null +++ b/k8s/observability-stack/16-node-exporter.yaml @@ -0,0 +1,85 @@ +apiVersion: apps/v1 +kind: DaemonSet +metadata: + name: node-exporter + namespace: observability + labels: + app: node-exporter +spec: + selector: + matchLabels: + app: node-exporter + template: + metadata: + labels: + app: node-exporter + annotations: + prometheus.io/scrape: "true" + prometheus.io/port: "9100" + spec: + hostNetwork: true + hostPID: true + containers: + - name: node-exporter + image: prom/node-exporter:v1.8.2 + args: + - --path.procfs=/host/proc + - --path.sysfs=/host/sys + - --path.rootfs=/host/root + - --collector.filesystem.mount-points-exclude=^/(sys|proc|dev|host|etc)($$|/) + ports: + - name: metrics + containerPort: 9100 + protocol: TCP + resources: + requests: + cpu: 100m + memory: 128Mi + limits: + cpu: 200m + memory: 256Mi + volumeMounts: + - name: proc + mountPath: /host/proc + readOnly: true + - name: sys + mountPath: /host/sys + readOnly: true + - name: root + mountPath: /host/root + mountPropagation: HostToContainer + readOnly: true + tolerations: + - effect: NoSchedule + operator: Exists + volumes: + - name: proc + hostPath: + path: /proc + - name: sys + hostPath: + path: /sys + - name: root + hostPath: + path: / + +--- +apiVersion: v1 +kind: Service +metadata: + name: node-exporter + namespace: observability + labels: + app: node-exporter + annotations: + prometheus.io/scrape: "true" + prometheus.io/port: "9100" +spec: + type: ClusterIP + clusterIP: None + ports: + - name: metrics + port: 9100 + targetPort: metrics + selector: + app: node-exporter diff --git a/k8s/observability-stack/20-grafana-ingress.yaml b/k8s/observability-stack/20-grafana-ingress.yaml new file mode 100644 index 0000000..934dc25 --- /dev/null +++ b/k8s/observability-stack/20-grafana-ingress.yaml @@ -0,0 +1,26 @@ +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + name: grafana-ingress + namespace: observability + annotations: + cert-manager.io/cluster-issuer: "letsencrypt-prod" + nginx.ingress.kubernetes.io/ssl-redirect: "true" + nginx.ingress.kubernetes.io/force-ssl-redirect: "true" +spec: + ingressClassName: nginx + tls: + - hosts: + - grafana.betelgeusebytes.io + secretName: grafana-tls + rules: + - host: grafana.betelgeusebytes.io + http: + paths: + - path: / + pathType: Prefix + backend: + service: + name: grafana + port: + number: 3000 diff --git a/k8s/observability-stack/21-optional-ingresses.yaml b/k8s/observability-stack/21-optional-ingresses.yaml new file mode 100644 index 0000000..707fe95 --- /dev/null +++ b/k8s/observability-stack/21-optional-ingresses.yaml @@ -0,0 +1,90 @@ +--- +# Optional: Prometheus Ingress (for direct access to Prometheus UI) +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + name: prometheus-ingress + namespace: observability + annotations: + cert-manager.io/cluster-issuer: "letsencrypt-prod" + nginx.ingress.kubernetes.io/ssl-redirect: "true" + nginx.ingress.kubernetes.io/force-ssl-redirect: "true" + # Optional: Add basic auth for security + # nginx.ingress.kubernetes.io/auth-type: basic + # nginx.ingress.kubernetes.io/auth-secret: prometheus-basic-auth + # nginx.ingress.kubernetes.io/auth-realm: 'Authentication Required' +spec: + ingressClassName: nginx + tls: + - hosts: + - prometheus.betelgeusebytes.io + secretName: prometheus-tls + rules: + - host: prometheus.betelgeusebytes.io + http: + paths: + - path: / + pathType: Prefix + backend: + service: + name: prometheus + port: + number: 9090 + +--- +# Optional: Loki Ingress (for direct API access) +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + name: loki-ingress + namespace: observability + annotations: + cert-manager.io/cluster-issuer: "letsencrypt-prod" + nginx.ingress.kubernetes.io/ssl-redirect: "true" + nginx.ingress.kubernetes.io/force-ssl-redirect: "true" +spec: + ingressClassName: nginx + tls: + - hosts: + - loki.betelgeusebytes.io + secretName: loki-tls + rules: + - host: loki.betelgeusebytes.io + http: + paths: + - path: / + pathType: Prefix + backend: + service: + name: loki + port: + number: 3100 + +--- +# Optional: Tempo Ingress (for direct API access) +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + name: tempo-ingress + namespace: observability + annotations: + cert-manager.io/cluster-issuer: "letsencrypt-prod" + nginx.ingress.kubernetes.io/ssl-redirect: "true" + nginx.ingress.kubernetes.io/force-ssl-redirect: "true" +spec: + ingressClassName: nginx + tls: + - hosts: + - tempo.betelgeusebytes.io + secretName: tempo-tls + rules: + - host: tempo.betelgeusebytes.io + http: + paths: + - path: / + pathType: Prefix + backend: + service: + name: tempo + port: + number: 3200 diff --git a/k8s/observability-stack/DEPLOYMENT-CHECKLIST.md b/k8s/observability-stack/DEPLOYMENT-CHECKLIST.md new file mode 100644 index 0000000..630a37b --- /dev/null +++ b/k8s/observability-stack/DEPLOYMENT-CHECKLIST.md @@ -0,0 +1,359 @@ +# Observability Stack Deployment Checklist + +Use this checklist to ensure a smooth deployment of the observability stack. + +## Pre-Deployment + +### Check for Existing Monitoring Stack +- [ ] Check if you have existing monitoring components: +```bash +# Check for monitoring namespaces +kubectl get namespaces | grep -E "(monitoring|prometheus|grafana|loki|tempo)" + +# Check for monitoring pods in common namespaces +kubectl get pods -n monitoring 2>/dev/null || true +kubectl get pods -n prometheus 2>/dev/null || true +kubectl get pods -n grafana 2>/dev/null || true +kubectl get pods -A | grep -E "(prometheus|grafana|loki|tempo|fluent-bit|vector)" + +# Check for Helm releases +helm list -A | grep -E "(prometheus|grafana|loki|tempo)" +``` + +- [ ] If existing monitoring is found, remove it first: +```bash +./remove-old-monitoring.sh +``` + +**OR** run the deployment script which will prompt you: +```bash +./deploy.sh # Will ask if you want to clean up first +``` + +### Prerequisites +- [ ] Kubernetes cluster is running +- [ ] NGINX Ingress Controller is installed +- [ ] cert-manager is installed with Let's Encrypt ClusterIssuer +- [ ] DNS record `grafana.betelgeusebytes.io` points to cluster IP +- [ ] Node is labeled `kubernetes.io/hostname=hetzner-2` +- [ ] kubectl is configured and working + +### Verify Prerequisites +```bash +# Check cluster +kubectl cluster-info + +# Check NGINX Ingress +kubectl get pods -n ingress-nginx + +# Check cert-manager +kubectl get pods -n cert-manager + +# Check node label +kubectl get nodes --show-labels | grep hetzner-2 + +# Check DNS (from external machine) +dig grafana.betelgeusebytes.io +``` + +## Deployment Steps + +### Step 1: Prepare Storage +- [ ] SSH into hetzner-2 node +- [ ] Create directories: +```bash +sudo mkdir -p /mnt/local-ssd/{prometheus,loki,tempo,grafana} +``` +- [ ] Set correct permissions: +```bash +sudo chown -R 65534:65534 /mnt/local-ssd/prometheus +sudo chown -R 10001:10001 /mnt/local-ssd/loki +sudo chown -R root:root /mnt/local-ssd/tempo +sudo chown -R 472:472 /mnt/local-ssd/grafana +``` +- [ ] Verify permissions: +```bash +ls -la /mnt/local-ssd/ +``` + +### Step 2: Review Configuration +- [ ] Review `03-prometheus-config.yaml` - verify scrape targets +- [ ] Review `04-loki-config.yaml` - verify retention (7 days) +- [ ] Review `05-tempo-config.yaml` - verify retention (7 days) +- [ ] Review `06-alloy-config.yaml` - verify endpoints +- [ ] Review `20-grafana-ingress.yaml` - verify domain name + +### Step 3: Deploy the Stack +- [ ] Navigate to observability-stack directory +```bash +cd /path/to/observability-stack +``` +- [ ] Make scripts executable (already done): +```bash +chmod +x *.sh +``` +- [ ] Run deployment script: +```bash +./deploy.sh +``` +OR deploy manually: +```bash +kubectl apply -f 00-namespace.yaml +kubectl apply -f 01-persistent-volumes.yaml +kubectl apply -f 02-persistent-volume-claims.yaml +kubectl apply -f 03-prometheus-config.yaml +kubectl apply -f 04-loki-config.yaml +kubectl apply -f 05-tempo-config.yaml +kubectl apply -f 06-alloy-config.yaml +kubectl apply -f 07-grafana-datasources.yaml +kubectl apply -f 08-rbac.yaml +kubectl apply -f 10-prometheus.yaml +kubectl apply -f 11-loki.yaml +kubectl apply -f 12-tempo.yaml +kubectl apply -f 13-grafana.yaml +kubectl apply -f 14-alloy.yaml +kubectl apply -f 15-kube-state-metrics.yaml +kubectl apply -f 16-node-exporter.yaml +kubectl apply -f 20-grafana-ingress.yaml +``` + +### Step 4: Verify Deployment +- [ ] Run status check: +```bash +./status.sh +``` +- [ ] Check all PersistentVolumes are Bound: +```bash +kubectl get pv +``` +- [ ] Check all PersistentVolumeClaims are Bound: +```bash +kubectl get pvc -n observability +``` +- [ ] Check all pods are Running: +```bash +kubectl get pods -n observability +``` +Expected pods: + - [x] prometheus-0 + - [x] loki-0 + - [x] tempo-0 + - [x] grafana-0 + - [x] alloy-xxxxx (one per node) + - [x] kube-state-metrics-xxxxx + - [x] node-exporter-xxxxx (one per node) + +- [ ] Check services are created: +```bash +kubectl get svc -n observability +``` +- [ ] Check ingress is created: +```bash +kubectl get ingress -n observability +``` +- [ ] Verify TLS certificate is issued: +```bash +kubectl get certificate -n observability +kubectl describe certificate grafana-tls -n observability +``` + +### Step 5: Test Connectivity +- [ ] Test Prometheus endpoint: +```bash +kubectl run -it --rm test --image=curlimages/curl --restart=Never -- \ + curl http://prometheus.observability.svc.cluster.local:9090/-/healthy +``` +- [ ] Test Loki endpoint: +```bash +kubectl run -it --rm test --image=curlimages/curl --restart=Never -- \ + curl http://loki.observability.svc.cluster.local:3100/ready +``` +- [ ] Test Tempo endpoint: +```bash +kubectl run -it --rm test --image=curlimages/curl --restart=Never -- \ + curl http://tempo.observability.svc.cluster.local:3200/ready +``` +- [ ] Test Grafana endpoint: +```bash +kubectl run -it --rm test --image=curlimages/curl --restart=Never -- \ + curl http://grafana.observability.svc.cluster.local:3000/api/health +``` + +## Post-Deployment Configuration + +### Step 6: Access Grafana +- [ ] Open browser to: https://grafana.betelgeusebytes.io +- [ ] Login with default credentials: + - Username: `admin` + - Password: `admin` +- [ ] **CRITICAL**: Change admin password immediately +- [ ] Verify datasources are configured: + - Go to Configuration → Data Sources + - Should see: Prometheus (default), Loki, Tempo + - Click "Test" on each datasource + +### Step 7: Verify Data Collection +- [ ] Check Prometheus has targets: + - In Grafana, Explore → Prometheus + - Query: `up` + - Should see multiple targets with value=1 +- [ ] Check Loki is receiving logs: + - In Grafana, Explore → Loki + - Query: `{namespace="observability"}` + - Should see logs from observability stack +- [ ] Check kube-state-metrics: + - In Grafana, Explore → Prometheus + - Query: `kube_pod_status_phase` + - Should see pod status metrics + +### Step 8: Import Dashboards (Optional) +- [ ] Import Kubernetes cluster dashboard: + - Dashboards → Import → ID: 315 +- [ ] Import Node Exporter dashboard: + - Dashboards → Import → ID: 1860 +- [ ] Import Loki dashboard: + - Dashboards → Import → ID: 13639 + +### Step 9: Test with Demo App (Optional) +- [ ] Deploy demo application: +```bash +kubectl apply -f demo-app.yaml +``` +- [ ] Wait for pod to be ready: +```bash +kubectl wait --for=condition=ready pod -l app=demo-app -n observability --timeout=300s +``` +- [ ] Test the endpoints: +```bash +kubectl port-forward -n observability svc/demo-app 8080:8080 +# In another terminal: +curl http://localhost:8080/ +curl http://localhost:8080/items +curl http://localhost:8080/slow +curl http://localhost:8080/error +``` +- [ ] Verify in Grafana: + - Logs: `{app="demo-app"}` + - Metrics: `flask_http_request_total` + - Traces: Search for "demo-app" service in Tempo + +## Monitoring and Maintenance + +### Daily Checks +- [ ] Check pod status: `kubectl get pods -n observability` +- [ ] Check resource usage: `kubectl top pods -n observability` +- [ ] Check disk usage on hetzner-2: `df -h /mnt/local-ssd/` + +### Weekly Checks +- [ ] Review Grafana for any alerts or anomalies +- [ ] Verify TLS certificate is valid +- [ ] Check logs for any errors: +```bash +kubectl logs -n observability -l app=prometheus --tail=100 +kubectl logs -n observability -l app=loki --tail=100 +kubectl logs -n observability -l app=tempo --tail=100 +kubectl logs -n observability -l app=grafana --tail=100 +``` + +### Monthly Checks +- [ ] Review retention policies (7 days is appropriate) +- [ ] Check storage growth trends +- [ ] Review and update dashboards +- [ ] Backup Grafana dashboards and configs + +## Troubleshooting Guide + +### Pod Won't Start +1. Check events: `kubectl describe pod -n observability` +2. Check logs: `kubectl logs -n observability` +3. Check storage: `kubectl get pv` and `kubectl get pvc -n observability` +4. Verify node has space: SSH to hetzner-2 and run `df -h` + +### No Logs Appearing +1. Check Alloy pods: `kubectl get pods -n observability -l app=alloy` +2. Check Alloy logs: `kubectl logs -n observability -l app=alloy` +3. Check Loki is running: `kubectl get pods -n observability -l app=loki` +4. Test Loki endpoint from Alloy pod + +### No Metrics Appearing +1. Check Prometheus targets: Port-forward and visit http://localhost:9090/targets +2. Check service discovery: Look for "kubernetes-*" targets +3. Verify RBAC: `kubectl get clusterrolebinding prometheus` +4. Check kube-state-metrics: `kubectl get pods -n observability -l app=kube-state-metrics` + +### Grafana Can't Connect to Datasources +1. Test from Grafana pod: +```bash +kubectl exec -it grafana-0 -n observability -- wget -O- http://prometheus.observability.svc.cluster.local:9090/-/healthy +``` +2. Check datasource configuration in Grafana UI +3. Verify services exist: `kubectl get svc -n observability` + +### High Resource Usage +1. Check actual usage: `kubectl top pods -n observability` +2. Check node capacity: `kubectl top nodes` +3. Consider reducing retention periods +4. Review and adjust resource limits + +## Rollback Procedure + +If something goes wrong: + +1. Remove the deployment: +```bash +./cleanup.sh +``` + +2. Fix the issue in configuration files + +3. Redeploy: +```bash +./deploy.sh +``` + +## Success Criteria + +All checked items below indicate successful deployment: + +- [x] All pods are in Running state +- [x] All PVCs are Bound +- [x] Grafana is accessible at https://grafana.betelgeusebytes.io +- [x] All three datasources (Prometheus, Loki, Tempo) test successfully +- [x] Prometheus shows targets as "up" +- [x] Loki shows logs from observability namespace +- [x] TLS certificate is valid and auto-renewing +- [x] Admin password has been changed +- [x] Resource usage is within acceptable limits + +## Documentation References + +- **README.md**: Comprehensive documentation +- **QUICKREF.md**: Quick reference for common operations +- **demo-app.yaml**: Example instrumented application +- **deploy.sh**: Automated deployment script +- **cleanup.sh**: Removal script +- **status.sh**: Status checking script + +## Next Steps After Deployment + +1. Import useful dashboards from Grafana.com +2. Configure alerts (requires Alertmanager - not included) +3. Instrument your applications to send logs/metrics/traces +4. Create custom dashboards for your specific needs +5. Set up backup procedures for Grafana dashboards +6. Document your team's observability practices + +## Notes + +- Default retention: 7 days for all components +- Default resources are optimized for single-node cluster +- Scale up resources if monitoring high-traffic applications +- Always backup before making configuration changes +- Test changes in a non-production environment first + +--- + +**Deployment Date**: _______________ +**Deployed By**: _______________ +**Grafana Version**: 11.4.0 +**Stack Version**: January 2025 diff --git a/k8s/observability-stack/DNS-SETUP.md b/k8s/observability-stack/DNS-SETUP.md new file mode 100644 index 0000000..97fa4fa --- /dev/null +++ b/k8s/observability-stack/DNS-SETUP.md @@ -0,0 +1,146 @@ +# DNS Configuration Guide + +## Required DNS Records + +### Minimum Setup (Recommended) + +Only **one** DNS record is required for basic operation: + +``` +grafana.betelgeusebytes.io A/CNAME +``` + +This gives you access to the complete observability stack through Grafana's unified interface. + +## Optional DNS Records + +If you want direct access to individual components, add these DNS records: + +``` +prometheus.betelgeusebytes.io A/CNAME +loki.betelgeusebytes.io A/CNAME +tempo.betelgeusebytes.io A/CNAME +``` + +Then deploy the optional ingresses: +```bash +kubectl apply -f 21-optional-ingresses.yaml +``` + +## DNS Record Types + +**Option 1: A Record (Direct IP)** +``` +Type: A +Name: grafana.betelgeusebytes.io +Value: 1.2.3.4 (your cluster's public IP) +TTL: 300 +``` + +**Option 2: CNAME (Alias to another domain)** +``` +Type: CNAME +Name: grafana.betelgeusebytes.io +Value: your-server.example.com +TTL: 300 +``` + +## Access URLs Summary + +### After DNS Setup + +| Service | URL | Purpose | DNS Required? | +|---------|-----|---------|---------------| +| **Grafana** | https://grafana.betelgeusebytes.io | Main dashboard (logs/metrics/traces) | ✅ Yes | +| **Prometheus** | https://prometheus.betelgeusebytes.io | Metrics UI (optional) | ⚠️ Optional | +| **Loki** | https://loki.betelgeusebytes.io | Logs API (optional) | ⚠️ Optional | +| **Tempo** | https://tempo.betelgeusebytes.io | Traces API (optional) | ⚠️ Optional | + +### Internal (No DNS Needed) + +These services are accessible from within your cluster only: + +``` +# Metrics +http://prometheus.observability.svc.cluster.local:9090 + +# Logs +http://loki.observability.svc.cluster.local:3100 + +# Traces (OTLP endpoints for your apps) +http://tempo.observability.svc.cluster.local:4317 # gRPC +http://tempo.observability.svc.cluster.local:4318 # HTTP + +# Grafana (internal) +http://grafana.observability.svc.cluster.local:3000 +``` + +## Verification + +After setting up DNS, verify it's working: + +```bash +# Check DNS resolution +dig grafana.betelgeusebytes.io +nslookup grafana.betelgeusebytes.io + +# Should return your cluster IP + +# Test HTTPS access +curl -I https://grafana.betelgeusebytes.io +# Should return 200 OK or 302 redirect +``` + +## TLS Certificate + +Let's Encrypt will automatically issue certificates for: +- grafana.betelgeusebytes.io (required) +- prometheus.betelgeusebytes.io (if optional ingress deployed) +- loki.betelgeusebytes.io (if optional ingress deployed) +- tempo.betelgeusebytes.io (if optional ingress deployed) + +Check certificate status: +```bash +kubectl get certificate -n observability +kubectl describe certificate grafana-tls -n observability +``` + +## Recommendation + +**For most users:** Just configure `grafana.betelgeusebytes.io` + +Why? +- Single DNS record to manage +- Grafana provides unified access to all components +- Simpler certificate management +- All functionality available through one interface + +**For advanced users:** Add optional DNS records if you need: +- Direct Prometheus UI access for debugging +- External log/trace ingestion +- API integrations +- Programmatic queries outside Grafana + +## Troubleshooting + +**DNS not resolving:** +- Check DNS propagation: https://dnschecker.org/ +- Wait 5-15 minutes for DNS to propagate +- Verify your DNS provider settings + +**Certificate not issued:** +```bash +# Check cert-manager +kubectl get pods -n cert-manager + +# Check certificate request +kubectl describe certificate grafana-tls -n observability + +# Check challenges +kubectl get challenges -n observability +``` + +**403/404 errors:** +- Verify ingress is created: `kubectl get ingress -n observability` +- Check NGINX ingress controller: `kubectl get pods -n ingress-nginx` +- Check ingress logs: `kubectl logs -n ingress-nginx ` diff --git a/k8s/observability-stack/MONITORING-GUIDE.md b/k8s/observability-stack/MONITORING-GUIDE.md new file mode 100644 index 0000000..15e3b6f --- /dev/null +++ b/k8s/observability-stack/MONITORING-GUIDE.md @@ -0,0 +1,572 @@ +# Access URLs & Monitoring New Applications Guide + +## 🌐 Access URLs + +### Required (Already Configured) + +**Grafana - Main Dashboard** +- **URL**: https://grafana.betelgeusebytes.io +- **DNS Required**: Yes - `grafana.betelgeusebytes.io` → your cluster IP +- **Login**: admin / admin (change on first login!) +- **Purpose**: Unified interface for logs, metrics, and traces +- **Ingress**: Already included in deployment (20-grafana-ingress.yaml) + +### Optional (Direct Component Access) + +You can optionally expose these components directly: + +**Prometheus - Metrics UI** +- **URL**: https://prometheus.betelgeusebytes.io +- **DNS Required**: Yes - `prometheus.betelgeusebytes.io` → your cluster IP +- **Purpose**: Direct access to Prometheus UI, query metrics, check targets +- **Deploy**: `kubectl apply -f 21-optional-ingresses.yaml` +- **Use Case**: Debugging metric collection, advanced PromQL queries + +**Loki - Logs API** +- **URL**: https://loki.betelgeusebytes.io +- **DNS Required**: Yes - `loki.betelgeusebytes.io` → your cluster IP +- **Purpose**: Direct API access for log queries +- **Deploy**: `kubectl apply -f 21-optional-ingresses.yaml` +- **Use Case**: External log forwarding, API integration + +**Tempo - Traces API** +- **URL**: https://tempo.betelgeusebytes.io +- **DNS Required**: Yes - `tempo.betelgeusebytes.io` → your cluster IP +- **Purpose**: Direct API access for trace queries +- **Deploy**: `kubectl apply -f 21-optional-ingresses.yaml` +- **Use Case**: External trace ingestion, API integration + +### Internal Only (No DNS Required) + +These are ClusterIP services accessible only from within the cluster: + +``` +http://prometheus.observability.svc.cluster.local:9090 +http://loki.observability.svc.cluster.local:3100 +http://tempo.observability.svc.cluster.local:3200 +http://tempo.observability.svc.cluster.local:4317 # OTLP gRPC +http://tempo.observability.svc.cluster.local:4318 # OTLP HTTP +``` + +## 🎯 Recommendation + +**For most users**: Just use Grafana (grafana.betelgeusebytes.io) +- Grafana provides unified access to all components +- No need to expose Prometheus, Loki, or Tempo directly +- Simpler DNS configuration (only one subdomain) + +**For power users**: Add optional ingresses +- Direct Prometheus access is useful for debugging +- Helps verify targets and scrape configs +- Deploy with: `kubectl apply -f 21-optional-ingresses.yaml` + +## 📊 Monitoring New Applications + +### Automatic: Kubernetes Logs + +**All pod logs are automatically collected!** No configuration needed. + +Alloy runs as a DaemonSet and automatically: +1. Discovers all pods in the cluster +2. Reads logs from `/var/log/pods/` +3. Sends them to Loki with labels: + - `namespace` + - `pod` + - `container` + - `node` + - All pod labels + +**View in Grafana:** +```logql +# All logs from your app +{namespace="your-namespace", pod=~"your-app.*"} + +# Error logs only +{namespace="your-namespace"} |= "error" + +# JSON logs parsed +{namespace="your-namespace"} | json | level="error" +``` + +**Best Practice for Logs:** +Emit structured JSON logs from your application: + +```python +import json +import logging + +# Python example +logging.basicConfig( + format='%(message)s', + level=logging.INFO +) + +logger = logging.getLogger(__name__) + +# Log as JSON +logger.info(json.dumps({ + "level": "info", + "message": "User login successful", + "user_id": "123", + "ip": "1.2.3.4", + "duration_ms": 42 +})) +``` + +### Manual: Application Metrics + +#### Step 1: Expose Metrics Endpoint + +Your application needs to expose metrics at `/metrics` in Prometheus format. + +**Python (Flask) Example:** +```python +from prometheus_flask_exporter import PrometheusMetrics + +app = Flask(__name__) +metrics = PrometheusMetrics(app) + +# Now /metrics endpoint is available +# Automatic metrics: request count, duration, etc. +``` + +**Python (FastAPI) Example:** +```python +from prometheus_fastapi_instrumentator import Instrumentator + +app = FastAPI() +Instrumentator().instrument(app).expose(app) + +# /metrics endpoint is now available +``` + +**Go Example:** +```go +import ( + "github.com/prometheus/client_golang/prometheus/promhttp" + "net/http" +) + +http.Handle("/metrics", promhttp.Handler()) +``` + +**Node.js Example:** +```javascript +const promClient = require('prom-client'); + +// Create default metrics +const register = new promClient.Registry(); +promClient.collectDefaultMetrics({ register }); + +// Expose /metrics endpoint +app.get('/metrics', async (req, res) => { + res.set('Content-Type', register.contentType); + res.end(await register.metrics()); +}); +``` + +#### Step 2: Add Prometheus Annotations to Your Deployment + +Add these annotations to your pod template: + +```yaml +apiVersion: apps/v1 +kind: Deployment +metadata: + name: my-app + namespace: my-namespace +spec: + template: + metadata: + annotations: + prometheus.io/scrape: "true" # Enable scraping + prometheus.io/port: "8080" # Port where metrics are exposed + prometheus.io/path: "/metrics" # Path to metrics (optional, /metrics is default) + spec: + containers: + - name: my-app + image: my-app:latest + ports: + - name: http + containerPort: 8080 +``` + +#### Step 3: Verify Metrics Collection + +**Check in Prometheus:** +1. Access Prometheus UI (if exposed): https://prometheus.betelgeusebytes.io +2. Go to Status → Targets +3. Look for your pod under "kubernetes-pods" +4. Should show as "UP" + +**Or via Grafana:** +1. Go to Explore → Prometheus +2. Query: `up{pod=~"my-app.*"}` +3. Should return value=1 + +**Query your metrics:** +```promql +# Request rate +rate(http_requests_total{namespace="my-namespace"}[5m]) + +# Request duration 95th percentile +histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m])) + +# Error rate +rate(http_requests_total{namespace="my-namespace", status=~"5.."}[5m]) +``` + +### Manual: Application Traces + +#### Step 1: Add OpenTelemetry to Your Application + +**Python Example:** +```python +from opentelemetry import trace +from opentelemetry.sdk.trace import TracerProvider +from opentelemetry.sdk.trace.export import BatchSpanProcessor +from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter +from opentelemetry.instrumentation.flask import FlaskInstrumentor +from opentelemetry.sdk.resources import Resource + +# Configure resource +resource = Resource.create({"service.name": "my-app"}) + +# Setup tracer +trace_provider = TracerProvider(resource=resource) +trace_provider.add_span_processor( + BatchSpanProcessor( + OTLPSpanExporter( + endpoint="http://tempo.observability.svc.cluster.local:4317", + insecure=True + ) + ) +) +trace.set_tracer_provider(trace_provider) + +# Auto-instrument Flask +app = Flask(__name__) +FlaskInstrumentor().instrument_app(app) + +# Manual spans +tracer = trace.get_tracer(__name__) + +@app.route('/api/data') +def get_data(): + with tracer.start_as_current_span("fetch_data") as span: + # Your code here + span.set_attribute("rows", 100) + return {"data": "..."} +``` + +**Install dependencies:** +```bash +pip install opentelemetry-api opentelemetry-sdk \ + opentelemetry-instrumentation-flask \ + opentelemetry-exporter-otlp-proto-grpc +``` + +**Go Example:** +```go +import ( + "go.opentelemetry.io/otel" + "go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc" + "go.opentelemetry.io/otel/sdk/trace" +) + +exporter, _ := otlptracegrpc.New( + context.Background(), + otlptracegrpc.WithEndpoint("tempo.observability.svc.cluster.local:4317"), + otlptracegrpc.WithInsecure(), +) + +tp := trace.NewTracerProvider( + trace.WithBatcher(exporter), +) +otel.SetTracerProvider(tp) +``` + +**Node.js Example:** +```javascript +const { NodeTracerProvider } = require('@opentelemetry/sdk-trace-node'); +const { OTLPTraceExporter } = require('@opentelemetry/exporter-trace-otlp-grpc'); +const { BatchSpanProcessor } = require('@opentelemetry/sdk-trace-base'); + +const provider = new NodeTracerProvider(); +const exporter = new OTLPTraceExporter({ + url: 'http://tempo.observability.svc.cluster.local:4317' +}); +provider.addSpanProcessor(new BatchSpanProcessor(exporter)); +provider.register(); +``` + +#### Step 2: Add Trace IDs to Logs (Optional but Recommended) + +This enables clicking from logs to traces in Grafana! + +**Python Example:** +```python +import json +from opentelemetry import trace + +def log_with_trace(message): + span = trace.get_current_span() + trace_id = format(span.get_span_context().trace_id, '032x') + + log_entry = { + "message": message, + "trace_id": trace_id, + "level": "info" + } + print(json.dumps(log_entry)) +``` + +#### Step 3: Verify Traces + +**In Grafana:** +1. Go to Explore → Tempo +2. Search for service: "my-app" +3. Click on a trace to view details +4. Click "Logs for this span" to see correlated logs + +## 📋 Complete Example: Monitoring a New App + +Here's a complete deployment with all monitoring configured: + +```yaml +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: my-app-config + namespace: my-namespace +data: + app.py: | + from flask import Flask + import logging + import json + from opentelemetry import trace + from opentelemetry.sdk.trace import TracerProvider + from opentelemetry.sdk.trace.export import BatchSpanProcessor + from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter + from opentelemetry.instrumentation.flask import FlaskInstrumentor + from opentelemetry.sdk.resources import Resource + from prometheus_flask_exporter import PrometheusMetrics + + # Setup logging + logging.basicConfig(level=logging.INFO, format='%(message)s') + logger = logging.getLogger(__name__) + + # Setup tracing + resource = Resource.create({"service.name": "my-app"}) + trace_provider = TracerProvider(resource=resource) + trace_provider.add_span_processor( + BatchSpanProcessor( + OTLPSpanExporter( + endpoint="http://tempo.observability.svc.cluster.local:4317", + insecure=True + ) + ) + ) + trace.set_tracer_provider(trace_provider) + + app = Flask(__name__) + + # Setup metrics + metrics = PrometheusMetrics(app) + + # Auto-instrument with traces + FlaskInstrumentor().instrument_app(app) + + @app.route('/') + def index(): + span = trace.get_current_span() + trace_id = format(span.get_span_context().trace_id, '032x') + + logger.info(json.dumps({ + "level": "info", + "message": "Request received", + "trace_id": trace_id, + "endpoint": "/" + })) + + return {"status": "ok", "trace_id": trace_id} + + if __name__ == '__main__': + app.run(host='0.0.0.0', port=8080) + +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: my-app + namespace: my-namespace + labels: + app: my-app +spec: + replicas: 2 + selector: + matchLabels: + app: my-app + template: + metadata: + labels: + app: my-app + annotations: + # Enable Prometheus scraping + prometheus.io/scrape: "true" + prometheus.io/port: "8080" + prometheus.io/path: "/metrics" + spec: + containers: + - name: my-app + image: python:3.11-slim + command: + - /bin/bash + - -c + - | + pip install flask opentelemetry-api opentelemetry-sdk \ + opentelemetry-instrumentation-flask \ + opentelemetry-exporter-otlp-proto-grpc \ + prometheus-flask-exporter && \ + python /app/app.py + ports: + - name: http + containerPort: 8080 + volumeMounts: + - name: app-code + mountPath: /app + resources: + requests: + cpu: 100m + memory: 256Mi + limits: + cpu: 500m + memory: 512Mi + volumes: + - name: app-code + configMap: + name: my-app-config + +--- +apiVersion: v1 +kind: Service +metadata: + name: my-app + namespace: my-namespace + labels: + app: my-app + annotations: + prometheus.io/scrape: "true" + prometheus.io/port: "8080" +spec: + type: ClusterIP + ports: + - port: 8080 + targetPort: http + protocol: TCP + name: http + selector: + app: my-app +``` + +## 🔍 Verification Checklist + +After deploying a new app with monitoring: + +### Logs ✓ (Automatic) +```bash +# Check logs appear in Grafana +# Explore → Loki → {namespace="my-namespace", pod=~"my-app.*"} +``` + +### Metrics ✓ (If configured) +```bash +# Check Prometheus is scraping +# Explore → Prometheus → up{pod=~"my-app.*"} +# Should return 1 + +# Check your custom metrics +# Explore → Prometheus → flask_http_request_total{namespace="my-namespace"} +``` + +### Traces ✓ (If configured) +```bash +# Check traces appear in Tempo +# Explore → Tempo → Search for service "my-app" +# Should see traces + +# Verify log-trace correlation +# Click on a log line with trace_id → should jump to trace +``` + +## 🎓 Quick Start for Common Frameworks + +### Python Flask/FastAPI +```bash +pip install opentelemetry-distro opentelemetry-exporter-otlp prometheus-flask-exporter +opentelemetry-bootstrap -a install +``` + +```python +# Set environment variables in your deployment: +OTEL_SERVICE_NAME=my-app +OTEL_EXPORTER_OTLP_ENDPOINT=http://tempo.observability.svc.cluster.local:4317 +OTEL_EXPORTER_OTLP_PROTOCOL=grpc + +# Then run with auto-instrumentation: +opentelemetry-instrument python app.py +``` + +### Go +```bash +go get go.opentelemetry.io/otel +go get go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp +go get go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc +``` + +### Node.js +```bash +npm install @opentelemetry/sdk-node @opentelemetry/auto-instrumentations-node \ + @opentelemetry/exporter-trace-otlp-grpc prom-client +``` + +## 📚 Summary + +| Component | Automatic? | Configuration Needed | +|-----------|-----------|---------------------| +| **Logs** | ✅ Yes | None - just deploy your app | +| **Metrics** | ❌ No | Add /metrics endpoint + annotations | +| **Traces** | ❌ No | Add OpenTelemetry SDK + configure endpoint | + +**Recommended Approach:** +1. **Start simple**: Deploy app, logs work automatically +2. **Add metrics**: Expose /metrics, add annotations +3. **Add traces**: Instrument with OpenTelemetry +4. **Correlate**: Add trace IDs to logs for full observability + +## 🔗 Useful Links + +- OpenTelemetry Python: https://opentelemetry.io/docs/instrumentation/python/ +- OpenTelemetry Go: https://opentelemetry.io/docs/instrumentation/go/ +- OpenTelemetry Node.js: https://opentelemetry.io/docs/instrumentation/js/ +- Prometheus Client Libraries: https://prometheus.io/docs/instrumenting/clientlibs/ +- Grafana Docs: https://grafana.com/docs/ + +## 🆘 Troubleshooting + +**Logs not appearing:** +- Check Alloy is running: `kubectl get pods -n observability -l app=alloy` +- Check pod logs are being written to stdout/stderr +- View in real-time: `kubectl logs -f -n ` + +**Metrics not being scraped:** +- Verify annotations are present: `kubectl get pod -o yaml | grep prometheus` +- Check /metrics endpoint: `kubectl port-forward pod/ 8080:8080` then `curl localhost:8080/metrics` +- Check Prometheus targets: https://prometheus.betelgeusebytes.io/targets + +**Traces not appearing:** +- Verify endpoint: `tempo.observability.svc.cluster.local:4317` +- Check Tempo logs: `kubectl logs -n observability tempo-0` +- Verify OTLP exporter is configured correctly in your app +- Check network policies allow traffic to observability namespace diff --git a/k8s/observability-stack/QUICKREF.md b/k8s/observability-stack/QUICKREF.md new file mode 100644 index 0000000..b30bb17 --- /dev/null +++ b/k8s/observability-stack/QUICKREF.md @@ -0,0 +1,398 @@ +# Observability Stack Quick Reference + +## Before You Start + +### Remove Old Monitoring Stack +If you have existing monitoring components, remove them first: +```bash +./remove-old-monitoring.sh +``` + +This will safely remove: +- Prometheus, Grafana, Loki, Tempo deployments +- Fluent Bit, Vector, or other log collectors +- Helm releases +- ConfigMaps, PVCs, RBAC resources +- Prometheus Operator CRDs + +## Quick Access + +- **Grafana UI**: https://grafana.betelgeusebytes.io +- **Default Login**: admin / admin (change immediately!) + +## Essential Commands + +### Check Status +```bash +# Quick status check +./status.sh + +# View all pods +kubectl get pods -n observability -o wide + +# Check specific component +kubectl get pods -n observability -l app=prometheus +kubectl get pods -n observability -l app=loki +kubectl get pods -n observability -l app=tempo +kubectl get pods -n observability -l app=grafana + +# Check storage +kubectl get pv +kubectl get pvc -n observability +``` + +### View Logs +```bash +# Grafana +kubectl logs -n observability -l app=grafana -f + +# Prometheus +kubectl logs -n observability -l app=prometheus -f + +# Loki +kubectl logs -n observability -l app=loki -f + +# Tempo +kubectl logs -n observability -l app=tempo -f + +# Alloy (log collector) +kubectl logs -n observability -l app=alloy -f +``` + +### Restart Components +```bash +# Restart Prometheus +kubectl rollout restart statefulset/prometheus -n observability + +# Restart Loki +kubectl rollout restart statefulset/loki -n observability + +# Restart Tempo +kubectl rollout restart statefulset/tempo -n observability + +# Restart Grafana +kubectl rollout restart statefulset/grafana -n observability + +# Restart Alloy +kubectl rollout restart daemonset/alloy -n observability +``` + +### Update Configurations +```bash +# Edit Prometheus config +kubectl edit configmap prometheus-config -n observability +kubectl rollout restart statefulset/prometheus -n observability + +# Edit Loki config +kubectl edit configmap loki-config -n observability +kubectl rollout restart statefulset/loki -n observability + +# Edit Tempo config +kubectl edit configmap tempo-config -n observability +kubectl rollout restart statefulset/tempo -n observability + +# Edit Alloy config +kubectl edit configmap alloy-config -n observability +kubectl rollout restart daemonset/alloy -n observability + +# Edit Grafana datasources +kubectl edit configmap grafana-datasources -n observability +kubectl rollout restart statefulset/grafana -n observability +``` + +## Common LogQL Queries (Loki) + +### Basic Queries +```logql +# All logs from observability namespace +{namespace="observability"} + +# Logs from specific app +{namespace="observability", app="prometheus"} + +# Filter by log level +{namespace="default"} |= "error" +{namespace="default"} | json | level="error" + +# Exclude certain logs +{namespace="default"} != "health check" + +# Multiple filters +{namespace="default"} |= "error" != "ignore" +``` + +### Advanced Queries +```logql +# Rate of errors +rate({namespace="default"} |= "error" [5m]) + +# Count logs by level +sum by (level) (count_over_time({namespace="default"} | json [5m])) + +# Top 10 error messages +topk(10, count by (message) ( + {namespace="default"} | json | level="error" +)) +``` + +## Common PromQL Queries (Prometheus) + +### Cluster Health +```promql +# All targets up/down +up + +# Pods by phase +kube_pod_status_phase{namespace="observability"} + +# Node memory available +node_memory_MemAvailable_bytes + +# Node CPU usage +rate(node_cpu_seconds_total{mode="user"}[5m]) +``` + +### Container Metrics +```promql +# CPU usage by container +rate(container_cpu_usage_seconds_total[5m]) + +# Memory usage by container +container_memory_usage_bytes + +# Network traffic +rate(container_network_transmit_bytes_total[5m]) +rate(container_network_receive_bytes_total[5m]) +``` + +### Application Metrics +```promql +# HTTP request rate +rate(http_requests_total[5m]) + +# Request duration +histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m])) + +# Error rate +rate(http_requests_total{status=~"5.."}[5m]) +``` + +## Trace Search (Tempo) + +In Grafana Explore with Tempo datasource: + +- **Search by service**: Select from dropdown +- **Search by duration**: "> 1s", "< 100ms" +- **Search by tag**: `http.status_code=500` +- **TraceQL**: `{span.http.method="POST" && span.http.status_code>=400}` + +## Correlations + +### From Logs to Traces +1. View logs in Loki +2. Click on a log line with a trace ID +3. Click the "Tempo" link +4. Trace opens in Tempo + +### From Traces to Logs +1. View trace in Tempo +2. Click on a span +3. Click "Logs for this span" +4. Related logs appear + +### From Traces to Metrics +1. View trace in Tempo +2. Service graph shows metrics +3. Click service to see metrics + +## Demo Application + +Deploy the demo app to test the stack: + +```bash +kubectl apply -f demo-app.yaml + +# Wait for it to start +kubectl wait --for=condition=ready pod -l app=demo-app -n observability --timeout=300s + +# Test it +kubectl port-forward -n observability svc/demo-app 8080:8080 + +# In another terminal +curl http://localhost:8080/ +curl http://localhost:8080/items +curl http://localhost:8080/item/0 +curl http://localhost:8080/slow +curl http://localhost:8080/error +``` + +Now view in Grafana: +- **Logs**: Search `{app="demo-app"}` in Loki +- **Traces**: Search "demo-app" service in Tempo +- **Metrics**: Query `flask_http_request_total` in Prometheus + +## Storage Management + +### Check Disk Usage +```bash +# On hetzner-2 node +df -h /mnt/local-ssd/ + +# Detailed usage +du -sh /mnt/local-ssd/* +``` + +### Cleanup Old Data +Data is automatically deleted after 7 days. To manually adjust retention: + +**Prometheus** (in 03-prometheus-config.yaml): +```yaml +args: + - '--storage.tsdb.retention.time=7d' +``` + +**Loki** (in 04-loki-config.yaml): +```yaml +limits_config: + retention_period: 168h # 7 days +``` + +**Tempo** (in 05-tempo-config.yaml): +```yaml +compactor: + compaction: + block_retention: 168h # 7 days +``` + +## Troubleshooting + +### No Logs Appearing +```bash +# Check Alloy is running +kubectl get pods -n observability -l app=alloy + +# Check Alloy logs +kubectl logs -n observability -l app=alloy + +# Check Loki +kubectl logs -n observability -l app=loki + +# Test Loki endpoint +kubectl run -it --rm debug --image=curlimages/curl --restart=Never -- \ + curl http://loki.observability.svc.cluster.local:3100/ready +``` + +### No Traces Appearing +```bash +# Check Tempo is running +kubectl get pods -n observability -l app=tempo + +# Check Tempo logs +kubectl logs -n observability -l app=tempo + +# Test Tempo endpoint +kubectl run -it --rm debug --image=curlimages/curl --restart=Never -- \ + curl http://tempo.observability.svc.cluster.local:3200/ready + +# Verify your app sends to correct endpoint +# Should be: tempo.observability.svc.cluster.local:4317 (gRPC) +# or: tempo.observability.svc.cluster.local:4318 (HTTP) +``` + +### Grafana Can't Connect to Datasources +```bash +# Check all services are running +kubectl get svc -n observability + +# Test from Grafana pod +kubectl exec -it -n observability grafana-0 -- \ + wget -O- http://prometheus.observability.svc.cluster.local:9090/-/healthy + +kubectl exec -it -n observability grafana-0 -- \ + wget -O- http://loki.observability.svc.cluster.local:3100/ready + +kubectl exec -it -n observability grafana-0 -- \ + wget -O- http://tempo.observability.svc.cluster.local:3200/ready +``` + +### High Resource Usage +```bash +# Check resource usage +kubectl top pods -n observability +kubectl top nodes + +# Scale down if needed (for testing) +kubectl scale statefulset/prometheus -n observability --replicas=0 +kubectl scale statefulset/loki -n observability --replicas=0 +``` + +## Backup and Restore + +### Backup Grafana Dashboards +```bash +# Export all dashboards via API +kubectl port-forward -n observability svc/grafana 3000:3000 + +# In another terminal +curl -H "Authorization: Bearer " \ + http://localhost:3000/api/search?type=dash-db | jq +``` + +### Backup Configurations +```bash +# Backup all ConfigMaps +kubectl get configmap -n observability -o yaml > configmaps-backup.yaml + +# Backup specific config +kubectl get configmap prometheus-config -n observability -o yaml > prometheus-config-backup.yaml +``` + +## Useful Dashboards in Grafana + +After login, import these dashboard IDs: + +- **315**: Kubernetes cluster monitoring +- **7249**: Kubernetes cluster +- **13639**: Loki dashboard +- **12611**: Tempo dashboard +- **3662**: Prometheus 2.0 stats +- **1860**: Node Exporter Full + +Go to: Dashboards → Import → Enter ID → Load + +## Performance Tuning + +### For Higher Load +Increase resources in respective YAML files: + +```yaml +resources: + requests: + cpu: 1000m # from 500m + memory: 4Gi # from 2Gi + limits: + cpu: 4000m # from 2000m + memory: 8Gi # from 4Gi +``` + +### For Lower Resource Usage +- Reduce scrape intervals in Prometheus config +- Reduce log retention periods +- Reduce trace sampling rate + +## Security Checklist + +- [ ] Change Grafana admin password +- [ ] Review RBAC permissions +- [ ] Enable audit logging +- [ ] Consider adding NetworkPolicies +- [ ] Review ingress TLS configuration +- [ ] Backup configurations regularly + +## Getting Help + +1. Check component logs first +2. Review configurations +3. Test network connectivity +4. Check resource availability +5. Review Grafana datasource settings diff --git a/k8s/observability-stack/README.md b/k8s/observability-stack/README.md new file mode 100644 index 0000000..bd56a15 --- /dev/null +++ b/k8s/observability-stack/README.md @@ -0,0 +1,385 @@ +# State-of-the-Art Observability Stack for Kubernetes + +This deployment provides a comprehensive, production-ready observability solution using the Grafana LGTM stack (Loki, Grafana, Tempo, Mimir/Prometheus) with unified collection through Grafana Alloy. + +## Architecture Overview + +### Core Components + +1. **Grafana** (v11.4.0) - Unified visualization platform + - Pre-configured datasources for Prometheus, Loki, and Tempo + - Automatic correlation between logs, metrics, and traces + - Modern UI with TraceQL editor support + +2. **Prometheus** (v2.54.1) - Metrics collection and storage + - 7-day retention + - Comprehensive Kubernetes service discovery + - Scrapes: API server, nodes, cadvisor, pods, services + +3. **Grafana Loki** (v3.2.1) - Log aggregation + - 7-day retention with compaction + - TSDB index for efficient queries + - Automatic correlation with traces + +4. **Grafana Tempo** (v2.6.1) - Distributed tracing + - 7-day retention + - Multiple protocol support: OTLP, Jaeger, Zipkin + - Metrics generation from traces + - Automatic correlation with logs and metrics + +5. **Grafana Alloy** (v1.5.1) - Unified observability agent + - Replaces Promtail, Vector, Fluent Bit + - Collects logs from all pods + - OTLP receiver for traces + - Runs as DaemonSet on all nodes + +6. **kube-state-metrics** (v2.13.0) - Kubernetes object metrics + - Deployment, Pod, Service, Node metrics + - Essential for cluster monitoring + +7. **node-exporter** (v1.8.2) - Node-level system metrics + - CPU, memory, disk, network metrics + - Runs on all nodes via DaemonSet + +## Key Features + +- **Unified Observability**: Logs, metrics, and traces in one platform +- **Automatic Correlation**: Click from logs to traces to metrics seamlessly +- **7-Day Retention**: Optimized for single-node cluster +- **Local SSD Storage**: Fast, persistent storage on hetzner-2 node +- **OTLP Support**: Modern OpenTelemetry protocol support +- **TLS Enabled**: Secure access via NGINX Ingress with Let's Encrypt +- **Low Resource Footprint**: Optimized for single-node deployment + +## Storage Layout + +All data stored on local SSD at `/mnt/local-ssd/`: + +``` +/mnt/local-ssd/ +├── prometheus/ (50Gi) - Metrics data +├── loki/ (100Gi) - Log data +├── tempo/ (50Gi) - Trace data +└── grafana/ (10Gi) - Dashboards and settings +``` + +## Deployment Instructions + +### Prerequisites + +1. Kubernetes cluster with NGINX Ingress Controller +2. cert-manager installed with Let's Encrypt issuer +3. DNS record: `grafana.betelgeusebytes.io` → your cluster IP +4. Node labeled: `kubernetes.io/hostname=hetzner-2` + +### Step 0: Remove Existing Monitoring (If Applicable) + +If you have an existing monitoring stack (Prometheus, Grafana, Loki, Fluent Bit, etc.), remove it first to avoid conflicts: + +```bash +./remove-old-monitoring.sh +``` + +This interactive script will help you safely remove: +- Existing Prometheus/Grafana/Loki/Tempo deployments +- Helm releases for monitoring components +- Fluent Bit, Vector, or other log collectors +- Related ConfigMaps, PVCs, and RBAC resources +- Prometheus Operator CRDs (if applicable) + +**Note**: The main deployment script (`deploy.sh`) will also prompt you to run cleanup if needed. + +### Step 1: Prepare Storage Directories + +SSH into the hetzner-2 node and create directories: + +```bash +sudo mkdir -p /mnt/local-ssd/{prometheus,loki,tempo,grafana} +sudo chown -R 65534:65534 /mnt/local-ssd/prometheus +sudo chown -R 10001:10001 /mnt/local-ssd/loki +sudo chown -R root:root /mnt/local-ssd/tempo +sudo chown -R 472:472 /mnt/local-ssd/grafana +``` + +### Step 2: Deploy the Stack + +```bash +chmod +x deploy.sh +./deploy.sh +``` + +Or deploy manually: + +```bash +kubectl apply -f 00-namespace.yaml +kubectl apply -f 01-persistent-volumes.yaml +kubectl apply -f 02-persistent-volume-claims.yaml +kubectl apply -f 03-prometheus-config.yaml +kubectl apply -f 04-loki-config.yaml +kubectl apply -f 05-tempo-config.yaml +kubectl apply -f 06-alloy-config.yaml +kubectl apply -f 07-grafana-datasources.yaml +kubectl apply -f 08-rbac.yaml +kubectl apply -f 10-prometheus.yaml +kubectl apply -f 11-loki.yaml +kubectl apply -f 12-tempo.yaml +kubectl apply -f 13-grafana.yaml +kubectl apply -f 14-alloy.yaml +kubectl apply -f 15-kube-state-metrics.yaml +kubectl apply -f 16-node-exporter.yaml +kubectl apply -f 20-grafana-ingress.yaml +``` + +### Step 3: Verify Deployment + +```bash +kubectl get pods -n observability +kubectl get pv +kubectl get pvc -n observability +``` + +All pods should be in `Running` state: +- grafana-0 +- loki-0 +- prometheus-0 +- tempo-0 +- alloy-xxxxx (one per node) +- kube-state-metrics-xxxxx +- node-exporter-xxxxx (one per node) + +### Step 4: Access Grafana + +1. Open: https://grafana.betelgeusebytes.io +2. Login with default credentials: + - Username: `admin` + - Password: `admin` +3. **IMPORTANT**: Change the password on first login! + +## Using the Stack + +### Exploring Logs (Loki) + +1. In Grafana, go to **Explore** +2. Select **Loki** datasource +3. Example queries: + ``` + {namespace="observability"} + {namespace="observability", app="prometheus"} + {namespace="default"} |= "error" + {pod="my-app-xxx"} | json | level="error" + ``` + +### Exploring Metrics (Prometheus) + +1. In Grafana, go to **Explore** +2. Select **Prometheus** datasource +3. Example queries: + ``` + up + node_memory_MemAvailable_bytes + rate(container_cpu_usage_seconds_total[5m]) + kube_pod_status_phase{namespace="observability"} + ``` + +### Exploring Traces (Tempo) + +1. In Grafana, go to **Explore** +2. Select **Tempo** datasource +3. Search by: + - Service name + - Duration + - Tags +4. Click on a trace to see detailed span timeline + +### Correlations + +The stack automatically correlates: +- **Logs → Traces**: Click traceID in logs to view trace +- **Traces → Logs**: Click on trace to see related logs +- **Traces → Metrics**: Tempo generates metrics from traces + +### Instrumenting Your Applications + +#### For Logs +Logs are automatically collected from all pods by Alloy. Emit structured JSON logs: + +```json +{"level":"info","message":"Request processed","duration_ms":42} +``` + +#### For Traces +Send traces to Tempo using OTLP: + +```python +# Python with OpenTelemetry +from opentelemetry import trace +from opentelemetry.sdk.trace import TracerProvider +from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter + +provider = TracerProvider() +provider.add_span_processor( + BatchSpanProcessor( + OTLPSpanExporter(endpoint="http://tempo.observability.svc.cluster.local:4317") + ) +) +trace.set_tracer_provider(provider) +``` + +#### For Metrics +Expose metrics in Prometheus format and add annotations to your pod: + +```yaml +metadata: + annotations: + prometheus.io/scrape: "true" + prometheus.io/port: "8080" + prometheus.io/path: "/metrics" +``` + +## Monitoring Endpoints + +Internal service endpoints: + +- **Prometheus**: `http://prometheus.observability.svc.cluster.local:9090` +- **Loki**: `http://loki.observability.svc.cluster.local:3100` +- **Tempo**: + - HTTP: `http://tempo.observability.svc.cluster.local:3200` + - OTLP gRPC: `tempo.observability.svc.cluster.local:4317` + - OTLP HTTP: `tempo.observability.svc.cluster.local:4318` +- **Grafana**: `http://grafana.observability.svc.cluster.local:3000` + +## Troubleshooting + +### Check Pod Status +```bash +kubectl get pods -n observability +kubectl describe pod -n observability +``` + +### View Logs +```bash +kubectl logs -n observability -l app=grafana +kubectl logs -n observability -l app=prometheus +kubectl logs -n observability -l app=loki +kubectl logs -n observability -l app=tempo +kubectl logs -n observability -l app=alloy +``` + +### Check Storage +```bash +kubectl get pv +kubectl get pvc -n observability +``` + +### Test Connectivity +```bash +# From inside cluster +kubectl run -it --rm debug --image=curlimages/curl --restart=Never -- \ + curl http://prometheus.observability.svc.cluster.local:9090/-/healthy +``` + +### Common Issues + +**Pods stuck in Pending** +- Check if storage directories exist on hetzner-2 +- Verify PV/PVC bindings: `kubectl describe pvc -n observability` + +**Loki won't start** +- Check permissions on `/mnt/local-ssd/loki` (should be 10001:10001) +- View logs: `kubectl logs -n observability loki-0` + +**No logs appearing** +- Check Alloy pods are running: `kubectl get pods -n observability -l app=alloy` +- View Alloy logs: `kubectl logs -n observability -l app=alloy` + +**Grafana can't reach datasources** +- Verify services: `kubectl get svc -n observability` +- Check datasource URLs in Grafana UI + +## Updating Configuration + +### Update Prometheus Scrape Config +```bash +kubectl edit configmap prometheus-config -n observability +kubectl rollout restart statefulset/prometheus -n observability +``` + +### Update Loki Retention +```bash +kubectl edit configmap loki-config -n observability +kubectl rollout restart statefulset/loki -n observability +``` + +### Update Alloy Collection Rules +```bash +kubectl edit configmap alloy-config -n observability +kubectl rollout restart daemonset/alloy -n observability +``` + +## Resource Usage + +Expected resource consumption: + +| Component | CPU Request | CPU Limit | Memory Request | Memory Limit | +|-----------|-------------|-----------|----------------|--------------| +| Prometheus | 500m | 2000m | 2Gi | 4Gi | +| Loki | 500m | 2000m | 1Gi | 2Gi | +| Tempo | 500m | 2000m | 1Gi | 2Gi | +| Grafana | 250m | 1000m | 512Mi | 1Gi | +| Alloy (per node) | 100m | 500m | 256Mi | 512Mi | +| kube-state-metrics | 100m | 200m | 128Mi | 256Mi | +| node-exporter (per node) | 100m | 200m | 128Mi | 256Mi | + +**Total (single node)**: ~2.1 CPU cores, ~7.5Gi memory + +## Security Considerations + +1. **Change default Grafana password** immediately after deployment +2. Consider adding authentication for internal services if exposed +3. Review and restrict RBAC permissions as needed +4. Enable audit logging in Loki for sensitive namespaces +5. Consider adding NetworkPolicies to restrict traffic + +## Documentation + +This deployment includes comprehensive guides: + +- **README.md**: Complete deployment and configuration guide (this file) +- **MONITORING-GUIDE.md**: URLs, access, and how to monitor new applications +- **DEPLOYMENT-CHECKLIST.md**: Step-by-step deployment checklist +- **QUICKREF.md**: Quick reference for daily operations +- **demo-app.yaml**: Example fully instrumented application +- **deploy.sh**: Automated deployment script +- **status.sh**: Health check script +- **cleanup.sh**: Complete stack removal +- **remove-old-monitoring.sh**: Remove existing monitoring before deployment +- **21-optional-ingresses.yaml**: Optional external access to Prometheus/Loki/Tempo + +## Future Enhancements + +- Add Alertmanager for alerting +- Configure Grafana SMTP for email notifications +- Add custom dashboards for your applications +- Implement Grafana RBAC for team access +- Consider Mimir for long-term metrics storage +- Add backup/restore procedures + +## Support + +For issues or questions: +1. Check pod logs first +2. Review Grafana datasource configuration +3. Verify network connectivity between components +4. Check storage and resource availability + +## Version Information + +- Grafana: 11.4.0 +- Prometheus: 2.54.1 +- Loki: 3.2.1 +- Tempo: 2.6.1 +- Alloy: 1.5.1 +- kube-state-metrics: 2.13.0 +- node-exporter: 1.8.2 + +Last updated: January 2025 diff --git a/k8s/observability-stack/cleanup.sh b/k8s/observability-stack/cleanup.sh new file mode 100755 index 0000000..8c071f3 --- /dev/null +++ b/k8s/observability-stack/cleanup.sh @@ -0,0 +1,62 @@ +#!/bin/bash + +set -e + +echo "==================================================" +echo "Removing Observability Stack from Kubernetes" +echo "==================================================" +echo "" + +RED='\033[0;31m' +YELLOW='\033[1;33m' +NC='\033[0m' # No Color + +echo -e "${RED}WARNING: This will delete all observability data!${NC}" +echo "" +read -p "Are you sure you want to continue? (yes/no): " confirm + +if [ "$confirm" != "yes" ]; then + echo "Cleanup cancelled." + exit 0 +fi + +echo -e "${YELLOW}Removing Ingress...${NC}" +kubectl delete -f 20-grafana-ingress.yaml --ignore-not-found + +echo -e "${YELLOW}Removing Deployments and DaemonSets...${NC}" +kubectl delete -f 16-node-exporter.yaml --ignore-not-found +kubectl delete -f 15-kube-state-metrics.yaml --ignore-not-found +kubectl delete -f 14-alloy.yaml --ignore-not-found +kubectl delete -f 13-grafana.yaml --ignore-not-found +kubectl delete -f 12-tempo.yaml --ignore-not-found +kubectl delete -f 11-loki.yaml --ignore-not-found +kubectl delete -f 10-prometheus.yaml --ignore-not-found + +echo -e "${YELLOW}Removing RBAC...${NC}" +kubectl delete -f 08-rbac.yaml --ignore-not-found + +echo -e "${YELLOW}Removing ConfigMaps...${NC}" +kubectl delete -f 07-grafana-datasources.yaml --ignore-not-found +kubectl delete -f 06-alloy-config.yaml --ignore-not-found +kubectl delete -f 05-tempo-config.yaml --ignore-not-found +kubectl delete -f 04-loki-config.yaml --ignore-not-found +kubectl delete -f 03-prometheus-config.yaml --ignore-not-found + +echo -e "${YELLOW}Removing PVCs...${NC}" +kubectl delete -f 02-persistent-volume-claims.yaml --ignore-not-found + +echo -e "${YELLOW}Removing PVs...${NC}" +kubectl delete -f 01-persistent-volumes.yaml --ignore-not-found + +echo -e "${YELLOW}Removing Namespace...${NC}" +kubectl delete -f 00-namespace.yaml --ignore-not-found + +echo "" +echo -e "${RED}==================================================" +echo "Cleanup Complete!" +echo "==================================================${NC}" +echo "" +echo "Data directories on hetzner-2 node are preserved." +echo "To remove them, run on the node:" +echo " sudo rm -rf /mnt/local-ssd/{prometheus,loki,tempo,grafana}" +echo "" diff --git a/k8s/observability-stack/demo-app.yaml b/k8s/observability-stack/demo-app.yaml new file mode 100644 index 0000000..5a4e9f5 --- /dev/null +++ b/k8s/observability-stack/demo-app.yaml @@ -0,0 +1,253 @@ +--- +# Example instrumented application to test the observability stack +# This is a simple Python Flask app with OpenTelemetry instrumentation + +apiVersion: v1 +kind: ConfigMap +metadata: + name: demo-app + namespace: observability +data: + app.py: | + from flask import Flask, jsonify + import logging + import json + import time + import random + + # OpenTelemetry imports + from opentelemetry import trace, metrics + from opentelemetry.sdk.trace import TracerProvider + from opentelemetry.sdk.trace.export import BatchSpanProcessor + from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter + from opentelemetry.sdk.metrics import MeterProvider + from opentelemetry.sdk.metrics.export import PeriodicExportingMetricReader + from opentelemetry.exporter.otlp.proto.grpc.metric_exporter import OTLPMetricExporter + from opentelemetry.instrumentation.flask import FlaskInstrumentor + from opentelemetry.sdk.resources import Resource + from prometheus_flask_exporter import PrometheusMetrics + + # Configure structured logging + logging.basicConfig( + level=logging.INFO, + format='%(message)s' + ) + + class JSONFormatter(logging.Formatter): + def format(self, record): + log_obj = { + 'timestamp': self.formatTime(record, self.datefmt), + 'level': record.levelname, + 'message': record.getMessage(), + 'logger': record.name, + } + if hasattr(record, 'trace_id'): + log_obj['trace_id'] = record.trace_id + log_obj['span_id'] = record.span_id + return json.dumps(log_obj) + + handler = logging.StreamHandler() + handler.setFormatter(JSONFormatter()) + logger = logging.getLogger(__name__) + logger.addHandler(handler) + logger.setLevel(logging.INFO) + + # Configure OpenTelemetry + resource = Resource.create({"service.name": "demo-app"}) + + # Tracing + trace_provider = TracerProvider(resource=resource) + trace_provider.add_span_processor( + BatchSpanProcessor( + OTLPSpanExporter( + endpoint="http://tempo.observability.svc.cluster.local:4317", + insecure=True + ) + ) + ) + trace.set_tracer_provider(trace_provider) + tracer = trace.get_tracer(__name__) + + # Create Flask app + app = Flask(__name__) + + # Prometheus metrics + metrics = PrometheusMetrics(app) + + # Auto-instrument Flask + FlaskInstrumentor().instrument_app(app) + + # Sample data + ITEMS = ["apple", "banana", "orange", "grape", "mango"] + + @app.route('/') + def index(): + span = trace.get_current_span() + trace_id = format(span.get_span_context().trace_id, '032x') + + logger.info("Index page accessed", extra={ + 'trace_id': trace_id, + 'endpoint': '/' + }) + + return jsonify({ + 'service': 'demo-app', + 'status': 'healthy', + 'trace_id': trace_id + }) + + @app.route('/items') + def get_items(): + with tracer.start_as_current_span("fetch_items") as span: + # Simulate database query + time.sleep(random.uniform(0.01, 0.1)) + + span.set_attribute("items.count", len(ITEMS)) + trace_id = format(span.get_span_context().trace_id, '032x') + + logger.info("Items fetched", extra={ + 'trace_id': trace_id, + 'count': len(ITEMS) + }) + + return jsonify({ + 'items': ITEMS, + 'count': len(ITEMS), + 'trace_id': trace_id + }) + + @app.route('/item/') + def get_item(item_id): + with tracer.start_as_current_span("fetch_item") as span: + span.set_attribute("item.id", item_id) + trace_id = format(span.get_span_context().trace_id, '032x') + + # Simulate processing + time.sleep(random.uniform(0.01, 0.05)) + + if item_id < 0 or item_id >= len(ITEMS): + logger.warning("Item not found", extra={ + 'trace_id': trace_id, + 'item_id': item_id + }) + return jsonify({'error': 'Item not found', 'trace_id': trace_id}), 404 + + item = ITEMS[item_id] + logger.info("Item fetched", extra={ + 'trace_id': trace_id, + 'item_id': item_id, + 'item': item + }) + + return jsonify({ + 'id': item_id, + 'name': item, + 'trace_id': trace_id + }) + + @app.route('/slow') + def slow_endpoint(): + with tracer.start_as_current_span("slow_operation") as span: + trace_id = format(span.get_span_context().trace_id, '032x') + + logger.info("Slow operation started", extra={'trace_id': trace_id}) + + # Simulate slow operation + time.sleep(random.uniform(1, 3)) + + logger.info("Slow operation completed", extra={'trace_id': trace_id}) + + return jsonify({ + 'message': 'Operation completed', + 'trace_id': trace_id + }) + + @app.route('/error') + def error_endpoint(): + with tracer.start_as_current_span("error_operation") as span: + trace_id = format(span.get_span_context().trace_id, '032x') + + logger.error("Intentional error triggered", extra={'trace_id': trace_id}) + span.set_attribute("error", True) + + return jsonify({ + 'error': 'This is an intentional error', + 'trace_id': trace_id + }), 500 + + if __name__ == '__main__': + app.run(host='0.0.0.0', port=8080) + +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: demo-app + namespace: observability + labels: + app: demo-app +spec: + replicas: 1 + selector: + matchLabels: + app: demo-app + template: + metadata: + labels: + app: demo-app + annotations: + prometheus.io/scrape: "true" + prometheus.io/port: "8080" + prometheus.io/path: "/metrics" + spec: + containers: + - name: demo-app + image: python:3.11-slim + command: + - /bin/bash + - -c + - | + pip install flask opentelemetry-api opentelemetry-sdk \ + opentelemetry-instrumentation-flask \ + opentelemetry-exporter-otlp-proto-grpc \ + prometheus-flask-exporter && \ + python /app/app.py + ports: + - name: http + containerPort: 8080 + volumeMounts: + - name: app-code + mountPath: /app + resources: + requests: + cpu: 100m + memory: 256Mi + limits: + cpu: 500m + memory: 512Mi + volumes: + - name: app-code + configMap: + name: demo-app + +--- +apiVersion: v1 +kind: Service +metadata: + name: demo-app + namespace: observability + labels: + app: demo-app + annotations: + prometheus.io/scrape: "true" + prometheus.io/port: "8080" + prometheus.io/path: "/metrics" +spec: + type: ClusterIP + ports: + - port: 8080 + targetPort: http + protocol: TCP + name: http + selector: + app: demo-app diff --git a/k8s/observability-stack/deploy.sh b/k8s/observability-stack/deploy.sh new file mode 100755 index 0000000..fcd22ef --- /dev/null +++ b/k8s/observability-stack/deploy.sh @@ -0,0 +1,114 @@ +#!/bin/bash + +set -e + +echo "==================================================" +echo "Deploying Observability Stack to Kubernetes" +echo "==================================================" +echo "" + +# Colors for output +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +RED='\033[0;31m' +NC='\033[0m' # No Color + +echo -e "${YELLOW}Pre-deployment Check: Existing Monitoring Stack${NC}" +echo "" +echo "If you have an existing monitoring/prometheus/grafana deployment," +echo "you should remove it first to avoid conflicts." +echo "" +read -p "Do you want to run the cleanup script now? (yes/no): " run_cleanup +if [ "$run_cleanup" = "yes" ]; then + if [ -f "./remove-old-monitoring.sh" ]; then + echo "Running cleanup script..." + ./remove-old-monitoring.sh + echo "" + echo "Cleanup complete. Continuing with deployment..." + echo "" + else + echo -e "${RED}Error: remove-old-monitoring.sh not found${NC}" + echo "Please run it manually before deploying." + exit 1 + fi +fi + +echo -e "${YELLOW}Step 1: Creating storage directories on node...${NC}" +echo "Please run this on the hetzner-2 node:" +echo " sudo mkdir -p /mnt/local-ssd/{prometheus,loki,tempo,grafana}" +echo " sudo chown -R 65534:65534 /mnt/local-ssd/prometheus" +echo " sudo chown -R 10001:10001 /mnt/local-ssd/loki" +echo " sudo chown -R root:root /mnt/local-ssd/tempo" +echo " sudo chown -R 472:472 /mnt/local-ssd/grafana" +echo "" +read -p "Press Enter once directories are created..." + +echo -e "${GREEN}Step 2: Creating namespace...${NC}" +kubectl apply -f 00-namespace.yaml + +echo -e "${GREEN}Step 3: Creating PersistentVolumes...${NC}" +kubectl apply -f 01-persistent-volumes.yaml + +echo -e "${GREEN}Step 4: Creating PersistentVolumeClaims...${NC}" +kubectl apply -f 02-persistent-volume-claims.yaml + +echo -e "${GREEN}Step 5: Creating ConfigMaps...${NC}" +kubectl apply -f 03-prometheus-config.yaml +kubectl apply -f 04-loki-config.yaml +kubectl apply -f 05-tempo-config.yaml +kubectl apply -f 06-alloy-config.yaml +kubectl apply -f 07-grafana-datasources.yaml + +echo -e "${GREEN}Step 6: Creating RBAC resources...${NC}" +kubectl apply -f 08-rbac.yaml + +echo -e "${GREEN}Step 7: Deploying Prometheus...${NC}" +kubectl apply -f 10-prometheus.yaml + +echo -e "${GREEN}Step 8: Deploying Loki...${NC}" +kubectl apply -f 11-loki.yaml + +echo -e "${GREEN}Step 9: Deploying Tempo...${NC}" +kubectl apply -f 12-tempo.yaml + +echo -e "${GREEN}Step 10: Deploying Grafana...${NC}" +kubectl apply -f 13-grafana.yaml + +echo -e "${GREEN}Step 11: Deploying Grafana Alloy...${NC}" +kubectl apply -f 14-alloy.yaml + +echo -e "${GREEN}Step 12: Deploying kube-state-metrics...${NC}" +kubectl apply -f 15-kube-state-metrics.yaml + +echo -e "${GREEN}Step 13: Deploying node-exporter...${NC}" +kubectl apply -f 16-node-exporter.yaml + +echo -e "${GREEN}Step 14: Creating Grafana Ingress...${NC}" +kubectl apply -f 20-grafana-ingress.yaml + +echo "" +echo -e "${GREEN}==================================================" +echo "Deployment Complete!" +echo "==================================================${NC}" +echo "" +echo "Waiting for pods to be ready..." +kubectl wait --for=condition=ready pod -l app=prometheus -n observability --timeout=300s +kubectl wait --for=condition=ready pod -l app=loki -n observability --timeout=300s +kubectl wait --for=condition=ready pod -l app=tempo -n observability --timeout=300s +kubectl wait --for=condition=ready pod -l app=grafana -n observability --timeout=300s + +echo "" +echo -e "${GREEN}All pods are ready!${NC}" +echo "" +echo "Access Grafana at: https://grafana.betelgeusebytes.io" +echo "Default credentials: admin / admin" +echo "" +echo "To check status:" +echo " kubectl get pods -n observability" +echo "" +echo "To view logs:" +echo " kubectl logs -n observability -l app=grafana" +echo " kubectl logs -n observability -l app=prometheus" +echo " kubectl logs -n observability -l app=loki" +echo " kubectl logs -n observability -l app=tempo" +echo "" diff --git a/k8s/observability-stack/remove-old-monitoring.sh b/k8s/observability-stack/remove-old-monitoring.sh new file mode 100755 index 0000000..f617225 --- /dev/null +++ b/k8s/observability-stack/remove-old-monitoring.sh @@ -0,0 +1,319 @@ +#!/bin/bash + +set -e + +echo "==========================================================" +echo "Removing Existing Monitoring Stack" +echo "==========================================================" +echo "" + +RED='\033[0;31m' +YELLOW='\033[1;33m' +GREEN='\033[0;32m' +NC='\033[0m' # No Color + +echo -e "${YELLOW}This script will remove common monitoring deployments including:${NC}" +echo " - Prometheus (standalone or operator)" +echo " - Grafana" +echo " - Fluent Bit" +echo " - Vector" +echo " - Loki" +echo " - Tempo" +echo " - Node exporters" +echo " - kube-state-metrics" +echo " - Any monitoring/prometheus/grafana namespaces" +echo "" +echo -e "${RED}WARNING: This will delete all existing monitoring data!${NC}" +echo "" +read -p "Are you sure you want to continue? (yes/no): " confirm + +if [ "$confirm" != "yes" ]; then + echo "Cleanup cancelled." + exit 0 +fi + +echo "" +echo -e "${YELLOW}Step 1: Checking for existing monitoring namespaces...${NC}" + +# Common namespace names for monitoring +NAMESPACES=("monitoring" "prometheus" "grafana" "loki" "tempo" "logging") + +for ns in "${NAMESPACES[@]}"; do + if kubectl get namespace "$ns" &> /dev/null; then + echo -e "${GREEN}Found namespace: $ns${NC}" + + # Show what's in the namespace + echo " Resources in $ns:" + kubectl get all -n "$ns" 2>/dev/null | head -20 || true + echo "" + + read -p " Delete namespace '$ns'? (yes/no): " delete_ns + if [ "$delete_ns" = "yes" ]; then + echo " Deleting namespace $ns..." + kubectl delete namespace "$ns" --timeout=120s || { + echo -e "${YELLOW} Warning: Namespace deletion timed out, forcing...${NC}" + kubectl delete namespace "$ns" --grace-period=0 --force & + } + fi + fi +done + +echo "" +echo -e "${YELLOW}Step 2: Removing common monitoring Helm releases...${NC}" + +# Check if helm is available +if command -v helm &> /dev/null; then + echo "Checking for Helm releases..." + + # Common Helm release names + RELEASES=("prometheus" "grafana" "loki" "tempo" "fluent-bit" "prometheus-operator" "kube-prometheus-stack") + + for release in "${RELEASES[@]}"; do + # Check all namespaces for the release + if helm list -A | grep -q "$release"; then + ns=$(helm list -A | grep "$release" | awk '{print $2}') + echo -e "${GREEN}Found Helm release: $release in namespace $ns${NC}" + read -p " Uninstall Helm release '$release'? (yes/no): " uninstall + if [ "$uninstall" = "yes" ]; then + echo " Uninstalling $release..." + helm uninstall "$release" -n "$ns" || echo -e "${YELLOW} Warning: Failed to uninstall $release${NC}" + fi + fi + done +else + echo "Helm not found, skipping Helm releases check" +fi + +echo "" +echo -e "${YELLOW}Step 3: Removing standalone monitoring components...${NC}" + +# Remove common DaemonSets in kube-system or default +echo "Checking for monitoring DaemonSets..." +for ns in kube-system default; do + if kubectl get daemonset -n "$ns" 2>/dev/null | grep -q "node-exporter\|fluent-bit\|fluentd\|vector"; then + echo -e "${GREEN}Found monitoring DaemonSets in $ns${NC}" + kubectl get daemonset -n "$ns" | grep -E "node-exporter|fluent-bit|fluentd|vector" + read -p " Delete these DaemonSets? (yes/no): " delete_ds + if [ "$delete_ds" = "yes" ]; then + kubectl delete daemonset -n "$ns" -l app=node-exporter --ignore-not-found + kubectl delete daemonset -n "$ns" -l app=fluent-bit --ignore-not-found + kubectl delete daemonset -n "$ns" -l app=fluentd --ignore-not-found + kubectl delete daemonset -n "$ns" -l app=vector --ignore-not-found + kubectl delete daemonset -n "$ns" node-exporter --ignore-not-found + kubectl delete daemonset -n "$ns" fluent-bit --ignore-not-found + kubectl delete daemonset -n "$ns" fluentd --ignore-not-found + kubectl delete daemonset -n "$ns" vector --ignore-not-found + fi + fi +done + +# Remove common Deployments +echo "" +echo "Checking for monitoring Deployments..." +for ns in kube-system default; do + if kubectl get deployment -n "$ns" 2>/dev/null | grep -q "prometheus\|grafana\|kube-state-metrics\|loki\|tempo"; then + echo -e "${GREEN}Found monitoring Deployments in $ns${NC}" + kubectl get deployment -n "$ns" | grep -E "prometheus|grafana|kube-state-metrics|loki|tempo" + read -p " Delete these Deployments? (yes/no): " delete_deploy + if [ "$delete_deploy" = "yes" ]; then + kubectl delete deployment -n "$ns" -l app=prometheus --ignore-not-found + kubectl delete deployment -n "$ns" -l app=grafana --ignore-not-found + kubectl delete deployment -n "$ns" -l app=kube-state-metrics --ignore-not-found + kubectl delete deployment -n "$ns" -l app=loki --ignore-not-found + kubectl delete deployment -n "$ns" -l app=tempo --ignore-not-found + kubectl delete deployment -n "$ns" prometheus --ignore-not-found + kubectl delete deployment -n "$ns" grafana --ignore-not-found + kubectl delete deployment -n "$ns" kube-state-metrics --ignore-not-found + fi + fi +done + +# Remove common StatefulSets +echo "" +echo "Checking for monitoring StatefulSets..." +for ns in kube-system default; do + if kubectl get statefulset -n "$ns" 2>/dev/null | grep -q "prometheus\|grafana\|loki\|tempo"; then + echo -e "${GREEN}Found monitoring StatefulSets in $ns${NC}" + kubectl get statefulset -n "$ns" | grep -E "prometheus|grafana|loki|tempo" + read -p " Delete these StatefulSets? (yes/no): " delete_sts + if [ "$delete_sts" = "yes" ]; then + kubectl delete statefulset -n "$ns" -l app=prometheus --ignore-not-found + kubectl delete statefulset -n "$ns" -l app=grafana --ignore-not-found + kubectl delete statefulset -n "$ns" -l app=loki --ignore-not-found + kubectl delete statefulset -n "$ns" -l app=tempo --ignore-not-found + kubectl delete statefulset -n "$ns" prometheus --ignore-not-found + kubectl delete statefulset -n "$ns" grafana --ignore-not-found + kubectl delete statefulset -n "$ns" loki --ignore-not-found + kubectl delete statefulset -n "$ns" tempo --ignore-not-found + fi + fi +done + +echo "" +echo -e "${YELLOW}Step 4: Removing monitoring ConfigMaps...${NC}" + +# Ask before removing ConfigMaps (they might contain important configs) +echo "Checking for monitoring ConfigMaps..." +for ns in kube-system default monitoring prometheus grafana; do + if kubectl get namespace "$ns" &> /dev/null; then + if kubectl get configmap -n "$ns" 2>/dev/null | grep -q "prometheus\|grafana\|loki\|tempo\|fluent"; then + echo -e "${GREEN}Found monitoring ConfigMaps in $ns${NC}" + kubectl get configmap -n "$ns" | grep -E "prometheus|grafana|loki|tempo|fluent" + read -p " Delete these ConfigMaps? (yes/no): " delete_cm + if [ "$delete_cm" = "yes" ]; then + kubectl delete configmap -n "$ns" -l app=prometheus --ignore-not-found + kubectl delete configmap -n "$ns" -l app=grafana --ignore-not-found + kubectl delete configmap -n "$ns" -l app=loki --ignore-not-found + kubectl delete configmap -n "$ns" -l app=fluent-bit --ignore-not-found + fi + fi + fi +done + +echo "" +echo -e "${YELLOW}Step 5: Removing ClusterRoles and ClusterRoleBindings...${NC}" + +# Remove monitoring-related RBAC +echo "Checking for monitoring ClusterRoles..." +if kubectl get clusterrole 2>/dev/null | grep -q "prometheus\|grafana\|kube-state-metrics\|fluent-bit\|node-exporter"; then + echo -e "${GREEN}Found monitoring ClusterRoles${NC}" + kubectl get clusterrole | grep -E "prometheus|grafana|kube-state-metrics|fluent-bit|node-exporter" + read -p " Delete these ClusterRoles? (yes/no): " delete_cr + if [ "$delete_cr" = "yes" ]; then + kubectl delete clusterrole prometheus --ignore-not-found + kubectl delete clusterrole grafana --ignore-not-found + kubectl delete clusterrole kube-state-metrics --ignore-not-found + kubectl delete clusterrole fluent-bit --ignore-not-found + kubectl delete clusterrole node-exporter --ignore-not-found + fi +fi + +echo "Checking for monitoring ClusterRoleBindings..." +if kubectl get clusterrolebinding 2>/dev/null | grep -q "prometheus\|grafana\|kube-state-metrics\|fluent-bit\|node-exporter"; then + echo -e "${GREEN}Found monitoring ClusterRoleBindings${NC}" + kubectl get clusterrolebinding | grep -E "prometheus|grafana|kube-state-metrics|fluent-bit|node-exporter" + read -p " Delete these ClusterRoleBindings? (yes/no): " delete_crb + if [ "$delete_crb" = "yes" ]; then + kubectl delete clusterrolebinding prometheus --ignore-not-found + kubectl delete clusterrolebinding grafana --ignore-not-found + kubectl delete clusterrolebinding kube-state-metrics --ignore-not-found + kubectl delete clusterrolebinding fluent-bit --ignore-not-found + kubectl delete clusterrolebinding node-exporter --ignore-not-found + fi +fi + +echo "" +echo -e "${YELLOW}Step 6: Removing PVCs and PVs...${NC}" + +# Check for monitoring PVCs +echo "Checking for monitoring PersistentVolumeClaims..." +for ns in kube-system default monitoring prometheus grafana; do + if kubectl get namespace "$ns" &> /dev/null; then + if kubectl get pvc -n "$ns" 2>/dev/null | grep -q "prometheus\|grafana\|loki\|tempo"; then + echo -e "${GREEN}Found monitoring PVCs in $ns${NC}" + kubectl get pvc -n "$ns" | grep -E "prometheus|grafana|loki|tempo" + echo -e "${RED} WARNING: Deleting PVCs will delete all stored data!${NC}" + read -p " Delete these PVCs? (yes/no): " delete_pvc + if [ "$delete_pvc" = "yes" ]; then + kubectl delete pvc -n "$ns" -l app=prometheus --ignore-not-found + kubectl delete pvc -n "$ns" -l app=grafana --ignore-not-found + kubectl delete pvc -n "$ns" -l app=loki --ignore-not-found + kubectl delete pvc -n "$ns" -l app=tempo --ignore-not-found + # Also try by name patterns + kubectl get pvc -n "$ns" -o name | grep -E "prometheus|grafana|loki|tempo" | xargs -r kubectl delete -n "$ns" || true + fi + fi + fi +done + +# Check for monitoring PVs +echo "" +echo "Checking for monitoring PersistentVolumes..." +if kubectl get pv 2>/dev/null | grep -q "prometheus\|grafana\|loki\|tempo\|monitoring"; then + echo -e "${GREEN}Found monitoring PVs${NC}" + kubectl get pv | grep -E "prometheus|grafana|loki|tempo|monitoring" + echo -e "${RED} WARNING: Deleting PVs may delete data on disk!${NC}" + read -p " Delete these PVs? (yes/no): " delete_pv + if [ "$delete_pv" = "yes" ]; then + kubectl get pv -o name | grep -E "prometheus|grafana|loki|tempo|monitoring" | xargs -r kubectl delete || true + fi +fi + +echo "" +echo -e "${YELLOW}Step 7: Checking for monitoring Ingresses...${NC}" + +for ns in kube-system default monitoring prometheus grafana; do + if kubectl get namespace "$ns" &> /dev/null; then + if kubectl get ingress -n "$ns" 2>/dev/null | grep -q "prometheus\|grafana\|loki"; then + echo -e "${GREEN}Found monitoring Ingresses in $ns${NC}" + kubectl get ingress -n "$ns" | grep -E "prometheus|grafana|loki" + read -p " Delete these Ingresses? (yes/no): " delete_ing + if [ "$delete_ing" = "yes" ]; then + kubectl delete ingress -n "$ns" -l app=prometheus --ignore-not-found + kubectl delete ingress -n "$ns" -l app=grafana --ignore-not-found + kubectl delete ingress -n "$ns" prometheus-ingress --ignore-not-found + kubectl delete ingress -n "$ns" grafana-ingress --ignore-not-found + fi + fi + fi +done + +echo "" +echo -e "${YELLOW}Step 8: Checking for Prometheus Operator CRDs...${NC}" + +# Check for Prometheus Operator CRDs +if kubectl get crd 2>/dev/null | grep -q "monitoring.coreos.com"; then + echo -e "${GREEN}Found Prometheus Operator CRDs${NC}" + kubectl get crd | grep "monitoring.coreos.com" + echo "" + echo -e "${RED}WARNING: Deleting these CRDs will remove ALL Prometheus Operator resources cluster-wide!${NC}" + read -p " Delete Prometheus Operator CRDs? (yes/no): " delete_crd + if [ "$delete_crd" = "yes" ]; then + kubectl delete crd prometheuses.monitoring.coreos.com --ignore-not-found + kubectl delete crd prometheusrules.monitoring.coreos.com --ignore-not-found + kubectl delete crd servicemonitors.monitoring.coreos.com --ignore-not-found + kubectl delete crd podmonitors.monitoring.coreos.com --ignore-not-found + kubectl delete crd alertmanagers.monitoring.coreos.com --ignore-not-found + kubectl delete crd alertmanagerconfigs.monitoring.coreos.com --ignore-not-found + kubectl delete crd probes.monitoring.coreos.com --ignore-not-found + kubectl delete crd thanosrulers.monitoring.coreos.com --ignore-not-found + fi +fi + +echo "" +echo -e "${YELLOW}Step 9: Optional - Clean up data directories on nodes...${NC}" +echo "" +echo "You may have monitoring data stored on your nodes at:" +echo " - /mnt/local-ssd/prometheus" +echo " - /mnt/local-ssd/grafana" +echo " - /mnt/local-ssd/loki" +echo " - /mnt/local-ssd/tempo" +echo " - /var/lib/prometheus" +echo " - /var/lib/grafana" +echo "" +echo "To remove these, SSH to each node and run:" +echo " sudo rm -rf /mnt/local-ssd/{prometheus,grafana,loki,tempo}" +echo " sudo rm -rf /var/lib/{prometheus,grafana,loki,tempo}" +echo "" +read -p "Have you cleaned up the data directories? (yes to continue, no to skip): " cleanup_dirs + +echo "" +echo -e "${GREEN}==========================================================" +echo "Existing Monitoring Stack Cleanup Complete!" +echo "==========================================================${NC}" +echo "" +echo "Summary of actions taken:" +echo " - Removed monitoring namespaces (if confirmed)" +echo " - Uninstalled Helm releases (if found and confirmed)" +echo " - Removed standalone monitoring components" +echo " - Removed monitoring ConfigMaps" +echo " - Removed RBAC resources" +echo " - Removed PVCs and PVs (if confirmed)" +echo " - Removed Ingresses" +echo " - Removed Prometheus Operator CRDs (if confirmed)" +echo "" +echo -e "${YELLOW}Next Steps:${NC}" +echo "1. Verify cleanup: kubectl get all -A | grep -E 'prometheus|grafana|loki|tempo|monitoring'" +echo "2. Clean up node data directories (see above)" +echo "3. Deploy new observability stack: ./deploy.sh" +echo "" diff --git a/k8s/observability-stack/status.sh b/k8s/observability-stack/status.sh new file mode 100755 index 0000000..d1895ee --- /dev/null +++ b/k8s/observability-stack/status.sh @@ -0,0 +1,115 @@ +#!/bin/bash + +GREEN='\033[0;32m' +RED='\033[0;31m' +YELLOW='\033[1;33m' +BLUE='\033[0;34m' +NC='\033[0m' # No Color + +echo -e "${BLUE}==================================================" +echo "Observability Stack Status Check" +echo "==================================================${NC}" +echo "" + +# Check namespace +echo -e "${YELLOW}Checking namespace...${NC}" +if kubectl get namespace observability &> /dev/null; then + echo -e "${GREEN}✓ Namespace 'observability' exists${NC}" +else + echo -e "${RED}✗ Namespace 'observability' not found${NC}" + exit 1 +fi +echo "" + +# Check PVs +echo -e "${YELLOW}Checking PersistentVolumes...${NC}" +pvs=$(kubectl get pv 2>/dev/null | grep -E "(prometheus|loki|tempo|grafana)-data-pv" | wc -l) +if [ "$pvs" -eq 4 ]; then + echo -e "${GREEN}✓ All 4 PersistentVolumes found${NC}" + kubectl get pv | grep -E "(prometheus|loki|tempo|grafana)-data-pv" +else + echo -e "${RED}✗ Expected 4 PVs, found $pvs${NC}" +fi +echo "" + +# Check PVCs +echo -e "${YELLOW}Checking PersistentVolumeClaims...${NC}" +pvcs=$(kubectl get pvc -n observability 2>/dev/null | grep -v NAME | wc -l) +if [ "$pvcs" -eq 4 ]; then + echo -e "${GREEN}✓ All 4 PersistentVolumeClaims found${NC}" + kubectl get pvc -n observability +else + echo -e "${RED}✗ Expected 4 PVCs, found $pvcs${NC}" +fi +echo "" + +# Check Pods +echo -e "${YELLOW}Checking Pods...${NC}" +kubectl get pods -n observability -o wide +echo "" + +# Count running pods +total_pods=$(kubectl get pods -n observability --no-headers 2>/dev/null | wc -l) +running_pods=$(kubectl get pods -n observability --field-selector=status.phase=Running --no-headers 2>/dev/null | wc -l) + +if [ "$total_pods" -eq 0 ]; then + echo -e "${RED}✗ No pods found in observability namespace${NC}" +else + if [ "$running_pods" -eq "$total_pods" ]; then + echo -e "${GREEN}✓ All $total_pods pods are running${NC}" + else + echo -e "${YELLOW}⚠ $running_pods/$total_pods pods are running${NC}" + fi +fi +echo "" + +# Check Services +echo -e "${YELLOW}Checking Services...${NC}" +kubectl get svc -n observability +echo "" + +# Check Ingress +echo -e "${YELLOW}Checking Ingress...${NC}" +if kubectl get ingress -n observability grafana-ingress &> /dev/null; then + echo -e "${GREEN}✓ Grafana Ingress found${NC}" + kubectl get ingress -n observability grafana-ingress +else + echo -e "${RED}✗ Grafana Ingress not found${NC}" +fi +echo "" + +# Check ConfigMaps +echo -e "${YELLOW}Checking ConfigMaps...${NC}" +configmaps=$(kubectl get configmap -n observability 2>/dev/null | grep -v NAME | wc -l) +echo "Found $configmaps ConfigMaps:" +kubectl get configmap -n observability --no-headers | awk '{print " - " $1}' +echo "" + +# Test endpoints +echo -e "${YELLOW}Testing service endpoints...${NC}" + +check_endpoint() { + local name=$1 + local url=$2 + + if kubectl run -it --rm test-$RANDOM --image=curlimages/curl --restart=Never -- \ + curl -s -o /dev/null -w "%{http_code}" --max-time 5 $url 2>/dev/null | grep -q "200\|302\|401"; then + echo -e "${GREEN}✓ $name is responding${NC}" + else + echo -e "${RED}✗ $name is not responding${NC}" + fi +} + +check_endpoint "Prometheus" "http://prometheus.observability.svc.cluster.local:9090/-/healthy" +check_endpoint "Loki" "http://loki.observability.svc.cluster.local:3100/ready" +check_endpoint "Tempo" "http://tempo.observability.svc.cluster.local:3200/ready" +check_endpoint "Grafana" "http://grafana.observability.svc.cluster.local:3000/api/health" + +echo "" +echo -e "${BLUE}==================================================" +echo "Status Check Complete" +echo "==================================================${NC}" +echo "" +echo "Access Grafana at: https://grafana.betelgeusebytes.io" +echo "Default credentials: admin / admin" +echo "" diff --git a/k8s/observability/fluent-bit.yaml b/k8s/observability/fluent-bit.yaml new file mode 100644 index 0000000..e6b726f --- /dev/null +++ b/k8s/observability/fluent-bit.yaml @@ -0,0 +1,46 @@ +apiVersion: v1 +kind: ServiceAccount +metadata: { name: fluent-bit, namespace: observability } +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: { name: fluent-bit-read } +rules: + - apiGroups: [""] + resources: ["pods", "namespaces"] + verbs: ["get", "list", "watch"] +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: { name: fluent-bit-read } +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: fluent-bit-read +subjects: + - kind: ServiceAccount + name: fluent-bit + namespace: observability +--- +apiVersion: apps/v1 +kind: DaemonSet +metadata: { name: fluent-bit, namespace: observability } +spec: + selector: { matchLabels: { app: fluent-bit } } + template: + metadata: { labels: { app: fluent-bit } } + spec: + serviceAccountName: fluent-bit + containers: + - name: fluent-bit + image: cr.fluentbit.io/fluent/fluent-bit:2.2.2 + volumeMounts: + - { name: varlog, mountPath: /var/log } + - { name: containers, mountPath: /var/lib/docker/containers, readOnly: true } + env: + - { name: FLUENT_ELASTICSEARCH_HOST, value: elasticsearch.elastic.svc.cluster.local } + - { name: FLUENT_ELASTICSEARCH_PORT, value: "9200" } + args: ["-i","tail","-p","path=/var/log/containers/*.log","-F","kubernetes","-o","es","-p","host=${FLUENT_ELASTICSEARCH_HOST}","-p","port=${FLUENT_ELASTICSEARCH_PORT}","-p","logstash_format=On","-p","logstash_prefix=k8s-logs"] + volumes: + - { name: varlog, hostPath: { path: /var/log } } + - { name: containers, hostPath: { path: /var/lib/docker/containers, type: DirectoryOrCreate } } diff --git a/k8s/otlp/otel-collector.yaml b/k8s/otlp/otel-collector.yaml new file mode 100644 index 0000000..924761d --- /dev/null +++ b/k8s/otlp/otel-collector.yaml @@ -0,0 +1,73 @@ +apiVersion: v1 +kind: Service +metadata: { name: otel-collector, namespace: observability } +spec: + selector: { app: otel-collector } + ports: + - { name: otlp-http, port: 4318, targetPort: 4318 } + - { name: otlp-grpc, port: 4317, targetPort: 4317 } +--- +apiVersion: apps/v1 +kind: Deployment +metadata: { name: otel-collector, namespace: observability } +spec: + replicas: 2 + selector: { matchLabels: { app: otel-collector } } + template: + metadata: { labels: { app: otel-collector } } + spec: + nodeSelector: { node: hetzner-2 } + containers: + - name: otel-collector + image: otel/opentelemetry-collector-contrib:0.102.0 + args: ["--config=/etc/otel/config.yaml"] + ports: + - { containerPort: 4318 } + - { containerPort: 4317 } + volumeMounts: + - { name: cfg, mountPath: /etc/otel } + volumes: + - { name: cfg, configMap: { name: otel-config } } +--- +apiVersion: v1 +kind: ConfigMap +metadata: { name: otel-config, namespace: observability } +data: + config.yaml: | + receivers: + otlp: + protocols: { http: {}, grpc: {} } + processors: { batch: {} } + exporters: + logging: {} + elasticsearch: + endpoints: ["http://elasticsearch.elastic.svc.cluster.local:9200"] + logs_index: "k8s-logs" + service: + pipelines: + logs: { receivers: [otlp], processors: [batch], exporters: [elasticsearch, logging] } + traces: { receivers: [otlp], processors: [batch], exporters: [logging] } + metrics: { receivers: [otlp], processors: [batch], exporters: [logging] } +--- +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + name: otlp + namespace: observability + annotations: { cert-manager.io/cluster-issuer: letsencrypt-prod } +spec: + ingressClassName: nginx + tls: [{ hosts: ["otlp.betelgeusebytes.io"], secretName: otlp-tls }] + rules: + - host: otlp.betelgeusebytes.io + http: + paths: + - path: /v1/traces + pathType: Prefix + backend: { service: { name: otel-collector, port: { number: 4318 } } } + - path: /v1/metrics + pathType: Prefix + backend: { service: { name: otel-collector, port: { number: 4318 } } } + - path: /v1/logs + pathType: Prefix + backend: { service: { name: otel-collector, port: { number: 4318 } } } diff --git a/k8s/postgres/.DS_Store b/k8s/postgres/.DS_Store new file mode 100644 index 0000000..2ce156a Binary files /dev/null and b/k8s/postgres/.DS_Store differ diff --git a/k8s/postgres/pg.yaml b/k8s/postgres/pg.yaml new file mode 100644 index 0000000..5b7c02e --- /dev/null +++ b/k8s/postgres/pg.yaml @@ -0,0 +1,217 @@ +# k8s/postgres/pg-init-sql-configmap.yaml +apiVersion: v1 +kind: ConfigMap +metadata: + name: pg-init-sql + namespace: db +data: + 00_extensions.sql: | + \connect gitea + CREATE EXTENSION IF NOT EXISTS postgis; + CREATE EXTENSION IF NOT EXISTS postgis_topology; + CREATE EXTENSION IF NOT EXISTS fuzzystrmatch; + CREATE EXTENSION IF NOT EXISTS pg_trgm; + CREATE EXTENSION IF NOT EXISTS hstore; + CREATE EXTENSION IF NOT EXISTS "uuid-ossp"; + CREATE EXTENSION IF NOT EXISTS citext; + CREATE EXTENSION IF NOT EXISTS unaccent; + CREATE EXTENSION IF NOT EXISTS pgcrypto; + DO $$ BEGIN + CREATE EXTENSION IF NOT EXISTS plpython3u; + EXCEPTION WHEN undefined_file THEN + RAISE NOTICE 'plpython3u not available in this image'; + END $$; + 01_tune.sql: | + ALTER SYSTEM SET shared_buffers = '1GB'; + ALTER SYSTEM SET work_mem = '32MB'; + ALTER SYSTEM SET maintenance_work_mem = '512MB'; + ALTER SYSTEM SET max_connections = 200; + SELECT pg_reload_conf(); +--- +# k8s/postgres/pg-conf.yaml +apiVersion: v1 +kind: ConfigMap +metadata: + name: pg-conf + namespace: db +data: + pg_hba.conf: | + # Local connections + local all all trust + host all all 127.0.0.1/32 trust + host all all ::1/128 trust + # TLS-only access from ANY external IP (harden as needed) + hostssl all all 0.0.0.0/0 md5 + hostssl all all ::/0 md5 +--- +# k8s/postgres/pg-secret.yaml +apiVersion: v1 +kind: Secret +metadata: + name: pg18-secret + namespace: db +type: Opaque +stringData: + POSTGRES_PASSWORD: "pa$$word" +--- +# k8s/postgres/pg-certificate.yaml +apiVersion: cert-manager.io/v1 +kind: Certificate +metadata: + name: pg-tls + namespace: db +spec: + secretName: pg-tls + dnsNames: + - pg.betelgeusebytes.io + issuerRef: + kind: ClusterIssuer + name: letsencrypt-prod +--- +# k8s/postgres/postgres-svc.yaml +apiVersion: v1 +kind: Service +metadata: + name: postgres + namespace: db +spec: + selector: + app: postgres + ports: + - name: postgres + port: 5432 + targetPort: 5432 +--- +apiVersion: v1 +kind: Service +metadata: + name: postgres-hl + namespace: db +spec: + clusterIP: None + selector: + app: postgres + ports: + - name: postgres + port: 5432 + targetPort: 5432 +--- +# k8s/postgres/postgres.yaml +apiVersion: apps/v1 +kind: StatefulSet +metadata: + name: postgres + namespace: db +spec: + serviceName: postgres-hl + replicas: 1 + selector: + matchLabels: + app: postgres + template: + metadata: + labels: + app: postgres + spec: + securityContext: + runAsUser: 999 + runAsGroup: 999 + fsGroup: 999 + fsGroupChangePolicy: "Always" + initContainers: + - name: install-certs + image: busybox:1.36 + command: + - sh + - -c + - | + cp /in/tls.crt /out/server.crt + cp /in/tls.key /out/server.key + chown 999:999 /out/* || true + chmod 600 /out/server.key + securityContext: + runAsUser: 0 + volumeMounts: + - { name: pg-tls, mountPath: /in, readOnly: true } + - { name: pg-certs, mountPath: /out } + containers: + - name: postgres + image: axxs/postgres:18-postgis-vector + imagePullPolicy: IfNotPresent + args: + - -c + - ssl=on + - -c + - ssl_cert_file=/certs/server.crt + - -c + - ssl_key_file=/certs/server.key + - -c + - hba_file=/etc/postgresql-custom/pg_hba.conf + env: + - name: POSTGRES_USER + value: "app" + - name: POSTGRES_DB + value: "gitea" + - name: POSTGRES_PASSWORD + valueFrom: + secretKeyRef: + name: pg18-secret + key: POSTGRES_PASSWORD + - name: TZ + value: "Europe/Paris" + ports: + - name: postgres + containerPort: 5432 + volumeMounts: + - { name: data, mountPath: /var/lib/postgresql } # PG18 expects parent, creates /var/lib/postgresql/18/main + - { name: init, mountPath: /docker-entrypoint-initdb.d, readOnly: true } + - { name: pg-certs, mountPath: /certs } + - { name: pg-conf, mountPath: /etc/postgresql-custom } + readinessProbe: + exec: { command: ["sh","-c","pg_isready -U \"$POSTGRES_USER\" -d \"$POSTGRES_DB\" -h 127.0.0.1"] } + initialDelaySeconds: 5 + periodSeconds: 10 + timeoutSeconds: 5 + failureThreshold: 6 + livenessProbe: + exec: { command: ["sh","-c","pg_isready -U \"$POSTGRES_USER\" -d \"$POSTGRES_DB\" -h 127.0.0.1"] } + initialDelaySeconds: 20 + periodSeconds: 10 + timeoutSeconds: 5 + failureThreshold: 6 + resources: + requests: { cpu: "250m", memory: "512Mi" } + limits: { cpu: "1", memory: "2Gi" } + volumes: + - name: init + configMap: + name: pg-init-sql + defaultMode: 0444 + - name: pg-tls + secret: + secretName: pg-tls + - name: pg-certs + emptyDir: {} + - name: pg-conf + configMap: + name: pg-conf + defaultMode: 0444 + volumeClaimTemplates: + - metadata: + name: data + spec: + accessModes: ["ReadWriteOnce"] + storageClassName: local-ssd-hetzner + resources: + requests: + storage: 80Gi + + +# kubectl -n ingress-nginx create configmap tcp-services \ +# --from-literal="5432=db/postgres:5432" \ +# -o yaml --dry-run=client | kubectl apply -f - +# kubectl -n ingress-nginx patch deploy ingress-nginx-controller \ +# --type='json' -p='[ +# {"op":"add","path":"/spec/template/spec/containers/0/args/-","value":"--tcp-services-configmap=$(POD_NAMESPACE)/tcp-services"} +# ]' +# # controller must listen on hostPort:5432 (we already patched earlier) \ No newline at end of file diff --git a/k8s/postgres/postgres-ha.yaml b/k8s/postgres/postgres-ha.yaml new file mode 100644 index 0000000..d74e306 --- /dev/null +++ b/k8s/postgres/postgres-ha.yaml @@ -0,0 +1,275 @@ +--- +apiVersion: v1 +kind: Namespace +metadata: + name: db +--- +# Password secret (replace with your own or generate one) +apiVersion: v1 +kind: Secret +metadata: + name: pg18-secret + namespace: db +type: Opaque +stringData: + POSTGRES_PASSWORD: "pa$$word" +--- +# Init SQL: keeps your original name and keeps enabling PostGIS + vector +apiVersion: v1 +kind: ConfigMap +metadata: + name: pg-init-sql + namespace: db +data: + 00_extensions.sql: | + -- enable common extensions in the default DB and template1 so future DBs inherit them + \connect gitea + CREATE EXTENSION IF NOT EXISTS postgis; + CREATE EXTENSION IF NOT EXISTS vector; + CREATE COLLATION IF NOT EXISTS arabic (provider = icu, locale = 'ar', deterministic = false); + CREATE EXTENSION IF NOT EXISTS tablefunc; + -- postpone pg_stat_statements CREATE to postStart (needs preload) + CREATE EXTENSION IF NOT EXISTS postgis_topology; + CREATE EXTENSION IF NOT EXISTS fuzzystrmatch; + CREATE EXTENSION IF NOT EXISTS pg_trgm; + CREATE EXTENSION IF NOT EXISTS hstore; + CREATE EXTENSION IF NOT EXISTS "uuid-ossp"; + CREATE EXTENSION IF NOT EXISTS citext; + CREATE EXTENSION IF NOT EXISTS unaccent; + CREATE EXTENSION IF NOT EXISTS pgcrypto; + + -- PL/Python (available in your image) + DO $$ BEGIN + CREATE EXTENSION IF NOT EXISTS plpython3u; + EXCEPTION WHEN undefined_file THEN + RAISE NOTICE 'plpython3u not available in this image'; + END $$; + + -- Also on template1 for new DBs (heavier, but intentional) + \connect template1 + CREATE EXTENSION IF NOT EXISTS postgis; + CREATE EXTENSION IF NOT EXISTS pg_trgm; + CREATE EXTENSION IF NOT EXISTS hstore; + CREATE EXTENSION IF NOT EXISTS "uuid-ossp"; + CREATE EXTENSION IF NOT EXISTS citext; + CREATE EXTENSION IF NOT EXISTS unaccent; + CREATE EXTENSION IF NOT EXISTS pgcrypto; + + -- Arabic-friendly ICU collation, non-deterministic for case/diacritics + DO $$ + BEGIN + PERFORM 1 FROM pg_collation WHERE collname='arabic'; + IF NOT FOUND THEN + CREATE COLLATION arabic (provider = icu, locale = 'ar', deterministic = false); + END IF; + END$$; + + 01_tune.sql: | + -- Enable pg_stat_statements on next server start + DO $$ + DECLARE + cur text := current_setting('shared_preload_libraries', true); + BEGIN + IF cur IS NULL OR position('pg_stat_statements' in cur) = 0 THEN + PERFORM pg_catalog.pg_reload_conf(); -- harmless even if no changes yet + EXECUTE $$ALTER SYSTEM SET shared_preload_libraries = + $$ || quote_literal(coalesce(NULLIF(cur,'' ) || ',pg_stat_statements', 'pg_stat_statements')); + END IF; + END$$; + + -- Optional tuning (adjust to your limits) + ALTER SYSTEM SET shared_buffers = '1GB'; + ALTER SYSTEM SET work_mem = '32MB'; + ALTER SYSTEM SET maintenance_work_mem = '512MB'; + ALTER SYSTEM SET max_connections = 200; + + -- Reload applies some settings immediately; others need restart (OK after init completes) + SELECT pg_reload_conf(); + ALTER SYSTEM SET pg_stat_statements.max = 10000; + ALTER SYSTEM SET pg_stat_statements.track = 'all'; + ALTER SYSTEM SET pg_stat_statements.save = on; + pg_hba.conf: | + # Allow loopback + local all all trust + host all all 127.0.0.1/32 trust + host all all ::1/128 trust + # Allow TLS connections from your IP(s) only + hostssl all all YOUR_PUBLIC_IP/32 md5 + # (Optional) Add more CIDRs or a private network range here: + # hostssl all all 10.0.0.0/8 md5 +--- +# Headless service required by StatefulSet for stable network IDs +apiVersion: v1 +kind: Service +metadata: + name: postgres-hl + namespace: db +spec: + clusterIP: None + selector: + app: postgres + ports: + - name: postgres + port: 5432 + targetPort: 5432 +--- +# Regular ClusterIP service for clients (keeps your original name) +apiVersion: v1 +kind: Service +metadata: + name: postgres + namespace: db +spec: + selector: + app: postgres + ports: + - name: postgres + port: 5432 + targetPort: 5432 +--- +apiVersion: apps/v1 +kind: StatefulSet +metadata: + name: postgres + namespace: db +spec: + serviceName: postgres-hl + replicas: 1 + selector: + matchLabels: + app: postgres + template: + metadata: + labels: + app: postgres + spec: + securityContext: + runAsUser: 999 + runAsGroup: 999 + fsGroup: 999 + fsGroupChangePolicy: "Always" + initContainers: + # Copy cert-manager certs to a writable path with correct perms for Postgres + - name: install-certs + image: busybox:1.36 + command: + - sh + - -c + - | + cp /in/tls.crt /out/server.crt + cp /in/tls.key /out/server.key + cp /in/ca.crt /out/ca.crt || true + chown 999:999 /out/* || true + chmod 600 /out/server.key + securityContext: + runAsUser: 0 + volumeMounts: + - { name: pg-tls, mountPath: /in, readOnly: true } + - { name: pg-certs, mountPath: /out } + containers: + - name: postgres + image: axxs/postgres:18-postgis-vector + imagePullPolicy: IfNotPresent + args: + - -c + - ssl=on + - -c + - ssl_cert_file=/certs/server.crt + - -c + - ssl_key_file=/certs/server.key + - -c + - ssl_ca_file=/certs/ca.crt + - -c + - hba_file=/etc/postgresql-custom/pg_hba.conf + lifecycle: + postStart: + exec: + command: + - /bin/sh + - -c + - | + set -e + # Wait until server accepts connections + for i in $(seq 1 30); do + pg_isready -h 127.0.0.1 -U "$POSTGRES_USER" -d "$POSTGRES_DB" && break + sleep 1 + done + psql -v ON_ERROR_STOP=1 -U "$POSTGRES_USER" -d "$POSTGRES_DB" -c "CREATE EXTENSION IF NOT EXISTS pg_stat_statements;" + env: + - name: POSTGRES_USER + value: "app" + - name: POSTGRES_DB + value: "gitea" # matches your \connect gitea + - name: POSTGRES_PASSWORD + valueFrom: + secretKeyRef: + name: pg18-secret + key: POSTGRES_PASSWORD + - name: TZ + value: "Europe/Paris" + ports: + - name: postgres + containerPort: 5432 + volumeMounts: + # ✅ PG 18 requires this parent path; it will create /var/lib/postgresql/18/main + - name: data + mountPath: /var/lib/postgresql + # your init scripts ConfigMap + - name: init + mountPath: /docker-entrypoint-initdb.d + readOnly: true + - name: pg-certs + mountPath: /certs + # pg_hba.conf + - name: pg-conf + mountPath: /etc/postgresql-custom + readinessProbe: + exec: + command: + - /bin/sh + - -c + - pg_isready -U "$POSTGRES_USER" -d "$POSTGRES_DB" -h 127.0.0.1 + initialDelaySeconds: 5 + periodSeconds: 10 + timeoutSeconds: 5 + failureThreshold: 6 + livenessProbe: + exec: + command: + - /bin/sh + - -c + - pg_isready -U "$POSTGRES_USER" -d "$POSTGRES_DB" -h 127.0.0.1 + initialDelaySeconds: 20 + periodSeconds: 10 + timeoutSeconds: 5 + failureThreshold: 6 + resources: + requests: + cpu: "250m" + memory: "512Mi" + limits: + cpu: "1" + memory: "2Gi" + volumes: + - name: init + configMap: + name: pg-init-sql + defaultMode: 0444 + - name: pg-tls + secret: + secretName: pg-tls + - name: pg-certs + emptyDir: {} + - name: pg-conf + configMap: + name: pg-conf + defaultMode: 0444 + volumeClaimTemplates: + - metadata: + name: data + spec: + accessModes: ["ReadWriteOnce"] + resources: + requests: + storage: 10Gi + # storageClassName: # optionally pin this diff --git a/k8s/postgres/postgres.yaml b/k8s/postgres/postgres.yaml new file mode 100644 index 0000000..3db44c7 --- /dev/null +++ b/k8s/postgres/postgres.yaml @@ -0,0 +1,122 @@ +apiVersion: v1 +kind: Service +metadata: { name: postgres, namespace: db } +spec: + ports: [{ port: 5432, targetPort: 5432 }] + selector: { app: postgres } +--- +apiVersion: v1 +kind: ConfigMap +metadata: { name: pg-init-sql, namespace: db } +data: + 00_extensions.sql: | + -- enable common extensions in the default DB and template1 so future DBs inherit them + \connect gitea + CREATE EXTENSION IF NOT EXISTS postgis; + CREATE EXTENSION IF NOT EXISTS vector; + CREATE COLLATION IF NOT EXISTS arabic (provider = icu, locale = 'ar', deterministic = false); + CREATE EXTENSION IF NOT EXISTS tablefunc; + CREATE EXTENSION IF NOT EXISTS pg_stat_statements; + + CREATE EXTENSION IF NOT EXISTS postgis_topology; + CREATE EXTENSION IF NOT EXISTS fuzzystrmatch; + CREATE EXTENSION IF NOT EXISTS pg_trgm; + CREATE EXTENSION IF NOT EXISTS hstore; + CREATE EXTENSION IF NOT EXISTS "uuid-ossp"; + CREATE EXTENSION IF NOT EXISTS citext; + CREATE EXTENSION IF NOT EXISTS unaccent; + CREATE EXTENSION IF NOT EXISTS pgcrypto; + -- PL/Python (optional; requires image with plpython3u, postgis image has it) + DO $$ BEGIN + CREATE EXTENSION IF NOT EXISTS plpython3u; + EXCEPTION WHEN undefined_file THEN + RAISE NOTICE 'plpython3u not available in this image'; + END $$; + + -- Also on template1 for new DBs: + \connect template1 + CREATE EXTENSION IF NOT EXISTS postgis; + CREATE EXTENSION IF NOT EXISTS pg_trgm; + CREATE EXTENSION IF NOT EXISTS hstore; + CREATE EXTENSION IF NOT EXISTS "uuid-ossp"; + CREATE EXTENSION IF NOT EXISTS citext; + CREATE EXTENSION IF NOT EXISTS unaccent; + CREATE EXTENSION IF NOT EXISTS pgcrypto; + + -- Arabic-friendly ICU collation (PostgreSQL >= 13) + -- Non-deterministic collation helps proper case/diacritics comparisons + DO $$ + BEGIN + PERFORM 1 FROM pg_collation WHERE collname='arabic'; + IF NOT FOUND THEN + CREATE COLLATION arabic (provider = icu, locale = 'ar', deterministic = false); + END IF; + END$$; + + -- Example: ensure gitea DB uses UTF8; Arabic text search often needs unaccent + custom dictionaries. + -- You can create additional DBs with: CREATE DATABASE mydb TEMPLATE template1 ENCODING 'UTF8'; + + 01_tune.sql: | + -- small safe defaults; adjust later + ALTER SYSTEM SET shared_buffers = '1GB'; + ALTER SYSTEM SET work_mem = '32MB'; + ALTER SYSTEM SET maintenance_work_mem = '512MB'; + ALTER SYSTEM SET max_connections = 200; + SELECT pg_reload_conf(); +--- +apiVersion: apps/v1 +kind: StatefulSet +metadata: { name: postgres, namespace: db } +spec: + serviceName: postgres + replicas: 1 + selector: { matchLabels: { app: postgres } } + template: + metadata: { labels: { app: postgres } } + spec: + nodeSelector: + node: hetzner-2 + securityContext: + fsGroup: 999 # Debian postgres user/group in postgis image + fsGroupChangePolicy: OnRootMismatch + initContainers: + - name: fix-perms + image: busybox:1.36 + command: ["sh","-c","chown -R 999:999 /var/lib/postgresql/data || true"] + securityContext: { runAsUser: 0 } + volumeMounts: [{ name: data, mountPath: /var/lib/postgresql/data }] + containers: + - name: postgres + image: postgres:16-3.4 + env: + - name: POSTGRES_PASSWORD + valueFrom: { secretKeyRef: { name: postgres-auth, key: POSTGRES_PASSWORD } } + - { name: POSTGRES_USER, value: gitea } + - { name: POSTGRES_DB, value: gitea } + - name: POSTGRES_INITDB_ARGS + value: "--encoding=UTF8 --locale=C.UTF-8" + ports: [{ containerPort: 5432 }] + volumeMounts: + - { name: data, mountPath: /var/lib/postgresql/data } + - { name: init, mountPath: /docker-entrypoint-initdb.d } + volumeClaimTemplates: + - metadata: { name: data } + spec: + accessModes: ["ReadWriteOnce"] + storageClassName: local-ssd-hetzner + resources: { requests: { storage: 80Gi } } +--- +# Mount the init scripts +apiVersion: apps/v1 +kind: StatefulSet +metadata: + name: postgres + namespace: db +spec: + template: + spec: + volumes: + - name: init + configMap: + name: pg-init-sql + defaultMode: 0444 diff --git a/k8s/postgres/secret.yaml b/k8s/postgres/secret.yaml new file mode 100644 index 0000000..cefb8de --- /dev/null +++ b/k8s/postgres/secret.yaml @@ -0,0 +1,7 @@ +apiVersion: v1 +kind: Secret +metadata: { name: postgres-auth, namespace: db } +type: Opaque +stringData: + POSTGRES_PASSWORD: "PG-ADM1N" + GITEA_DB_PASSWORD: "G1TEA" diff --git a/k8s/prometheus/prometheus-config.yaml b/k8s/prometheus/prometheus-config.yaml new file mode 100644 index 0000000..9b7eabd --- /dev/null +++ b/k8s/prometheus/prometheus-config.yaml @@ -0,0 +1,13 @@ +apiVersion: v1 +kind: ConfigMap +metadata: { name: prometheus-config, namespace: monitoring } +data: + prometheus.yml: | + global: { scrape_interval: 15s } + scrape_configs: + - job_name: 'kubernetes-pods' + kubernetes_sd_configs: [ { role: pod } ] + relabel_configs: + - action: keep + source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape] + regex: 'true' diff --git a/k8s/prometheus/prometheus.yaml b/k8s/prometheus/prometheus.yaml new file mode 100644 index 0000000..4a8e43b --- /dev/null +++ b/k8s/prometheus/prometheus.yaml @@ -0,0 +1,55 @@ +apiVersion: v1 +kind: Service +metadata: { name: prometheus, namespace: monitoring } +spec: + ports: [{ port: 9090, targetPort: 9090 }] + selector: { app: prometheus } +--- +apiVersion: apps/v1 +kind: StatefulSet +metadata: { name: prometheus, namespace: monitoring } +spec: + serviceName: prometheus + replicas: 1 + selector: { matchLabels: { app: prometheus } } + template: + metadata: { labels: { app: prometheus } } + spec: + nodeSelector: { node: hetzner-2 } + containers: + - name: prometheus + image: prom/prometheus:v2.53.0 + args: ["--config.file=/etc/prometheus/prometheus.yml","--storage.tsdb.path=/prometheus"] + ports: [{ containerPort: 9090 }] + volumeMounts: + - { name: data, mountPath: /prometheus } + - { name: config, mountPath: /etc/prometheus } + volumes: + - { name: config, configMap: { name: prometheus-config } } + volumeClaimTemplates: + - metadata: { name: data } + spec: + accessModes: ["ReadWriteOnce"] + storageClassName: local-ssd-hetzner + resources: { requests: { storage: 50Gi } } +--- +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + name: prometheus + namespace: monitoring + annotations: + cert-manager.io/cluster-issuer: letsencrypt-prod + nginx.ingress.kubernetes.io/auth-type: basic + nginx.ingress.kubernetes.io/auth-secret: basic-auth-prometheus + nginx.ingress.kubernetes.io/auth-realm: "Authentication Required" +spec: + ingressClassName: nginx + tls: [{ hosts: ["prometheus.betelgeusebytes.io"], secretName: prometheus-tls }] + rules: + - host: prometheus.betelgeusebytes.io + http: + paths: + - path: / + pathType: Prefix + backend: { service: { name: prometheus, port: { number: 9090 } } } diff --git a/k8s/redis/redis-pv.yaml b/k8s/redis/redis-pv.yaml new file mode 100644 index 0000000..7ccc1da --- /dev/null +++ b/k8s/redis/redis-pv.yaml @@ -0,0 +1,21 @@ +apiVersion: v1 +kind: PersistentVolume +metadata: + name: pv-redis +spec: + capacity: + storage: 10Gi + accessModes: + - ReadWriteOnce + persistentVolumeReclaimPolicy: Retain + storageClassName: local-ssd-hetzner + local: + path: /mnt/local-ssd/redis + nodeAffinity: + required: + nodeSelectorTerms: + - matchExpressions: + - key: kubernetes.io/hostname + operator: In + values: + - hetzner-2 \ No newline at end of file diff --git a/k8s/redis/redis.yaml b/k8s/redis/redis.yaml new file mode 100644 index 0000000..0ac5cff --- /dev/null +++ b/k8s/redis/redis.yaml @@ -0,0 +1,40 @@ +apiVersion: v1 +kind: Service +metadata: { name: redis, namespace: db } +spec: + ports: [{ port: 6379, targetPort: 6379 }] + selector: { app: redis } +--- +apiVersion: apps/v1 +kind: StatefulSet +metadata: { name: redis, namespace: db } +spec: + serviceName: redis + replicas: 1 + selector: { matchLabels: { app: redis } } + template: + metadata: { labels: { app: redis } } + spec: + nodeSelector: { node: hetzner-2 } + containers: + - name: redis + image: redis:7 + args: ["--requirepass", "$(REDIS_PASSWORD)"] + env: + - name: REDIS_PASSWORD + valueFrom: { secretKeyRef: { name: redis-auth, key: REDIS_PASSWORD } } + ports: [{ containerPort: 6379 }] + volumeMounts: + - { name: data, mountPath: /data } + volumeClaimTemplates: + - metadata: { name: data } + spec: + accessModes: ["ReadWriteOnce"] + storageClassName: local-ssd-hetzner + resources: { requests: { storage: 10Gi } } +--- +apiVersion: v1 +kind: Secret +metadata: { name: redis-auth, namespace: db } +type: Opaque +stringData: { REDIS_PASSWORD: "RED1S" } diff --git a/k8s/scripts/cleanup.sh b/k8s/scripts/cleanup.sh new file mode 100755 index 0000000..1fe758c --- /dev/null +++ b/k8s/scripts/cleanup.sh @@ -0,0 +1,319 @@ +#!/bin/bash + +set -e + +echo "==========================================================" +echo "Removing Existing Monitoring Stack" +echo "==========================================================" +echo "" + +RED='\033[0;31m' +YELLOW='\033[1;33m' +GREEN='\033[0;32m' +NC='\033[0m' # No Color + +echo -e "${YELLOW}This script will remove common monitoring deployments including:${NC}" +echo " - Prometheus (standalone or operator)" +echo " - Grafana" +echo " - Fluent Bit" +echo " - Vector" +echo " - Loki" +echo " - Tempo" +echo " - Node exporters" +echo " - kube-state-metrics" +echo " - Any monitoring/prometheus/grafana namespaces" +echo "" +echo -e "${RED}WARNING: This will delete all existing monitoring data!${NC}" +echo "" +read -p "Are you sure you want to continue? (yes/no): " confirm + +if [ "$confirm" != "yes" ]; then + echo "Cleanup cancelled." + exit 0 +fi + +echo "" +echo -e "${YELLOW}Step 1: Checking for existing monitoring namespaces...${NC}" + +# Common namespace names for monitoring +NAMESPACES=("monitoring" "prometheus" "grafana" "loki" "tempo" "logging") + +for ns in "${NAMESPACES[@]}"; do + if kubectl get namespace "$ns" &> /dev/null; then + echo -e "${GREEN}Found namespace: $ns${NC}" + + # Show what's in the namespace + echo " Resources in $ns:" + kubectl get all -n "$ns" 2>/dev/null | head -20 || true + echo "" + + read -p " Delete namespace '$ns'? (yes/no): " delete_ns + if [ "$delete_ns" = "yes" ]; then + echo " Deleting namespace $ns..." + kubectl delete namespace "$ns" --timeout=120s || { + echo -e "${YELLOW} Warning: Namespace deletion timed out, forcing...${NC}" + kubectl delete namespace "$ns" --grace-period=0 --force & + } + fi + fi +done + +echo "" +echo -e "${YELLOW}Step 2: Removing common monitoring Helm releases...${NC}" + +# Check if helm is available +if command -v helm &> /dev/null; then + echo "Checking for Helm releases..." + + # Common Helm release names + RELEASES=("prometheus" "grafana" "loki" "tempo" "fluent-bit" "prometheus-operator" "kube-prometheus-stack") + + for release in "${RELEASES[@]}"; do + # Check all namespaces for the release + if helm list -A | grep -q "$release"; then + ns=$(helm list -A | grep "$release" | awk '{print $2}') + echo -e "${GREEN}Found Helm release: $release in namespace $ns${NC}" + read -p " Uninstall Helm release '$release'? (yes/no): " uninstall + if [ "$uninstall" = "yes" ]; then + echo " Uninstalling $release..." + helm uninstall "$release" -n "$ns" || echo -e "${YELLOW} Warning: Failed to uninstall $release${NC}" + fi + fi + done +else + echo "Helm not found, skipping Helm releases check" +fi + +echo "" +echo -e "${YELLOW}Step 3: Removing standalone monitoring components...${NC}" + +# Remove common DaemonSets in kube-system or default +echo "Checking for monitoring DaemonSets..." +for ns in kube-system default; do + if kubectl get daemonset -n "$ns" 2>/dev/null | grep -q "node-exporter\|fluent-bit\|fluentd\|vector"; then + echo -e "${GREEN}Found monitoring DaemonSets in $ns${NC}" + kubectl get daemonset -n "$ns" | grep -E "node-exporter|fluent-bit|fluentd|vector" + read -p " Delete these DaemonSets? (yes/no): " delete_ds + if [ "$delete_ds" = "yes" ]; then + kubectl delete daemonset -n "$ns" -l app=node-exporter --ignore-not-found + kubectl delete daemonset -n "$ns" -l app=fluent-bit --ignore-not-found + kubectl delete daemonset -n "$ns" -l app=fluentd --ignore-not-found + kubectl delete daemonset -n "$ns" -l app=vector --ignore-not-found + kubectl delete daemonset -n "$ns" node-exporter --ignore-not-found + kubectl delete daemonset -n "$ns" fluent-bit --ignore-not-found + kubectl delete daemonset -n "$ns" fluentd --ignore-not-found + kubectl delete daemonset -n "$ns" vector --ignore-not-found + fi + fi +done + +# Remove common Deployments +echo "" +echo "Checking for monitoring Deployments..." +for ns in kube-system default; do + if kubectl get deployment -n "$ns" 2>/dev/null | grep -q "prometheus\|grafana\|kube-state-metrics\|loki\|tempo"; then + echo -e "${GREEN}Found monitoring Deployments in $ns${NC}" + kubectl get deployment -n "$ns" | grep -E "prometheus|grafana|kube-state-metrics|loki|tempo" + read -p " Delete these Deployments? (yes/no): " delete_deploy + if [ "$delete_deploy" = "yes" ]; then + kubectl delete deployment -n "$ns" -l app=prometheus --ignore-not-found + kubectl delete deployment -n "$ns" -l app=grafana --ignore-not-found + kubectl delete deployment -n "$ns" -l app=kube-state-metrics --ignore-not-found + kubectl delete deployment -n "$ns" -l app=loki --ignore-not-found + kubectl delete deployment -n "$ns" -l app=tempo --ignore-not-found + kubectl delete deployment -n "$ns" prometheus --ignore-not-found + kubectl delete deployment -n "$ns" grafana --ignore-not-found + kubectl delete deployment -n "$ns" kube-state-metrics --ignore-not-found + fi + fi +done + +# Remove common StatefulSets +echo "" +echo "Checking for monitoring StatefulSets..." +for ns in kube-system default; do + if kubectl get statefulset -n "$ns" 2>/dev/null | grep -q "prometheus\|grafana\|loki\|tempo"; then + echo -e "${GREEN}Found monitoring StatefulSets in $ns${NC}" + kubectl get statefulset -n "$ns" | grep -E "prometheus|grafana|loki|tempo" + read -p " Delete these StatefulSets? (yes/no): " delete_sts + if [ "$delete_sts" = "yes" ]; then + kubectl delete statefulset -n "$ns" -l app=prometheus --ignore-not-found + kubectl delete statefulset -n "$ns" -l app=grafana --ignore-not-found + kubectl delete statefulset -n "$ns" -l app=loki --ignore-not-found + kubectl delete statefulset -n "$ns" -l app=tempo --ignore-not-found + kubectl delete statefulset -n "$ns" prometheus --ignore-not-found + kubectl delete statefulset -n "$ns" grafana --ignore-not-found + kubectl delete statefulset -n "$ns" loki --ignore-not-found + kubectl delete statefulset -n "$ns" tempo --ignore-not-found + fi + fi +done + +echo "" +echo -e "${YELLOW}Step 4: Removing monitoring ConfigMaps...${NC}" + +# Ask before removing ConfigMaps (they might contain important configs) +echo "Checking for monitoring ConfigMaps..." +for ns in kube-system default monitoring prometheus grafana; do + if kubectl get namespace "$ns" &> /dev/null; then + if kubectl get configmap -n "$ns" 2>/dev/null | grep -q "prometheus\|grafana\|loki\|tempo\|fluent"; then + echo -e "${GREEN}Found monitoring ConfigMaps in $ns${NC}" + kubectl get configmap -n "$ns" | grep -E "prometheus|grafana|loki|tempo|fluent" + read -p " Delete these ConfigMaps? (yes/no): " delete_cm + if [ "$delete_cm" = "yes" ]; then + kubectl delete configmap -n "$ns" -l app=prometheus --ignore-not-found + kubectl delete configmap -n "$ns" -l app=grafana --ignore-not-found + kubectl delete configmap -n "$ns" -l app=loki --ignore-not-found + kubectl delete configmap -n "$ns" -l app=fluent-bit --ignore-not-found + fi + fi + fi +done + +echo "" +echo -e "${YELLOW}Step 5: Removing ClusterRoles and ClusterRoleBindings...${NC}" + +# Remove monitoring-related RBAC +echo "Checking for monitoring ClusterRoles..." +if kubectl get clusterrole 2>/dev/null | grep -q "prometheus\|grafana\|kube-state-metrics\|fluent-bit\|node-exporter"; then + echo -e "${GREEN}Found monitoring ClusterRoles${NC}" + kubectl get clusterrole | grep -E "prometheus|grafana|kube-state-metrics|fluent-bit|node-exporter" + read -p " Delete these ClusterRoles? (yes/no): " delete_cr + if [ "$delete_cr" = "yes" ]; then + kubectl delete clusterrole prometheus --ignore-not-found + kubectl delete clusterrole grafana --ignore-not-found + kubectl delete clusterrole kube-state-metrics --ignore-not-found + kubectl delete clusterrole fluent-bit --ignore-not-found + kubectl delete clusterrole node-exporter --ignore-not-found + fi +fi + +echo "Checking for monitoring ClusterRoleBindings..." +if kubectl get clusterrolebinding 2>/dev/null | grep -q "prometheus\|grafana\|kube-state-metrics\|fluent-bit\|node-exporter"; then + echo -e "${GREEN}Found monitoring ClusterRoleBindings${NC}" + kubectl get clusterrolebinding | grep -E "prometheus|grafana|kube-state-metrics|fluent-bit|node-exporter" + read -p " Delete these ClusterRoleBindings? (yes/no): " delete_crb + if [ "$delete_crb" = "yes" ]; then + kubectl delete clusterrolebinding prometheus --ignore-not-found + kubectl delete clusterrolebinding grafana --ignore-not-found + kubectl delete clusterrolebinding kube-state-metrics --ignore-not-found + kubectl delete clusterrolebinding fluent-bit --ignore-not-found + kubectl delete clusterrolebinding node-exporter --ignore-not-found + fi +fi + +echo "" +echo -e "${YELLOW}Step 6: Removing PVCs and PVs...${NC}" + +# Check for monitoring PVCs +echo "Checking for monitoring PersistentVolumeClaims..." +for ns in kube-system default monitoring prometheus grafana; do + if kubectl get namespace "$ns" &> /dev/null; then + if kubectl get pvc -n "$ns" 2>/dev/null | grep -q "prometheus\|grafana\|loki\|tempo"; then + echo -e "${GREEN}Found monitoring PVCs in $ns${NC}" + kubectl get pvc -n "$ns" | grep -E "prometheus|grafana|loki|tempo" + echo -e "${RED} WARNING: Deleting PVCs will delete all stored data!${NC}" + read -p " Delete these PVCs? (yes/no): " delete_pvc + if [ "$delete_pvc" = "yes" ]; then + kubectl delete pvc -n "$ns" -l app=prometheus --ignore-not-found + kubectl delete pvc -n "$ns" -l app=grafana --ignore-not-found + kubectl delete pvc -n "$ns" -l app=loki --ignore-not-found + kubectl delete pvc -n "$ns" -l app=tempo --ignore-not-found + # Also try by name patterns + kubectl get pvc -n "$ns" -o name | grep -E "prometheus|grafana|loki|tempo" | xargs -r kubectl delete -n "$ns" || true + fi + fi + fi +done + +# Check for monitoring PVs +echo "" +echo "Checking for monitoring PersistentVolumes..." +if kubectl get pv 2>/dev/null | grep -q "prometheus\|grafana\|loki\|tempo\|monitoring"; then + echo -e "${GREEN}Found monitoring PVs${NC}" + kubectl get pv | grep -E "prometheus|grafana|loki|tempo|monitoring" + echo -e "${RED} WARNING: Deleting PVs may delete data on disk!${NC}" + read -p " Delete these PVs? (yes/no): " delete_pv + if [ "$delete_pv" = "yes" ]; then + kubectl get pv -o name | grep -E "prometheus|grafana|loki|tempo|monitoring" | xargs -r kubectl delete || true + fi +fi + +echo "" +echo -e "${YELLOW}Step 7: Checking for monitoring Ingresses...${NC}" + +for ns in kube-system default monitoring prometheus grafana; do + if kubectl get namespace "$ns" &> /dev/null; then + if kubectl get ingress -n "$ns" 2>/dev/null | grep -q "prometheus\|grafana\|loki"; then + echo -e "${GREEN}Found monitoring Ingresses in $ns${NC}" + kubectl get ingress -n "$ns" | grep -E "prometheus|grafana|loki" + read -p " Delete these Ingresses? (yes/no): " delete_ing + if [ "$delete_ing" = "yes" ]; then + kubectl delete ingress -n "$ns" -l app=prometheus --ignore-not-found + kubectl delete ingress -n "$ns" -l app=grafana --ignore-not-found + kubectl delete ingress -n "$ns" prometheus-ingress --ignore-not-found + kubectl delete ingress -n "$ns" grafana-ingress --ignore-not-found + fi + fi + fi +done + +echo "" +echo -e "${YELLOW}Step 8: Checking for Prometheus Operator CRDs...${NC}" + +# Check for Prometheus Operator CRDs +if kubectl get crd 2>/dev/null | grep -q "monitoring.coreos.com"; then + echo -e "${GREEN}Found Prometheus Operator CRDs${NC}" + kubectl get crd | grep "monitoring.coreos.com" + echo "" + echo -e "${RED}WARNING: Deleting these CRDs will remove ALL Prometheus Operator resources cluster-wide!${NC}" + read -p " Delete Prometheus Operator CRDs? (yes/no): " delete_crd + if [ "$delete_crd" = "yes" ]; then + kubectl delete crd prometheuses.monitoring.coreos.com --ignore-not-found + kubectl delete crd prometheusrules.monitoring.coreos.com --ignore-not-found + kubectl delete crd servicemonitors.monitoring.coreos.com --ignore-not-found + kubectl delete crd podmonitors.monitoring.coreos.com --ignore-not-found + kubectl delete crd alertmanagers.monitoring.coreos.com --ignore-not-found + kubectl delete crd alertmanagerconfigs.monitoring.coreos.com --ignore-not-found + kubectl delete crd probes.monitoring.coreos.com --ignore-not-found + kubectl delete crd thanosrulers.monitoring.coreos.com --ignore-not-found + fi +fi + +echo "" +echo -e "${YELLOW}Step 9: Optional - Clean up data directories on nodes...${NC}" +echo "" +echo "You may have monitoring data stored on your nodes at:" +echo " - /mnt/local-ssd/prometheus" +echo " - /mnt/local-ssd/grafana" +echo " - /mnt/local-ssd/loki" +echo " - /mnt/local-ssd/tempo" +echo " - /var/lib/prometheus" +echo " - /var/lib/grafana" +echo "" +echo "To remove these, SSH to each node and run:" +echo " sudo rm -rf /mnt/local-ssd/{prometheus,grafana,loki,tempo}" +echo " sudo rm -rf /var/lib/{prometheus,grafana,loki,tempo}" +echo "" +read -p "Have you cleaned up the data directories? (yes to continue, no to skip): " cleanup_dirs + +echo "" +echo -e "${GREEN}==========================================================" +echo "Existing Monitoring Stack Cleanup Complete!" +echo "==========================================================${NC}" +echo "" +echo "Summary of actions taken:" +echo " - Removed monitoring namespaces (if confirmed)" +echo " - Uninstalled Helm releases (if found and confirmed)" +echo " - Removed standalone monitoring components" +echo " - Removed monitoring ConfigMaps" +echo " - Removed RBAC resources" +echo " - Removed PVCs and PVs (if confirmed)" +echo " - Removed Ingresses" +echo " - Removed Prometheus Operator CRDs (if confirmed)" +echo "" +echo -e "${YELLOW}Next Steps:${NC}" +echo "1. Verify cleanup: kubectl get all -A | grep -E 'prometheus|grafana|loki|tempo|monitoring'" +echo "2. Clean up node data directories (see above)" +echo "3. Deploy new observability stack: ./deploy.sh" +echo "" \ No newline at end of file diff --git a/k8s/sso/sso.yaml b/k8s/sso/sso.yaml new file mode 100644 index 0000000..6311842 --- /dev/null +++ b/k8s/sso/sso.yaml @@ -0,0 +1,98 @@ +# PV +apiVersion: v1 +kind: PersistentVolume +metadata: + name: pv-auth +spec: + capacity: + storage: 10Gi + accessModes: + - ReadWriteOnce + persistentVolumeReclaimPolicy: Retain + storageClassName: local-ssd-hetzner + local: + path: /mnt/local-ssd/auth + nodeAffinity: + required: + nodeSelectorTerms: + - matchExpressions: + - key: kubernetes.io/hostname + operator: In + values: + - hetzner-2 +--- +# k8s/auth/keycloak/secret.yaml +apiVersion: v1 +kind: Secret +metadata: { name: keycloak-admin, namespace: db } +type: Opaque +stringData: { KEYCLOAK_ADMIN: "admin", KEYCLOAK_ADMIN_PASSWORD: "admin" } + +--- +# k8s/auth/keycloak/pvc.yaml +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: { name: keycloak-data, namespace: db } +spec: + accessModes: ["ReadWriteOnce"] + storageClassName: local-ssd-hetzner + resources: { requests: { storage: 10Gi } } + +--- +# k8s/auth/keycloak/deploy.yaml +apiVersion: apps/v1 +kind: Deployment +metadata: { name: keycloak, namespace: db } +spec: + replicas: 1 + selector: { matchLabels: { app: keycloak } } + template: + metadata: { labels: { app: keycloak } } + spec: + # Ensure the PV is owned by the Keycloak UID/GID + securityContext: + fsGroup: 1000 + initContainers: + - name: fix-permissions + image: busybox + command: ['sh', '-c', 'chown -R 1000:1000 /opt/keycloak/data && chmod -R 755 /opt/keycloak/data'] + volumeMounts: + - name: data + mountPath: /opt/keycloak/data + containers: + - name: keycloak + image: quay.io/keycloak/keycloak:latest + args: ["start","--http-enabled=true","--proxy-headers=xforwarded","--hostname-strict=false"] + env: + - { name: KEYCLOAK_ADMIN, valueFrom: { secretKeyRef: { name: keycloak-admin, key: KEYCLOAK_ADMIN } } } + - { name: KEYCLOAK_ADMIN_PASSWORD, valueFrom: { secretKeyRef: { name: keycloak-admin, key: KEYCLOAK_ADMIN_PASSWORD } } } + ports: [{ containerPort: 8080 }] + volumeMounts: [{ name: data, mountPath: /opt/keycloak/data }] + securityContext: + runAsUser: 1000 + runAsGroup: 1000 + volumes: + - name: data + persistentVolumeClaim: { claimName: keycloak-data } +--- +apiVersion: v1 +kind: Service +metadata: { name: keycloak, namespace: db } +spec: { selector: { app: keycloak }, ports: [ { port: 80, targetPort: 8080 } ] } +--- +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + name: keycloak + namespace: db + annotations: { cert-manager.io/cluster-issuer: letsencrypt-prod } +spec: + ingressClassName: nginx + tls: [{ hosts: ["auth.betelgeusebytes.io"], secretName: keycloak-tls }] + rules: + - host: auth.betelgeusebytes.io + http: + paths: + - path: / + pathType: Prefix + backend: { service: { name: keycloak, port: { number: 80 } } } diff --git a/k8s/storage/persistent-volumes.yaml b/k8s/storage/persistent-volumes.yaml new file mode 100644 index 0000000..fa0db43 --- /dev/null +++ b/k8s/storage/persistent-volumes.yaml @@ -0,0 +1,175 @@ +apiVersion: v1 +kind: PersistentVolume +metadata: + name: pv-postgres +spec: + capacity: + storage: 80Gi + accessModes: + - ReadWriteOnce + persistentVolumeReclaimPolicy: Retain + storageClassName: local-ssd-hetzner + local: + path: /mnt/local-ssd/postgres + nodeAffinity: + required: + nodeSelectorTerms: + - matchExpressions: + - key: kubernetes.io/hostname + operator: In + values: + - hetzner-2 +--- +apiVersion: v1 +kind: PersistentVolume +metadata: + name: pv-elasticsearch +spec: + capacity: + storage: 300Gi + accessModes: + - ReadWriteOnce + persistentVolumeReclaimPolicy: Retain + storageClassName: local-ssd-hetzner + local: + path: /mnt/local-ssd/elasticsearch + nodeAffinity: + required: + nodeSelectorTerms: + - matchExpressions: + - key: kubernetes.io/hostname + operator: In + values: + - hetzner-2 +--- +apiVersion: v1 +kind: PersistentVolume +metadata: + name: pv-gitea +spec: + capacity: + storage: 50Gi + accessModes: + - ReadWriteOnce + persistentVolumeReclaimPolicy: Retain + storageClassName: local-ssd-hetzner + local: + path: /mnt/local-ssd/gitea + nodeAffinity: + required: + nodeSelectorTerms: + - matchExpressions: + - key: kubernetes.io/hostname + operator: In + values: + - hetzner-2 +--- +apiVersion: v1 +kind: PersistentVolume +metadata: + name: pv-jupyter +spec: + capacity: + storage: 20Gi + accessModes: + - ReadWriteOnce + persistentVolumeReclaimPolicy: Retain + storageClassName: local-ssd-hetzner + local: + path: /mnt/local-ssd/jupyter + nodeAffinity: + required: + nodeSelectorTerms: + - matchExpressions: + - key: kubernetes.io/hostname + operator: In + values: + - hetzner-2 +--- +apiVersion: v1 +kind: PersistentVolume +metadata: + name: pv-kafka +spec: + capacity: + storage: 50Gi + accessModes: + - ReadWriteOnce + persistentVolumeReclaimPolicy: Retain + storageClassName: local-ssd-hetzner + local: + path: /mnt/local-ssd/kafka + nodeAffinity: + required: + nodeSelectorTerms: + - matchExpressions: + - key: kubernetes.io/hostname + operator: In + values: + - hetzner-2 +--- +apiVersion: v1 +kind: PersistentVolume +metadata: + name: pv-zookeeper-data +spec: + capacity: + storage: 10Gi + accessModes: + - ReadWriteOnce + persistentVolumeReclaimPolicy: Retain + storageClassName: local-ssd-hetzner + local: + path: /mnt/local-ssd/zookeeper-data + nodeAffinity: + required: + nodeSelectorTerms: + - matchExpressions: + - key: kubernetes.io/hostname + operator: In + values: + - hetzner-2 +--- +apiVersion: v1 +kind: PersistentVolume +metadata: + name: pv-zookeeper-log +spec: + capacity: + storage: 10Gi + accessModes: + - ReadWriteOnce + persistentVolumeReclaimPolicy: Retain + storageClassName: local-ssd-hetzner + local: + path: /mnt/local-ssd/zookeeper-log + nodeAffinity: + required: + nodeSelectorTerms: + - matchExpressions: + - key: kubernetes.io/hostname + operator: In + values: + - hetzner-2 +--- +apiVersion: v1 +kind: PersistentVolume +metadata: + name: pv-prometheus +spec: + capacity: + storage: 50Gi + accessModes: + - ReadWriteOnce + persistentVolumeReclaimPolicy: Retain + storageClassName: local-ssd-hetzner + local: + path: /mnt/local-ssd/prometheus + nodeAffinity: + required: + nodeSelectorTerms: + - matchExpressions: + - key: kubernetes.io/hostname + operator: In + values: + - hetzner-2 diff --git a/k8s/storage/storageclass.yaml b/k8s/storage/storageclass.yaml new file mode 100644 index 0000000..ed7d4e3 --- /dev/null +++ b/k8s/storage/storageclass.yaml @@ -0,0 +1,6 @@ +apiVersion: storage.k8s.io/v1 +kind: StorageClass +metadata: + name: local-ssd-hetzner +provisioner: kubernetes.io/no-provisioner +volumeBindingMode: WaitForFirstConsumer diff --git a/k8s/tei/tei.yaml b/k8s/tei/tei.yaml new file mode 100644 index 0000000..ae1549e --- /dev/null +++ b/k8s/tei/tei.yaml @@ -0,0 +1,37 @@ +# k8s/ai/tei/deploy.yaml +apiVersion: apps/v1 +kind: Deployment +metadata: { name: tei, namespace: ml } +spec: + replicas: 1 + selector: { matchLabels: { app: tei } } + template: + metadata: { labels: { app: tei } } + spec: + containers: + - name: tei + image: ghcr.io/huggingface/text-embeddings-inference:cpu-latest + env: [{ name: MODEL_ID, value: "mixedbread-ai/mxbai-embed-large-v1" }] + ports: [{ containerPort: 80 }] +--- +apiVersion: v1 +kind: Service +metadata: { name: tei, namespace: ml } +spec: { selector: { app: tei }, ports: [ { port: 80, targetPort: 80 } ] } +--- +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + name: tei + namespace: ml + annotations: { cert-manager.io/cluster-issuer: letsencrypt-prod } +spec: + ingressClassName: nginx + tls: [{ hosts: ["embeddings.betelgeusebytes.io"], secretName: tei-tls }] + rules: + - host: embeddings.betelgeusebytes.io + http: + paths: + - path: / + pathType: Prefix + backend: { service: { name: tei, port: { number: 80 } } } diff --git a/k8s/trading/ib-gateway.yaml b/k8s/trading/ib-gateway.yaml new file mode 100644 index 0000000..31bbaaa --- /dev/null +++ b/k8s/trading/ib-gateway.yaml @@ -0,0 +1,541 @@ +apiVersion: v1 +kind: Namespace +metadata: + name: trading + labels: + name: trading + environment: production +--- +# OPTIONAL: Use this if you want to persist IB Gateway settings/logs +# across pod restarts. For most use cases, this is NOT needed since +# IB Gateway is mostly stateless and credentials are in Secrets. +# +# Only create this PV/PVC if you need to persist: +# - TWS session data +# - Custom workspace layouts +# - Historical API usage logs + +apiVersion: v1 +kind: PersistentVolume +metadata: + name: ib-gateway-data + labels: + type: local + app: ib-gateway +spec: + capacity: + storage: 5Gi + accessModes: + - ReadWriteOnce + persistentVolumeReclaimPolicy: Retain + storageClassName: local-storage + local: + path: /mnt/local-ssd/ib-gateway # Adjust to your local SSD path + nodeAffinity: + required: + nodeSelectorTerms: + - matchExpressions: + - key: kubernetes.io/hostname + operator: In + values: + - hetzner-2 + +--- +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: ib-gateway-data + namespace: trading +spec: + accessModes: + - ReadWriteOnce + resources: + requests: + storage: 5Gi + storageClassName: local-storage + selector: + matchLabels: + app: ib-gateway + +# To use this PVC, add to Deployment volumeMounts: +# - name: data +# mountPath: /root/Jts +# And to volumes: +# - name: data +# persistentVolumeClaim: +# claimName: ib-gateway-data +--- +apiVersion: v1 +kind: Secret +metadata: + name: ib-credentials + namespace: trading +type: Opaque +stringData: + # IMPORTANT: Replace these with your actual IB credentials + # For paper trading, use your paper trading account + username: "saladin85" + password: "3Lcd@05041985" + # Trading mode: "paper" or "live" + trading-mode: "paper" + + # IB Gateway config (jts.ini equivalent) + # This enables headless mode and configures ports + ibgateway.conf: | + [IBGateway] + TradingMode=paper + ApiOnly=true + ReadOnlyApi=false + TrustedIPs=127.0.0.1 + + [IBGatewayAPI] + ApiPortNumber=4002 + + [Logon] + UseRemoteSettings=no + Locale=en + ColorPaletteName=dark + + [Display] + ShowSplashScreen=no +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: ib-gateway-config + namespace: trading +data: + # Startup script to configure IB Gateway for headless operation + startup.sh: | + #!/bin/bash + set -e + + echo "Starting IB Gateway in headless mode..." + echo "Trading Mode: ${TRADING_MODE}" + echo "Port: ${TWS_PORT}" + + # Configure based on trading mode + if [ "${TRADING_MODE}" == "live" ]; then + export TWS_PORT=4001 + echo "⚠️ LIVE TRADING MODE - USE WITH CAUTION ⚠️" + else + export TWS_PORT=4002 + echo "📝 Paper Trading Mode (Safe)" + fi + # IMPORTANT: use the env vars provided by the Deployment + export IB_USERNAME="${TWS_USERID}" + export IB_PASSWORD="${TWS_PASSWORD}" + + # Start IB Gateway + exec /opt/ibgateway/ibgateway-latest-standalone-linux-x64.sh \ + --tws-path=/root/Jts \ + --tws-settings-path=/root \ + --user="${IB_USERNAME}" \ + --pw="${IB_PASSWORD}" \ + --mode="${TRADING_MODE}" \ + --port="${TWS_PORT}" + + # Health check script + healthcheck.sh: | + #!/bin/bash + # Check if TWS API port is listening + # PORT=${TWS_PORT:-4002} + # nc -z localhost $PORT + # exit $? + #!/bin/sh + # Pure-python TCP check (no nc required) + PORT="${TWS_PORT:-4002}" + python - <<'PY' + import os, socket, sys + port = int(os.environ.get("TWS_PORT", os.environ.get("PORT", "4002"))) + s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) + s.settimeout(2) + try: + s.connect(("127.0.0.1", port)) + sys.exit(0) + except Exception: + sys.exit(1) + finally: + s.close() + PY +--- +# apiVersion: apps/v1 +# kind: Deployment +# metadata: +# name: ib-gateway +# namespace: trading +# labels: +# app: ib-gateway +# component: trading-infrastructure +# spec: +# replicas: 1 # IB Gateway should only have 1 instance per account +# strategy: +# type: Recreate # Avoid multiple simultaneous logins +# selector: +# matchLabels: +# app: ib-gateway +# template: +# metadata: +# labels: +# app: ib-gateway +# annotations: +# prometheus.io/scrape: "false" # No metrics endpoint by default +# spec: +# # Pin to hetzner-2 (matches your existing pattern) +# nodeSelector: +# kubernetes.io/hostname: hetzner-2 + +# # Security context +# securityContext: +# runAsNonRoot: false # IB Gateway requires root for VNC (even if unused) +# fsGroup: 1000 + +# containers: +# - name: ib-gateway +# # Using community-maintained IB Gateway image +# # Alternative: waytrade/ib-gateway:latest +# image: ghcr.io/gnzsnz/ib-gateway:stable +# imagePullPolicy: IfNotPresent + +# env: +# - name: TWS_USERID +# valueFrom: +# secretKeyRef: +# name: ib-credentials +# key: username +# - name: TWS_PASSWORD +# valueFrom: +# secretKeyRef: +# name: ib-credentials +# key: password +# - name: TRADING_MODE +# valueFrom: +# secretKeyRef: +# name: ib-credentials +# key: trading-mode +# - name: TWS_PORT +# value: "4002" # Default to paper trading +# - name: READ_ONLY_API +# value: "no" + +# # Ports +# ports: +# - name: paper-trading +# containerPort: 4002 +# protocol: TCP +# - name: live-trading +# containerPort: 4001 +# protocol: TCP +# - name: vnc +# containerPort: 5900 +# protocol: TCP # VNC (not exposed externally) + +# # Resource limits +# resources: +# requests: +# memory: "1Gi" +# cpu: "500m" +# limits: +# memory: "2Gi" +# cpu: "1000m" + +# # Liveness probe (check if API port is responsive) +# startupProbe: +# tcpSocket: +# port: 4002 +# initialDelaySeconds: 60 # Wait 60s before first check +# periodSeconds: 10 # Check every 10s +# timeoutSeconds: 5 +# failureThreshold: 18 # 60s + (10s * 18) = 240s total startup time + +# livenessProbe: +# tcpSocket: +# port: 4002 +# initialDelaySeconds: 0 # IB Gateway takes time to start +# periodSeconds: 60 +# timeoutSeconds: 5 +# failureThreshold: 3 + +# # Readiness probe +# readinessProbe: +# tcpSocket: +# port: 4002 +# initialDelaySeconds: 0 +# periodSeconds: 10 +# timeoutSeconds: 5 +# failureThreshold: 2 + +# # Volume mounts for config +# volumeMounts: +# - name: ib-config +# mountPath: /root/Jts/jts.ini +# subPath: ibgateway.conf +# - name: startup-script +# mountPath: /startup.sh +# subPath: startup.sh +# - name: data +# mountPath: /root/Jts + +# # Logging to stdout (Fluent Bit will collect) +# # IB Gateway logs go to /root/Jts/log by default +# lifecycle: +# postStart: +# exec: +# command: +# - /bin/sh +# - -c +# - | +# mkdir -p /root/Jts/log +# ln -sf /dev/stdout /root/Jts/log/ibgateway.log || true + +# volumes: +# - name: ib-config +# secret: +# secretName: ib-credentials +# defaultMode: 0644 +# - name: startup-script +# configMap: +# name: ib-gateway-config +# defaultMode: 0755 +# - name: data +# persistentVolumeClaim: +# claimName: ib-gateway-data + +# # Restart policy +# restartPolicy: Always + +# # DNS policy for internal cluster resolution +# dnsPolicy: ClusterFirst +apiVersion: apps/v1 +kind: Deployment +metadata: + name: ib-gateway + namespace: trading + labels: + app: ib-gateway + component: trading-infrastructure +spec: + replicas: 1 + strategy: + type: Recreate + selector: + matchLabels: + app: ib-gateway + template: + metadata: + labels: + app: ib-gateway + annotations: + prometheus.io/scrape: "false" + spec: + nodeSelector: + kubernetes.io/hostname: hetzner-2 + + securityContext: + runAsNonRoot: false + fsGroup: 1000 + + # Seed writable jts.ini into the PVC once + initContainers: + - name: seed-jts-config + image: busybox:1.36 + command: + - sh + - -c + - | + set -e + mkdir -p /data + if [ ! -f /data/jts.ini ]; then + echo "Seeding jts.ini into PVC" + cp /config/ibgateway.conf /data/jts.ini + chmod 644 /data/jts.ini + else + echo "jts.ini already exists in PVC" + fi + volumeMounts: + - name: ib-config + mountPath: /config + readOnly: true + - name: data + mountPath: /data + + containers: + # ------------------------------------------------------------------ + # IB Gateway + # ------------------------------------------------------------------ + - name: ib-gateway + image: ghcr.io/gnzsnz/ib-gateway:stable + imagePullPolicy: IfNotPresent + + env: + - name: TWS_USERID + valueFrom: + secretKeyRef: + name: ib-credentials + key: username + - name: TWS_PASSWORD + valueFrom: + secretKeyRef: + name: ib-credentials + key: password + - name: TRADING_MODE + valueFrom: + secretKeyRef: + name: ib-credentials + key: trading-mode + - name: TWS_PORT + value: "4002" + - name: READ_ONLY_API + value: "no" + + ports: + - name: ib-api-local + containerPort: 4002 + protocol: TCP + - name: live-trading + containerPort: 4001 + protocol: TCP + - name: vnc + containerPort: 5900 + protocol: TCP + + resources: + requests: + memory: "1Gi" + cpu: "500m" + limits: + memory: "2Gi" + cpu: "1000m" + + # IMPORTANT: Probes should check the local IB port (4002) + startupProbe: + tcpSocket: + port: 4002 + initialDelaySeconds: 60 + periodSeconds: 10 + timeoutSeconds: 5 + failureThreshold: 18 + + livenessProbe: + tcpSocket: + port: 4002 + periodSeconds: 60 + timeoutSeconds: 5 + failureThreshold: 3 + + readinessProbe: + tcpSocket: + port: 4002 + periodSeconds: 10 + timeoutSeconds: 5 + failureThreshold: 2 + + volumeMounts: + - name: data + mountPath: /root/Jts + + lifecycle: + postStart: + exec: + command: + - sh + - -c + - | + mkdir -p /root/Jts/log + ln -sf /dev/stdout /root/Jts/log/ibgateway.log || true + + # ------------------------------------------------------------------ + # Sidecar TCP proxy: accepts cluster traffic, forwards to localhost:4002 + # ------------------------------------------------------------------ + - name: ib-api-proxy + image: alpine/socat:1.8.0.0 + imagePullPolicy: IfNotPresent + args: + - "-d" + - "-d" + - "TCP-LISTEN:4003,fork,reuseaddr" + - "TCP:127.0.0.1:4002" + ports: + - name: ib-api + containerPort: 4003 + protocol: TCP + resources: + requests: + memory: "32Mi" + cpu: "10m" + limits: + memory: "128Mi" + cpu: "100m" + # basic probe: is proxy listening + readinessProbe: + tcpSocket: + port: 4003 + periodSeconds: 5 + timeoutSeconds: 2 + failureThreshold: 3 + + volumes: + - name: ib-config + secret: + secretName: ib-credentials + defaultMode: 0644 + + - name: data + persistentVolumeClaim: + claimName: ib-gateway-data + + restartPolicy: Always + dnsPolicy: ClusterFirst + + +--- +# apiVersion: v1 +# kind: Service +# metadata: +# name: ib-gateway +# namespace: trading +# labels: +# app: ib-gateway +# spec: +# type: ClusterIP # Internal-only, not exposed publicly +# clusterIP: None # Headless service (optional, remove if you want a stable ClusterIP) +# selector: +# app: ib-gateway +# ports: +# - name: paper-trading +# port: 4002 +# targetPort: 4002 +# protocol: TCP +# - name: live-trading +# port: 4001 +# targetPort: 4001 +# protocol: TCP +# sessionAffinity: ClientIP # Stick to same pod (important for stateful TWS sessions) +# sessionAffinityConfig: +# clientIP: +# timeoutSeconds: 3600 # 1 hour session stickiness + +apiVersion: v1 +kind: Service +metadata: + name: ib-gateway + namespace: trading + labels: + app: ib-gateway +spec: + type: ClusterIP + selector: + app: ib-gateway + ports: + - name: paper-trading + port: 4002 + targetPort: 4003 # <-- proxy sidecar, not the gateway directly + protocol: TCP + - name: live-trading + port: 4001 + targetPort: 4001 + protocol: TCP + sessionAffinity: ClientIP + sessionAffinityConfig: + clientIP: + timeoutSeconds: 3600 diff --git a/k8s/trading/ib-gateway2.yaml b/k8s/trading/ib-gateway2.yaml new file mode 100644 index 0000000..81fc23b --- /dev/null +++ b/k8s/trading/ib-gateway2.yaml @@ -0,0 +1,169 @@ +apiVersion: v1 +kind: Namespace +metadata: + name: trading + labels: + name: trading + environment: production +--- +apiVersion: v1 +kind: Secret +metadata: + name: ib-credentials + namespace: trading +type: Opaque +stringData: + # Rotate your creds (you pasted them earlier). + username: "saladin85" + password: "3Lcd@05041985" + trading-mode: "paper" +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: ib-gateway + namespace: trading + labels: + app: ib-gateway + component: trading-infrastructure +spec: + replicas: 1 + strategy: + type: Recreate + selector: + matchLabels: + app: ib-gateway + template: + metadata: + labels: + app: ib-gateway + annotations: + prometheus.io/scrape: "false" + spec: + nodeSelector: + kubernetes.io/hostname: hetzner-2 + + # Keep your original security context + securityContext: + runAsNonRoot: false + fsGroup: 1000 + + containers: + - name: ib-gateway + image: ghcr.io/gnzsnz/ib-gateway:stable + imagePullPolicy: IfNotPresent + + # IMPORTANT: use env vars this image expects + env: + - name: TWS_USERID + valueFrom: + secretKeyRef: + name: ib-credentials + key: username + - name: TWS_PASSWORD + valueFrom: + secretKeyRef: + name: ib-credentials + key: password + - name: TRADING_MODE + valueFrom: + secretKeyRef: + name: ib-credentials + key: trading-mode + - name: READ_ONLY_API + value: "no" + + # These two match what your log shows the image uses + - name: API_PORT + value: "4002" + - name: SOCAT_PORT + value: "4004" + + # optional but nice + - name: TIME_ZONE + value: "Etc/UTC" + - name: TWOFA_TIMEOUT_ACTION + value: "exit" + + ports: + # IB API ports (inside container / localhost use) + - name: api-paper + containerPort: 4002 + protocol: TCP + - name: api-live + containerPort: 4001 + protocol: TCP + + # socat relay port for non-localhost clients (what we expose via Service) + - name: api-socat + containerPort: 4004 + protocol: TCP + + # optional UI/VNC + - name: vnc + containerPort: 5900 + protocol: TCP + + resources: + requests: + memory: "1Gi" + cpu: "500m" + limits: + memory: "2Gi" + cpu: "1000m" + + # Probe the socat port (represents remote connectivity) + startupProbe: + tcpSocket: + port: 4004 + initialDelaySeconds: 60 + periodSeconds: 10 + timeoutSeconds: 5 + failureThreshold: 18 + + readinessProbe: + tcpSocket: + port: 4004 + periodSeconds: 10 + timeoutSeconds: 5 + failureThreshold: 2 + + livenessProbe: + tcpSocket: + port: 4004 + periodSeconds: 60 + timeoutSeconds: 5 + failureThreshold: 3 + + restartPolicy: Always + dnsPolicy: ClusterFirst +--- +apiVersion: v1 +kind: Service +metadata: + name: ib-gateway + namespace: trading + labels: + app: ib-gateway +spec: + type: ClusterIP + selector: + app: ib-gateway + ports: + # Clients connect to 4002, but we forward to SOCAT_PORT=4004 + - name: paper-trading + port: 4002 + targetPort: 4004 + protocol: TCP + + # If you truly need live, you should relay live via another socat port too. + # For now keep it direct (or remove it entirely for safety). + - name: live-trading + port: 4001 + targetPort: 4001 + protocol: TCP + + sessionAffinity: ClientIP + sessionAffinityConfig: + clientIP: + timeoutSeconds: 3600 diff --git a/k8s/vector/qdrant.yaml b/k8s/vector/qdrant.yaml new file mode 100644 index 0000000..b035db9 --- /dev/null +++ b/k8s/vector/qdrant.yaml @@ -0,0 +1,80 @@ +# k8s/vec/qdrant/pvc.yaml +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: { name: qdrant-data, namespace: db} +spec: + accessModes: ["ReadWriteOnce"] + storageClassName: local-ssd-hetzner + resources: { requests: { storage: 20Gi } } + +--- +# k8s/vec/qdrant/deploy.yaml +apiVersion: apps/v1 +kind: Deployment +metadata: { name: qdrant, namespace: db} +spec: + replicas: 1 + selector: { matchLabels: { app: qdrant } } + template: + metadata: { labels: { app: qdrant } } + spec: + containers: + - name: qdrant + image: qdrant/qdrant:latest + ports: + - { containerPort: 6333 } # HTTP + Web UI + - { containerPort: 6334 } # gRPC + volumeMounts: + - { name: data, mountPath: /qdrant/storage } + volumes: + - name: data + persistentVolumeClaim: { claimName: qdrant-data } +--- +apiVersion: v1 +kind: Service +metadata: { name: qdrant, namespace: db} +spec: + selector: { app: qdrant } + ports: + - { name: http, port: 80, targetPort: 6333 } + - { name: grpc, port: 6334, targetPort: 6334 } +--- +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + name: qdrant + namespace: db + annotations: { cert-manager.io/cluster-issuer: letsencrypt-prod } +spec: + ingressClassName: nginx + tls: [{ hosts: ["vector.betelgeusebytes.io"], secretName: qdrant-tls }] + rules: + - host: vector.betelgeusebytes.io + http: + paths: + - path: / + pathType: Prefix + backend: { service: { name: qdrant, port: { number: 80 } } } +--- +# PV +apiVersion: v1 +kind: PersistentVolume +metadata: + name: pv-qdrant +spec: + capacity: + storage: 20Gi + accessModes: + - ReadWriteOnce + persistentVolumeReclaimPolicy: Retain + storageClassName: local-ssd-hetzner + local: + path: /mnt/local-ssd/qdrant + nodeAffinity: + required: + nodeSelectorTerms: + - matchExpressions: + - key: kubernetes.io/hostname + operator: In + values: + - hetzner-2 diff --git a/k8s/vllm/vllm.yaml b/k8s/vllm/vllm.yaml new file mode 100644 index 0000000..1d7fb6f --- /dev/null +++ b/k8s/vllm/vllm.yaml @@ -0,0 +1,142 @@ +# PV +apiVersion: v1 +kind: PersistentVolume +metadata: + name: pv-vllm +spec: + capacity: + storage: 50Gi + accessModes: + - ReadWriteOnce + persistentVolumeReclaimPolicy: Retain + storageClassName: local-ssd-hetzner + local: + path: /mnt/local-ssd/vllm + nodeAffinity: + required: + nodeSelectorTerms: + - matchExpressions: + - key: kubernetes.io/hostname + operator: In + values: + - hetzner-2 +--- +# k8s/ai/vllm/secret.yaml +apiVersion: v1 +kind: Secret +metadata: { name: vllm-auth, namespace: ml } +type: Opaque +stringData: { API_KEY: "replace_me" } + +--- +# k8s/ai/ollama/deploy.yaml +apiVersion: apps/v1 +kind: Deployment +metadata: { name: ollama, namespace: ml } +spec: + replicas: 1 + selector: { matchLabels: { app: ollama } } + template: + metadata: { labels: { app: ollama } } + spec: + securityContext: + runAsUser: 0 # needed so the init can write into /root/.ollama + initContainers: + - name: warm-models + image: ollama/ollama:latest + command: ["/bin/sh","-c"] + args: + - | + ollama serve & # start a temp daemon + sleep 2 + # pull one or more small, quantized models for CPU + ollama pull qwen2.5:3b-instruct-q4_K_M || true + ollama pull llama3.2:3b-instruct-q4_K_M || true + pkill ollama || true + volumeMounts: + - { name: data, mountPath: /root/.ollama } + containers: + - name: ollama + image: ollama/ollama:latest + env: + - { name: OLLAMA_ORIGINS, value: "*" } # CORS if you call from browser + ports: + - { containerPort: 11434 } + volumeMounts: + - { name: data, mountPath: /root/.ollama } + resources: + requests: { cpu: "2", memory: "4Gi" } + limits: { cpu: "4", memory: "8Gi" } + volumes: + - name: data + persistentVolumeClaim: { claimName: ollama-data } + +--- +# k8s/ai/ollama/svc-ing.yaml +apiVersion: v1 +kind: Service +metadata: { name: ollama, namespace: ml } +spec: + selector: { app: ollama } + ports: [ { name: http, port: 80, targetPort: 11434 } ] + +# --- +# # old k8s/ai/vllm/deploy.yaml +# apiVersion: apps/v1 +# kind: Deployment +# metadata: { name: vllm, namespace: ml } +# spec: +# replicas: 1 +# selector: { matchLabels: { app: vllm } } +# template: +# metadata: { labels: { app: vllm } } +# spec: +# containers: +# - name: vllm +# image: vllm/vllm-openai:latest +# args: ["--model","Qwen/Qwen2.5-7B-Instruct","--max-model-len","8192","--port","8000","--host","0.0.0.0"] +# env: +# - name: VLLM_API_KEY +# valueFrom: { secretKeyRef: { name: vllm-auth, key: API_KEY } } +# ports: [{ containerPort: 8000 }] +# resources: +# limits: +# nvidia.com/gpu: 1 +# requests: +# nvidia.com/gpu: 1 +# volumeMounts: +# - { name: cache, mountPath: /root/.cache/huggingface } +# volumes: +# - name: cache +# persistentVolumeClaim: { claimName: vllm-cache-pvc } +--- +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: { name: ollama-data, namespace: ml } +spec: + accessModes: ["ReadWriteOnce"] + storageClassName: local-ssd-hetzner + resources: { requests: { storage: 50Gi } } +# --- +#old k8s/ai/vllm/svc-ing.yaml +# apiVersion: v1 +# kind: Service +# metadata: { name: vllm, namespace: ml } +# spec: { selector: { app: vllm }, ports: [ { port: 80, targetPort: 8000 } ] } +--- +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + name: vllm + namespace: ml + annotations: { cert-manager.io/cluster-issuer: letsencrypt-prod } +spec: + ingressClassName: nginx + tls: [{ hosts: ["llm.betelgeusebytes.io"], secretName: vllm-tls }] + rules: + - host: llm.betelgeusebytes.io + http: + paths: + - path: / + pathType: Prefix + backend: { service: { name: vllm, port: { number: 80 } } }