=== ./ansible/inventories/prod/group_vars/all.yml === cluster_name: prod k8s_version: "v1.30.3" control_plane_endpoint: "95.217.89.53:6443" # switch later to cp.k8s.betelgeusebytes.io:6443 pod_cidr: "10.244.0.0/16" service_cidr: "10.96.0.0/12" cilium_version: "1.15.7" local_path_dir: "/srv/k8s" local_sc_name: "local-ssd-hetzner" stateful_node_label_key: "node" stateful_node_label_val: "hetzner-2" === ./ansible/inventories/prod/hosts.ini === [k8s_control_plane] hetzner-1 ansible_host=95.217.89.53 public_ip=95.217.89.53 wg_address=10.66.0.11 [k8s_workers] hetzner-1 ansible_host=95.217.89.53 public_ip=95.217.89.53 wg_address=10.66.0.11 hetzner-2 ansible_host=138.201.254.97 public_ip=138.201.254.97 wg_address=10.66.0.12 [k8s_nodes:children] k8s_control_plane k8s_workers # add tiny VPS control-planes here when ready [new_control_planes] # cp-a ansible_host= public_ip= wg_address=10.66.0.10 [all:vars] ansible_user=root ansible_password=3Lcd0504 ansible_become=true === ./ansible/playbooks/add-control-planes.yml === - hosts: k8s_control_plane[0] become: yes roles: - kubeadm_cp_discovery - hosts: new_control_planes become: yes roles: - common - wireguard - containerd - kubernetes - hosts: new_control_planes become: yes roles: - kubeadm_join_cp vars: kubeadm_cp_join_cmd: "{{ hostvars[groups['k8s_control_plane'][0]].kubeadm_cp_join_cmd | default(kubeadm_cp_join_cmd) }}" === ./ansible/playbooks/site.yml === - hosts: k8s_nodes become: yes # serial: 1 roles: # - ../roles/common #- ../roles/wireguard #- ../roles/containerd #- ../roles/kubernetes - hosts: k8s_control_plane become: yes roles: - ../roles/kubeadm_init # - hosts: k8s_workers # become: yes # roles: # - ../roles/kubeadm_join - hosts: k8s_control_plane become: yes roles: # - ../roles/cilium # - ../roles/ingress #- ../roles/cert_manager - hosts: k8s_nodes become: yes roles: #- ../roles/storage_local_path - ../roles/labels === ./ansible/roles/cert_manager/tasks/main.yml === - name: Install cert-manager shell: kubectl apply -f https://github.com/cert-manager/cert-manager/releases/download/v1.14.4/cert-manager.yaml - name: Wait for cert-manager pods to be ready shell: kubectl wait --for=condition=ready --timeout=300s pod -l app.kubernetes.io/instance=cert-manager -n cert-manager - name: Wait for webhook endpoint to be ready shell: | for i in {1..30}; do if kubectl get endpoints cert-manager-webhook -n cert-manager -o jsonpath='{.subsets[*].addresses[*].ip}' | grep -q .; then echo "Webhook endpoint is ready" exit 0 fi echo "Waiting for webhook endpoint... attempt $i/30" sleep 2 done exit 1 - name: Test webhook connectivity shell: kubectl run test-webhook --image=curlimages/curl:latest --rm -i --restart=Never -- curl -k https://cert-manager-webhook.cert-manager.svc:443/healthz register: webhook_test ignore_errors: yes - name: Display webhook test result debug: var: webhook_test - name: ClusterIssuer copy: dest: /root/cluster-issuer-prod.yaml content: | apiVersion: cert-manager.io/v1 kind: ClusterIssuer metadata: name: letsencrypt-prod spec: acme: - name: ClusterIssuer copy: dest: /root/cluster-issuer-prod.yaml content: | apiVersion: cert-manager.io/v1 kind: ClusterIssuer metadata: name: letsencrypt-prod spec: acme: email: admin@betelgeusebytes.io server: https://acme-v02.api.letsencrypt.org/directory privateKeySecretRef: name: letsencrypt-prod-key solvers: - http01: ingress: class: nginx - name: Temporarily disable cert-manager webhook shell: | kubectl delete validatingwebhookconfiguration cert-manager-webhook || true ignore_errors: yes - name: Apply ClusterIssuer command: kubectl apply -f /root/cluster-issuer-prod.yaml - name: Reinstall cert-manager to restore webhook shell: kubectl apply -f https://github.com/cert-manager/cert-manager/releases/download/v1.14.4/cert-manager.yaml === ./ansible/roles/cilium/tasks/main.yml === - name: Install cilium CLI shell: | curl -L --fail --remote-name-all https://github.com/cilium/cilium-cli/releases/latest/download/cilium-linux-amd64.tar.gz tar xzf cilium-linux-amd64.tar.gz -C /usr/local/bin args: { creates: /usr/local/bin/cilium } - name: Deploy cilium shell: | cilium install --version {{ cilium_version }} --set kubeProxyReplacement=strict --set bpf.masquerade=true === ./ansible/roles/common/tasks/main.yml === - name: Disable swap command: swapoff -a when: ansible_swaptotal_mb|int > 0 - name: Ensure swap disabled on boot replace: path: /etc/fstab regexp: '^([^#].*\sswap\s)' replace: '# \1' - name: Kernel modules copy: dest: /etc/modules-load.d/containerd.conf content: | overlay br_netfilter - name: Load modules command: modprobe {{ item }} loop: [overlay, br_netfilter] - name: Sysctl for k8s copy: dest: /etc/sysctl.d/99-kubernetes.conf content: | net.bridge.bridge-nf-call-iptables = 1 net.bridge.bridge-nf-call-ip6tables = 1 net.ipv4.ip_forward = 1 vm.max_map_count = 262144 - name: Apply sysctl command: sysctl --system === ./ansible/roles/containerd/tasks/main.yml === - name: Install containerd apt: name: containerd state: present update_cache: yes - name: Ensure containerd config directory file: path: /etc/containerd state: directory mode: '0755' - name: Generate default config shell: containerd config default > /etc/containerd/config.toml args: { creates: /etc/containerd/config.toml } - name: Ensure SystemdCgroup=true replace: path: /etc/containerd/config.toml regexp: 'SystemdCgroup = false' replace: 'SystemdCgroup = true' - name: Restart containerd service: name: containerd state: restarted enabled: yes === ./ansible/roles/ingress/tasks/main.yml === - name: Deploy ingress-nginx (baremetal) shell: kubectl apply -f https://raw.githubusercontent.com/kubernetes/ingress-nginx/main/deploy/static/provider/baremetal/deploy.yaml === ./ansible/roles/kubeadm_cp_discovery/tasks/main.yml === - name: Upload certs and get certificate key shell: kubeadm init phase upload-certs --upload-certs | tail -n 1 register: cert_key - name: Compute CA cert hash shell: | openssl x509 -pubkey -in /etc/kubernetes/pki/ca.crt | openssl rsa -pubin -outform der 2>/dev/null | openssl dgst -sha256 -hex | awk '{print $2}' register: ca_hash - name: Create short-lived token shell: kubeadm token create --ttl 30m register: join_token - name: Determine control-plane endpoint set_fact: cp_endpoint: "{{ hostvars[inventory_hostname].control_plane_endpoint | default(ansible_host ~ ':6443') }}" - set_fact: kubeadm_cp_join_cmd: >- kubeadm join {{ cp_endpoint }} --token {{ join_token.stdout }} --discovery-token-ca-cert-hash sha256:{{ ca_hash.stdout }} --control-plane --certificate-key {{ cert_key.stdout }} === ./ansible/roles/kubeadm_init/tasks/main.yml === # - name: Write kubeadm config # template: # src: kubeadm-config.yaml.j2 # dest: /etc/kubernetes/kubeadm-config.yaml # - name: Pre-pull images # command: kubeadm config images pull # - name: Init control-plane # command: kubeadm init --config=/etc/kubernetes/kubeadm-config.yaml # args: { creates: /etc/kubernetes/admin.conf } # - name: Setup kubeconfig # shell: | # mkdir -p $HOME/.kube # cp -i /etc/kubernetes/admin.conf $HOME/.kube/config # chown $(id -u):$(id -g) $HOME/.kube/config - name: Save join command shell: kubeadm token create --print-join-command register: join_cmd - set_fact: kubeadm_join_command_all: "{{ join_cmd.stdout }}" === ./ansible/roles/kubeadm_join/tasks/main.yml === - name: Join node to cluster command: "{{ hostvars[groups['k8s_control_plane'][0]].kubeadm_join_command_all }} --ignore-preflight-errors=FileAvailable--etc-kubernetes-kubelet.conf,FileAvailable--etc-kubernetes-pki-ca.crt,Port-10250" === ./ansible/roles/kubeadm_join_cp/tasks/main.yml === - name: Ensure join command provided fail: msg: "Set kubeadm_cp_join_cmd variable (string)" when: kubeadm_cp_join_cmd is not defined - name: Join node as control-plane command: "{{ kubeadm_cp_join_cmd }}" args: creates: /etc/kubernetes/kubelet.conf === ./ansible/roles/kubernetes/tasks/main.yml === - name: Install Kubernetes apt key shell: curl -fsSL https://pkgs.k8s.io/core:/stable:/v1.30/deb/Release.key | gpg --dearmor -o /etc/apt/keyrings/kubernetes-apt-keyring.gpg args: { creates: /etc/apt/keyrings/kubernetes-apt-keyring.gpg } - name: Add Kubernetes repo apt_repository: repo: "deb [signed-by=/etc/apt/keyrings/kubernetes-apt-keyring.gpg] https://pkgs.k8s.io/core:/stable:/v1.30/deb/ /" state: present - name: Install kubeadm, kubelet, kubectl apt: name: [kubeadm, kubelet, kubectl] state: present update_cache: yes - name: Hold kube packages command: apt-mark hold kubeadm kubelet kubectl === ./ansible/roles/labels/tasks/main.yml === - name: Label hetzner-2 for stateful command: kubectl label node hetzner-2 {{ stateful_node_label_key }}={{ stateful_node_label_val }} --overwrite delegate_to: "{{ groups['k8s_control_plane'][0] }}" run_once: true === ./ansible/roles/storage_local_path/tasks/main.yml === - name: Ensure local path dir file: path: "{{ local_path_dir }}" state: directory mode: '0777' - name: StorageClass local-ssd-hetzner copy: dest: /root/local-sc.yaml content: | apiVersion: storage.k8s.io/v1 kind: StorageClass metadata: name: {{ local_sc_name }} provisioner: kubernetes.io/no-provisioner volumeBindingMode: WaitForFirstConsumer when: inventory_hostname in groups['k8s_control_plane'] - name: Apply SC command: kubectl apply -f /root/local-sc.yaml environment: KUBECONFIG: /etc/kubernetes/admin.conf when: inventory_hostname in groups['k8s_control_plane'] - name: Create local-path directory file: path: /mnt/local-ssd state: directory mode: '0755' - name: Create subdirectories for each PV file: path: "/mnt/local-ssd/{{ item }}" state: directory mode: '0755' loop: - postgres - prometheus - elasticsearch - grafana - name: Copy PV manifest template: src: local-ssd-pv.yaml dest: /tmp/local-ssd-pv.yaml - name: Apply PV command: kubectl apply -f /tmp/local-ssd-pv.yaml run_once: true delegate_to: "{{ groups['k8s_control_plane'][0] }}" - name: Apply SC command: kubectl apply -f /tmp/local-ssd-sc.yaml run_once: true delegate_to: "{{ groups['k8s_control_plane'][0] }}" === ./ansible/roles/storage_local_path/templates/local-ssd-pv.yaml === apiVersion: v1 kind: PersistentVolume metadata: name: local-ssd-postgres spec: capacity: storage: 100Gi accessModes: - ReadWriteOnce persistentVolumeReclaimPolicy: Retain storageClassName: local-ssd-hetzner local: path: /mnt/local-ssd/postgres nodeAffinity: required: nodeSelectorTerms: - matchExpressions: - key: kubernetes.io/hostname operator: In values: - hetzner-2 --- apiVersion: v1 kind: PersistentVolume metadata: name: local-ssd-prometheus spec: capacity: storage: 100Gi accessModes: - ReadWriteOnce persistentVolumeReclaimPolicy: Retain storageClassName: local-ssd-hetzner local: path: /mnt/local-ssd/prometheus nodeAffinity: required: nodeSelectorTerms: - matchExpressions: - key: kubernetes.io/hostname operator: In values: - hetzner-2 --- apiVersion: v1 kind: PersistentVolume metadata: name: local-ssd-elasticsearch spec: capacity: storage: 300Gi accessModes: - ReadWriteOnce persistentVolumeReclaimPolicy: Retain storageClassName: local-ssd-hetzner local: path: /mnt/local-ssd/elasticsearch nodeAffinity: required: nodeSelectorTerms: - matchExpressions: - key: kubernetes.io/hostname operator: In values: - hetzner-2 === ./ansible/roles/wireguard/tasks/main.yml === - name: Install wireguard apt: name: [wireguard, qrencode] state: present update_cache: yes - name: Ensure key dir file: { path: /etc/wireguard/keys, state: directory, mode: '0700' } - name: Generate private key if missing shell: "[ -f /etc/wireguard/keys/privatekey ] || (umask 077 && wg genkey > /etc/wireguard/keys/privatekey)" args: { creates: /etc/wireguard/keys/privatekey } - name: Generate public key shell: "wg pubkey < /etc/wireguard/keys/privatekey > /etc/wireguard/keys/publickey" args: { creates: /etc/wireguard/keys/publickey } - name: Read pubkey slurp: { src: /etc/wireguard/keys/publickey } register: pubkey_raw - name: Read private key slurp: { src: /etc/wireguard/keys/privatekey } register: privkey_raw - set_fact: wg_public_key: "{{ pubkey_raw.content | b64decode | trim }}" wg_private_key: "{{ privkey_raw.content | b64decode | trim }}" - name: Gather facts from all hosts setup: delegate_to: "{{ item }}" delegate_facts: true loop: "{{ groups['k8s_nodes'] }}" run_once: true - name: Pretty print hostvars debug: msg: "{{ hostvars['hetzner-1']['wg_public_key'] }}" - name: Render config template: src: wg0.conf.j2 dest: /etc/wireguard/wg0.conf mode: '0600' - name: Enable IP forward sysctl: name: net.ipv4.ip_forward value: "1" sysctl_set: yes state: present reload: yes - name: Enable wg-quick service: name: wg-quick@wg0 enabled: yes state: started - debug: var: wg_show.stdout === ./ansible/roles/wireguard/vars/main.yml === wg_interface: wg0 wg_port: 51820 wg_cidr: 10.66.0.0/24 wg_nodes: hetzner-1: { address: 10.66.0.11, public_ip: "95.217.89.53" } hetzner-2: { address: 10.66.0.12, public_ip: "138.201.254.97" } === ./DNS_RECORDS.txt === apps.betelgeusebytes.io. 300 IN A 95.217.89.53 apps.betelgeusebytes.io. 300 IN A 138.201.254.97 gitea.betelgeusebytes.io. 300 IN CNAME apps.betelgeusebytes.io. kibana.betelgeusebytes.io. 300 IN CNAME apps.betelgeusebytes.io. grafana.betelgeusebytes.io. 300 IN CNAME apps.betelgeusebytes.io. prometheus.betelgeusebytes.io. 300 IN CNAME apps.betelgeusebytes.io. notebook.betelgeusebytes.io. 300 IN CNAME apps.betelgeusebytes.io. broker.betelgeusebytes.io. 300 IN CNAME apps.betelgeusebytes.io. neo4j.betelgeusebytes.io. 300 IN CNAME apps.betelgeusebytes.io. otlp.betelgeusebytes.io. 300 IN CNAME apps.betelgeusebytes.io. === ./k8s/00-namespaces.yaml === apiVersion: v1 kind: Namespace metadata: { name: db } --- apiVersion: v1 kind: Namespace metadata: { name: scm } --- apiVersion: v1 kind: Namespace metadata: { name: ml } --- apiVersion: v1 kind: Namespace metadata: { name: monitoring } --- apiVersion: v1 kind: Namespace metadata: { name: elastic } --- apiVersion: v1 kind: Namespace metadata: { name: broker } --- apiVersion: v1 kind: Namespace metadata: { name: graph } --- apiVersion: v1 kind: Namespace metadata: { name: observability } === ./k8s/01-secrets/basic-auth.yaml === # Replace each 'auth' line with a real htpasswd pair: # htpasswd -nbBC 10 admin 'Str0ngP@ss' (copy 'admin:...' to value below) apiVersion: v1 kind: Secret metadata: { name: basic-auth-kibana, namespace: elastic } type: Opaque stringData: { auth: "admin:$2y$10$MBLgALyI7xwFrQh2PHqZruX.EzaTUGagmJODwpBEvF27snFAxCBvq" } --- apiVersion: v1 kind: Secret metadata: { name: basic-auth-grafana, namespace: monitoring } type: Opaque stringData: { auth: "admin:$2y$10$MBLgALyI7xwFrQh2PHqZruX.EzaTUGagmJODwpBEvF27snFAxCBvq" } --- apiVersion: v1 kind: Secret metadata: { name: basic-auth-prometheus, namespace: monitoring } type: Opaque stringData: { auth: "aadmin:$2y$10$MBLgALyI7xwFrQh2PHqZruX.EzaTUGagmJODwpBEvF27snFAxCBvq" } --- apiVersion: v1 kind: Secret metadata: { name: basic-auth-notebook, namespace: ml } type: Opaque stringData: { auth: "admin:$2y$10$MBLgALyI7xwFrQh2PHqZruX.EzaTUGagmJODwpBEvF27snFAxCBvq" } --- apiVersion: v1 kind: Secret metadata: { name: basic-auth-broker, namespace: broker } type: Opaque stringData: { auth: "admin:$2y$10$MBLgALyI7xwFrQh2PHqZruX.EzaTUGagmJODwpBEvF27snFAxCBvq" } --- apiVersion: v1 kind: Secret metadata: { name: basic-auth-neo4j, namespace: graph } type: Opaque stringData: { auth: "admin:$2y$10$MBLgALyI7xwFrQh2PHqZruX.EzaTUGagmJODwpBEvF27snFAxCBvq" } === ./k8s/argoflow/argo.yaml === apiVersion: v1 kind: Secret metadata: name: argo-artifacts namespace: ml type: Opaque stringData: accesskey: "minioadmin" # <-- change secretkey: "minioadmin" # <-- change --- apiVersion: v1 kind: ConfigMap metadata: name: workflow-controller-configmap namespace: ml data: config: | artifactRepository: s3: bucket: argo-artifacts endpoint: minio.betelgeusebytes.io # no scheme here insecure: false # https via Ingress accessKeySecret: name: argo-artifacts key: accesskey secretKeySecret: name: argo-artifacts key: secretkey keyFormat: "{{workflow.namespace}}/{{workflow.name}}/{{pod.name}}" --- # k8s/argo/workflows/ns-rbac.yaml apiVersion: v1 kind: ServiceAccount metadata: name: argo-server namespace: ml --- apiVersion: rbac.authorization.k8s.io/v1 kind: Role metadata: name: argo-namespaced namespace: ml rules: - apiGroups: [""] resources: ["pods","pods/log","secrets","configmaps","events","persistentvolumeclaims","serviceaccounts"] verbs: ["get","list","watch","create","delete","patch","update"] - apiGroups: ["coordination.k8s.io"] resources: ["leases"] verbs: ["get","list","watch","create","delete","patch","update"] - apiGroups: ["argoproj.io"] resources: ["workflows","workflowtemplates","cronworkflows","workfloweventbindings","sensors","eventsources","workflowtasksets","workflowartifactgctasks","workflowtaskresults"] verbs: ["get","list","watch","create","delete","patch","update"] --- apiVersion: rbac.authorization.k8s.io/v1 kind: RoleBinding metadata: name: argo-namespaced-binding namespace: ml subjects: - kind: ServiceAccount name: argo-server namespace: ml roleRef: apiGroup: rbac.authorization.k8s.io kind: Role name: argo-namespaced --- # k8s/argo/workflows/controller.yaml apiVersion: apps/v1 kind: Deployment metadata: { name: workflow-controller, namespace: ml } spec: replicas: 1 selector: { matchLabels: { app: workflow-controller } } template: metadata: { labels: { app: workflow-controller } } spec: serviceAccountName: argo-server containers: - name: controller image: quay.io/argoproj/workflow-controller:latest args: ["--namespaced"] env: - name: LEADER_ELECTION_IDENTITY valueFrom: fieldRef: fieldPath: metadata.name ports: [{ containerPort: 9090 }] readinessProbe: httpGet: { path: /metrics, port: 9090, scheme: HTTPS } initialDelaySeconds: 5 periodSeconds: 10 livenessProbe: httpGet: { path: /metrics, port: 9090, scheme: HTTPS } initialDelaySeconds: 20 periodSeconds: 20 --- # k8s/argo/workflows/server.yaml apiVersion: apps/v1 kind: Deployment metadata: { name: argo-server, namespace: ml } spec: replicas: 1 selector: { matchLabels: { app: argo-server } } template: metadata: { labels: { app: argo-server } } spec: serviceAccountName: argo-server containers: - name: server image: quay.io/argoproj/argocli:latest args: ["server","--auth-mode","server","--namespaced","--secure=false"] ports: [{ containerPort: 2746 }] readinessProbe: httpGet: { path: /, port: 2746, scheme: HTTP } initialDelaySeconds: 5 periodSeconds: 10 livenessProbe: httpGet: { path: /, port: 2746, scheme: HTTP } initialDelaySeconds: 20 periodSeconds: 20 --- apiVersion: v1 kind: Service metadata: { name: argo-server, namespace: ml } spec: { selector: { app: argo-server }, ports: [ { port: 80, targetPort: 2746 } ] } --- apiVersion: networking.k8s.io/v1 kind: Ingress metadata: name: argo namespace: ml annotations: { cert-manager.io/cluster-issuer: letsencrypt-prod } spec: ingressClassName: nginx tls: [{ hosts: ["argo.betelgeusebytes.io"], secretName: argo-tls }] rules: - host: argo.betelgeusebytes.io http: paths: - path: / pathType: Prefix backend: { service: { name: argo-server, port: { number: 80 } } } === ./k8s/automation/n8n.yaml === apiVersion: v1 kind: Namespace metadata: name: automation labels: name: automation --- apiVersion: v1 kind: PersistentVolume metadata: name: n8n-pv labels: app: n8n spec: capacity: storage: 20Gi volumeMode: Filesystem accessModes: - ReadWriteOnce persistentVolumeReclaimPolicy: Retain storageClassName: local-ssd local: path: /mnt/local-ssd/n8n nodeAffinity: required: nodeSelectorTerms: - matchExpressions: - key: kubernetes.io/hostname operator: In values: - hetzner-2 --- apiVersion: v1 kind: PersistentVolumeClaim metadata: name: n8n-data namespace: automation labels: app: n8n spec: accessModes: - ReadWriteOnce storageClassName: local-ssd resources: requests: storage: 20Gi selector: matchLabels: app: n8n --- apiVersion: v1 kind: Secret metadata: name: n8n-secrets namespace: automation type: Opaque stringData: # Generate a strong encryption key with: openssl rand -base64 32 N8N_ENCRYPTION_KEY: "G/US0ePajEpWwRUjlchyOs6+6I/AT+0bisXmE2fugSU=" # Optional: Database connection if using PostgreSQL DB_TYPE: "postgresdb" DB_POSTGRESDB_HOST: "pg.betelgeusebytes.io" DB_POSTGRESDB_PORT: "5432" DB_POSTGRESDB_DATABASE: "n8n" DB_POSTGRESDB_USER: "app" DB_POSTGRESDB_PASSWORD: "pa$$word" --- apiVersion: apps/v1 kind: StatefulSet metadata: name: n8n namespace: automation spec: serviceName: n8n replicas: 1 selector: matchLabels: app: n8n template: metadata: labels: app: n8n spec: nodeSelector: kubernetes.io/hostname: hetzner-2 containers: - name: n8n image: n8nio/n8n:latest ports: - containerPort: 5678 name: http env: - name: N8N_HOST value: "n8n.betelgeusebytes.io" - name: N8N_PORT value: "5678" - name: N8N_PROTOCOL value: "https" - name: WEBHOOK_URL value: "https://n8n.betelgeusebytes.io/" - name: GENERIC_TIMEZONE value: "UTC" - name: N8N_ENCRYPTION_KEY valueFrom: secretKeyRef: name: n8n-secrets key: N8N_ENCRYPTION_KEY # Uncomment if using PostgreSQL - name: DB_TYPE valueFrom: secretKeyRef: name: n8n-secrets key: DB_TYPE - name: DB_POSTGRESDB_HOST valueFrom: secretKeyRef: name: n8n-secrets key: DB_POSTGRESDB_HOST - name: DB_POSTGRESDB_PORT valueFrom: secretKeyRef: name: n8n-secrets key: DB_POSTGRESDB_PORT - name: DB_POSTGRESDB_DATABASE valueFrom: secretKeyRef: name: n8n-secrets key: DB_POSTGRESDB_DATABASE - name: DB_POSTGRESDB_USER valueFrom: secretKeyRef: name: n8n-secrets key: DB_POSTGRESDB_USER - name: DB_POSTGRESDB_PASSWORD valueFrom: secretKeyRef: name: n8n-secrets key: DB_POSTGRESDB_PASSWORD volumeMounts: - name: n8n-data mountPath: /home/node/.n8n resources: requests: memory: "512Mi" cpu: "250m" limits: memory: "2Gi" cpu: "1000m" livenessProbe: httpGet: path: /healthz port: 5678 initialDelaySeconds: 60 periodSeconds: 30 timeoutSeconds: 10 failureThreshold: 5 readinessProbe: httpGet: path: /healthz port: 5678 initialDelaySeconds: 30 periodSeconds: 10 timeoutSeconds: 5 failureThreshold: 3 volumes: - name: n8n-data persistentVolumeClaim: claimName: n8n-data --- apiVersion: v1 kind: Service metadata: name: n8n namespace: automation labels: app: n8n spec: type: ClusterIP ports: - port: 5678 targetPort: 5678 protocol: TCP name: http selector: app: n8n --- apiVersion: networking.k8s.io/v1 kind: Ingress metadata: name: n8n namespace: automation annotations: cert-manager.io/cluster-issuer: "letsencrypt-prod" # nginx.ingress.kubernetes.io/proxy-body-size: "50m" # nginx.ingress.kubernetes.io/proxy-read-timeout: "300" # nginx.ingress.kubernetes.io/proxy-send-timeout: "300" # Uncomment below if you want basic auth protection in addition to n8n's auth # nginx.ingress.kubernetes.io/auth-type: basic # nginx.ingress.kubernetes.io/auth-secret: n8n-basic-auth # nginx.ingress.kubernetes.io/auth-realm: 'Authentication Required' spec: ingressClassName: nginx tls: - hosts: - n8n.betelgeusebytes.io secretName: wildcard-betelgeusebytes-tls rules: - host: n8n.betelgeusebytes.io http: paths: - path: / pathType: Prefix backend: service: name: n8n port: number: 5678 === ./k8s/cert-manager/cluster-issuer.yaml === apiVersion: cert-manager.io/v1 kind: ClusterIssuer metadata: { name: letsencrypt-prod } spec: acme: email: angal.salah@gmail.com server: https://acme-v02.api.letsencrypt.org/directory privateKeySecretRef: { name: letsencrypt-prod-key } solvers: - http01: { ingress: { class: nginx } } === ./k8s/elastic/elastic-pv.yaml === apiVersion: v1 kind: PersistentVolume metadata: name: pv-elasticsearch spec: capacity: storage: 80Gi accessModes: - ReadWriteOnce persistentVolumeReclaimPolicy: Retain storageClassName: local-ssd-hetzner local: path: /mnt/local-ssd/elasticsearch nodeAffinity: required: nodeSelectorTerms: - matchExpressions: - key: kubernetes.io/hostname operator: In values: - hetzner-2 === ./k8s/elastic/elasticsearch.yaml === apiVersion: v1 kind: Service metadata: { name: elasticsearch, namespace: elastic } spec: ports: - { name: http, port: 9200, targetPort: 9200 } - { name: transport, port: 9300, targetPort: 9300 } selector: { app: elasticsearch } --- apiVersion: apps/v1 kind: StatefulSet metadata: { name: elasticsearch, namespace: elastic } spec: serviceName: elasticsearch replicas: 1 selector: { matchLabels: { app: elasticsearch } } template: metadata: { labels: { app: elasticsearch } } spec: nodeSelector: { node: hetzner-2 } containers: - name: es image: docker.elastic.co/elasticsearch/elasticsearch:8.14.0 env: - { name: discovery.type, value: single-node } - { name: xpack.security.enabled, value: "false" } - { name: ES_JAVA_OPTS, value: "-Xms2g -Xmx2g" } ports: - { containerPort: 9200 } - { containerPort: 9300 } volumeMounts: - { name: data, mountPath: /usr/share/elasticsearch/data } volumeClaimTemplates: - metadata: { name: data } spec: accessModes: ["ReadWriteOnce"] storageClassName: local-ssd-hetzner resources: { requests: { storage: 80Gi } } === ./k8s/elastic/kibana.yaml === apiVersion: v1 kind: Service metadata: { name: kibana, namespace: elastic } spec: ports: [{ port: 5601, targetPort: 5601 }] selector: { app: kibana } --- apiVersion: apps/v1 kind: Deployment metadata: { name: kibana, namespace: elastic } spec: replicas: 1 selector: { matchLabels: { app: kibana } } template: metadata: { labels: { app: kibana } } spec: nodeSelector: { node: hetzner-2 } containers: - name: kibana image: docker.elastic.co/kibana/kibana:8.14.0 env: - { name: ELASTICSEARCH_HOSTS, value: "http://elasticsearch.elastic.svc.cluster.local:9200" } ports: [{ containerPort: 5601 }] --- apiVersion: networking.k8s.io/v1 kind: Ingress metadata: name: kibana namespace: elastic annotations: cert-manager.io/cluster-issuer: letsencrypt-prod # nginx.ingress.kubernetes.io/auth-type: basic # nginx.ingress.kubernetes.io/auth-secret: basic-auth-kibana # nginx.ingress.kubernetes.io/auth-realm: "Authentication Required" spec: ingressClassName: nginx tls: [{ hosts: ["kibana.betelgeusebytes.io"], secretName: kibana-tls }] rules: - host: kibana.betelgeusebytes.io http: paths: - path: / pathType: Prefix backend: { service: { name: kibana, port: { number: 5601 } } } === ./k8s/gitea/gitea-pv.yaml === apiVersion: v1 kind: PersistentVolume metadata: name: pv-gitea spec: capacity: storage: 50Gi accessModes: - ReadWriteOnce persistentVolumeReclaimPolicy: Retain storageClassName: local-ssd-hetzner local: path: /mnt/local-ssd/gitea nodeAffinity: required: nodeSelectorTerms: - matchExpressions: - key: kubernetes.io/hostname operator: In values: - hetzner-2 === ./k8s/gitea/gitea.yaml === apiVersion: v1 kind: Service metadata: { name: gitea, namespace: scm } spec: ports: [{ port: 80, targetPort: 3000 }] selector: { app: gitea } --- apiVersion: apps/v1 kind: StatefulSet metadata: { name: gitea, namespace: scm } spec: serviceName: gitea replicas: 1 selector: { matchLabels: { app: gitea } } template: metadata: { labels: { app: gitea } } spec: nodeSelector: { node: hetzner-2 } containers: - name: gitea image: gitea/gitea:1.21.11 env: - { name: GITEA__server__ROOT_URL, value: "https://gitea.betelgeusebytes.io" } - { name: GITEA__database__DB_TYPE, value: "postgres" } - { name: GITEA__database__HOST, value: "postgres.db.svc.cluster.local:5432" } - { name: GITEA__database__NAME, value: "gitea" } - { name: GITEA__database__USER, value: "app" } - { name: GITEA__database__PASSWD, value: "pa$$word" } ports: [{ containerPort: 3000 }] volumeMounts: - { name: data, mountPath: /data } volumeClaimTemplates: - metadata: { name: data } spec: accessModes: ["ReadWriteOnce"] storageClassName: local-ssd-hetzner resources: { requests: { storage: 50Gi } } --- apiVersion: networking.k8s.io/v1 kind: Ingress metadata: name: gitea namespace: scm annotations: { cert-manager.io/cluster-issuer: letsencrypt-prod } spec: ingressClassName: nginx tls: [{ hosts: ["gitea.betelgeusebytes.io"], secretName: gitea-tls }] rules: - host: gitea.betelgeusebytes.io http: paths: - path: / pathType: Prefix backend: { service: { name: gitea, port: { number: 80 } } } === ./k8s/grafana/grafana.yaml === apiVersion: v1 kind: Service metadata: { name: grafana, namespace: monitoring } spec: ports: [{ port: 80, targetPort: 3000 }] selector: { app: grafana } --- apiVersion: apps/v1 kind: Deployment metadata: { name: grafana, namespace: monitoring } spec: replicas: 1 selector: { matchLabels: { app: grafana } } template: metadata: { labels: { app: grafana } } spec: nodeSelector: { node: hetzner-2 } containers: - name: grafana image: grafana/grafana:10.4.3 env: - { name: GF_SECURITY_ADMIN_USER, value: admin } - { name: GF_SECURITY_ADMIN_PASSWORD, value: "ADMINclaude-GRAFANA" } ports: [{ containerPort: 3000 }] --- apiVersion: networking.k8s.io/v1 kind: Ingress metadata: name: grafana namespace: monitoring annotations: cert-manager.io/cluster-issuer: letsencrypt-prod nginx.ingress.kubernetes.io/auth-type: basic nginx.ingress.kubernetes.io/auth-secret: basic-auth-grafana nginx.ingress.kubernetes.io/auth-realm: "Authentication Required" spec: ingressClassName: nginx tls: [{ hosts: ["grafana.betelgeusebytes.io"], secretName: grafana-tls }] rules: - host: grafana.betelgeusebytes.io http: paths: - path: / pathType: Prefix backend: { service: { name: grafana, port: { number: 80 } } } === ./k8s/ingress-patch/kustomization.yaml === apiVersion: kustomize.config.k8s.io/v1beta1 kind: Kustomization namespace: ingress-nginx # Create the tcp-services ConfigMap from *quoted* literals configMapGenerator: - name: tcp-services literals: - "5432=db/postgres:5432" - "7687=graph/neo4j:7687" generatorOptions: disableNameSuffixHash: true # Inline JSON6902 patches patches: # 1) Add controller arg for tcp-services - target: group: apps version: v1 kind: Deployment name: ingress-nginx-controller namespace: ingress-nginx patch: |- - op: add path: /spec/template/spec/containers/0/args/- value: --tcp-services-configmap=$(POD_NAMESPACE)/tcp-services # 2) Expose Service ports 5432 and 7687 (keeps 80/443) - target: version: v1 kind: Service name: ingress-nginx-controller namespace: ingress-nginx patch: |- - op: add path: /spec/ports/- value: name: tcp-5432 port: 5432 protocol: TCP targetPort: 5432 - op: add path: /spec/ports/- value: name: tcp-7687 port: 7687 protocol: TCP targetPort: 7687 === ./k8s/jupyter/jupyter.yaml === apiVersion: v1 kind: Service metadata: { name: notebook, namespace: ml } spec: selector: { app: jupyterlab } ports: [{ port: 80, targetPort: 8888 }] --- apiVersion: apps/v1 kind: Deployment metadata: { name: jupyterlab, namespace: ml } spec: replicas: 1 selector: { matchLabels: { app: jupyterlab } } template: metadata: { labels: { app: jupyterlab } } spec: securityContext: runAsUser: 1000 fsGroup: 100 nodeSelector: { node: hetzner-2 } containers: - name: jupyter image: jupyter/base-notebook:latest args: ["start-notebook.sh", "--NotebookApp.token=$(PASSWORD)"] env: - name: PASSWORD valueFrom: { secretKeyRef: { name: jupyter-auth, key: PASSWORD } } ports: [{ containerPort: 8888 }] volumeMounts: - { name: work, mountPath: /home/jovyan/work } volumes: - name: work persistentVolumeClaim: { claimName: jupyter-pvc } --- apiVersion: v1 kind: PersistentVolumeClaim metadata: { name: jupyter-pvc, namespace: ml } spec: accessModes: ["ReadWriteOnce"] storageClassName: local-ssd-hetzner resources: { requests: { storage: 20Gi } } --- apiVersion: v1 kind: Secret metadata: { name: jupyter-auth, namespace: ml } type: Opaque stringData: { PASSWORD: "notebook" } --- apiVersion: networking.k8s.io/v1 kind: Ingress metadata: name: notebook namespace: ml annotations: cert-manager.io/cluster-issuer: letsencrypt-prod # nginx.ingress.kubernetes.io/auth-type: basic # nginx.ingress.kubernetes.io/auth-secret: basic-auth-notebook # nginx.ingress.kubernetes.io/auth-realm: "Authentication Required" spec: ingressClassName: nginx tls: [{ hosts: ["notebook.betelgeusebytes.io"], secretName: notebook-tls }] rules: - host: notebook.betelgeusebytes.io http: paths: - path: / pathType: Prefix backend: { service: { name: notebook, port: { number: 80 } } } === ./k8s/kafka/kafka-pv.yaml === apiVersion: v1 kind: PersistentVolume metadata: name: pv-kafka spec: capacity: storage: 50Gi accessModes: - ReadWriteOnce persistentVolumeReclaimPolicy: Retain storageClassName: local-ssd-hetzner local: path: /mnt/local-ssd/kafka nodeAffinity: required: nodeSelectorTerms: - matchExpressions: - key: kubernetes.io/hostname operator: In values: - hetzner-2 --- apiVersion: v1 kind: PersistentVolume metadata: name: pv-zookeeper-data spec: capacity: storage: 10Gi accessModes: - ReadWriteOnce persistentVolumeReclaimPolicy: Retain storageClassName: local-ssd-hetzner local: path: /mnt/local-ssd/zookeeper-data nodeAffinity: required: nodeSelectorTerms: - matchExpressions: - key: kubernetes.io/hostname operator: In values: - hetzner-2 --- apiVersion: v1 kind: PersistentVolume metadata: name: pv-zookeeper-log spec: capacity: storage: 10Gi accessModes: - ReadWriteOnce persistentVolumeReclaimPolicy: Retain storageClassName: local-ssd-hetzner local: path: /mnt/local-ssd/zookeeper-log nodeAffinity: required: nodeSelectorTerms: - matchExpressions: - key: kubernetes.io/hostname operator: In values: - hetzner-2 === ./k8s/kafka/kafka-ui.yaml === apiVersion: v1 kind: Service metadata: { name: kafka-ui, namespace: broker } spec: ports: [{ port: 80, targetPort: 8080 }] selector: { app: kafka-ui } --- apiVersion: apps/v1 kind: Deployment metadata: { name: kafka-ui, namespace: broker } spec: replicas: 1 selector: { matchLabels: { app: kafka-ui } } template: metadata: { labels: { app: kafka-ui } } spec: containers: - name: ui image: provectuslabs/kafka-ui:latest env: - { name: KAFKA_CLUSTERS_0_NAME, value: "local" } - { name: KAFKA_CLUSTERS_0_BOOTSTRAPSERVERS, value: "kafka.broker.svc.cluster.local:9092" } ports: [{ containerPort: 8080 }] --- apiVersion: networking.k8s.io/v1 kind: Ingress metadata: name: kafka-ui namespace: broker annotations: cert-manager.io/cluster-issuer: letsencrypt-prod # nginx.ingress.kubernetes.io/auth-type: basic # nginx.ingress.kubernetes.io/auth-secret: basic-auth-broker # nginx.ingress.kubernetes.io/auth-realm: "Authentication Required" spec: ingressClassName: nginx tls: [{ hosts: ["broker.betelgeusebytes.io"], secretName: broker-tls }] rules: - host: broker.betelgeusebytes.io http: paths: - path: / pathType: Prefix backend: { service: { name: kafka-ui, port: { number: 80 } } } === ./k8s/kafka/kafka.yaml === apiVersion: v1 kind: Service metadata: { name: kafka, namespace: broker } spec: ports: [{ name: kafka, port: 9092, targetPort: 9092 }] selector: { app: kafka } --- apiVersion: apps/v1 kind: StatefulSet metadata: { name: kafka, namespace: broker } spec: serviceName: kafka replicas: 1 selector: { matchLabels: { app: kafka } } template: metadata: { labels: { app: kafka } } spec: nodeSelector: { node: hetzner-2 } containers: - name: kafka image: apache/kafka:latest env: - { name: KAFKA_NODE_ID, value: "1" } - { name: KAFKA_PROCESS_ROLES, value: "broker,controller" } - { name: KAFKA_LISTENERS, value: "PLAINTEXT://:9092,CONTROLLER://:9093" } - { name: KAFKA_ADVERTISED_LISTENERS, value: "PLAINTEXT://kafka.broker.svc.cluster.local:9092" } - { name: KAFKA_CONTROLLER_LISTENER_NAMES, value: "CONTROLLER" } - { name: KAFKA_LISTENER_SECURITY_PROTOCOL_MAP, value: "CONTROLLER:PLAINTEXT,PLAINTEXT:PLAINTEXT" } - { name: KAFKA_CONTROLLER_QUORUM_VOTERS, value: "1@localhost:9093" } - { name: KAFKA_OFFSETS_TOPIC_REPLICATION_FACTOR, value: "1" } - { name: KAFKA_TRANSACTION_STATE_LOG_REPLICATION_FACTOR, value: "1" } - { name: KAFKA_TRANSACTION_STATE_LOG_MIN_ISR, value: "1" } - { name: KAFKA_LOG_DIRS, value: "/var/lib/kafka/data" } - { name: CLUSTER_ID, value: "MkU3OEVBNTcwNTJENDM2Qk" } ports: - { containerPort: 9092 } - { containerPort: 9093 } volumeMounts: - { name: data, mountPath: /var/lib/kafka/data } volumeClaimTemplates: - metadata: { name: data } spec: accessModes: ["ReadWriteOnce"] storageClassName: local-ssd-hetzner resources: { requests: { storage: 50Gi } } === ./k8s/label_studio/label.yaml === # k8s/ai/label-studio/secret-pg.yaml apiVersion: v1 kind: Secret metadata: { name: labelstudio-pg, namespace: ml } type: Opaque stringData: { POSTGRES_PASSWORD: "admin" } --- # k8s/ai/label-studio/secret-minio.yaml apiVersion: v1 kind: Secret metadata: { name: minio-label, namespace: ml } type: Opaque stringData: accesskey: "minioadmin" secretkey: "minioadmin" --- # k8s/ai/label-studio/deploy.yaml apiVersion: apps/v1 kind: Deployment metadata: { name: label-studio, namespace: ml } spec: replicas: 1 selector: { matchLabels: { app: label-studio } } template: metadata: { labels: { app: label-studio } } spec: containers: - name: app image: heartexlabs/label-studio:latest env: - { name: POSTGRE_NAME, value: "labelstudio" } - { name: POSTGRE_USER, value: "admin" } - name: POSTGRE_PASSWORD valueFrom: { secretKeyRef: { name: labelstudio-pg, key: POSTGRES_PASSWORD } } - { name: POSTGRE_HOST, value: "postgres.db.svc.cluster.local" } - { name: POSTGRE_PORT, value: "5432" } - { name: S3_ENDPOINT, value: "https://minio.betelgeusebytes.io" } - name: AWS_ACCESS_KEY_ID valueFrom: { secretKeyRef: { name: minio-label, key: accesskey } } - name: AWS_SECRET_ACCESS_KEY valueFrom: { secretKeyRef: { name: minio-label, key: secretkey } } - name: ALLOWED_HOSTS value: "label.betelgeusebytes.io" - name: CSRF_TRUSTED_ORIGINS value: "https://label.betelgeusebytes.io" - name: CSRF_COOKIE_SECURE value: "1" - name: SESSION_COOKIE_SECURE value: "1" ports: [{ containerPort: 8080 }] --- apiVersion: v1 kind: Service metadata: { name: label-studio, namespace: ml } spec: { selector: { app: label-studio }, ports: [ { port: 80, targetPort: 8080 } ] } --- apiVersion: networking.k8s.io/v1 kind: Ingress metadata: name: label-studio namespace: ml annotations: { cert-manager.io/cluster-issuer: letsencrypt-prod } spec: ingressClassName: nginx tls: [{ hosts: ["label.betelgeusebytes.io"], secretName: label-tls }] rules: - host: label.betelgeusebytes.io http: paths: - path: / pathType: Prefix backend: { service: { name: label-studio, port: { number: 80 } } } === ./k8s/minio/minio.yaml === apiVersion: v1 kind: Namespace metadata: { name: storage } --- # k8s/storage/minio/secret.yaml apiVersion: v1 kind: Secret metadata: { name: minio-root, namespace: storage } type: Opaque stringData: MINIO_ROOT_USER: "minioadmin" MINIO_ROOT_PASSWORD: "minioadmin" --- # k8s/storage/minio/pvc.yaml apiVersion: v1 kind: PersistentVolumeClaim metadata: { name: minio-data, namespace: storage } spec: accessModes: ["ReadWriteOnce"] storageClassName: local-ssd-hetzner resources: { requests: { storage: 20Gi } } --- # k8s/storage/minio/deploy.yaml apiVersion: apps/v1 kind: Deployment metadata: { name: minio, namespace: storage } spec: replicas: 1 selector: { matchLabels: { app: minio } } template: metadata: { labels: { app: minio } } spec: containers: - name: minio image: minio/minio:latest args: ["server","/data","--console-address",":9001"] envFrom: [{ secretRef: { name: minio-root } }] ports: - { containerPort: 9000 } # S3 - { containerPort: 9001 } # Console volumeMounts: - { name: data, mountPath: /data } volumes: - name: data persistentVolumeClaim: { claimName: minio-data } --- apiVersion: v1 kind: Service metadata: { name: minio, namespace: storage } spec: selector: { app: minio } ports: - { name: s3, port: 9000, targetPort: 9000 } - { name: console, port: 9001, targetPort: 9001 } --- apiVersion: networking.k8s.io/v1 kind: Ingress metadata: name: minio namespace: storage annotations: { cert-manager.io/cluster-issuer: letsencrypt-prod } spec: ingressClassName: nginx tls: [{ hosts: ["minio.betelgeusebytes.io"], secretName: minio-tls }] rules: - host: minio.betelgeusebytes.io http: paths: - path: / pathType: Prefix backend: { service: { name: minio, port: { number: 9001 } } } --- # PV apiVersion: v1 kind: PersistentVolume metadata: name: pv-minio spec: capacity: storage: 20Gi accessModes: - ReadWriteOnce persistentVolumeReclaimPolicy: Retain storageClassName: local-ssd-hetzner local: path: /mnt/local-ssd/minio nodeAffinity: required: nodeSelectorTerms: - matchExpressions: - key: kubernetes.io/hostname operator: In values: - hetzner-2 === ./k8s/mlflow/mlflow.yaml === # k8s/mlops/mlflow/secret-pg.yaml apiVersion: v1 kind: Secret metadata: { name: mlflow-pg, namespace: ml } type: Opaque stringData: { POSTGRES_PASSWORD: "pa$$word" } --- # k8s/mlops/mlflow/secret-minio.yaml apiVersion: v1 kind: Secret metadata: { name: mlflow-minio, namespace: ml } type: Opaque stringData: accesskey: "minioadmin" secretkey: "minioadmin" --- # k8s/mlops/mlflow/deploy.yaml apiVersion: apps/v1 kind: Deployment metadata: { name: mlflow, namespace: ml } spec: replicas: 1 selector: { matchLabels: { app: mlflow } } template: metadata: { labels: { app: mlflow } } spec: containers: - name: mlflow # image: ghcr.io/mlflow/mlflow:v3.6.0 image: axxs/mlflow-pg env: - { name: MLFLOW_BACKEND_STORE_URI, value: "postgresql://admin:admin@postgres.db.svc.cluster.local:5432/mlflow" } - { name: POSTGRES_PASSWORD, valueFrom: { secretKeyRef: { name: mlflow-pg, key: POSTGRES_PASSWORD } } } - { name: MLFLOW_S3_ENDPOINT_URL, value: "https://minio.betelgeusebytes.io" } - { name: AWS_ACCESS_KEY_ID, valueFrom: { secretKeyRef: { name: mlflow-minio, key: accesskey } } } - { name: AWS_SECRET_ACCESS_KEY, valueFrom: { secretKeyRef: { name: mlflow-minio, key: secretkey } } } args: ["mlflow","server","--host","0.0.0.0","--port","5000","--artifacts-destination","s3://mlflow", "--allowed-hosts", "*.betelgeusebytes.io"] ports: [{ containerPort: 5000 }] --- apiVersion: v1 kind: Service metadata: { name: mlflow, namespace: ml } spec: { selector: { app: mlflow }, ports: [ { port: 80, targetPort: 5000 } ] } --- apiVersion: networking.k8s.io/v1 kind: Ingress metadata: name: mlflow namespace: ml annotations: { cert-manager.io/cluster-issuer: letsencrypt-prod } spec: ingressClassName: nginx tls: [{ hosts: ["mlflow.betelgeusebytes.io"], secretName: mlflow-tls }] rules: - host: mlflow.betelgeusebytes.io http: paths: - path: / pathType: Prefix backend: { service: { name: mlflow, port: { number: 80 } } } === ./k8s/neo4j/neo4j-pv.yaml === apiVersion: v1 kind: PersistentVolume metadata: name: pv-neo4j spec: capacity: storage: 20Gi accessModes: - ReadWriteOnce persistentVolumeReclaimPolicy: Retain storageClassName: local-ssd-hetzner local: path: /mnt/local-ssd/neo4j nodeAffinity: required: nodeSelectorTerms: - matchExpressions: - key: kubernetes.io/hostname operator: In values: - hetzner-2 === ./k8s/neo4j/neo4j.yaml === apiVersion: v1 kind: Service metadata: { name: neo4j, namespace: graph } spec: selector: { app: neo4j } ports: - { name: http, port: 7474, targetPort: 7474 } - { name: bolt, port: 7687, targetPort: 7687 } --- apiVersion: apps/v1 kind: StatefulSet metadata: { name: neo4j, namespace: graph } spec: serviceName: neo4j replicas: 1 selector: { matchLabels: { app: neo4j } } template: metadata: { labels: { app: neo4j } } spec: enableServiceLinks: false nodeSelector: { node: hetzner-2 } containers: - name: neo4j image: neo4j:5.20 env: - name: NEO4J_AUTH valueFrom: { secretKeyRef: { name: neo4j-auth, key: NEO4J_AUTH } } - name: NEO4J_dbms_ssl_policy_bolt_enabled value: "true" - name: NEO4J_dbms_ssl_policy_bolt_base__directory value: "/certs/bolt" - name: NEO4J_dbms_ssl_policy_bolt_private__key value: "tls.key" - name: NEO4J_dbms_ssl_policy_bolt_public__certificate value: "tls.crt" - name: NEO4J_dbms_connector_bolt_tls__level value: "REQUIRED" # Advertise public hostname so the Browser uses the external FQDN for Bolt - name: NEO4J_dbms_connector_bolt_advertised__address value: "neo4j.betelgeusebytes.io:7687" # also set a default advertised address (recommended) - name: NEO4J_dbms_default__advertised__address value: "neo4j.betelgeusebytes.io" ports: - { containerPort: 7474 } - { containerPort: 7687 } volumeMounts: - { name: data, mountPath: /data } - { name: bolt-certs, mountPath: /certs/bolt } volumes: - name: bolt-certs secret: secretName: neo4j-tls items: - key: tls.crt path: tls.crt - key: tls.key path: tls.key volumeClaimTemplates: - metadata: { name: data } spec: accessModes: ["ReadWriteOnce"] storageClassName: local-ssd-hetzner resources: { requests: { storage: 20Gi } } --- apiVersion: v1 kind: Secret metadata: { name: neo4j-auth, namespace: graph } type: Opaque stringData: { NEO4J_AUTH: "neo4j/NEO4J-PASS" } --- apiVersion: networking.k8s.io/v1 kind: Ingress metadata: name: neo4j-http namespace: graph annotations: cert-manager.io/cluster-issuer: letsencrypt-prod # nginx.ingress.kubernetes.io/auth-type: basic # nginx.ingress.kubernetes.io/auth-secret: basic-auth-neo4j # nginx.ingress.kubernetes.io/auth-realm: "Authentication Required" spec: ingressClassName: nginx tls: [{ hosts: ["neo4j.betelgeusebytes.io"], secretName: neo4j-tls }] rules: - host: neo4j.betelgeusebytes.io http: paths: - path: / pathType: Prefix backend: { service: { name: neo4j, port: { number: 7474 } } } # create or update the tcp-services configmap # kubectl -n ingress-nginx create configmap tcp-services \ # --from-literal="7687=graph/neo4j:7687" \ # -o yaml --dry-run=client | kubectl apply -f - # kubectl -n ingress-nginx patch deploy ingress-nginx-controller \ # --type='json' -p='[{"op":"add","path":"/spec/template/spec/containers/0/args/-","value":"--tcp-services-configmap=$(POD_NAMESPACE)/tcp-services"}]' # kubectl -n ingress-nginx patch deploy ingress-nginx-controller \ # --type='json' -p='[{"op":"add","path":"/spec/template/spec/containers/0/args/-","value":"--tcp-services-configmap=$(POD_NAMESPACE)/tcp-services"}]' # kubectl -n ingress-nginx patch deployment ingress-nginx-controller \ # --type='json' -p='[ # {"op":"add","path":"/spec/template/spec/containers/0/ports/-","value":{"name":"tcp-7687","containerPort":7687,"hostPort":7687,"protocol":"TCP"}} # ]' === ./k8s/observability/fluent-bit.yaml === apiVersion: v1 kind: ServiceAccount metadata: { name: fluent-bit, namespace: observability } --- apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRole metadata: { name: fluent-bit-read } rules: - apiGroups: [""] resources: ["pods", "namespaces"] verbs: ["get", "list", "watch"] --- apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRoleBinding metadata: { name: fluent-bit-read } roleRef: apiGroup: rbac.authorization.k8s.io kind: ClusterRole name: fluent-bit-read subjects: - kind: ServiceAccount name: fluent-bit namespace: observability --- apiVersion: apps/v1 kind: DaemonSet metadata: { name: fluent-bit, namespace: observability } spec: selector: { matchLabels: { app: fluent-bit } } template: metadata: { labels: { app: fluent-bit } } spec: serviceAccountName: fluent-bit containers: - name: fluent-bit image: cr.fluentbit.io/fluent/fluent-bit:2.2.2 volumeMounts: - { name: varlog, mountPath: /var/log } - { name: containers, mountPath: /var/lib/docker/containers, readOnly: true } env: - { name: FLUENT_ELASTICSEARCH_HOST, value: elasticsearch.elastic.svc.cluster.local } - { name: FLUENT_ELASTICSEARCH_PORT, value: "9200" } args: ["-i","tail","-p","path=/var/log/containers/*.log","-F","kubernetes","-o","es","-p","host=${FLUENT_ELASTICSEARCH_HOST}","-p","port=${FLUENT_ELASTICSEARCH_PORT}","-p","logstash_format=On","-p","logstash_prefix=k8s-logs"] volumes: - { name: varlog, hostPath: { path: /var/log } } - { name: containers, hostPath: { path: /var/lib/docker/containers, type: DirectoryOrCreate } } === ./k8s/observability-stack/00-namespace.yaml === apiVersion: v1 kind: Namespace metadata: name: observability labels: name: observability monitoring: "true" === ./k8s/observability-stack/01-persistent-volumes.yaml === --- # Prometheus PV apiVersion: v1 kind: PersistentVolume metadata: name: prometheus-data-pv spec: capacity: storage: 50Gi accessModes: - ReadWriteOnce persistentVolumeReclaimPolicy: Retain storageClassName: local-storage local: path: /mnt/local-ssd/prometheus nodeAffinity: required: nodeSelectorTerms: - matchExpressions: - key: kubernetes.io/hostname operator: In values: - hetzner-2 --- # Loki PV apiVersion: v1 kind: PersistentVolume metadata: name: loki-data-pv spec: capacity: storage: 100Gi accessModes: - ReadWriteOnce persistentVolumeReclaimPolicy: Retain storageClassName: local-storage local: path: /mnt/local-ssd/loki nodeAffinity: required: nodeSelectorTerms: - matchExpressions: - key: kubernetes.io/hostname operator: In values: - hetzner-2 --- # Tempo PV apiVersion: v1 kind: PersistentVolume metadata: name: tempo-data-pv spec: capacity: storage: 50Gi accessModes: - ReadWriteOnce persistentVolumeReclaimPolicy: Retain storageClassName: local-storage local: path: /mnt/local-ssd/tempo nodeAffinity: required: nodeSelectorTerms: - matchExpressions: - key: kubernetes.io/hostname operator: In values: - hetzner-2 --- # Grafana PV apiVersion: v1 kind: PersistentVolume metadata: name: grafana-data-pv spec: capacity: storage: 10Gi accessModes: - ReadWriteOnce persistentVolumeReclaimPolicy: Retain storageClassName: local-storage local: path: /mnt/local-ssd/grafana nodeAffinity: required: nodeSelectorTerms: - matchExpressions: - key: kubernetes.io/hostname operator: In values: - hetzner-2 === ./k8s/observability-stack/02-persistent-volume-claims.yaml === --- apiVersion: v1 kind: PersistentVolumeClaim metadata: name: prometheus-data namespace: observability spec: accessModes: - ReadWriteOnce storageClassName: local-storage resources: requests: storage: 50Gi --- apiVersion: v1 kind: PersistentVolumeClaim metadata: name: loki-data namespace: observability spec: accessModes: - ReadWriteOnce storageClassName: local-storage resources: requests: storage: 100Gi --- apiVersion: v1 kind: PersistentVolumeClaim metadata: name: tempo-data namespace: observability spec: accessModes: - ReadWriteOnce storageClassName: local-storage resources: requests: storage: 50Gi --- apiVersion: v1 kind: PersistentVolumeClaim metadata: name: grafana-data namespace: observability spec: accessModes: - ReadWriteOnce storageClassName: local-storage resources: requests: storage: 10Gi === ./k8s/observability-stack/03-prometheus-config.yaml === apiVersion: v1 kind: ConfigMap metadata: name: prometheus-config namespace: observability data: prometheus.yml: | global: scrape_interval: 15s evaluation_interval: 15s external_labels: cluster: 'betelgeuse-k8s' environment: 'production' # Alerting configuration (optional - can add alertmanager later) alerting: alertmanagers: - static_configs: - targets: [] # Rule files rule_files: - /etc/prometheus/rules/*.yml scrape_configs: # Scrape Prometheus itself - job_name: 'prometheus' static_configs: - targets: ['localhost:9090'] # Kubernetes API server - job_name: 'kubernetes-apiservers' kubernetes_sd_configs: - role: endpoints scheme: https tls_config: ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token relabel_configs: - source_labels: [__meta_kubernetes_namespace, __meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name] action: keep regex: default;kubernetes;https # Kubernetes nodes - job_name: 'kubernetes-nodes' kubernetes_sd_configs: - role: node scheme: https tls_config: ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token relabel_configs: - action: labelmap regex: __meta_kubernetes_node_label_(.+) - target_label: __address__ replacement: kubernetes.default.svc:443 - source_labels: [__meta_kubernetes_node_name] regex: (.+) target_label: __metrics_path__ replacement: /api/v1/nodes/${1}/proxy/metrics # Kubernetes nodes cadvisor - job_name: 'kubernetes-cadvisor' kubernetes_sd_configs: - role: node scheme: https tls_config: ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token relabel_configs: - action: labelmap regex: __meta_kubernetes_node_label_(.+) - target_label: __address__ replacement: kubernetes.default.svc:443 - source_labels: [__meta_kubernetes_node_name] regex: (.+) target_label: __metrics_path__ replacement: /api/v1/nodes/${1}/proxy/metrics/cadvisor # Kubernetes service endpoints - job_name: 'kubernetes-service-endpoints' kubernetes_sd_configs: - role: endpoints relabel_configs: - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scrape] action: keep regex: true - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scheme] action: replace target_label: __scheme__ regex: (https?) - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_path] action: replace target_label: __metrics_path__ regex: (.+) - source_labels: [__address__, __meta_kubernetes_service_annotation_prometheus_io_port] action: replace target_label: __address__ regex: ([^:]+)(?::\d+)?;(\d+) replacement: $1:$2 - action: labelmap regex: __meta_kubernetes_service_label_(.+) - source_labels: [__meta_kubernetes_namespace] action: replace target_label: kubernetes_namespace - source_labels: [__meta_kubernetes_service_name] action: replace target_label: kubernetes_name - source_labels: [__meta_kubernetes_pod_name] action: replace target_label: kubernetes_pod_name # Kubernetes pods - job_name: 'kubernetes-pods' kubernetes_sd_configs: - role: pod relabel_configs: - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape] action: keep regex: true - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_path] action: replace target_label: __metrics_path__ regex: (.+) - source_labels: [__address__, __meta_kubernetes_pod_annotation_prometheus_io_port] action: replace regex: ([^:]+)(?::\d+)?;(\d+) replacement: $1:$2 target_label: __address__ - action: labelmap regex: __meta_kubernetes_pod_label_(.+) - source_labels: [__meta_kubernetes_namespace] action: replace target_label: kubernetes_namespace - source_labels: [__meta_kubernetes_pod_name] action: replace target_label: kubernetes_pod_name # kube-state-metrics - job_name: 'kube-state-metrics' static_configs: - targets: ['kube-state-metrics.observability.svc.cluster.local:8080'] # node-exporter - job_name: 'node-exporter' kubernetes_sd_configs: - role: pod relabel_configs: - source_labels: [__meta_kubernetes_pod_label_app] action: keep regex: node-exporter - source_labels: [__meta_kubernetes_pod_node_name] action: replace target_label: instance # Grafana Loki - job_name: 'loki' static_configs: - targets: ['loki.observability.svc.cluster.local:3100'] # Grafana Tempo - job_name: 'tempo' static_configs: - targets: ['tempo.observability.svc.cluster.local:3200'] # Grafana - job_name: 'grafana' static_configs: - targets: ['grafana.observability.svc.cluster.local:3000'] === ./k8s/observability-stack/04-loki-config.yaml === apiVersion: v1 kind: ConfigMap metadata: name: loki-config namespace: observability data: loki.yaml: | auth_enabled: false server: http_listen_port: 3100 grpc_listen_port: 9096 log_level: info common: path_prefix: /loki storage: filesystem: chunks_directory: /loki/chunks rules_directory: /loki/rules replication_factor: 1 ring: kvstore: store: inmemory schema_config: configs: - from: 2024-01-01 store: tsdb object_store: filesystem schema: v13 index: prefix: index_ period: 24h storage_config: tsdb_shipper: active_index_directory: /loki/tsdb-index cache_location: /loki/tsdb-cache filesystem: directory: /loki/chunks compactor: working_directory: /loki/compactor compaction_interval: 10m retention_enabled: false limits_config: reject_old_samples: true reject_old_samples_max_age: 168h # 7 days retention_period: 168h # 7 days max_query_length: 721h # 30 days for queries max_query_parallelism: 32 max_streams_per_user: 0 max_global_streams_per_user: 0 ingestion_rate_mb: 50 ingestion_burst_size_mb: 100 per_stream_rate_limit: 10MB per_stream_rate_limit_burst: 20MB split_queries_by_interval: 15m query_range: align_queries_with_step: true cache_results: true results_cache: cache: embedded_cache: enabled: true max_size_mb: 500 frontend: log_queries_longer_than: 5s compress_responses: true query_scheduler: max_outstanding_requests_per_tenant: 2048 ingester: chunk_idle_period: 30m chunk_block_size: 262144 chunk_encoding: snappy chunk_retain_period: 1m max_chunk_age: 2h wal: enabled: true dir: /loki/wal flush_on_shutdown: true replay_memory_ceiling: 1GB analytics: reporting_enabled: false === ./k8s/observability-stack/05-tempo-config.yaml === apiVersion: v1 kind: ConfigMap metadata: name: tempo-config namespace: observability data: tempo.yaml: | server: http_listen_port: 3200 log_level: info distributor: receivers: jaeger: protocols: thrift_http: endpoint: 0.0.0.0:14268 grpc: endpoint: 0.0.0.0:14250 zipkin: endpoint: 0.0.0.0:9411 otlp: protocols: http: endpoint: 0.0.0.0:4318 grpc: endpoint: 0.0.0.0:4317 ingester: max_block_duration: 5m compactor: compaction: block_retention: 168h # 7 days metrics_generator: registry: external_labels: source: tempo cluster: betelgeuse-k8s storage: path: /var/tempo/generator/wal remote_write: - url: http://prometheus.observability.svc.cluster.local:9090/api/v1/write send_exemplars: true storage: trace: backend: local wal: path: /var/tempo/wal local: path: /var/tempo/blocks pool: max_workers: 100 queue_depth: 10000 # Single instance mode - no need for frontend/querier split query_frontend: search: duration_slo: 5s throughput_bytes_slo: 1.073741824e+09 trace_by_id: duration_slo: 5s overrides: defaults: metrics_generator: processors: [service-graphs, span-metrics] === ./k8s/observability-stack/06-alloy-config.yaml === apiVersion: v1 kind: ConfigMap metadata: name: alloy-config namespace: observability data: config.alloy: | // Logging configuration logging { level = "info" format = "logfmt" } // Discover Kubernetes pods for log collection discovery.kubernetes "pods" { role = "pod" } // Discover Kubernetes nodes discovery.kubernetes "nodes" { role = "node" } // Relabel pods for log collection discovery.relabel "pod_logs" { targets = discovery.kubernetes.pods.targets // Only scrape pods with logs rule { source_labels = ["__meta_kubernetes_pod_container_name"] action = "keep" regex = ".+" } // Set the log path rule { source_labels = ["__meta_kubernetes_pod_uid", "__meta_kubernetes_pod_container_name"] target_label = "__path__" separator = "/" replacement = "/var/log/pods/*$1/*.log" } // Set namespace label rule { source_labels = ["__meta_kubernetes_namespace"] target_label = "namespace" } // Set pod name label rule { source_labels = ["__meta_kubernetes_pod_name"] target_label = "pod" } // Set container name label rule { source_labels = ["__meta_kubernetes_pod_container_name"] target_label = "container" } // Set node name label rule { source_labels = ["__meta_kubernetes_pod_node_name"] target_label = "node" } // Copy all pod labels rule { action = "labelmap" regex = "__meta_kubernetes_pod_label_(.+)" } } // Read logs from discovered pods loki.source.kubernetes "pod_logs" { targets = discovery.relabel.pod_logs.output forward_to = [loki.process.pod_logs.receiver] } // Process and enrich logs loki.process "pod_logs" { forward_to = [loki.write.local.receiver] // Parse JSON logs stage.json { expressions = { level = "level", message = "message", timestamp = "timestamp", } } // Extract log level stage.labels { values = { level = "", } } // Add cluster label stage.static_labels { values = { cluster = "betelgeuse-k8s", } } } // Write logs to Loki loki.write "local" { endpoint { url = "http://loki.observability.svc.cluster.local:3100/loki/api/v1/push" } } // OpenTelemetry receiver for traces otelcol.receiver.otlp "default" { grpc { endpoint = "0.0.0.0:4317" } http { endpoint = "0.0.0.0:4318" } output { traces = [otelcol.exporter.otlp.tempo.input] } } // Export traces to Tempo otelcol.exporter.otlp "tempo" { client { endpoint = "tempo.observability.svc.cluster.local:4317" tls { insecure = true } } } // Scrape local metrics (Alloy's own metrics) // Prometheus will scrape these via service discovery prometheus.exporter.self "alloy" { } === ./k8s/observability-stack/07-grafana-datasources.yaml === apiVersion: v1 kind: ConfigMap metadata: name: grafana-datasources namespace: observability data: datasources.yaml: | apiVersion: 1 datasources: # Prometheus - name: Prometheus type: prometheus access: proxy url: http://prometheus.observability.svc.cluster.local:9090 isDefault: true editable: true jsonData: timeInterval: 15s queryTimeout: 60s httpMethod: POST # Loki - name: Loki type: loki access: proxy url: http://loki.observability.svc.cluster.local:3100 editable: true jsonData: maxLines: 1000 derivedFields: - datasourceUid: tempo matcherRegex: "traceID=(\\w+)" name: TraceID url: "$${__value.raw}" # Tempo - name: Tempo type: tempo access: proxy url: http://tempo.observability.svc.cluster.local:3200 editable: true uid: tempo jsonData: tracesToLogsV2: datasourceUid: loki spanStartTimeShift: -1h spanEndTimeShift: 1h filterByTraceID: true filterBySpanID: false customQuery: false tracesToMetrics: datasourceUid: prometheus spanStartTimeShift: -1h spanEndTimeShift: 1h serviceMap: datasourceUid: prometheus nodeGraph: enabled: true search: hide: false lokiSearch: datasourceUid: loki === ./k8s/observability-stack/08-rbac.yaml === --- # Prometheus ServiceAccount apiVersion: v1 kind: ServiceAccount metadata: name: prometheus namespace: observability --- # Prometheus ClusterRole apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRole metadata: name: prometheus rules: - apiGroups: [""] resources: - nodes - nodes/proxy - services - endpoints - pods verbs: ["get", "list", "watch"] - apiGroups: - extensions resources: - ingresses verbs: ["get", "list", "watch"] - nonResourceURLs: ["/metrics"] verbs: ["get"] --- # Prometheus ClusterRoleBinding apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRoleBinding metadata: name: prometheus roleRef: apiGroup: rbac.authorization.k8s.io kind: ClusterRole name: prometheus subjects: - kind: ServiceAccount name: prometheus namespace: observability --- # Alloy ServiceAccount apiVersion: v1 kind: ServiceAccount metadata: name: alloy namespace: observability --- # Alloy ClusterRole apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRole metadata: name: alloy rules: - apiGroups: [""] resources: - nodes - nodes/proxy - services - endpoints - pods - pods/log verbs: ["get", "list", "watch"] - apiGroups: - extensions resources: - ingresses verbs: ["get", "list", "watch"] --- # Alloy ClusterRoleBinding apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRoleBinding metadata: name: alloy roleRef: apiGroup: rbac.authorization.k8s.io kind: ClusterRole name: alloy subjects: - kind: ServiceAccount name: alloy namespace: observability --- # kube-state-metrics ServiceAccount apiVersion: v1 kind: ServiceAccount metadata: name: kube-state-metrics namespace: observability --- # kube-state-metrics ClusterRole apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRole metadata: name: kube-state-metrics rules: - apiGroups: [""] resources: - configmaps - secrets - nodes - pods - services - resourcequotas - replicationcontrollers - limitranges - persistentvolumeclaims - persistentvolumes - namespaces - endpoints verbs: ["list", "watch"] - apiGroups: ["apps"] resources: - statefulsets - daemonsets - deployments - replicasets verbs: ["list", "watch"] - apiGroups: ["batch"] resources: - cronjobs - jobs verbs: ["list", "watch"] - apiGroups: ["autoscaling"] resources: - horizontalpodautoscalers verbs: ["list", "watch"] - apiGroups: ["policy"] resources: - poddisruptionbudgets verbs: ["list", "watch"] - apiGroups: ["certificates.k8s.io"] resources: - certificatesigningrequests verbs: ["list", "watch"] - apiGroups: ["storage.k8s.io"] resources: - storageclasses - volumeattachments verbs: ["list", "watch"] - apiGroups: ["admissionregistration.k8s.io"] resources: - mutatingwebhookconfigurations - validatingwebhookconfigurations verbs: ["list", "watch"] - apiGroups: ["networking.k8s.io"] resources: - networkpolicies - ingresses verbs: ["list", "watch"] - apiGroups: ["coordination.k8s.io"] resources: - leases verbs: ["list", "watch"] --- # kube-state-metrics ClusterRoleBinding apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRoleBinding metadata: name: kube-state-metrics roleRef: apiGroup: rbac.authorization.k8s.io kind: ClusterRole name: kube-state-metrics subjects: - kind: ServiceAccount name: kube-state-metrics namespace: observability === ./k8s/observability-stack/10-prometheus.yaml === apiVersion: apps/v1 kind: StatefulSet metadata: name: prometheus namespace: observability labels: app: prometheus spec: serviceName: prometheus replicas: 1 selector: matchLabels: app: prometheus template: metadata: labels: app: prometheus annotations: prometheus.io/scrape: "true" prometheus.io/port: "9090" spec: serviceAccountName: prometheus nodeSelector: kubernetes.io/hostname: hetzner-2 containers: - name: prometheus image: prom/prometheus:v2.54.1 args: - '--config.file=/etc/prometheus/prometheus.yml' - '--storage.tsdb.path=/prometheus' - '--storage.tsdb.retention.time=7d' - '--web.console.libraries=/usr/share/prometheus/console_libraries' - '--web.console.templates=/usr/share/prometheus/consoles' - '--web.enable-lifecycle' - '--web.enable-admin-api' ports: - name: http containerPort: 9090 protocol: TCP livenessProbe: httpGet: path: /-/healthy port: http initialDelaySeconds: 30 periodSeconds: 10 timeoutSeconds: 5 readinessProbe: httpGet: path: /-/ready port: http initialDelaySeconds: 30 periodSeconds: 10 timeoutSeconds: 5 resources: requests: cpu: 500m memory: 2Gi limits: cpu: 2000m memory: 4Gi volumeMounts: - name: prometheus-config mountPath: /etc/prometheus - name: prometheus-data mountPath: /prometheus volumes: - name: prometheus-config configMap: name: prometheus-config - name: prometheus-data persistentVolumeClaim: claimName: prometheus-data --- apiVersion: v1 kind: Service metadata: name: prometheus namespace: observability labels: app: prometheus spec: type: ClusterIP ports: - port: 9090 targetPort: http protocol: TCP name: http selector: app: prometheus === ./k8s/observability-stack/11-loki.yaml === apiVersion: apps/v1 kind: StatefulSet metadata: name: loki namespace: observability labels: app: loki spec: serviceName: loki replicas: 1 selector: matchLabels: app: loki template: metadata: labels: app: loki annotations: prometheus.io/scrape: "true" prometheus.io/port: "3100" spec: nodeSelector: kubernetes.io/hostname: hetzner-2 securityContext: fsGroup: 10001 runAsGroup: 10001 runAsNonRoot: true runAsUser: 10001 containers: - name: loki image: grafana/loki:3.2.1 args: - '-config.file=/etc/loki/loki.yaml' - '-target=all' ports: - name: http containerPort: 3100 protocol: TCP - name: grpc containerPort: 9096 protocol: TCP livenessProbe: httpGet: path: /ready port: http initialDelaySeconds: 45 periodSeconds: 10 timeoutSeconds: 5 readinessProbe: httpGet: path: /ready port: http initialDelaySeconds: 45 periodSeconds: 10 timeoutSeconds: 5 resources: requests: cpu: 500m memory: 1Gi limits: cpu: 2000m memory: 2Gi volumeMounts: - name: loki-config mountPath: /etc/loki - name: loki-data mountPath: /loki volumes: - name: loki-config configMap: name: loki-config - name: loki-data persistentVolumeClaim: claimName: loki-data --- apiVersion: v1 kind: Service metadata: name: loki namespace: observability labels: app: loki spec: type: ClusterIP ports: - port: 3100 targetPort: http protocol: TCP name: http - port: 9096 targetPort: grpc protocol: TCP name: grpc selector: app: loki === ./k8s/observability-stack/12-tempo.yaml === apiVersion: apps/v1 kind: StatefulSet metadata: name: tempo namespace: observability labels: app: tempo spec: serviceName: tempo replicas: 1 selector: matchLabels: app: tempo template: metadata: labels: app: tempo annotations: prometheus.io/scrape: "true" prometheus.io/port: "3200" spec: nodeSelector: kubernetes.io/hostname: hetzner-2 securityContext: fsGroup: 10001 runAsGroup: 10001 runAsNonRoot: true runAsUser: 10001 containers: - name: tempo image: grafana/tempo:2.6.1 args: - '-config.file=/etc/tempo/tempo.yaml' ports: - name: http containerPort: 3200 protocol: TCP - name: otlp-grpc containerPort: 4317 protocol: TCP - name: otlp-http containerPort: 4318 protocol: TCP - name: jaeger-grpc containerPort: 14250 protocol: TCP - name: jaeger-http containerPort: 14268 protocol: TCP - name: zipkin containerPort: 9411 protocol: TCP livenessProbe: httpGet: path: /ready port: http initialDelaySeconds: 30 periodSeconds: 10 timeoutSeconds: 5 readinessProbe: httpGet: path: /ready port: http initialDelaySeconds: 30 periodSeconds: 10 timeoutSeconds: 5 resources: requests: cpu: 500m memory: 1Gi limits: cpu: 2000m memory: 2Gi volumeMounts: - name: tempo-config mountPath: /etc/tempo - name: tempo-data mountPath: /var/tempo volumes: - name: tempo-config configMap: name: tempo-config - name: tempo-data persistentVolumeClaim: claimName: tempo-data --- apiVersion: v1 kind: Service metadata: name: tempo namespace: observability labels: app: tempo spec: type: ClusterIP ports: - port: 3200 targetPort: http protocol: TCP name: http - port: 4317 targetPort: otlp-grpc protocol: TCP name: otlp-grpc - port: 4318 targetPort: otlp-http protocol: TCP name: otlp-http - port: 14250 targetPort: jaeger-grpc protocol: TCP name: jaeger-grpc - port: 14268 targetPort: jaeger-http protocol: TCP name: jaeger-http - port: 9411 targetPort: zipkin protocol: TCP name: zipkin selector: app: tempo === ./k8s/observability-stack/13-grafana.yaml === apiVersion: apps/v1 kind: StatefulSet metadata: name: grafana namespace: observability labels: app: grafana spec: serviceName: grafana replicas: 1 selector: matchLabels: app: grafana template: metadata: labels: app: grafana spec: nodeSelector: kubernetes.io/hostname: hetzner-2 securityContext: fsGroup: 472 runAsGroup: 472 runAsUser: 472 containers: - name: grafana image: grafana/grafana:11.4.0 ports: - name: http containerPort: 3000 protocol: TCP env: - name: GF_SECURITY_ADMIN_USER value: admin - name: GF_SECURITY_ADMIN_PASSWORD value: admin # Change this in production! - name: GF_INSTALL_PLUGINS value: "" - name: GF_FEATURE_TOGGLES_ENABLE value: "traceqlEditor,correlations" - name: GF_AUTH_ANONYMOUS_ENABLED value: "false" - name: GF_ANALYTICS_REPORTING_ENABLED value: "false" - name: GF_ANALYTICS_CHECK_FOR_UPDATES value: "false" livenessProbe: httpGet: path: /api/health port: http initialDelaySeconds: 60 periodSeconds: 10 timeoutSeconds: 5 readinessProbe: httpGet: path: /api/health port: http initialDelaySeconds: 60 periodSeconds: 10 timeoutSeconds: 5 resources: requests: cpu: 250m memory: 512Mi limits: cpu: 1000m memory: 1Gi volumeMounts: - name: grafana-data mountPath: /var/lib/grafana - name: grafana-datasources mountPath: /etc/grafana/provisioning/datasources volumes: - name: grafana-data persistentVolumeClaim: claimName: grafana-data - name: grafana-datasources configMap: name: grafana-datasources --- apiVersion: v1 kind: Service metadata: name: grafana namespace: observability labels: app: grafana spec: type: ClusterIP ports: - port: 3000 targetPort: http protocol: TCP name: http selector: app: grafana === ./k8s/observability-stack/14-alloy.yaml === apiVersion: apps/v1 kind: DaemonSet metadata: name: alloy namespace: observability labels: app: alloy spec: selector: matchLabels: app: alloy template: metadata: labels: app: alloy spec: serviceAccountName: alloy hostNetwork: true hostPID: true dnsPolicy: ClusterFirstWithHostNet containers: - name: alloy image: grafana/alloy:v1.5.1 args: - run - /etc/alloy/config.alloy - --storage.path=/var/lib/alloy - --server.http.listen-addr=0.0.0.0:12345 ports: - name: http-metrics containerPort: 12345 protocol: TCP - name: otlp-grpc containerPort: 4317 protocol: TCP - name: otlp-http containerPort: 4318 protocol: TCP env: - name: HOSTNAME valueFrom: fieldRef: fieldPath: spec.nodeName securityContext: privileged: true runAsUser: 0 resources: requests: cpu: 100m memory: 256Mi limits: cpu: 500m memory: 512Mi volumeMounts: - name: config mountPath: /etc/alloy - name: varlog mountPath: /var/log readOnly: true - name: varlibdockercontainers mountPath: /var/lib/docker/containers readOnly: true - name: etcmachineid mountPath: /etc/machine-id readOnly: true tolerations: - effect: NoSchedule operator: Exists volumes: - name: config configMap: name: alloy-config - name: varlog hostPath: path: /var/log - name: varlibdockercontainers hostPath: path: /var/lib/docker/containers - name: etcmachineid hostPath: path: /etc/machine-id --- apiVersion: v1 kind: Service metadata: name: alloy namespace: observability labels: app: alloy spec: type: ClusterIP ports: - port: 12345 targetPort: http-metrics protocol: TCP name: http-metrics - port: 4317 targetPort: otlp-grpc protocol: TCP name: otlp-grpc - port: 4318 targetPort: otlp-http protocol: TCP name: otlp-http selector: app: alloy === ./k8s/observability-stack/15-kube-state-metrics.yaml === apiVersion: apps/v1 kind: Deployment metadata: name: kube-state-metrics namespace: observability labels: app: kube-state-metrics spec: replicas: 1 selector: matchLabels: app: kube-state-metrics template: metadata: labels: app: kube-state-metrics annotations: prometheus.io/scrape: "true" prometheus.io/port: "8080" spec: serviceAccountName: kube-state-metrics containers: - name: kube-state-metrics image: registry.k8s.io/kube-state-metrics/kube-state-metrics:v2.13.0 ports: - name: http-metrics containerPort: 8080 - name: telemetry containerPort: 8081 livenessProbe: httpGet: path: /healthz port: 8080 initialDelaySeconds: 5 timeoutSeconds: 5 readinessProbe: httpGet: path: / port: 8080 initialDelaySeconds: 5 timeoutSeconds: 5 resources: requests: cpu: 100m memory: 128Mi limits: cpu: 200m memory: 256Mi --- apiVersion: v1 kind: Service metadata: name: kube-state-metrics namespace: observability labels: app: kube-state-metrics annotations: prometheus.io/scrape: "true" prometheus.io/port: "8080" spec: type: ClusterIP ports: - name: http-metrics port: 8080 targetPort: http-metrics - name: telemetry port: 8081 targetPort: telemetry selector: app: kube-state-metrics === ./k8s/observability-stack/16-node-exporter.yaml === apiVersion: apps/v1 kind: DaemonSet metadata: name: node-exporter namespace: observability labels: app: node-exporter spec: selector: matchLabels: app: node-exporter template: metadata: labels: app: node-exporter annotations: prometheus.io/scrape: "true" prometheus.io/port: "9100" spec: hostNetwork: true hostPID: true containers: - name: node-exporter image: prom/node-exporter:v1.8.2 args: - --path.procfs=/host/proc - --path.sysfs=/host/sys - --path.rootfs=/host/root - --collector.filesystem.mount-points-exclude=^/(sys|proc|dev|host|etc)($$|/) ports: - name: metrics containerPort: 9100 protocol: TCP resources: requests: cpu: 100m memory: 128Mi limits: cpu: 200m memory: 256Mi volumeMounts: - name: proc mountPath: /host/proc readOnly: true - name: sys mountPath: /host/sys readOnly: true - name: root mountPath: /host/root mountPropagation: HostToContainer readOnly: true tolerations: - effect: NoSchedule operator: Exists volumes: - name: proc hostPath: path: /proc - name: sys hostPath: path: /sys - name: root hostPath: path: / --- apiVersion: v1 kind: Service metadata: name: node-exporter namespace: observability labels: app: node-exporter annotations: prometheus.io/scrape: "true" prometheus.io/port: "9100" spec: type: ClusterIP clusterIP: None ports: - name: metrics port: 9100 targetPort: metrics selector: app: node-exporter === ./k8s/observability-stack/20-grafana-ingress.yaml === apiVersion: networking.k8s.io/v1 kind: Ingress metadata: name: grafana-ingress namespace: observability annotations: cert-manager.io/cluster-issuer: "letsencrypt-prod" nginx.ingress.kubernetes.io/ssl-redirect: "true" nginx.ingress.kubernetes.io/force-ssl-redirect: "true" spec: ingressClassName: nginx tls: - hosts: - grafana.betelgeusebytes.io secretName: grafana-tls rules: - host: grafana.betelgeusebytes.io http: paths: - path: / pathType: Prefix backend: service: name: grafana port: number: 3000 === ./k8s/observability-stack/21-optional-ingresses.yaml === --- # Optional: Prometheus Ingress (for direct access to Prometheus UI) apiVersion: networking.k8s.io/v1 kind: Ingress metadata: name: prometheus-ingress namespace: observability annotations: cert-manager.io/cluster-issuer: "letsencrypt-prod" nginx.ingress.kubernetes.io/ssl-redirect: "true" nginx.ingress.kubernetes.io/force-ssl-redirect: "true" # Optional: Add basic auth for security # nginx.ingress.kubernetes.io/auth-type: basic # nginx.ingress.kubernetes.io/auth-secret: prometheus-basic-auth # nginx.ingress.kubernetes.io/auth-realm: 'Authentication Required' spec: ingressClassName: nginx tls: - hosts: - prometheus.betelgeusebytes.io secretName: prometheus-tls rules: - host: prometheus.betelgeusebytes.io http: paths: - path: / pathType: Prefix backend: service: name: prometheus port: number: 9090 --- # Optional: Loki Ingress (for direct API access) apiVersion: networking.k8s.io/v1 kind: Ingress metadata: name: loki-ingress namespace: observability annotations: cert-manager.io/cluster-issuer: "letsencrypt-prod" nginx.ingress.kubernetes.io/ssl-redirect: "true" nginx.ingress.kubernetes.io/force-ssl-redirect: "true" spec: ingressClassName: nginx tls: - hosts: - loki.betelgeusebytes.io secretName: loki-tls rules: - host: loki.betelgeusebytes.io http: paths: - path: / pathType: Prefix backend: service: name: loki port: number: 3100 --- # Optional: Tempo Ingress (for direct API access) apiVersion: networking.k8s.io/v1 kind: Ingress metadata: name: tempo-ingress namespace: observability annotations: cert-manager.io/cluster-issuer: "letsencrypt-prod" nginx.ingress.kubernetes.io/ssl-redirect: "true" nginx.ingress.kubernetes.io/force-ssl-redirect: "true" spec: ingressClassName: nginx tls: - hosts: - tempo.betelgeusebytes.io secretName: tempo-tls rules: - host: tempo.betelgeusebytes.io http: paths: - path: / pathType: Prefix backend: service: name: tempo port: number: 3200 === ./k8s/observability-stack/demo-app.yaml === --- # Example instrumented application to test the observability stack # This is a simple Python Flask app with OpenTelemetry instrumentation apiVersion: v1 kind: ConfigMap metadata: name: demo-app namespace: observability data: app.py: | from flask import Flask, jsonify import logging import json import time import random # OpenTelemetry imports from opentelemetry import trace, metrics from opentelemetry.sdk.trace import TracerProvider from opentelemetry.sdk.trace.export import BatchSpanProcessor from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter from opentelemetry.sdk.metrics import MeterProvider from opentelemetry.sdk.metrics.export import PeriodicExportingMetricReader from opentelemetry.exporter.otlp.proto.grpc.metric_exporter import OTLPMetricExporter from opentelemetry.instrumentation.flask import FlaskInstrumentor from opentelemetry.sdk.resources import Resource from prometheus_flask_exporter import PrometheusMetrics # Configure structured logging logging.basicConfig( level=logging.INFO, format='%(message)s' ) class JSONFormatter(logging.Formatter): def format(self, record): log_obj = { 'timestamp': self.formatTime(record, self.datefmt), 'level': record.levelname, 'message': record.getMessage(), 'logger': record.name, } if hasattr(record, 'trace_id'): log_obj['trace_id'] = record.trace_id log_obj['span_id'] = record.span_id return json.dumps(log_obj) handler = logging.StreamHandler() handler.setFormatter(JSONFormatter()) logger = logging.getLogger(__name__) logger.addHandler(handler) logger.setLevel(logging.INFO) # Configure OpenTelemetry resource = Resource.create({"service.name": "demo-app"}) # Tracing trace_provider = TracerProvider(resource=resource) trace_provider.add_span_processor( BatchSpanProcessor( OTLPSpanExporter( endpoint="http://tempo.observability.svc.cluster.local:4317", insecure=True ) ) ) trace.set_tracer_provider(trace_provider) tracer = trace.get_tracer(__name__) # Create Flask app app = Flask(__name__) # Prometheus metrics metrics = PrometheusMetrics(app) # Auto-instrument Flask FlaskInstrumentor().instrument_app(app) # Sample data ITEMS = ["apple", "banana", "orange", "grape", "mango"] @app.route('/') def index(): span = trace.get_current_span() trace_id = format(span.get_span_context().trace_id, '032x') logger.info("Index page accessed", extra={ 'trace_id': trace_id, 'endpoint': '/' }) return jsonify({ 'service': 'demo-app', 'status': 'healthy', 'trace_id': trace_id }) @app.route('/items') def get_items(): with tracer.start_as_current_span("fetch_items") as span: # Simulate database query time.sleep(random.uniform(0.01, 0.1)) span.set_attribute("items.count", len(ITEMS)) trace_id = format(span.get_span_context().trace_id, '032x') logger.info("Items fetched", extra={ 'trace_id': trace_id, 'count': len(ITEMS) }) return jsonify({ 'items': ITEMS, 'count': len(ITEMS), 'trace_id': trace_id }) @app.route('/item/') def get_item(item_id): with tracer.start_as_current_span("fetch_item") as span: span.set_attribute("item.id", item_id) trace_id = format(span.get_span_context().trace_id, '032x') # Simulate processing time.sleep(random.uniform(0.01, 0.05)) if item_id < 0 or item_id >= len(ITEMS): logger.warning("Item not found", extra={ 'trace_id': trace_id, 'item_id': item_id }) return jsonify({'error': 'Item not found', 'trace_id': trace_id}), 404 item = ITEMS[item_id] logger.info("Item fetched", extra={ 'trace_id': trace_id, 'item_id': item_id, 'item': item }) return jsonify({ 'id': item_id, 'name': item, 'trace_id': trace_id }) @app.route('/slow') def slow_endpoint(): with tracer.start_as_current_span("slow_operation") as span: trace_id = format(span.get_span_context().trace_id, '032x') logger.info("Slow operation started", extra={'trace_id': trace_id}) # Simulate slow operation time.sleep(random.uniform(1, 3)) logger.info("Slow operation completed", extra={'trace_id': trace_id}) return jsonify({ 'message': 'Operation completed', 'trace_id': trace_id }) @app.route('/error') def error_endpoint(): with tracer.start_as_current_span("error_operation") as span: trace_id = format(span.get_span_context().trace_id, '032x') logger.error("Intentional error triggered", extra={'trace_id': trace_id}) span.set_attribute("error", True) return jsonify({ 'error': 'This is an intentional error', 'trace_id': trace_id }), 500 if __name__ == '__main__': app.run(host='0.0.0.0', port=8080) --- apiVersion: apps/v1 kind: Deployment metadata: name: demo-app namespace: observability labels: app: demo-app spec: replicas: 1 selector: matchLabels: app: demo-app template: metadata: labels: app: demo-app annotations: prometheus.io/scrape: "true" prometheus.io/port: "8080" prometheus.io/path: "/metrics" spec: containers: - name: demo-app image: python:3.11-slim command: - /bin/bash - -c - | pip install flask opentelemetry-api opentelemetry-sdk \ opentelemetry-instrumentation-flask \ opentelemetry-exporter-otlp-proto-grpc \ prometheus-flask-exporter && \ python /app/app.py ports: - name: http containerPort: 8080 volumeMounts: - name: app-code mountPath: /app resources: requests: cpu: 100m memory: 256Mi limits: cpu: 500m memory: 512Mi volumes: - name: app-code configMap: name: demo-app --- apiVersion: v1 kind: Service metadata: name: demo-app namespace: observability labels: app: demo-app annotations: prometheus.io/scrape: "true" prometheus.io/port: "8080" prometheus.io/path: "/metrics" spec: type: ClusterIP ports: - port: 8080 targetPort: http protocol: TCP name: http selector: app: demo-app === ./k8s/otlp/otel-collector.yaml === apiVersion: v1 kind: Service metadata: { name: otel-collector, namespace: observability } spec: selector: { app: otel-collector } ports: - { name: otlp-http, port: 4318, targetPort: 4318 } - { name: otlp-grpc, port: 4317, targetPort: 4317 } --- apiVersion: apps/v1 kind: Deployment metadata: { name: otel-collector, namespace: observability } spec: replicas: 2 selector: { matchLabels: { app: otel-collector } } template: metadata: { labels: { app: otel-collector } } spec: nodeSelector: { node: hetzner-2 } containers: - name: otel-collector image: otel/opentelemetry-collector-contrib:0.102.0 args: ["--config=/etc/otel/config.yaml"] ports: - { containerPort: 4318 } - { containerPort: 4317 } volumeMounts: - { name: cfg, mountPath: /etc/otel } volumes: - { name: cfg, configMap: { name: otel-config } } --- apiVersion: v1 kind: ConfigMap metadata: { name: otel-config, namespace: observability } data: config.yaml: | receivers: otlp: protocols: { http: {}, grpc: {} } processors: { batch: {} } exporters: logging: {} elasticsearch: endpoints: ["http://elasticsearch.elastic.svc.cluster.local:9200"] logs_index: "k8s-logs" service: pipelines: logs: { receivers: [otlp], processors: [batch], exporters: [elasticsearch, logging] } traces: { receivers: [otlp], processors: [batch], exporters: [logging] } metrics: { receivers: [otlp], processors: [batch], exporters: [logging] } --- apiVersion: networking.k8s.io/v1 kind: Ingress metadata: name: otlp namespace: observability annotations: { cert-manager.io/cluster-issuer: letsencrypt-prod } spec: ingressClassName: nginx tls: [{ hosts: ["otlp.betelgeusebytes.io"], secretName: otlp-tls }] rules: - host: otlp.betelgeusebytes.io http: paths: - path: /v1/traces pathType: Prefix backend: { service: { name: otel-collector, port: { number: 4318 } } } - path: /v1/metrics pathType: Prefix backend: { service: { name: otel-collector, port: { number: 4318 } } } - path: /v1/logs pathType: Prefix backend: { service: { name: otel-collector, port: { number: 4318 } } } === ./k8s/postgres/pg.yaml === # k8s/postgres/pg-init-sql-configmap.yaml apiVersion: v1 kind: ConfigMap metadata: name: pg-init-sql namespace: db data: 00_extensions.sql: | \connect gitea CREATE EXTENSION IF NOT EXISTS postgis; CREATE EXTENSION IF NOT EXISTS postgis_topology; CREATE EXTENSION IF NOT EXISTS fuzzystrmatch; CREATE EXTENSION IF NOT EXISTS pg_trgm; CREATE EXTENSION IF NOT EXISTS hstore; CREATE EXTENSION IF NOT EXISTS "uuid-ossp"; CREATE EXTENSION IF NOT EXISTS citext; CREATE EXTENSION IF NOT EXISTS unaccent; CREATE EXTENSION IF NOT EXISTS pgcrypto; DO $$ BEGIN CREATE EXTENSION IF NOT EXISTS plpython3u; EXCEPTION WHEN undefined_file THEN RAISE NOTICE 'plpython3u not available in this image'; END $$; 01_tune.sql: | ALTER SYSTEM SET shared_buffers = '1GB'; ALTER SYSTEM SET work_mem = '32MB'; ALTER SYSTEM SET maintenance_work_mem = '512MB'; ALTER SYSTEM SET max_connections = 200; SELECT pg_reload_conf(); --- # k8s/postgres/pg-conf.yaml apiVersion: v1 kind: ConfigMap metadata: name: pg-conf namespace: db data: pg_hba.conf: | # Local connections local all all trust host all all 127.0.0.1/32 trust host all all ::1/128 trust # TLS-only access from ANY external IP (harden as needed) hostssl all all 0.0.0.0/0 md5 hostssl all all ::/0 md5 --- # k8s/postgres/pg-secret.yaml apiVersion: v1 kind: Secret metadata: name: pg18-secret namespace: db type: Opaque stringData: POSTGRES_PASSWORD: "pa$$word" --- # k8s/postgres/pg-certificate.yaml apiVersion: cert-manager.io/v1 kind: Certificate metadata: name: pg-tls namespace: db spec: secretName: pg-tls dnsNames: - pg.betelgeusebytes.io issuerRef: kind: ClusterIssuer name: letsencrypt-prod --- # k8s/postgres/postgres-svc.yaml apiVersion: v1 kind: Service metadata: name: postgres namespace: db spec: selector: app: postgres ports: - name: postgres port: 5432 targetPort: 5432 --- apiVersion: v1 kind: Service metadata: name: postgres-hl namespace: db spec: clusterIP: None selector: app: postgres ports: - name: postgres port: 5432 targetPort: 5432 --- # k8s/postgres/postgres.yaml apiVersion: apps/v1 kind: StatefulSet metadata: name: postgres namespace: db spec: serviceName: postgres-hl replicas: 1 selector: matchLabels: app: postgres template: metadata: labels: app: postgres spec: securityContext: runAsUser: 999 runAsGroup: 999 fsGroup: 999 fsGroupChangePolicy: "Always" initContainers: - name: install-certs image: busybox:1.36 command: - sh - -c - | cp /in/tls.crt /out/server.crt cp /in/tls.key /out/server.key chown 999:999 /out/* || true chmod 600 /out/server.key securityContext: runAsUser: 0 volumeMounts: - { name: pg-tls, mountPath: /in, readOnly: true } - { name: pg-certs, mountPath: /out } containers: - name: postgres image: axxs/postgres:18-postgis-vector imagePullPolicy: IfNotPresent args: - -c - ssl=on - -c - ssl_cert_file=/certs/server.crt - -c - ssl_key_file=/certs/server.key - -c - hba_file=/etc/postgresql-custom/pg_hba.conf env: - name: POSTGRES_USER value: "app" - name: POSTGRES_DB value: "gitea" - name: POSTGRES_PASSWORD valueFrom: secretKeyRef: name: pg18-secret key: POSTGRES_PASSWORD - name: TZ value: "Europe/Paris" ports: - name: postgres containerPort: 5432 volumeMounts: - { name: data, mountPath: /var/lib/postgresql } # PG18 expects parent, creates /var/lib/postgresql/18/main - { name: init, mountPath: /docker-entrypoint-initdb.d, readOnly: true } - { name: pg-certs, mountPath: /certs } - { name: pg-conf, mountPath: /etc/postgresql-custom } readinessProbe: exec: { command: ["sh","-c","pg_isready -U \"$POSTGRES_USER\" -d \"$POSTGRES_DB\" -h 127.0.0.1"] } initialDelaySeconds: 5 periodSeconds: 10 timeoutSeconds: 5 failureThreshold: 6 livenessProbe: exec: { command: ["sh","-c","pg_isready -U \"$POSTGRES_USER\" -d \"$POSTGRES_DB\" -h 127.0.0.1"] } initialDelaySeconds: 20 periodSeconds: 10 timeoutSeconds: 5 failureThreshold: 6 resources: requests: { cpu: "250m", memory: "512Mi" } limits: { cpu: "1", memory: "2Gi" } volumes: - name: init configMap: name: pg-init-sql defaultMode: 0444 - name: pg-tls secret: secretName: pg-tls - name: pg-certs emptyDir: {} - name: pg-conf configMap: name: pg-conf defaultMode: 0444 volumeClaimTemplates: - metadata: name: data spec: accessModes: ["ReadWriteOnce"] storageClassName: local-ssd-hetzner resources: requests: storage: 80Gi # kubectl -n ingress-nginx create configmap tcp-services \ # --from-literal="5432=db/postgres:5432" \ # -o yaml --dry-run=client | kubectl apply -f - # kubectl -n ingress-nginx patch deploy ingress-nginx-controller \ # --type='json' -p='[ # {"op":"add","path":"/spec/template/spec/containers/0/args/-","value":"--tcp-services-configmap=$(POD_NAMESPACE)/tcp-services"} # ]' # # controller must listen on hostPort:5432 (we already patched earlier) === ./k8s/postgres/postgres-ha.yaml === --- apiVersion: v1 kind: Namespace metadata: name: db --- # Password secret (replace with your own or generate one) apiVersion: v1 kind: Secret metadata: name: pg18-secret namespace: db type: Opaque stringData: POSTGRES_PASSWORD: "pa$$word" --- # Init SQL: keeps your original name and keeps enabling PostGIS + vector apiVersion: v1 kind: ConfigMap metadata: name: pg-init-sql namespace: db data: 00_extensions.sql: | -- enable common extensions in the default DB and template1 so future DBs inherit them \connect gitea CREATE EXTENSION IF NOT EXISTS postgis; CREATE EXTENSION IF NOT EXISTS vector; CREATE COLLATION IF NOT EXISTS arabic (provider = icu, locale = 'ar', deterministic = false); CREATE EXTENSION IF NOT EXISTS tablefunc; -- postpone pg_stat_statements CREATE to postStart (needs preload) CREATE EXTENSION IF NOT EXISTS postgis_topology; CREATE EXTENSION IF NOT EXISTS fuzzystrmatch; CREATE EXTENSION IF NOT EXISTS pg_trgm; CREATE EXTENSION IF NOT EXISTS hstore; CREATE EXTENSION IF NOT EXISTS "uuid-ossp"; CREATE EXTENSION IF NOT EXISTS citext; CREATE EXTENSION IF NOT EXISTS unaccent; CREATE EXTENSION IF NOT EXISTS pgcrypto; -- PL/Python (available in your image) DO $$ BEGIN CREATE EXTENSION IF NOT EXISTS plpython3u; EXCEPTION WHEN undefined_file THEN RAISE NOTICE 'plpython3u not available in this image'; END $$; -- Also on template1 for new DBs (heavier, but intentional) \connect template1 CREATE EXTENSION IF NOT EXISTS postgis; CREATE EXTENSION IF NOT EXISTS pg_trgm; CREATE EXTENSION IF NOT EXISTS hstore; CREATE EXTENSION IF NOT EXISTS "uuid-ossp"; CREATE EXTENSION IF NOT EXISTS citext; CREATE EXTENSION IF NOT EXISTS unaccent; CREATE EXTENSION IF NOT EXISTS pgcrypto; -- Arabic-friendly ICU collation, non-deterministic for case/diacritics DO $$ BEGIN PERFORM 1 FROM pg_collation WHERE collname='arabic'; IF NOT FOUND THEN CREATE COLLATION arabic (provider = icu, locale = 'ar', deterministic = false); END IF; END$$; 01_tune.sql: | -- Enable pg_stat_statements on next server start DO $$ DECLARE cur text := current_setting('shared_preload_libraries', true); BEGIN IF cur IS NULL OR position('pg_stat_statements' in cur) = 0 THEN PERFORM pg_catalog.pg_reload_conf(); -- harmless even if no changes yet EXECUTE $$ALTER SYSTEM SET shared_preload_libraries = $$ || quote_literal(coalesce(NULLIF(cur,'' ) || ',pg_stat_statements', 'pg_stat_statements')); END IF; END$$; -- Optional tuning (adjust to your limits) ALTER SYSTEM SET shared_buffers = '1GB'; ALTER SYSTEM SET work_mem = '32MB'; ALTER SYSTEM SET maintenance_work_mem = '512MB'; ALTER SYSTEM SET max_connections = 200; -- Reload applies some settings immediately; others need restart (OK after init completes) SELECT pg_reload_conf(); ALTER SYSTEM SET pg_stat_statements.max = 10000; ALTER SYSTEM SET pg_stat_statements.track = 'all'; ALTER SYSTEM SET pg_stat_statements.save = on; pg_hba.conf: | # Allow loopback local all all trust host all all 127.0.0.1/32 trust host all all ::1/128 trust # Allow TLS connections from your IP(s) only hostssl all all YOUR_PUBLIC_IP/32 md5 # (Optional) Add more CIDRs or a private network range here: # hostssl all all 10.0.0.0/8 md5 --- # Headless service required by StatefulSet for stable network IDs apiVersion: v1 kind: Service metadata: name: postgres-hl namespace: db spec: clusterIP: None selector: app: postgres ports: - name: postgres port: 5432 targetPort: 5432 --- # Regular ClusterIP service for clients (keeps your original name) apiVersion: v1 kind: Service metadata: name: postgres namespace: db spec: selector: app: postgres ports: - name: postgres port: 5432 targetPort: 5432 --- apiVersion: apps/v1 kind: StatefulSet metadata: name: postgres namespace: db spec: serviceName: postgres-hl replicas: 1 selector: matchLabels: app: postgres template: metadata: labels: app: postgres spec: securityContext: runAsUser: 999 runAsGroup: 999 fsGroup: 999 fsGroupChangePolicy: "Always" initContainers: # Copy cert-manager certs to a writable path with correct perms for Postgres - name: install-certs image: busybox:1.36 command: - sh - -c - | cp /in/tls.crt /out/server.crt cp /in/tls.key /out/server.key cp /in/ca.crt /out/ca.crt || true chown 999:999 /out/* || true chmod 600 /out/server.key securityContext: runAsUser: 0 volumeMounts: - { name: pg-tls, mountPath: /in, readOnly: true } - { name: pg-certs, mountPath: /out } containers: - name: postgres image: axxs/postgres:18-postgis-vector imagePullPolicy: IfNotPresent args: - -c - ssl=on - -c - ssl_cert_file=/certs/server.crt - -c - ssl_key_file=/certs/server.key - -c - ssl_ca_file=/certs/ca.crt - -c - hba_file=/etc/postgresql-custom/pg_hba.conf lifecycle: postStart: exec: command: - /bin/sh - -c - | set -e # Wait until server accepts connections for i in $(seq 1 30); do pg_isready -h 127.0.0.1 -U "$POSTGRES_USER" -d "$POSTGRES_DB" && break sleep 1 done psql -v ON_ERROR_STOP=1 -U "$POSTGRES_USER" -d "$POSTGRES_DB" -c "CREATE EXTENSION IF NOT EXISTS pg_stat_statements;" env: - name: POSTGRES_USER value: "app" - name: POSTGRES_DB value: "gitea" # matches your \connect gitea - name: POSTGRES_PASSWORD valueFrom: secretKeyRef: name: pg18-secret key: POSTGRES_PASSWORD - name: TZ value: "Europe/Paris" ports: - name: postgres containerPort: 5432 volumeMounts: # ✅ PG 18 requires this parent path; it will create /var/lib/postgresql/18/main - name: data mountPath: /var/lib/postgresql # your init scripts ConfigMap - name: init mountPath: /docker-entrypoint-initdb.d readOnly: true - name: pg-certs mountPath: /certs # pg_hba.conf - name: pg-conf mountPath: /etc/postgresql-custom readinessProbe: exec: command: - /bin/sh - -c - pg_isready -U "$POSTGRES_USER" -d "$POSTGRES_DB" -h 127.0.0.1 initialDelaySeconds: 5 periodSeconds: 10 timeoutSeconds: 5 failureThreshold: 6 livenessProbe: exec: command: - /bin/sh - -c - pg_isready -U "$POSTGRES_USER" -d "$POSTGRES_DB" -h 127.0.0.1 initialDelaySeconds: 20 periodSeconds: 10 timeoutSeconds: 5 failureThreshold: 6 resources: requests: cpu: "250m" memory: "512Mi" limits: cpu: "1" memory: "2Gi" volumes: - name: init configMap: name: pg-init-sql defaultMode: 0444 - name: pg-tls secret: secretName: pg-tls - name: pg-certs emptyDir: {} - name: pg-conf configMap: name: pg-conf defaultMode: 0444 volumeClaimTemplates: - metadata: name: data spec: accessModes: ["ReadWriteOnce"] resources: requests: storage: 10Gi # storageClassName: # optionally pin this === ./k8s/postgres/postgres.yaml === apiVersion: v1 kind: Service metadata: { name: postgres, namespace: db } spec: ports: [{ port: 5432, targetPort: 5432 }] selector: { app: postgres } --- apiVersion: v1 kind: ConfigMap metadata: { name: pg-init-sql, namespace: db } data: 00_extensions.sql: | -- enable common extensions in the default DB and template1 so future DBs inherit them \connect gitea CREATE EXTENSION IF NOT EXISTS postgis; CREATE EXTENSION IF NOT EXISTS vector; CREATE COLLATION IF NOT EXISTS arabic (provider = icu, locale = 'ar', deterministic = false); CREATE EXTENSION IF NOT EXISTS tablefunc; CREATE EXTENSION IF NOT EXISTS pg_stat_statements; CREATE EXTENSION IF NOT EXISTS postgis_topology; CREATE EXTENSION IF NOT EXISTS fuzzystrmatch; CREATE EXTENSION IF NOT EXISTS pg_trgm; CREATE EXTENSION IF NOT EXISTS hstore; CREATE EXTENSION IF NOT EXISTS "uuid-ossp"; CREATE EXTENSION IF NOT EXISTS citext; CREATE EXTENSION IF NOT EXISTS unaccent; CREATE EXTENSION IF NOT EXISTS pgcrypto; -- PL/Python (optional; requires image with plpython3u, postgis image has it) DO $$ BEGIN CREATE EXTENSION IF NOT EXISTS plpython3u; EXCEPTION WHEN undefined_file THEN RAISE NOTICE 'plpython3u not available in this image'; END $$; -- Also on template1 for new DBs: \connect template1 CREATE EXTENSION IF NOT EXISTS postgis; CREATE EXTENSION IF NOT EXISTS pg_trgm; CREATE EXTENSION IF NOT EXISTS hstore; CREATE EXTENSION IF NOT EXISTS "uuid-ossp"; CREATE EXTENSION IF NOT EXISTS citext; CREATE EXTENSION IF NOT EXISTS unaccent; CREATE EXTENSION IF NOT EXISTS pgcrypto; -- Arabic-friendly ICU collation (PostgreSQL >= 13) -- Non-deterministic collation helps proper case/diacritics comparisons DO $$ BEGIN PERFORM 1 FROM pg_collation WHERE collname='arabic'; IF NOT FOUND THEN CREATE COLLATION arabic (provider = icu, locale = 'ar', deterministic = false); END IF; END$$; -- Example: ensure gitea DB uses UTF8; Arabic text search often needs unaccent + custom dictionaries. -- You can create additional DBs with: CREATE DATABASE mydb TEMPLATE template1 ENCODING 'UTF8'; 01_tune.sql: | -- small safe defaults; adjust later ALTER SYSTEM SET shared_buffers = '1GB'; ALTER SYSTEM SET work_mem = '32MB'; ALTER SYSTEM SET maintenance_work_mem = '512MB'; ALTER SYSTEM SET max_connections = 200; SELECT pg_reload_conf(); --- apiVersion: apps/v1 kind: StatefulSet metadata: { name: postgres, namespace: db } spec: serviceName: postgres replicas: 1 selector: { matchLabels: { app: postgres } } template: metadata: { labels: { app: postgres } } spec: nodeSelector: node: hetzner-2 securityContext: fsGroup: 999 # Debian postgres user/group in postgis image fsGroupChangePolicy: OnRootMismatch initContainers: - name: fix-perms image: busybox:1.36 command: ["sh","-c","chown -R 999:999 /var/lib/postgresql/data || true"] securityContext: { runAsUser: 0 } volumeMounts: [{ name: data, mountPath: /var/lib/postgresql/data }] containers: - name: postgres image: postgres:16-3.4 env: - name: POSTGRES_PASSWORD valueFrom: { secretKeyRef: { name: postgres-auth, key: POSTGRES_PASSWORD } } - { name: POSTGRES_USER, value: gitea } - { name: POSTGRES_DB, value: gitea } - name: POSTGRES_INITDB_ARGS value: "--encoding=UTF8 --locale=C.UTF-8" ports: [{ containerPort: 5432 }] volumeMounts: - { name: data, mountPath: /var/lib/postgresql/data } - { name: init, mountPath: /docker-entrypoint-initdb.d } volumeClaimTemplates: - metadata: { name: data } spec: accessModes: ["ReadWriteOnce"] storageClassName: local-ssd-hetzner resources: { requests: { storage: 80Gi } } --- # Mount the init scripts apiVersion: apps/v1 kind: StatefulSet metadata: name: postgres namespace: db spec: template: spec: volumes: - name: init configMap: name: pg-init-sql defaultMode: 0444 === ./k8s/postgres/secret.yaml === apiVersion: v1 kind: Secret metadata: { name: postgres-auth, namespace: db } type: Opaque stringData: POSTGRES_PASSWORD: "PG-ADM1N" GITEA_DB_PASSWORD: "G1TEA" === ./k8s/prometheus/prometheus-config.yaml === apiVersion: v1 kind: ConfigMap metadata: { name: prometheus-config, namespace: monitoring } data: prometheus.yml: | global: { scrape_interval: 15s } scrape_configs: - job_name: 'kubernetes-pods' kubernetes_sd_configs: [ { role: pod } ] relabel_configs: - action: keep source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape] regex: 'true' === ./k8s/prometheus/prometheus.yaml === apiVersion: v1 kind: Service metadata: { name: prometheus, namespace: monitoring } spec: ports: [{ port: 9090, targetPort: 9090 }] selector: { app: prometheus } --- apiVersion: apps/v1 kind: StatefulSet metadata: { name: prometheus, namespace: monitoring } spec: serviceName: prometheus replicas: 1 selector: { matchLabels: { app: prometheus } } template: metadata: { labels: { app: prometheus } } spec: nodeSelector: { node: hetzner-2 } containers: - name: prometheus image: prom/prometheus:v2.53.0 args: ["--config.file=/etc/prometheus/prometheus.yml","--storage.tsdb.path=/prometheus"] ports: [{ containerPort: 9090 }] volumeMounts: - { name: data, mountPath: /prometheus } - { name: config, mountPath: /etc/prometheus } volumes: - { name: config, configMap: { name: prometheus-config } } volumeClaimTemplates: - metadata: { name: data } spec: accessModes: ["ReadWriteOnce"] storageClassName: local-ssd-hetzner resources: { requests: { storage: 50Gi } } --- apiVersion: networking.k8s.io/v1 kind: Ingress metadata: name: prometheus namespace: monitoring annotations: cert-manager.io/cluster-issuer: letsencrypt-prod nginx.ingress.kubernetes.io/auth-type: basic nginx.ingress.kubernetes.io/auth-secret: basic-auth-prometheus nginx.ingress.kubernetes.io/auth-realm: "Authentication Required" spec: ingressClassName: nginx tls: [{ hosts: ["prometheus.betelgeusebytes.io"], secretName: prometheus-tls }] rules: - host: prometheus.betelgeusebytes.io http: paths: - path: / pathType: Prefix backend: { service: { name: prometheus, port: { number: 9090 } } } === ./k8s/redis/redis-pv.yaml === apiVersion: v1 kind: PersistentVolume metadata: name: pv-redis spec: capacity: storage: 10Gi accessModes: - ReadWriteOnce persistentVolumeReclaimPolicy: Retain storageClassName: local-ssd-hetzner local: path: /mnt/local-ssd/redis nodeAffinity: required: nodeSelectorTerms: - matchExpressions: - key: kubernetes.io/hostname operator: In values: - hetzner-2 === ./k8s/redis/redis.yaml === apiVersion: v1 kind: Service metadata: { name: redis, namespace: db } spec: ports: [{ port: 6379, targetPort: 6379 }] selector: { app: redis } --- apiVersion: apps/v1 kind: StatefulSet metadata: { name: redis, namespace: db } spec: serviceName: redis replicas: 1 selector: { matchLabels: { app: redis } } template: metadata: { labels: { app: redis } } spec: nodeSelector: { node: hetzner-2 } containers: - name: redis image: redis:7 args: ["--requirepass", "$(REDIS_PASSWORD)"] env: - name: REDIS_PASSWORD valueFrom: { secretKeyRef: { name: redis-auth, key: REDIS_PASSWORD } } ports: [{ containerPort: 6379 }] volumeMounts: - { name: data, mountPath: /data } volumeClaimTemplates: - metadata: { name: data } spec: accessModes: ["ReadWriteOnce"] storageClassName: local-ssd-hetzner resources: { requests: { storage: 10Gi } } --- apiVersion: v1 kind: Secret metadata: { name: redis-auth, namespace: db } type: Opaque stringData: { REDIS_PASSWORD: "RED1S" } === ./k8s/sso/sso.yaml === # PV apiVersion: v1 kind: PersistentVolume metadata: name: pv-auth spec: capacity: storage: 10Gi accessModes: - ReadWriteOnce persistentVolumeReclaimPolicy: Retain storageClassName: local-ssd-hetzner local: path: /mnt/local-ssd/auth nodeAffinity: required: nodeSelectorTerms: - matchExpressions: - key: kubernetes.io/hostname operator: In values: - hetzner-2 --- # k8s/auth/keycloak/secret.yaml apiVersion: v1 kind: Secret metadata: { name: keycloak-admin, namespace: db } type: Opaque stringData: { KEYCLOAK_ADMIN: "admin", KEYCLOAK_ADMIN_PASSWORD: "admin" } --- # k8s/auth/keycloak/pvc.yaml apiVersion: v1 kind: PersistentVolumeClaim metadata: { name: keycloak-data, namespace: db } spec: accessModes: ["ReadWriteOnce"] storageClassName: local-ssd-hetzner resources: { requests: { storage: 10Gi } } --- # k8s/auth/keycloak/deploy.yaml apiVersion: apps/v1 kind: Deployment metadata: { name: keycloak, namespace: db } spec: replicas: 1 selector: { matchLabels: { app: keycloak } } template: metadata: { labels: { app: keycloak } } spec: # Ensure the PV is owned by the Keycloak UID/GID securityContext: fsGroup: 1000 initContainers: - name: fix-permissions image: busybox command: ['sh', '-c', 'chown -R 1000:1000 /opt/keycloak/data && chmod -R 755 /opt/keycloak/data'] volumeMounts: - name: data mountPath: /opt/keycloak/data containers: - name: keycloak image: quay.io/keycloak/keycloak:latest args: ["start","--http-enabled=true","--proxy-headers=xforwarded","--hostname-strict=false"] env: - { name: KEYCLOAK_ADMIN, valueFrom: { secretKeyRef: { name: keycloak-admin, key: KEYCLOAK_ADMIN } } } - { name: KEYCLOAK_ADMIN_PASSWORD, valueFrom: { secretKeyRef: { name: keycloak-admin, key: KEYCLOAK_ADMIN_PASSWORD } } } ports: [{ containerPort: 8080 }] volumeMounts: [{ name: data, mountPath: /opt/keycloak/data }] securityContext: runAsUser: 1000 runAsGroup: 1000 volumes: - name: data persistentVolumeClaim: { claimName: keycloak-data } --- apiVersion: v1 kind: Service metadata: { name: keycloak, namespace: db } spec: { selector: { app: keycloak }, ports: [ { port: 80, targetPort: 8080 } ] } --- apiVersion: networking.k8s.io/v1 kind: Ingress metadata: name: keycloak namespace: db annotations: { cert-manager.io/cluster-issuer: letsencrypt-prod } spec: ingressClassName: nginx tls: [{ hosts: ["auth.betelgeusebytes.io"], secretName: keycloak-tls }] rules: - host: auth.betelgeusebytes.io http: paths: - path: / pathType: Prefix backend: { service: { name: keycloak, port: { number: 80 } } } === ./k8s/storage/persistent-volumes.yaml === apiVersion: v1 kind: PersistentVolume metadata: name: pv-postgres spec: capacity: storage: 80Gi accessModes: - ReadWriteOnce persistentVolumeReclaimPolicy: Retain storageClassName: local-ssd-hetzner local: path: /mnt/local-ssd/postgres nodeAffinity: required: nodeSelectorTerms: - matchExpressions: - key: kubernetes.io/hostname operator: In values: - hetzner-2 --- apiVersion: v1 kind: PersistentVolume metadata: name: pv-elasticsearch spec: capacity: storage: 300Gi accessModes: - ReadWriteOnce persistentVolumeReclaimPolicy: Retain storageClassName: local-ssd-hetzner local: path: /mnt/local-ssd/elasticsearch nodeAffinity: required: nodeSelectorTerms: - matchExpressions: - key: kubernetes.io/hostname operator: In values: - hetzner-2 --- apiVersion: v1 kind: PersistentVolume metadata: name: pv-gitea spec: capacity: storage: 50Gi accessModes: - ReadWriteOnce persistentVolumeReclaimPolicy: Retain storageClassName: local-ssd-hetzner local: path: /mnt/local-ssd/gitea nodeAffinity: required: nodeSelectorTerms: - matchExpressions: - key: kubernetes.io/hostname operator: In values: - hetzner-2 --- apiVersion: v1 kind: PersistentVolume metadata: name: pv-jupyter spec: capacity: storage: 20Gi accessModes: - ReadWriteOnce persistentVolumeReclaimPolicy: Retain storageClassName: local-ssd-hetzner local: path: /mnt/local-ssd/jupyter nodeAffinity: required: nodeSelectorTerms: - matchExpressions: - key: kubernetes.io/hostname operator: In values: - hetzner-2 --- apiVersion: v1 kind: PersistentVolume metadata: name: pv-kafka spec: capacity: storage: 50Gi accessModes: - ReadWriteOnce persistentVolumeReclaimPolicy: Retain storageClassName: local-ssd-hetzner local: path: /mnt/local-ssd/kafka nodeAffinity: required: nodeSelectorTerms: - matchExpressions: - key: kubernetes.io/hostname operator: In values: - hetzner-2 --- apiVersion: v1 kind: PersistentVolume metadata: name: pv-zookeeper-data spec: capacity: storage: 10Gi accessModes: - ReadWriteOnce persistentVolumeReclaimPolicy: Retain storageClassName: local-ssd-hetzner local: path: /mnt/local-ssd/zookeeper-data nodeAffinity: required: nodeSelectorTerms: - matchExpressions: - key: kubernetes.io/hostname operator: In values: - hetzner-2 --- apiVersion: v1 kind: PersistentVolume metadata: name: pv-zookeeper-log spec: capacity: storage: 10Gi accessModes: - ReadWriteOnce persistentVolumeReclaimPolicy: Retain storageClassName: local-ssd-hetzner local: path: /mnt/local-ssd/zookeeper-log nodeAffinity: required: nodeSelectorTerms: - matchExpressions: - key: kubernetes.io/hostname operator: In values: - hetzner-2 --- apiVersion: v1 kind: PersistentVolume metadata: name: pv-prometheus spec: capacity: storage: 50Gi accessModes: - ReadWriteOnce persistentVolumeReclaimPolicy: Retain storageClassName: local-ssd-hetzner local: path: /mnt/local-ssd/prometheus nodeAffinity: required: nodeSelectorTerms: - matchExpressions: - key: kubernetes.io/hostname operator: In values: - hetzner-2 === ./k8s/storage/storageclass.yaml === apiVersion: storage.k8s.io/v1 kind: StorageClass metadata: name: local-ssd-hetzner provisioner: kubernetes.io/no-provisioner volumeBindingMode: WaitForFirstConsumer === ./k8s/tei/tei.yaml === # k8s/ai/tei/deploy.yaml apiVersion: apps/v1 kind: Deployment metadata: { name: tei, namespace: ml } spec: replicas: 1 selector: { matchLabels: { app: tei } } template: metadata: { labels: { app: tei } } spec: containers: - name: tei image: ghcr.io/huggingface/text-embeddings-inference:cpu-latest env: [{ name: MODEL_ID, value: "mixedbread-ai/mxbai-embed-large-v1" }] ports: [{ containerPort: 80 }] --- apiVersion: v1 kind: Service metadata: { name: tei, namespace: ml } spec: { selector: { app: tei }, ports: [ { port: 80, targetPort: 80 } ] } --- apiVersion: networking.k8s.io/v1 kind: Ingress metadata: name: tei namespace: ml annotations: { cert-manager.io/cluster-issuer: letsencrypt-prod } spec: ingressClassName: nginx tls: [{ hosts: ["embeddings.betelgeusebytes.io"], secretName: tei-tls }] rules: - host: embeddings.betelgeusebytes.io http: paths: - path: / pathType: Prefix backend: { service: { name: tei, port: { number: 80 } } } === ./k8s/trading/ib-gateway.yaml === apiVersion: v1 kind: Namespace metadata: name: trading labels: name: trading environment: production --- # OPTIONAL: Use this if you want to persist IB Gateway settings/logs # across pod restarts. For most use cases, this is NOT needed since # IB Gateway is mostly stateless and credentials are in Secrets. # # Only create this PV/PVC if you need to persist: # - TWS session data # - Custom workspace layouts # - Historical API usage logs apiVersion: v1 kind: PersistentVolume metadata: name: ib-gateway-data labels: type: local app: ib-gateway spec: capacity: storage: 5Gi accessModes: - ReadWriteOnce persistentVolumeReclaimPolicy: Retain storageClassName: local-storage local: path: /mnt/local-ssd/ib-gateway # Adjust to your local SSD path nodeAffinity: required: nodeSelectorTerms: - matchExpressions: - key: kubernetes.io/hostname operator: In values: - hetzner-2 --- apiVersion: v1 kind: PersistentVolumeClaim metadata: name: ib-gateway-data namespace: trading spec: accessModes: - ReadWriteOnce resources: requests: storage: 5Gi storageClassName: local-storage selector: matchLabels: app: ib-gateway # To use this PVC, add to Deployment volumeMounts: # - name: data # mountPath: /root/Jts # And to volumes: # - name: data # persistentVolumeClaim: # claimName: ib-gateway-data --- apiVersion: v1 kind: Secret metadata: name: ib-credentials namespace: trading type: Opaque stringData: # IMPORTANT: Replace these with your actual IB credentials # For paper trading, use your paper trading account username: "saladin85" password: "3Lcd@05041985" # Trading mode: "paper" or "live" trading-mode: "paper" # IB Gateway config (jts.ini equivalent) # This enables headless mode and configures ports ibgateway.conf: | [IBGateway] TradingMode=paper ApiOnly=true ReadOnlyApi=false TrustedIPs=127.0.0.1 [IBGatewayAPI] ApiPortNumber=4002 [Logon] UseRemoteSettings=no Locale=en ColorPaletteName=dark [Display] ShowSplashScreen=no --- apiVersion: v1 kind: ConfigMap metadata: name: ib-gateway-config namespace: trading data: # Startup script to configure IB Gateway for headless operation startup.sh: | #!/bin/bash set -e echo "Starting IB Gateway in headless mode..." echo "Trading Mode: ${TRADING_MODE}" echo "Port: ${TWS_PORT}" # Configure based on trading mode if [ "${TRADING_MODE}" == "live" ]; then export TWS_PORT=4001 echo "⚠️ LIVE TRADING MODE - USE WITH CAUTION ⚠️" else export TWS_PORT=4002 echo "📝 Paper Trading Mode (Safe)" fi # IMPORTANT: use the env vars provided by the Deployment export IB_USERNAME="${TWS_USERID}" export IB_PASSWORD="${TWS_PASSWORD}" # Start IB Gateway exec /opt/ibgateway/ibgateway-latest-standalone-linux-x64.sh \ --tws-path=/root/Jts \ --tws-settings-path=/root \ --user="${IB_USERNAME}" \ --pw="${IB_PASSWORD}" \ --mode="${TRADING_MODE}" \ --port="${TWS_PORT}" # Health check script healthcheck.sh: | #!/bin/bash # Check if TWS API port is listening # PORT=${TWS_PORT:-4002} # nc -z localhost $PORT # exit $? #!/bin/sh # Pure-python TCP check (no nc required) PORT="${TWS_PORT:-4002}" python - <<'PY' import os, socket, sys port = int(os.environ.get("TWS_PORT", os.environ.get("PORT", "4002"))) s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) s.settimeout(2) try: s.connect(("127.0.0.1", port)) sys.exit(0) except Exception: sys.exit(1) finally: s.close() PY --- # apiVersion: apps/v1 # kind: Deployment # metadata: # name: ib-gateway # namespace: trading # labels: # app: ib-gateway # component: trading-infrastructure # spec: # replicas: 1 # IB Gateway should only have 1 instance per account # strategy: # type: Recreate # Avoid multiple simultaneous logins # selector: # matchLabels: # app: ib-gateway # template: # metadata: # labels: # app: ib-gateway # annotations: # prometheus.io/scrape: "false" # No metrics endpoint by default # spec: # # Pin to hetzner-2 (matches your existing pattern) # nodeSelector: # kubernetes.io/hostname: hetzner-2 # # Security context # securityContext: # runAsNonRoot: false # IB Gateway requires root for VNC (even if unused) # fsGroup: 1000 # containers: # - name: ib-gateway # # Using community-maintained IB Gateway image # # Alternative: waytrade/ib-gateway:latest # image: ghcr.io/gnzsnz/ib-gateway:stable # imagePullPolicy: IfNotPresent # env: # - name: TWS_USERID # valueFrom: # secretKeyRef: # name: ib-credentials # key: username # - name: TWS_PASSWORD # valueFrom: # secretKeyRef: # name: ib-credentials # key: password # - name: TRADING_MODE # valueFrom: # secretKeyRef: # name: ib-credentials # key: trading-mode # - name: TWS_PORT # value: "4002" # Default to paper trading # - name: READ_ONLY_API # value: "no" # # Ports # ports: # - name: paper-trading # containerPort: 4002 # protocol: TCP # - name: live-trading # containerPort: 4001 # protocol: TCP # - name: vnc # containerPort: 5900 # protocol: TCP # VNC (not exposed externally) # # Resource limits # resources: # requests: # memory: "1Gi" # cpu: "500m" # limits: # memory: "2Gi" # cpu: "1000m" # # Liveness probe (check if API port is responsive) # startupProbe: # tcpSocket: # port: 4002 # initialDelaySeconds: 60 # Wait 60s before first check # periodSeconds: 10 # Check every 10s # timeoutSeconds: 5 # failureThreshold: 18 # 60s + (10s * 18) = 240s total startup time # livenessProbe: # tcpSocket: # port: 4002 # initialDelaySeconds: 0 # IB Gateway takes time to start # periodSeconds: 60 # timeoutSeconds: 5 # failureThreshold: 3 # # Readiness probe # readinessProbe: # tcpSocket: # port: 4002 # initialDelaySeconds: 0 # periodSeconds: 10 # timeoutSeconds: 5 # failureThreshold: 2 # # Volume mounts for config # volumeMounts: # - name: ib-config # mountPath: /root/Jts/jts.ini # subPath: ibgateway.conf # - name: startup-script # mountPath: /startup.sh # subPath: startup.sh # - name: data # mountPath: /root/Jts # # Logging to stdout (Fluent Bit will collect) # # IB Gateway logs go to /root/Jts/log by default # lifecycle: # postStart: # exec: # command: # - /bin/sh # - -c # - | # mkdir -p /root/Jts/log # ln -sf /dev/stdout /root/Jts/log/ibgateway.log || true # volumes: # - name: ib-config # secret: # secretName: ib-credentials # defaultMode: 0644 # - name: startup-script # configMap: # name: ib-gateway-config # defaultMode: 0755 # - name: data # persistentVolumeClaim: # claimName: ib-gateway-data # # Restart policy # restartPolicy: Always # # DNS policy for internal cluster resolution # dnsPolicy: ClusterFirst apiVersion: apps/v1 kind: Deployment metadata: name: ib-gateway namespace: trading labels: app: ib-gateway component: trading-infrastructure spec: replicas: 1 strategy: type: Recreate selector: matchLabels: app: ib-gateway template: metadata: labels: app: ib-gateway annotations: prometheus.io/scrape: "false" spec: nodeSelector: kubernetes.io/hostname: hetzner-2 securityContext: runAsNonRoot: false fsGroup: 1000 # Seed writable jts.ini into the PVC once initContainers: - name: seed-jts-config image: busybox:1.36 command: - sh - -c - | set -e mkdir -p /data if [ ! -f /data/jts.ini ]; then echo "Seeding jts.ini into PVC" cp /config/ibgateway.conf /data/jts.ini chmod 644 /data/jts.ini else echo "jts.ini already exists in PVC" fi volumeMounts: - name: ib-config mountPath: /config readOnly: true - name: data mountPath: /data containers: # ------------------------------------------------------------------ # IB Gateway # ------------------------------------------------------------------ - name: ib-gateway image: ghcr.io/gnzsnz/ib-gateway:stable imagePullPolicy: IfNotPresent env: - name: TWS_USERID valueFrom: secretKeyRef: name: ib-credentials key: username - name: TWS_PASSWORD valueFrom: secretKeyRef: name: ib-credentials key: password - name: TRADING_MODE valueFrom: secretKeyRef: name: ib-credentials key: trading-mode - name: TWS_PORT value: "4002" - name: READ_ONLY_API value: "no" ports: - name: ib-api-local containerPort: 4002 protocol: TCP - name: live-trading containerPort: 4001 protocol: TCP - name: vnc containerPort: 5900 protocol: TCP resources: requests: memory: "1Gi" cpu: "500m" limits: memory: "2Gi" cpu: "1000m" # IMPORTANT: Probes should check the local IB port (4002) startupProbe: tcpSocket: port: 4002 initialDelaySeconds: 60 periodSeconds: 10 timeoutSeconds: 5 failureThreshold: 18 livenessProbe: tcpSocket: port: 4002 periodSeconds: 60 timeoutSeconds: 5 failureThreshold: 3 readinessProbe: tcpSocket: port: 4002 periodSeconds: 10 timeoutSeconds: 5 failureThreshold: 2 volumeMounts: - name: data mountPath: /root/Jts lifecycle: postStart: exec: command: - sh - -c - | mkdir -p /root/Jts/log ln -sf /dev/stdout /root/Jts/log/ibgateway.log || true # ------------------------------------------------------------------ # Sidecar TCP proxy: accepts cluster traffic, forwards to localhost:4002 # ------------------------------------------------------------------ - name: ib-api-proxy image: alpine/socat:1.8.0.0 imagePullPolicy: IfNotPresent args: - "-d" - "-d" - "TCP-LISTEN:4003,fork,reuseaddr" - "TCP:127.0.0.1:4002" ports: - name: ib-api containerPort: 4003 protocol: TCP resources: requests: memory: "32Mi" cpu: "10m" limits: memory: "128Mi" cpu: "100m" # basic probe: is proxy listening readinessProbe: tcpSocket: port: 4003 periodSeconds: 5 timeoutSeconds: 2 failureThreshold: 3 volumes: - name: ib-config secret: secretName: ib-credentials defaultMode: 0644 - name: data persistentVolumeClaim: claimName: ib-gateway-data restartPolicy: Always dnsPolicy: ClusterFirst --- # apiVersion: v1 # kind: Service # metadata: # name: ib-gateway # namespace: trading # labels: # app: ib-gateway # spec: # type: ClusterIP # Internal-only, not exposed publicly # clusterIP: None # Headless service (optional, remove if you want a stable ClusterIP) # selector: # app: ib-gateway # ports: # - name: paper-trading # port: 4002 # targetPort: 4002 # protocol: TCP # - name: live-trading # port: 4001 # targetPort: 4001 # protocol: TCP # sessionAffinity: ClientIP # Stick to same pod (important for stateful TWS sessions) # sessionAffinityConfig: # clientIP: # timeoutSeconds: 3600 # 1 hour session stickiness apiVersion: v1 kind: Service metadata: name: ib-gateway namespace: trading labels: app: ib-gateway spec: type: ClusterIP selector: app: ib-gateway ports: - name: paper-trading port: 4002 targetPort: 4003 # <-- proxy sidecar, not the gateway directly protocol: TCP - name: live-trading port: 4001 targetPort: 4001 protocol: TCP sessionAffinity: ClientIP sessionAffinityConfig: clientIP: timeoutSeconds: 3600 === ./k8s/trading/ib-gateway2.yaml === apiVersion: v1 kind: Namespace metadata: name: trading labels: name: trading environment: production --- apiVersion: v1 kind: Secret metadata: name: ib-credentials namespace: trading type: Opaque stringData: # Rotate your creds (you pasted them earlier). username: "saladin85" password: "3Lcd@05041985" trading-mode: "paper" --- apiVersion: apps/v1 kind: Deployment metadata: name: ib-gateway namespace: trading labels: app: ib-gateway component: trading-infrastructure spec: replicas: 1 strategy: type: Recreate selector: matchLabels: app: ib-gateway template: metadata: labels: app: ib-gateway annotations: prometheus.io/scrape: "false" spec: nodeSelector: kubernetes.io/hostname: hetzner-2 # Keep your original security context securityContext: runAsNonRoot: false fsGroup: 1000 containers: - name: ib-gateway image: ghcr.io/gnzsnz/ib-gateway:stable imagePullPolicy: IfNotPresent # IMPORTANT: use env vars this image expects env: - name: TWS_USERID valueFrom: secretKeyRef: name: ib-credentials key: username - name: TWS_PASSWORD valueFrom: secretKeyRef: name: ib-credentials key: password - name: TRADING_MODE valueFrom: secretKeyRef: name: ib-credentials key: trading-mode - name: READ_ONLY_API value: "no" # These two match what your log shows the image uses - name: API_PORT value: "4002" - name: SOCAT_PORT value: "4004" # optional but nice - name: TIME_ZONE value: "Etc/UTC" - name: TWOFA_TIMEOUT_ACTION value: "exit" ports: # IB API ports (inside container / localhost use) - name: api-paper containerPort: 4002 protocol: TCP - name: api-live containerPort: 4001 protocol: TCP # socat relay port for non-localhost clients (what we expose via Service) - name: api-socat containerPort: 4004 protocol: TCP # optional UI/VNC - name: vnc containerPort: 5900 protocol: TCP resources: requests: memory: "1Gi" cpu: "500m" limits: memory: "2Gi" cpu: "1000m" # Probe the socat port (represents remote connectivity) startupProbe: tcpSocket: port: 4004 initialDelaySeconds: 60 periodSeconds: 10 timeoutSeconds: 5 failureThreshold: 18 readinessProbe: tcpSocket: port: 4004 periodSeconds: 10 timeoutSeconds: 5 failureThreshold: 2 livenessProbe: tcpSocket: port: 4004 periodSeconds: 60 timeoutSeconds: 5 failureThreshold: 3 restartPolicy: Always dnsPolicy: ClusterFirst --- apiVersion: v1 kind: Service metadata: name: ib-gateway namespace: trading labels: app: ib-gateway spec: type: ClusterIP selector: app: ib-gateway ports: # Clients connect to 4002, but we forward to SOCAT_PORT=4004 - name: paper-trading port: 4002 targetPort: 4004 protocol: TCP # If you truly need live, you should relay live via another socat port too. # For now keep it direct (or remove it entirely for safety). - name: live-trading port: 4001 targetPort: 4001 protocol: TCP sessionAffinity: ClientIP sessionAffinityConfig: clientIP: timeoutSeconds: 3600 === ./k8s/vector/qdrant.yaml === # k8s/vec/qdrant/pvc.yaml apiVersion: v1 kind: PersistentVolumeClaim metadata: { name: qdrant-data, namespace: db} spec: accessModes: ["ReadWriteOnce"] storageClassName: local-ssd-hetzner resources: { requests: { storage: 20Gi } } --- # k8s/vec/qdrant/deploy.yaml apiVersion: apps/v1 kind: Deployment metadata: { name: qdrant, namespace: db} spec: replicas: 1 selector: { matchLabels: { app: qdrant } } template: metadata: { labels: { app: qdrant } } spec: containers: - name: qdrant image: qdrant/qdrant:latest ports: - { containerPort: 6333 } # HTTP + Web UI - { containerPort: 6334 } # gRPC volumeMounts: - { name: data, mountPath: /qdrant/storage } volumes: - name: data persistentVolumeClaim: { claimName: qdrant-data } --- apiVersion: v1 kind: Service metadata: { name: qdrant, namespace: db} spec: selector: { app: qdrant } ports: - { name: http, port: 80, targetPort: 6333 } - { name: grpc, port: 6334, targetPort: 6334 } --- apiVersion: networking.k8s.io/v1 kind: Ingress metadata: name: qdrant namespace: db annotations: { cert-manager.io/cluster-issuer: letsencrypt-prod } spec: ingressClassName: nginx tls: [{ hosts: ["vector.betelgeusebytes.io"], secretName: qdrant-tls }] rules: - host: vector.betelgeusebytes.io http: paths: - path: / pathType: Prefix backend: { service: { name: qdrant, port: { number: 80 } } } --- # PV apiVersion: v1 kind: PersistentVolume metadata: name: pv-qdrant spec: capacity: storage: 20Gi accessModes: - ReadWriteOnce persistentVolumeReclaimPolicy: Retain storageClassName: local-ssd-hetzner local: path: /mnt/local-ssd/qdrant nodeAffinity: required: nodeSelectorTerms: - matchExpressions: - key: kubernetes.io/hostname operator: In values: - hetzner-2 === ./k8s/vllm/vllm.yaml === # PV apiVersion: v1 kind: PersistentVolume metadata: name: pv-vllm spec: capacity: storage: 50Gi accessModes: - ReadWriteOnce persistentVolumeReclaimPolicy: Retain storageClassName: local-ssd-hetzner local: path: /mnt/local-ssd/vllm nodeAffinity: required: nodeSelectorTerms: - matchExpressions: - key: kubernetes.io/hostname operator: In values: - hetzner-2 --- # k8s/ai/vllm/secret.yaml apiVersion: v1 kind: Secret metadata: { name: vllm-auth, namespace: ml } type: Opaque stringData: { API_KEY: "replace_me" } --- # k8s/ai/ollama/deploy.yaml apiVersion: apps/v1 kind: Deployment metadata: { name: ollama, namespace: ml } spec: replicas: 1 selector: { matchLabels: { app: ollama } } template: metadata: { labels: { app: ollama } } spec: securityContext: runAsUser: 0 # needed so the init can write into /root/.ollama initContainers: - name: warm-models image: ollama/ollama:latest command: ["/bin/sh","-c"] args: - | ollama serve & # start a temp daemon sleep 2 # pull one or more small, quantized models for CPU ollama pull qwen2.5:3b-instruct-q4_K_M || true ollama pull llama3.2:3b-instruct-q4_K_M || true pkill ollama || true volumeMounts: - { name: data, mountPath: /root/.ollama } containers: - name: ollama image: ollama/ollama:latest env: - { name: OLLAMA_ORIGINS, value: "*" } # CORS if you call from browser ports: - { containerPort: 11434 } volumeMounts: - { name: data, mountPath: /root/.ollama } resources: requests: { cpu: "2", memory: "4Gi" } limits: { cpu: "4", memory: "8Gi" } volumes: - name: data persistentVolumeClaim: { claimName: ollama-data } --- # k8s/ai/ollama/svc-ing.yaml apiVersion: v1 kind: Service metadata: { name: ollama, namespace: ml } spec: selector: { app: ollama } ports: [ { name: http, port: 80, targetPort: 11434 } ] # --- # # old k8s/ai/vllm/deploy.yaml # apiVersion: apps/v1 # kind: Deployment # metadata: { name: vllm, namespace: ml } # spec: # replicas: 1 # selector: { matchLabels: { app: vllm } } # template: # metadata: { labels: { app: vllm } } # spec: # containers: # - name: vllm # image: vllm/vllm-openai:latest # args: ["--model","Qwen/Qwen2.5-7B-Instruct","--max-model-len","8192","--port","8000","--host","0.0.0.0"] # env: # - name: VLLM_API_KEY # valueFrom: { secretKeyRef: { name: vllm-auth, key: API_KEY } } # ports: [{ containerPort: 8000 }] # resources: # limits: # nvidia.com/gpu: 1 # requests: # nvidia.com/gpu: 1 # volumeMounts: # - { name: cache, mountPath: /root/.cache/huggingface } # volumes: # - name: cache # persistentVolumeClaim: { claimName: vllm-cache-pvc } --- apiVersion: v1 kind: PersistentVolumeClaim metadata: { name: ollama-data, namespace: ml } spec: accessModes: ["ReadWriteOnce"] storageClassName: local-ssd-hetzner resources: { requests: { storage: 50Gi } } # --- #old k8s/ai/vllm/svc-ing.yaml # apiVersion: v1 # kind: Service # metadata: { name: vllm, namespace: ml } # spec: { selector: { app: vllm }, ports: [ { port: 80, targetPort: 8000 } ] } --- apiVersion: networking.k8s.io/v1 kind: Ingress metadata: name: vllm namespace: ml annotations: { cert-manager.io/cluster-issuer: letsencrypt-prod } spec: ingressClassName: nginx tls: [{ hosts: ["llm.betelgeusebytes.io"], secretName: vllm-tls }] rules: - host: llm.betelgeusebytes.io http: paths: - path: / pathType: Prefix backend: { service: { name: vllm, port: { number: 80 } } }