betelgeusebytes/betelgeusebytes.txt

5959 lines
156 KiB
Plaintext

=== ./ansible/inventories/prod/group_vars/all.yml ===
cluster_name: prod
k8s_version: "v1.30.3"
control_plane_endpoint: "95.217.89.53:6443" # switch later to cp.k8s.betelgeusebytes.io:6443
pod_cidr: "10.244.0.0/16"
service_cidr: "10.96.0.0/12"
cilium_version: "1.15.7"
local_path_dir: "/srv/k8s"
local_sc_name: "local-ssd-hetzner"
stateful_node_label_key: "node"
stateful_node_label_val: "hetzner-2"
=== ./ansible/inventories/prod/hosts.ini ===
[k8s_control_plane]
hetzner-1 ansible_host=95.217.89.53 public_ip=95.217.89.53 wg_address=10.66.0.11
[k8s_workers]
hetzner-1 ansible_host=95.217.89.53 public_ip=95.217.89.53 wg_address=10.66.0.11
hetzner-2 ansible_host=138.201.254.97 public_ip=138.201.254.97 wg_address=10.66.0.12
[k8s_nodes:children]
k8s_control_plane
k8s_workers
# add tiny VPS control-planes here when ready
[new_control_planes]
# cp-a ansible_host=<VPS1_IP> public_ip=<VPS1_IP> wg_address=10.66.0.10
[all:vars]
ansible_user=root
ansible_password=3Lcd0504
ansible_become=true
=== ./ansible/playbooks/add-control-planes.yml ===
- hosts: k8s_control_plane[0]
become: yes
roles:
- kubeadm_cp_discovery
- hosts: new_control_planes
become: yes
roles:
- common
- wireguard
- containerd
- kubernetes
- hosts: new_control_planes
become: yes
roles:
- kubeadm_join_cp
vars:
kubeadm_cp_join_cmd: "{{ hostvars[groups['k8s_control_plane'][0]].kubeadm_cp_join_cmd | default(kubeadm_cp_join_cmd) }}"
=== ./ansible/playbooks/site.yml ===
- hosts: k8s_nodes
become: yes
# serial: 1
roles:
# - ../roles/common
#- ../roles/wireguard
#- ../roles/containerd
#- ../roles/kubernetes
- hosts: k8s_control_plane
become: yes
roles:
- ../roles/kubeadm_init
# - hosts: k8s_workers
# become: yes
# roles:
# - ../roles/kubeadm_join
- hosts: k8s_control_plane
become: yes
roles:
# - ../roles/cilium
# - ../roles/ingress
#- ../roles/cert_manager
- hosts: k8s_nodes
become: yes
roles:
#- ../roles/storage_local_path
- ../roles/labels
=== ./ansible/roles/cert_manager/tasks/main.yml ===
- name: Install cert-manager
shell: kubectl apply -f https://github.com/cert-manager/cert-manager/releases/download/v1.14.4/cert-manager.yaml
- name: Wait for cert-manager pods to be ready
shell: kubectl wait --for=condition=ready --timeout=300s pod -l app.kubernetes.io/instance=cert-manager -n cert-manager
- name: Wait for webhook endpoint to be ready
shell: |
for i in {1..30}; do
if kubectl get endpoints cert-manager-webhook -n cert-manager -o jsonpath='{.subsets[*].addresses[*].ip}' | grep -q .; then
echo "Webhook endpoint is ready"
exit 0
fi
echo "Waiting for webhook endpoint... attempt $i/30"
sleep 2
done
exit 1
- name: Test webhook connectivity
shell: kubectl run test-webhook --image=curlimages/curl:latest --rm -i --restart=Never -- curl -k https://cert-manager-webhook.cert-manager.svc:443/healthz
register: webhook_test
ignore_errors: yes
- name: Display webhook test result
debug:
var: webhook_test
- name: ClusterIssuer
copy:
dest: /root/cluster-issuer-prod.yaml
content: |
apiVersion: cert-manager.io/v1
kind: ClusterIssuer
metadata:
name: letsencrypt-prod
spec:
acme:
- name: ClusterIssuer
copy:
dest: /root/cluster-issuer-prod.yaml
content: |
apiVersion: cert-manager.io/v1
kind: ClusterIssuer
metadata:
name: letsencrypt-prod
spec:
acme:
email: admin@betelgeusebytes.io
server: https://acme-v02.api.letsencrypt.org/directory
privateKeySecretRef:
name: letsencrypt-prod-key
solvers:
- http01:
ingress:
class: nginx
- name: Temporarily disable cert-manager webhook
shell: |
kubectl delete validatingwebhookconfiguration cert-manager-webhook || true
ignore_errors: yes
- name: Apply ClusterIssuer
command: kubectl apply -f /root/cluster-issuer-prod.yaml
- name: Reinstall cert-manager to restore webhook
shell: kubectl apply -f https://github.com/cert-manager/cert-manager/releases/download/v1.14.4/cert-manager.yaml
=== ./ansible/roles/cilium/tasks/main.yml ===
- name: Install cilium CLI
shell: |
curl -L --fail --remote-name-all https://github.com/cilium/cilium-cli/releases/latest/download/cilium-linux-amd64.tar.gz
tar xzf cilium-linux-amd64.tar.gz -C /usr/local/bin
args: { creates: /usr/local/bin/cilium }
- name: Deploy cilium
shell: |
cilium install --version {{ cilium_version }} --set kubeProxyReplacement=strict --set bpf.masquerade=true
=== ./ansible/roles/common/tasks/main.yml ===
- name: Disable swap
command: swapoff -a
when: ansible_swaptotal_mb|int > 0
- name: Ensure swap disabled on boot
replace:
path: /etc/fstab
regexp: '^([^#].*\sswap\s)'
replace: '# \1'
- name: Kernel modules
copy:
dest: /etc/modules-load.d/containerd.conf
content: |
overlay
br_netfilter
- name: Load modules
command: modprobe {{ item }}
loop: [overlay, br_netfilter]
- name: Sysctl for k8s
copy:
dest: /etc/sysctl.d/99-kubernetes.conf
content: |
net.bridge.bridge-nf-call-iptables = 1
net.bridge.bridge-nf-call-ip6tables = 1
net.ipv4.ip_forward = 1
vm.max_map_count = 262144
- name: Apply sysctl
command: sysctl --system
=== ./ansible/roles/containerd/tasks/main.yml ===
- name: Install containerd
apt:
name: containerd
state: present
update_cache: yes
- name: Ensure containerd config directory
file:
path: /etc/containerd
state: directory
mode: '0755'
- name: Generate default config
shell: containerd config default > /etc/containerd/config.toml
args: { creates: /etc/containerd/config.toml }
- name: Ensure SystemdCgroup=true
replace:
path: /etc/containerd/config.toml
regexp: 'SystemdCgroup = false'
replace: 'SystemdCgroup = true'
- name: Restart containerd
service:
name: containerd
state: restarted
enabled: yes
=== ./ansible/roles/ingress/tasks/main.yml ===
- name: Deploy ingress-nginx (baremetal)
shell: kubectl apply -f https://raw.githubusercontent.com/kubernetes/ingress-nginx/main/deploy/static/provider/baremetal/deploy.yaml
=== ./ansible/roles/kubeadm_cp_discovery/tasks/main.yml ===
- name: Upload certs and get certificate key
shell: kubeadm init phase upload-certs --upload-certs | tail -n 1
register: cert_key
- name: Compute CA cert hash
shell: |
openssl x509 -pubkey -in /etc/kubernetes/pki/ca.crt | openssl rsa -pubin -outform der 2>/dev/null | openssl dgst -sha256 -hex | awk '{print $2}'
register: ca_hash
- name: Create short-lived token
shell: kubeadm token create --ttl 30m
register: join_token
- name: Determine control-plane endpoint
set_fact:
cp_endpoint: "{{ hostvars[inventory_hostname].control_plane_endpoint | default(ansible_host ~ ':6443') }}"
- set_fact:
kubeadm_cp_join_cmd: >-
kubeadm join {{ cp_endpoint }}
--token {{ join_token.stdout }}
--discovery-token-ca-cert-hash sha256:{{ ca_hash.stdout }}
--control-plane
--certificate-key {{ cert_key.stdout }}
=== ./ansible/roles/kubeadm_init/tasks/main.yml ===
# - name: Write kubeadm config
# template:
# src: kubeadm-config.yaml.j2
# dest: /etc/kubernetes/kubeadm-config.yaml
# - name: Pre-pull images
# command: kubeadm config images pull
# - name: Init control-plane
# command: kubeadm init --config=/etc/kubernetes/kubeadm-config.yaml
# args: { creates: /etc/kubernetes/admin.conf }
# - name: Setup kubeconfig
# shell: |
# mkdir -p $HOME/.kube
# cp -i /etc/kubernetes/admin.conf $HOME/.kube/config
# chown $(id -u):$(id -g) $HOME/.kube/config
- name: Save join command
shell: kubeadm token create --print-join-command
register: join_cmd
- set_fact:
kubeadm_join_command_all: "{{ join_cmd.stdout }}"
=== ./ansible/roles/kubeadm_join/tasks/main.yml ===
- name: Join node to cluster
command: "{{ hostvars[groups['k8s_control_plane'][0]].kubeadm_join_command_all }} --ignore-preflight-errors=FileAvailable--etc-kubernetes-kubelet.conf,FileAvailable--etc-kubernetes-pki-ca.crt,Port-10250"
=== ./ansible/roles/kubeadm_join_cp/tasks/main.yml ===
- name: Ensure join command provided
fail:
msg: "Set kubeadm_cp_join_cmd variable (string)"
when: kubeadm_cp_join_cmd is not defined
- name: Join node as control-plane
command: "{{ kubeadm_cp_join_cmd }}"
args:
creates: /etc/kubernetes/kubelet.conf
=== ./ansible/roles/kubernetes/tasks/main.yml ===
- name: Install Kubernetes apt key
shell: curl -fsSL https://pkgs.k8s.io/core:/stable:/v1.30/deb/Release.key | gpg --dearmor -o /etc/apt/keyrings/kubernetes-apt-keyring.gpg
args: { creates: /etc/apt/keyrings/kubernetes-apt-keyring.gpg }
- name: Add Kubernetes repo
apt_repository:
repo: "deb [signed-by=/etc/apt/keyrings/kubernetes-apt-keyring.gpg] https://pkgs.k8s.io/core:/stable:/v1.30/deb/ /"
state: present
- name: Install kubeadm, kubelet, kubectl
apt:
name: [kubeadm, kubelet, kubectl]
state: present
update_cache: yes
- name: Hold kube packages
command: apt-mark hold kubeadm kubelet kubectl
=== ./ansible/roles/labels/tasks/main.yml ===
- name: Label hetzner-2 for stateful
command: kubectl label node hetzner-2 {{ stateful_node_label_key }}={{ stateful_node_label_val }} --overwrite
delegate_to: "{{ groups['k8s_control_plane'][0] }}"
run_once: true
=== ./ansible/roles/storage_local_path/tasks/main.yml ===
- name: Ensure local path dir
file:
path: "{{ local_path_dir }}"
state: directory
mode: '0777'
- name: StorageClass local-ssd-hetzner
copy:
dest: /root/local-sc.yaml
content: |
apiVersion: storage.k8s.io/v1
kind: StorageClass
metadata:
name: {{ local_sc_name }}
provisioner: kubernetes.io/no-provisioner
volumeBindingMode: WaitForFirstConsumer
when: inventory_hostname in groups['k8s_control_plane']
- name: Apply SC
command: kubectl apply -f /root/local-sc.yaml
environment:
KUBECONFIG: /etc/kubernetes/admin.conf
when: inventory_hostname in groups['k8s_control_plane']
- name: Create local-path directory
file:
path: /mnt/local-ssd
state: directory
mode: '0755'
- name: Create subdirectories for each PV
file:
path: "/mnt/local-ssd/{{ item }}"
state: directory
mode: '0755'
loop:
- postgres
- prometheus
- elasticsearch
- grafana
- name: Copy PV manifest
template:
src: local-ssd-pv.yaml
dest: /tmp/local-ssd-pv.yaml
- name: Apply PV
command: kubectl apply -f /tmp/local-ssd-pv.yaml
run_once: true
delegate_to: "{{ groups['k8s_control_plane'][0] }}"
- name: Apply SC
command: kubectl apply -f /tmp/local-ssd-sc.yaml
run_once: true
delegate_to: "{{ groups['k8s_control_plane'][0] }}"
=== ./ansible/roles/storage_local_path/templates/local-ssd-pv.yaml ===
apiVersion: v1
kind: PersistentVolume
metadata:
name: local-ssd-postgres
spec:
capacity:
storage: 100Gi
accessModes:
- ReadWriteOnce
persistentVolumeReclaimPolicy: Retain
storageClassName: local-ssd-hetzner
local:
path: /mnt/local-ssd/postgres
nodeAffinity:
required:
nodeSelectorTerms:
- matchExpressions:
- key: kubernetes.io/hostname
operator: In
values:
- hetzner-2
---
apiVersion: v1
kind: PersistentVolume
metadata:
name: local-ssd-prometheus
spec:
capacity:
storage: 100Gi
accessModes:
- ReadWriteOnce
persistentVolumeReclaimPolicy: Retain
storageClassName: local-ssd-hetzner
local:
path: /mnt/local-ssd/prometheus
nodeAffinity:
required:
nodeSelectorTerms:
- matchExpressions:
- key: kubernetes.io/hostname
operator: In
values:
- hetzner-2
---
apiVersion: v1
kind: PersistentVolume
metadata:
name: local-ssd-elasticsearch
spec:
capacity:
storage: 300Gi
accessModes:
- ReadWriteOnce
persistentVolumeReclaimPolicy: Retain
storageClassName: local-ssd-hetzner
local:
path: /mnt/local-ssd/elasticsearch
nodeAffinity:
required:
nodeSelectorTerms:
- matchExpressions:
- key: kubernetes.io/hostname
operator: In
values:
- hetzner-2
=== ./ansible/roles/wireguard/tasks/main.yml ===
- name: Install wireguard
apt:
name: [wireguard, qrencode]
state: present
update_cache: yes
- name: Ensure key dir
file: { path: /etc/wireguard/keys, state: directory, mode: '0700' }
- name: Generate private key if missing
shell: "[ -f /etc/wireguard/keys/privatekey ] || (umask 077 && wg genkey > /etc/wireguard/keys/privatekey)"
args: { creates: /etc/wireguard/keys/privatekey }
- name: Generate public key
shell: "wg pubkey < /etc/wireguard/keys/privatekey > /etc/wireguard/keys/publickey"
args: { creates: /etc/wireguard/keys/publickey }
- name: Read pubkey
slurp: { src: /etc/wireguard/keys/publickey }
register: pubkey_raw
- name: Read private key
slurp: { src: /etc/wireguard/keys/privatekey }
register: privkey_raw
- set_fact:
wg_public_key: "{{ pubkey_raw.content | b64decode | trim }}"
wg_private_key: "{{ privkey_raw.content | b64decode | trim }}"
- name: Gather facts from all hosts
setup:
delegate_to: "{{ item }}"
delegate_facts: true
loop: "{{ groups['k8s_nodes'] }}"
run_once: true
- name: Pretty print hostvars
debug:
msg: "{{ hostvars['hetzner-1']['wg_public_key'] }}"
- name: Render config
template:
src: wg0.conf.j2
dest: /etc/wireguard/wg0.conf
mode: '0600'
- name: Enable IP forward
sysctl:
name: net.ipv4.ip_forward
value: "1"
sysctl_set: yes
state: present
reload: yes
- name: Enable wg-quick
service:
name: wg-quick@wg0
enabled: yes
state: started
- debug:
var: wg_show.stdout
=== ./ansible/roles/wireguard/vars/main.yml ===
wg_interface: wg0
wg_port: 51820
wg_cidr: 10.66.0.0/24
wg_nodes:
hetzner-1: { address: 10.66.0.11, public_ip: "95.217.89.53" }
hetzner-2: { address: 10.66.0.12, public_ip: "138.201.254.97" }
=== ./DNS_RECORDS.txt ===
apps.betelgeusebytes.io. 300 IN A 95.217.89.53
apps.betelgeusebytes.io. 300 IN A 138.201.254.97
gitea.betelgeusebytes.io. 300 IN CNAME apps.betelgeusebytes.io.
kibana.betelgeusebytes.io. 300 IN CNAME apps.betelgeusebytes.io.
grafana.betelgeusebytes.io. 300 IN CNAME apps.betelgeusebytes.io.
prometheus.betelgeusebytes.io. 300 IN CNAME apps.betelgeusebytes.io.
notebook.betelgeusebytes.io. 300 IN CNAME apps.betelgeusebytes.io.
broker.betelgeusebytes.io. 300 IN CNAME apps.betelgeusebytes.io.
neo4j.betelgeusebytes.io. 300 IN CNAME apps.betelgeusebytes.io.
otlp.betelgeusebytes.io. 300 IN CNAME apps.betelgeusebytes.io.
=== ./k8s/00-namespaces.yaml ===
apiVersion: v1
kind: Namespace
metadata: { name: db }
---
apiVersion: v1
kind: Namespace
metadata: { name: scm }
---
apiVersion: v1
kind: Namespace
metadata: { name: ml }
---
apiVersion: v1
kind: Namespace
metadata: { name: monitoring }
---
apiVersion: v1
kind: Namespace
metadata: { name: elastic }
---
apiVersion: v1
kind: Namespace
metadata: { name: broker }
---
apiVersion: v1
kind: Namespace
metadata: { name: graph }
---
apiVersion: v1
kind: Namespace
metadata: { name: observability }
=== ./k8s/01-secrets/basic-auth.yaml ===
# Replace each 'auth' line with a real htpasswd pair:
# htpasswd -nbBC 10 admin 'Str0ngP@ss' (copy 'admin:...' to value below)
apiVersion: v1
kind: Secret
metadata: { name: basic-auth-kibana, namespace: elastic }
type: Opaque
stringData: { auth: "admin:$2y$10$MBLgALyI7xwFrQh2PHqZruX.EzaTUGagmJODwpBEvF27snFAxCBvq" }
---
apiVersion: v1
kind: Secret
metadata: { name: basic-auth-grafana, namespace: monitoring }
type: Opaque
stringData: { auth: "admin:$2y$10$MBLgALyI7xwFrQh2PHqZruX.EzaTUGagmJODwpBEvF27snFAxCBvq" }
---
apiVersion: v1
kind: Secret
metadata: { name: basic-auth-prometheus, namespace: monitoring }
type: Opaque
stringData: { auth: "aadmin:$2y$10$MBLgALyI7xwFrQh2PHqZruX.EzaTUGagmJODwpBEvF27snFAxCBvq" }
---
apiVersion: v1
kind: Secret
metadata: { name: basic-auth-notebook, namespace: ml }
type: Opaque
stringData: { auth: "admin:$2y$10$MBLgALyI7xwFrQh2PHqZruX.EzaTUGagmJODwpBEvF27snFAxCBvq" }
---
apiVersion: v1
kind: Secret
metadata: { name: basic-auth-broker, namespace: broker }
type: Opaque
stringData: { auth: "admin:$2y$10$MBLgALyI7xwFrQh2PHqZruX.EzaTUGagmJODwpBEvF27snFAxCBvq" }
---
apiVersion: v1
kind: Secret
metadata: { name: basic-auth-neo4j, namespace: graph }
type: Opaque
stringData: { auth: "admin:$2y$10$MBLgALyI7xwFrQh2PHqZruX.EzaTUGagmJODwpBEvF27snFAxCBvq" }
=== ./k8s/argoflow/argo.yaml ===
apiVersion: v1
kind: Secret
metadata:
name: argo-artifacts
namespace: ml
type: Opaque
stringData:
accesskey: "minioadmin" # <-- change
secretkey: "minioadmin" # <-- change
---
apiVersion: v1
kind: ConfigMap
metadata:
name: workflow-controller-configmap
namespace: ml
data:
config: |
artifactRepository:
s3:
bucket: argo-artifacts
endpoint: minio.betelgeusebytes.io # no scheme here
insecure: false # https via Ingress
accessKeySecret:
name: argo-artifacts
key: accesskey
secretKeySecret:
name: argo-artifacts
key: secretkey
keyFormat: "{{workflow.namespace}}/{{workflow.name}}/{{pod.name}}"
---
# k8s/argo/workflows/ns-rbac.yaml
apiVersion: v1
kind: ServiceAccount
metadata:
name: argo-server
namespace: ml
---
apiVersion: rbac.authorization.k8s.io/v1
kind: Role
metadata:
name: argo-namespaced
namespace: ml
rules:
- apiGroups: [""]
resources: ["pods","pods/log","secrets","configmaps","events","persistentvolumeclaims","serviceaccounts"]
verbs: ["get","list","watch","create","delete","patch","update"]
- apiGroups: ["coordination.k8s.io"]
resources: ["leases"]
verbs: ["get","list","watch","create","delete","patch","update"]
- apiGroups: ["argoproj.io"]
resources: ["workflows","workflowtemplates","cronworkflows","workfloweventbindings","sensors","eventsources","workflowtasksets","workflowartifactgctasks","workflowtaskresults"]
verbs: ["get","list","watch","create","delete","patch","update"]
---
apiVersion: rbac.authorization.k8s.io/v1
kind: RoleBinding
metadata:
name: argo-namespaced-binding
namespace: ml
subjects:
- kind: ServiceAccount
name: argo-server
namespace: ml
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: Role
name: argo-namespaced
---
# k8s/argo/workflows/controller.yaml
apiVersion: apps/v1
kind: Deployment
metadata: { name: workflow-controller, namespace: ml }
spec:
replicas: 1
selector: { matchLabels: { app: workflow-controller } }
template:
metadata: { labels: { app: workflow-controller } }
spec:
serviceAccountName: argo-server
containers:
- name: controller
image: quay.io/argoproj/workflow-controller:latest
args: ["--namespaced"]
env:
- name: LEADER_ELECTION_IDENTITY
valueFrom:
fieldRef:
fieldPath: metadata.name
ports: [{ containerPort: 9090 }]
readinessProbe:
httpGet: { path: /metrics, port: 9090, scheme: HTTPS }
initialDelaySeconds: 5
periodSeconds: 10
livenessProbe:
httpGet: { path: /metrics, port: 9090, scheme: HTTPS }
initialDelaySeconds: 20
periodSeconds: 20
---
# k8s/argo/workflows/server.yaml
apiVersion: apps/v1
kind: Deployment
metadata: { name: argo-server, namespace: ml }
spec:
replicas: 1
selector: { matchLabels: { app: argo-server } }
template:
metadata: { labels: { app: argo-server } }
spec:
serviceAccountName: argo-server
containers:
- name: server
image: quay.io/argoproj/argocli:latest
args: ["server","--auth-mode","server","--namespaced","--secure=false"]
ports: [{ containerPort: 2746 }]
readinessProbe:
httpGet: { path: /, port: 2746, scheme: HTTP }
initialDelaySeconds: 5
periodSeconds: 10
livenessProbe:
httpGet: { path: /, port: 2746, scheme: HTTP }
initialDelaySeconds: 20
periodSeconds: 20
---
apiVersion: v1
kind: Service
metadata: { name: argo-server, namespace: ml }
spec: { selector: { app: argo-server }, ports: [ { port: 80, targetPort: 2746 } ] }
---
apiVersion: networking.k8s.io/v1
kind: Ingress
metadata:
name: argo
namespace: ml
annotations: { cert-manager.io/cluster-issuer: letsencrypt-prod }
spec:
ingressClassName: nginx
tls: [{ hosts: ["argo.betelgeusebytes.io"], secretName: argo-tls }]
rules:
- host: argo.betelgeusebytes.io
http:
paths:
- path: /
pathType: Prefix
backend: { service: { name: argo-server, port: { number: 80 } } }
=== ./k8s/automation/n8n.yaml ===
apiVersion: v1
kind: Namespace
metadata:
name: automation
labels:
name: automation
---
apiVersion: v1
kind: PersistentVolume
metadata:
name: n8n-pv
labels:
app: n8n
spec:
capacity:
storage: 20Gi
volumeMode: Filesystem
accessModes:
- ReadWriteOnce
persistentVolumeReclaimPolicy: Retain
storageClassName: local-ssd
local:
path: /mnt/local-ssd/n8n
nodeAffinity:
required:
nodeSelectorTerms:
- matchExpressions:
- key: kubernetes.io/hostname
operator: In
values:
- hetzner-2
---
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: n8n-data
namespace: automation
labels:
app: n8n
spec:
accessModes:
- ReadWriteOnce
storageClassName: local-ssd
resources:
requests:
storage: 20Gi
selector:
matchLabels:
app: n8n
---
apiVersion: v1
kind: Secret
metadata:
name: n8n-secrets
namespace: automation
type: Opaque
stringData:
# Generate a strong encryption key with: openssl rand -base64 32
N8N_ENCRYPTION_KEY: "G/US0ePajEpWwRUjlchyOs6+6I/AT+0bisXmE2fugSU="
# Optional: Database connection if using PostgreSQL
DB_TYPE: "postgresdb"
DB_POSTGRESDB_HOST: "pg.betelgeusebytes.io"
DB_POSTGRESDB_PORT: "5432"
DB_POSTGRESDB_DATABASE: "n8n"
DB_POSTGRESDB_USER: "app"
DB_POSTGRESDB_PASSWORD: "pa$$word"
---
apiVersion: apps/v1
kind: StatefulSet
metadata:
name: n8n
namespace: automation
spec:
serviceName: n8n
replicas: 1
selector:
matchLabels:
app: n8n
template:
metadata:
labels:
app: n8n
spec:
nodeSelector:
kubernetes.io/hostname: hetzner-2
containers:
- name: n8n
image: n8nio/n8n:latest
ports:
- containerPort: 5678
name: http
env:
- name: N8N_HOST
value: "n8n.betelgeusebytes.io"
- name: N8N_PORT
value: "5678"
- name: N8N_PROTOCOL
value: "https"
- name: WEBHOOK_URL
value: "https://n8n.betelgeusebytes.io/"
- name: GENERIC_TIMEZONE
value: "UTC"
- name: N8N_ENCRYPTION_KEY
valueFrom:
secretKeyRef:
name: n8n-secrets
key: N8N_ENCRYPTION_KEY
# Uncomment if using PostgreSQL
- name: DB_TYPE
valueFrom:
secretKeyRef:
name: n8n-secrets
key: DB_TYPE
- name: DB_POSTGRESDB_HOST
valueFrom:
secretKeyRef:
name: n8n-secrets
key: DB_POSTGRESDB_HOST
- name: DB_POSTGRESDB_PORT
valueFrom:
secretKeyRef:
name: n8n-secrets
key: DB_POSTGRESDB_PORT
- name: DB_POSTGRESDB_DATABASE
valueFrom:
secretKeyRef:
name: n8n-secrets
key: DB_POSTGRESDB_DATABASE
- name: DB_POSTGRESDB_USER
valueFrom:
secretKeyRef:
name: n8n-secrets
key: DB_POSTGRESDB_USER
- name: DB_POSTGRESDB_PASSWORD
valueFrom:
secretKeyRef:
name: n8n-secrets
key: DB_POSTGRESDB_PASSWORD
volumeMounts:
- name: n8n-data
mountPath: /home/node/.n8n
resources:
requests:
memory: "512Mi"
cpu: "250m"
limits:
memory: "2Gi"
cpu: "1000m"
livenessProbe:
httpGet:
path: /healthz
port: 5678
initialDelaySeconds: 60
periodSeconds: 30
timeoutSeconds: 10
failureThreshold: 5
readinessProbe:
httpGet:
path: /healthz
port: 5678
initialDelaySeconds: 30
periodSeconds: 10
timeoutSeconds: 5
failureThreshold: 3
volumes:
- name: n8n-data
persistentVolumeClaim:
claimName: n8n-data
---
apiVersion: v1
kind: Service
metadata:
name: n8n
namespace: automation
labels:
app: n8n
spec:
type: ClusterIP
ports:
- port: 5678
targetPort: 5678
protocol: TCP
name: http
selector:
app: n8n
---
apiVersion: networking.k8s.io/v1
kind: Ingress
metadata:
name: n8n
namespace: automation
annotations:
cert-manager.io/cluster-issuer: "letsencrypt-prod"
# nginx.ingress.kubernetes.io/proxy-body-size: "50m"
# nginx.ingress.kubernetes.io/proxy-read-timeout: "300"
# nginx.ingress.kubernetes.io/proxy-send-timeout: "300"
# Uncomment below if you want basic auth protection in addition to n8n's auth
# nginx.ingress.kubernetes.io/auth-type: basic
# nginx.ingress.kubernetes.io/auth-secret: n8n-basic-auth
# nginx.ingress.kubernetes.io/auth-realm: 'Authentication Required'
spec:
ingressClassName: nginx
tls:
- hosts:
- n8n.betelgeusebytes.io
secretName: wildcard-betelgeusebytes-tls
rules:
- host: n8n.betelgeusebytes.io
http:
paths:
- path: /
pathType: Prefix
backend:
service:
name: n8n
port:
number: 5678
=== ./k8s/cert-manager/cluster-issuer.yaml ===
apiVersion: cert-manager.io/v1
kind: ClusterIssuer
metadata: { name: letsencrypt-prod }
spec:
acme:
email: angal.salah@gmail.com
server: https://acme-v02.api.letsencrypt.org/directory
privateKeySecretRef: { name: letsencrypt-prod-key }
solvers:
- http01: { ingress: { class: nginx } }
=== ./k8s/elastic/elastic-pv.yaml ===
apiVersion: v1
kind: PersistentVolume
metadata:
name: pv-elasticsearch
spec:
capacity:
storage: 80Gi
accessModes:
- ReadWriteOnce
persistentVolumeReclaimPolicy: Retain
storageClassName: local-ssd-hetzner
local:
path: /mnt/local-ssd/elasticsearch
nodeAffinity:
required:
nodeSelectorTerms:
- matchExpressions:
- key: kubernetes.io/hostname
operator: In
values:
- hetzner-2
=== ./k8s/elastic/elasticsearch.yaml ===
apiVersion: v1
kind: Service
metadata: { name: elasticsearch, namespace: elastic }
spec:
ports:
- { name: http, port: 9200, targetPort: 9200 }
- { name: transport, port: 9300, targetPort: 9300 }
selector: { app: elasticsearch }
---
apiVersion: apps/v1
kind: StatefulSet
metadata: { name: elasticsearch, namespace: elastic }
spec:
serviceName: elasticsearch
replicas: 1
selector: { matchLabels: { app: elasticsearch } }
template:
metadata: { labels: { app: elasticsearch } }
spec:
nodeSelector: { node: hetzner-2 }
containers:
- name: es
image: docker.elastic.co/elasticsearch/elasticsearch:8.14.0
env:
- { name: discovery.type, value: single-node }
- { name: xpack.security.enabled, value: "false" }
- { name: ES_JAVA_OPTS, value: "-Xms2g -Xmx2g" }
ports:
- { containerPort: 9200 }
- { containerPort: 9300 }
volumeMounts:
- { name: data, mountPath: /usr/share/elasticsearch/data }
volumeClaimTemplates:
- metadata: { name: data }
spec:
accessModes: ["ReadWriteOnce"]
storageClassName: local-ssd-hetzner
resources: { requests: { storage: 80Gi } }
=== ./k8s/elastic/kibana.yaml ===
apiVersion: v1
kind: Service
metadata: { name: kibana, namespace: elastic }
spec:
ports: [{ port: 5601, targetPort: 5601 }]
selector: { app: kibana }
---
apiVersion: apps/v1
kind: Deployment
metadata: { name: kibana, namespace: elastic }
spec:
replicas: 1
selector: { matchLabels: { app: kibana } }
template:
metadata: { labels: { app: kibana } }
spec:
nodeSelector: { node: hetzner-2 }
containers:
- name: kibana
image: docker.elastic.co/kibana/kibana:8.14.0
env:
- { name: ELASTICSEARCH_HOSTS, value: "http://elasticsearch.elastic.svc.cluster.local:9200" }
ports: [{ containerPort: 5601 }]
---
apiVersion: networking.k8s.io/v1
kind: Ingress
metadata:
name: kibana
namespace: elastic
annotations:
cert-manager.io/cluster-issuer: letsencrypt-prod
# nginx.ingress.kubernetes.io/auth-type: basic
# nginx.ingress.kubernetes.io/auth-secret: basic-auth-kibana
# nginx.ingress.kubernetes.io/auth-realm: "Authentication Required"
spec:
ingressClassName: nginx
tls: [{ hosts: ["kibana.betelgeusebytes.io"], secretName: kibana-tls }]
rules:
- host: kibana.betelgeusebytes.io
http:
paths:
- path: /
pathType: Prefix
backend: { service: { name: kibana, port: { number: 5601 } } }
=== ./k8s/gitea/gitea-pv.yaml ===
apiVersion: v1
kind: PersistentVolume
metadata:
name: pv-gitea
spec:
capacity:
storage: 50Gi
accessModes:
- ReadWriteOnce
persistentVolumeReclaimPolicy: Retain
storageClassName: local-ssd-hetzner
local:
path: /mnt/local-ssd/gitea
nodeAffinity:
required:
nodeSelectorTerms:
- matchExpressions:
- key: kubernetes.io/hostname
operator: In
values:
- hetzner-2
=== ./k8s/gitea/gitea.yaml ===
apiVersion: v1
kind: Service
metadata: { name: gitea, namespace: scm }
spec:
ports: [{ port: 80, targetPort: 3000 }]
selector: { app: gitea }
---
apiVersion: apps/v1
kind: StatefulSet
metadata: { name: gitea, namespace: scm }
spec:
serviceName: gitea
replicas: 1
selector: { matchLabels: { app: gitea } }
template:
metadata: { labels: { app: gitea } }
spec:
nodeSelector: { node: hetzner-2 }
containers:
- name: gitea
image: gitea/gitea:1.21.11
env:
- { name: GITEA__server__ROOT_URL, value: "https://gitea.betelgeusebytes.io" }
- { name: GITEA__database__DB_TYPE, value: "postgres" }
- { name: GITEA__database__HOST, value: "postgres.db.svc.cluster.local:5432" }
- { name: GITEA__database__NAME, value: "gitea" }
- { name: GITEA__database__USER, value: "app" }
- { name: GITEA__database__PASSWD, value: "pa$$word" }
ports: [{ containerPort: 3000 }]
volumeMounts:
- { name: data, mountPath: /data }
volumeClaimTemplates:
- metadata: { name: data }
spec:
accessModes: ["ReadWriteOnce"]
storageClassName: local-ssd-hetzner
resources: { requests: { storage: 50Gi } }
---
apiVersion: networking.k8s.io/v1
kind: Ingress
metadata:
name: gitea
namespace: scm
annotations: { cert-manager.io/cluster-issuer: letsencrypt-prod }
spec:
ingressClassName: nginx
tls: [{ hosts: ["gitea.betelgeusebytes.io"], secretName: gitea-tls }]
rules:
- host: gitea.betelgeusebytes.io
http:
paths:
- path: /
pathType: Prefix
backend: { service: { name: gitea, port: { number: 80 } } }
=== ./k8s/grafana/grafana.yaml ===
apiVersion: v1
kind: Service
metadata: { name: grafana, namespace: monitoring }
spec:
ports: [{ port: 80, targetPort: 3000 }]
selector: { app: grafana }
---
apiVersion: apps/v1
kind: Deployment
metadata: { name: grafana, namespace: monitoring }
spec:
replicas: 1
selector: { matchLabels: { app: grafana } }
template:
metadata: { labels: { app: grafana } }
spec:
nodeSelector: { node: hetzner-2 }
containers:
- name: grafana
image: grafana/grafana:10.4.3
env:
- { name: GF_SECURITY_ADMIN_USER, value: admin }
- { name: GF_SECURITY_ADMIN_PASSWORD, value: "ADMINclaude-GRAFANA" }
ports: [{ containerPort: 3000 }]
---
apiVersion: networking.k8s.io/v1
kind: Ingress
metadata:
name: grafana
namespace: monitoring
annotations:
cert-manager.io/cluster-issuer: letsencrypt-prod
nginx.ingress.kubernetes.io/auth-type: basic
nginx.ingress.kubernetes.io/auth-secret: basic-auth-grafana
nginx.ingress.kubernetes.io/auth-realm: "Authentication Required"
spec:
ingressClassName: nginx
tls: [{ hosts: ["grafana.betelgeusebytes.io"], secretName: grafana-tls }]
rules:
- host: grafana.betelgeusebytes.io
http:
paths:
- path: /
pathType: Prefix
backend: { service: { name: grafana, port: { number: 80 } } }
=== ./k8s/ingress-patch/kustomization.yaml ===
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
namespace: ingress-nginx
# Create the tcp-services ConfigMap from *quoted* literals
configMapGenerator:
- name: tcp-services
literals:
- "5432=db/postgres:5432"
- "7687=graph/neo4j:7687"
generatorOptions:
disableNameSuffixHash: true
# Inline JSON6902 patches
patches:
# 1) Add controller arg for tcp-services
- target:
group: apps
version: v1
kind: Deployment
name: ingress-nginx-controller
namespace: ingress-nginx
patch: |-
- op: add
path: /spec/template/spec/containers/0/args/-
value: --tcp-services-configmap=$(POD_NAMESPACE)/tcp-services
# 2) Expose Service ports 5432 and 7687 (keeps 80/443)
- target:
version: v1
kind: Service
name: ingress-nginx-controller
namespace: ingress-nginx
patch: |-
- op: add
path: /spec/ports/-
value:
name: tcp-5432
port: 5432
protocol: TCP
targetPort: 5432
- op: add
path: /spec/ports/-
value:
name: tcp-7687
port: 7687
protocol: TCP
targetPort: 7687
=== ./k8s/jupyter/jupyter.yaml ===
apiVersion: v1
kind: Service
metadata: { name: notebook, namespace: ml }
spec:
selector: { app: jupyterlab }
ports: [{ port: 80, targetPort: 8888 }]
---
apiVersion: apps/v1
kind: Deployment
metadata: { name: jupyterlab, namespace: ml }
spec:
replicas: 1
selector: { matchLabels: { app: jupyterlab } }
template:
metadata: { labels: { app: jupyterlab } }
spec:
securityContext:
runAsUser: 1000
fsGroup: 100
nodeSelector: { node: hetzner-2 }
containers:
- name: jupyter
image: jupyter/base-notebook:latest
args: ["start-notebook.sh", "--NotebookApp.token=$(PASSWORD)"]
env:
- name: PASSWORD
valueFrom: { secretKeyRef: { name: jupyter-auth, key: PASSWORD } }
ports: [{ containerPort: 8888 }]
volumeMounts:
- { name: work, mountPath: /home/jovyan/work }
volumes:
- name: work
persistentVolumeClaim: { claimName: jupyter-pvc }
---
apiVersion: v1
kind: PersistentVolumeClaim
metadata: { name: jupyter-pvc, namespace: ml }
spec:
accessModes: ["ReadWriteOnce"]
storageClassName: local-ssd-hetzner
resources: { requests: { storage: 20Gi } }
---
apiVersion: v1
kind: Secret
metadata: { name: jupyter-auth, namespace: ml }
type: Opaque
stringData: { PASSWORD: "notebook" }
---
apiVersion: networking.k8s.io/v1
kind: Ingress
metadata:
name: notebook
namespace: ml
annotations:
cert-manager.io/cluster-issuer: letsencrypt-prod
# nginx.ingress.kubernetes.io/auth-type: basic
# nginx.ingress.kubernetes.io/auth-secret: basic-auth-notebook
# nginx.ingress.kubernetes.io/auth-realm: "Authentication Required"
spec:
ingressClassName: nginx
tls: [{ hosts: ["notebook.betelgeusebytes.io"], secretName: notebook-tls }]
rules:
- host: notebook.betelgeusebytes.io
http:
paths:
- path: /
pathType: Prefix
backend: { service: { name: notebook, port: { number: 80 } } }
=== ./k8s/kafka/kafka-pv.yaml ===
apiVersion: v1
kind: PersistentVolume
metadata:
name: pv-kafka
spec:
capacity:
storage: 50Gi
accessModes:
- ReadWriteOnce
persistentVolumeReclaimPolicy: Retain
storageClassName: local-ssd-hetzner
local:
path: /mnt/local-ssd/kafka
nodeAffinity:
required:
nodeSelectorTerms:
- matchExpressions:
- key: kubernetes.io/hostname
operator: In
values:
- hetzner-2
---
apiVersion: v1
kind: PersistentVolume
metadata:
name: pv-zookeeper-data
spec:
capacity:
storage: 10Gi
accessModes:
- ReadWriteOnce
persistentVolumeReclaimPolicy: Retain
storageClassName: local-ssd-hetzner
local:
path: /mnt/local-ssd/zookeeper-data
nodeAffinity:
required:
nodeSelectorTerms:
- matchExpressions:
- key: kubernetes.io/hostname
operator: In
values:
- hetzner-2
---
apiVersion: v1
kind: PersistentVolume
metadata:
name: pv-zookeeper-log
spec:
capacity:
storage: 10Gi
accessModes:
- ReadWriteOnce
persistentVolumeReclaimPolicy: Retain
storageClassName: local-ssd-hetzner
local:
path: /mnt/local-ssd/zookeeper-log
nodeAffinity:
required:
nodeSelectorTerms:
- matchExpressions:
- key: kubernetes.io/hostname
operator: In
values:
- hetzner-2
=== ./k8s/kafka/kafka-ui.yaml ===
apiVersion: v1
kind: Service
metadata: { name: kafka-ui, namespace: broker }
spec:
ports: [{ port: 80, targetPort: 8080 }]
selector: { app: kafka-ui }
---
apiVersion: apps/v1
kind: Deployment
metadata: { name: kafka-ui, namespace: broker }
spec:
replicas: 1
selector: { matchLabels: { app: kafka-ui } }
template:
metadata: { labels: { app: kafka-ui } }
spec:
containers:
- name: ui
image: provectuslabs/kafka-ui:latest
env:
- { name: KAFKA_CLUSTERS_0_NAME, value: "local" }
- { name: KAFKA_CLUSTERS_0_BOOTSTRAPSERVERS, value: "kafka.broker.svc.cluster.local:9092" }
ports: [{ containerPort: 8080 }]
---
apiVersion: networking.k8s.io/v1
kind: Ingress
metadata:
name: kafka-ui
namespace: broker
annotations:
cert-manager.io/cluster-issuer: letsencrypt-prod
# nginx.ingress.kubernetes.io/auth-type: basic
# nginx.ingress.kubernetes.io/auth-secret: basic-auth-broker
# nginx.ingress.kubernetes.io/auth-realm: "Authentication Required"
spec:
ingressClassName: nginx
tls: [{ hosts: ["broker.betelgeusebytes.io"], secretName: broker-tls }]
rules:
- host: broker.betelgeusebytes.io
http:
paths:
- path: /
pathType: Prefix
backend: { service: { name: kafka-ui, port: { number: 80 } } }
=== ./k8s/kafka/kafka.yaml ===
apiVersion: v1
kind: Service
metadata: { name: kafka, namespace: broker }
spec:
ports: [{ name: kafka, port: 9092, targetPort: 9092 }]
selector: { app: kafka }
---
apiVersion: apps/v1
kind: StatefulSet
metadata: { name: kafka, namespace: broker }
spec:
serviceName: kafka
replicas: 1
selector: { matchLabels: { app: kafka } }
template:
metadata: { labels: { app: kafka } }
spec:
nodeSelector: { node: hetzner-2 }
containers:
- name: kafka
image: apache/kafka:latest
env:
- { name: KAFKA_NODE_ID, value: "1" }
- { name: KAFKA_PROCESS_ROLES, value: "broker,controller" }
- { name: KAFKA_LISTENERS, value: "PLAINTEXT://:9092,CONTROLLER://:9093" }
- { name: KAFKA_ADVERTISED_LISTENERS, value: "PLAINTEXT://kafka.broker.svc.cluster.local:9092" }
- { name: KAFKA_CONTROLLER_LISTENER_NAMES, value: "CONTROLLER" }
- { name: KAFKA_LISTENER_SECURITY_PROTOCOL_MAP, value: "CONTROLLER:PLAINTEXT,PLAINTEXT:PLAINTEXT" }
- { name: KAFKA_CONTROLLER_QUORUM_VOTERS, value: "1@localhost:9093" }
- { name: KAFKA_OFFSETS_TOPIC_REPLICATION_FACTOR, value: "1" }
- { name: KAFKA_TRANSACTION_STATE_LOG_REPLICATION_FACTOR, value: "1" }
- { name: KAFKA_TRANSACTION_STATE_LOG_MIN_ISR, value: "1" }
- { name: KAFKA_LOG_DIRS, value: "/var/lib/kafka/data" }
- { name: CLUSTER_ID, value: "MkU3OEVBNTcwNTJENDM2Qk" }
ports:
- { containerPort: 9092 }
- { containerPort: 9093 }
volumeMounts:
- { name: data, mountPath: /var/lib/kafka/data }
volumeClaimTemplates:
- metadata: { name: data }
spec:
accessModes: ["ReadWriteOnce"]
storageClassName: local-ssd-hetzner
resources: { requests: { storage: 50Gi } }
=== ./k8s/label_studio/label.yaml ===
# k8s/ai/label-studio/secret-pg.yaml
apiVersion: v1
kind: Secret
metadata: { name: labelstudio-pg, namespace: ml }
type: Opaque
stringData: { POSTGRES_PASSWORD: "admin" }
---
# k8s/ai/label-studio/secret-minio.yaml
apiVersion: v1
kind: Secret
metadata: { name: minio-label, namespace: ml }
type: Opaque
stringData:
accesskey: "minioadmin"
secretkey: "minioadmin"
---
# k8s/ai/label-studio/deploy.yaml
apiVersion: apps/v1
kind: Deployment
metadata: { name: label-studio, namespace: ml }
spec:
replicas: 1
selector: { matchLabels: { app: label-studio } }
template:
metadata: { labels: { app: label-studio } }
spec:
containers:
- name: app
image: heartexlabs/label-studio:latest
env:
- { name: POSTGRE_NAME, value: "labelstudio" }
- { name: POSTGRE_USER, value: "admin" }
- name: POSTGRE_PASSWORD
valueFrom: { secretKeyRef: { name: labelstudio-pg, key: POSTGRES_PASSWORD } }
- { name: POSTGRE_HOST, value: "postgres.db.svc.cluster.local" }
- { name: POSTGRE_PORT, value: "5432" }
- { name: S3_ENDPOINT, value: "https://minio.betelgeusebytes.io" }
- name: AWS_ACCESS_KEY_ID
valueFrom: { secretKeyRef: { name: minio-label, key: accesskey } }
- name: AWS_SECRET_ACCESS_KEY
valueFrom: { secretKeyRef: { name: minio-label, key: secretkey } }
- name: ALLOWED_HOSTS
value: "label.betelgeusebytes.io"
- name: CSRF_TRUSTED_ORIGINS
value: "https://label.betelgeusebytes.io"
- name: CSRF_COOKIE_SECURE
value: "1"
- name: SESSION_COOKIE_SECURE
value: "1"
ports: [{ containerPort: 8080 }]
---
apiVersion: v1
kind: Service
metadata: { name: label-studio, namespace: ml }
spec: { selector: { app: label-studio }, ports: [ { port: 80, targetPort: 8080 } ] }
---
apiVersion: networking.k8s.io/v1
kind: Ingress
metadata:
name: label-studio
namespace: ml
annotations: { cert-manager.io/cluster-issuer: letsencrypt-prod }
spec:
ingressClassName: nginx
tls: [{ hosts: ["label.betelgeusebytes.io"], secretName: label-tls }]
rules:
- host: label.betelgeusebytes.io
http:
paths:
- path: /
pathType: Prefix
backend: { service: { name: label-studio, port: { number: 80 } } }
=== ./k8s/minio/minio.yaml ===
apiVersion: v1
kind: Namespace
metadata: { name: storage }
---
# k8s/storage/minio/secret.yaml
apiVersion: v1
kind: Secret
metadata: { name: minio-root, namespace: storage }
type: Opaque
stringData:
MINIO_ROOT_USER: "minioadmin"
MINIO_ROOT_PASSWORD: "minioadmin"
---
# k8s/storage/minio/pvc.yaml
apiVersion: v1
kind: PersistentVolumeClaim
metadata: { name: minio-data, namespace: storage }
spec:
accessModes: ["ReadWriteOnce"]
storageClassName: local-ssd-hetzner
resources: { requests: { storage: 20Gi } }
---
# k8s/storage/minio/deploy.yaml
apiVersion: apps/v1
kind: Deployment
metadata: { name: minio, namespace: storage }
spec:
replicas: 1
selector: { matchLabels: { app: minio } }
template:
metadata: { labels: { app: minio } }
spec:
containers:
- name: minio
image: minio/minio:latest
args: ["server","/data","--console-address",":9001"]
envFrom: [{ secretRef: { name: minio-root } }]
ports:
- { containerPort: 9000 } # S3
- { containerPort: 9001 } # Console
volumeMounts:
- { name: data, mountPath: /data }
volumes:
- name: data
persistentVolumeClaim: { claimName: minio-data }
---
apiVersion: v1
kind: Service
metadata: { name: minio, namespace: storage }
spec:
selector: { app: minio }
ports:
- { name: s3, port: 9000, targetPort: 9000 }
- { name: console, port: 9001, targetPort: 9001 }
---
apiVersion: networking.k8s.io/v1
kind: Ingress
metadata:
name: minio
namespace: storage
annotations: { cert-manager.io/cluster-issuer: letsencrypt-prod }
spec:
ingressClassName: nginx
tls: [{ hosts: ["minio.betelgeusebytes.io"], secretName: minio-tls }]
rules:
- host: minio.betelgeusebytes.io
http:
paths:
- path: /
pathType: Prefix
backend: { service: { name: minio, port: { number: 9001 } } }
---
# PV
apiVersion: v1
kind: PersistentVolume
metadata:
name: pv-minio
spec:
capacity:
storage: 20Gi
accessModes:
- ReadWriteOnce
persistentVolumeReclaimPolicy: Retain
storageClassName: local-ssd-hetzner
local:
path: /mnt/local-ssd/minio
nodeAffinity:
required:
nodeSelectorTerms:
- matchExpressions:
- key: kubernetes.io/hostname
operator: In
values:
- hetzner-2
=== ./k8s/mlflow/mlflow.yaml ===
# k8s/mlops/mlflow/secret-pg.yaml
apiVersion: v1
kind: Secret
metadata: { name: mlflow-pg, namespace: ml }
type: Opaque
stringData: { POSTGRES_PASSWORD: "pa$$word" }
---
# k8s/mlops/mlflow/secret-minio.yaml
apiVersion: v1
kind: Secret
metadata: { name: mlflow-minio, namespace: ml }
type: Opaque
stringData:
accesskey: "minioadmin"
secretkey: "minioadmin"
---
# k8s/mlops/mlflow/deploy.yaml
apiVersion: apps/v1
kind: Deployment
metadata: { name: mlflow, namespace: ml }
spec:
replicas: 1
selector: { matchLabels: { app: mlflow } }
template:
metadata: { labels: { app: mlflow } }
spec:
containers:
- name: mlflow
# image: ghcr.io/mlflow/mlflow:v3.6.0
image: axxs/mlflow-pg
env:
- { name: MLFLOW_BACKEND_STORE_URI,
value: "postgresql://admin:admin@postgres.db.svc.cluster.local:5432/mlflow" }
- { name: POSTGRES_PASSWORD, valueFrom: { secretKeyRef: { name: mlflow-pg, key: POSTGRES_PASSWORD } } }
- { name: MLFLOW_S3_ENDPOINT_URL, value: "https://minio.betelgeusebytes.io" }
- { name: AWS_ACCESS_KEY_ID, valueFrom: { secretKeyRef: { name: mlflow-minio, key: accesskey } } }
- { name: AWS_SECRET_ACCESS_KEY, valueFrom: { secretKeyRef: { name: mlflow-minio, key: secretkey } } }
args: ["mlflow","server","--host","0.0.0.0","--port","5000","--artifacts-destination","s3://mlflow", "--allowed-hosts", "*.betelgeusebytes.io"]
ports: [{ containerPort: 5000 }]
---
apiVersion: v1
kind: Service
metadata: { name: mlflow, namespace: ml }
spec: { selector: { app: mlflow }, ports: [ { port: 80, targetPort: 5000 } ] }
---
apiVersion: networking.k8s.io/v1
kind: Ingress
metadata:
name: mlflow
namespace: ml
annotations: { cert-manager.io/cluster-issuer: letsencrypt-prod }
spec:
ingressClassName: nginx
tls: [{ hosts: ["mlflow.betelgeusebytes.io"], secretName: mlflow-tls }]
rules:
- host: mlflow.betelgeusebytes.io
http:
paths:
- path: /
pathType: Prefix
backend: { service: { name: mlflow, port: { number: 80 } } }
=== ./k8s/neo4j/neo4j-pv.yaml ===
apiVersion: v1
kind: PersistentVolume
metadata:
name: pv-neo4j
spec:
capacity:
storage: 20Gi
accessModes:
- ReadWriteOnce
persistentVolumeReclaimPolicy: Retain
storageClassName: local-ssd-hetzner
local:
path: /mnt/local-ssd/neo4j
nodeAffinity:
required:
nodeSelectorTerms:
- matchExpressions:
- key: kubernetes.io/hostname
operator: In
values:
- hetzner-2
=== ./k8s/neo4j/neo4j.yaml ===
apiVersion: v1
kind: Service
metadata: { name: neo4j, namespace: graph }
spec:
selector: { app: neo4j }
ports:
- { name: http, port: 7474, targetPort: 7474 }
- { name: bolt, port: 7687, targetPort: 7687 }
---
apiVersion: apps/v1
kind: StatefulSet
metadata: { name: neo4j, namespace: graph }
spec:
serviceName: neo4j
replicas: 1
selector: { matchLabels: { app: neo4j } }
template:
metadata: { labels: { app: neo4j } }
spec:
enableServiceLinks: false
nodeSelector: { node: hetzner-2 }
containers:
- name: neo4j
image: neo4j:5.20
env:
- name: NEO4J_AUTH
valueFrom: { secretKeyRef: { name: neo4j-auth, key: NEO4J_AUTH } }
- name: NEO4J_dbms_ssl_policy_bolt_enabled
value: "true"
- name: NEO4J_dbms_ssl_policy_bolt_base__directory
value: "/certs/bolt"
- name: NEO4J_dbms_ssl_policy_bolt_private__key
value: "tls.key"
- name: NEO4J_dbms_ssl_policy_bolt_public__certificate
value: "tls.crt"
- name: NEO4J_dbms_connector_bolt_tls__level
value: "REQUIRED"
# Advertise public hostname so the Browser uses the external FQDN for Bolt
- name: NEO4J_dbms_connector_bolt_advertised__address
value: "neo4j.betelgeusebytes.io:7687"
# also set a default advertised address (recommended)
- name: NEO4J_dbms_default__advertised__address
value: "neo4j.betelgeusebytes.io"
ports:
- { containerPort: 7474 }
- { containerPort: 7687 }
volumeMounts:
- { name: data, mountPath: /data }
- { name: bolt-certs, mountPath: /certs/bolt }
volumes:
- name: bolt-certs
secret:
secretName: neo4j-tls
items:
- key: tls.crt
path: tls.crt
- key: tls.key
path: tls.key
volumeClaimTemplates:
- metadata: { name: data }
spec:
accessModes: ["ReadWriteOnce"]
storageClassName: local-ssd-hetzner
resources: { requests: { storage: 20Gi } }
---
apiVersion: v1
kind: Secret
metadata: { name: neo4j-auth, namespace: graph }
type: Opaque
stringData: { NEO4J_AUTH: "neo4j/NEO4J-PASS" }
---
apiVersion: networking.k8s.io/v1
kind: Ingress
metadata:
name: neo4j-http
namespace: graph
annotations:
cert-manager.io/cluster-issuer: letsencrypt-prod
# nginx.ingress.kubernetes.io/auth-type: basic
# nginx.ingress.kubernetes.io/auth-secret: basic-auth-neo4j
# nginx.ingress.kubernetes.io/auth-realm: "Authentication Required"
spec:
ingressClassName: nginx
tls: [{ hosts: ["neo4j.betelgeusebytes.io"], secretName: neo4j-tls }]
rules:
- host: neo4j.betelgeusebytes.io
http:
paths:
- path: /
pathType: Prefix
backend: { service: { name: neo4j, port: { number: 7474 } } }
# create or update the tcp-services configmap
# kubectl -n ingress-nginx create configmap tcp-services \
# --from-literal="7687=graph/neo4j:7687" \
# -o yaml --dry-run=client | kubectl apply -f -
# kubectl -n ingress-nginx patch deploy ingress-nginx-controller \
# --type='json' -p='[{"op":"add","path":"/spec/template/spec/containers/0/args/-","value":"--tcp-services-configmap=$(POD_NAMESPACE)/tcp-services"}]'
# kubectl -n ingress-nginx patch deploy ingress-nginx-controller \
# --type='json' -p='[{"op":"add","path":"/spec/template/spec/containers/0/args/-","value":"--tcp-services-configmap=$(POD_NAMESPACE)/tcp-services"}]'
# kubectl -n ingress-nginx patch deployment ingress-nginx-controller \
# --type='json' -p='[
# {"op":"add","path":"/spec/template/spec/containers/0/ports/-","value":{"name":"tcp-7687","containerPort":7687,"hostPort":7687,"protocol":"TCP"}}
# ]'
=== ./k8s/observability/fluent-bit.yaml ===
apiVersion: v1
kind: ServiceAccount
metadata: { name: fluent-bit, namespace: observability }
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata: { name: fluent-bit-read }
rules:
- apiGroups: [""]
resources: ["pods", "namespaces"]
verbs: ["get", "list", "watch"]
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata: { name: fluent-bit-read }
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: fluent-bit-read
subjects:
- kind: ServiceAccount
name: fluent-bit
namespace: observability
---
apiVersion: apps/v1
kind: DaemonSet
metadata: { name: fluent-bit, namespace: observability }
spec:
selector: { matchLabels: { app: fluent-bit } }
template:
metadata: { labels: { app: fluent-bit } }
spec:
serviceAccountName: fluent-bit
containers:
- name: fluent-bit
image: cr.fluentbit.io/fluent/fluent-bit:2.2.2
volumeMounts:
- { name: varlog, mountPath: /var/log }
- { name: containers, mountPath: /var/lib/docker/containers, readOnly: true }
env:
- { name: FLUENT_ELASTICSEARCH_HOST, value: elasticsearch.elastic.svc.cluster.local }
- { name: FLUENT_ELASTICSEARCH_PORT, value: "9200" }
args: ["-i","tail","-p","path=/var/log/containers/*.log","-F","kubernetes","-o","es","-p","host=${FLUENT_ELASTICSEARCH_HOST}","-p","port=${FLUENT_ELASTICSEARCH_PORT}","-p","logstash_format=On","-p","logstash_prefix=k8s-logs"]
volumes:
- { name: varlog, hostPath: { path: /var/log } }
- { name: containers, hostPath: { path: /var/lib/docker/containers, type: DirectoryOrCreate } }
=== ./k8s/observability-stack/00-namespace.yaml ===
apiVersion: v1
kind: Namespace
metadata:
name: observability
labels:
name: observability
monitoring: "true"
=== ./k8s/observability-stack/01-persistent-volumes.yaml ===
---
# Prometheus PV
apiVersion: v1
kind: PersistentVolume
metadata:
name: prometheus-data-pv
spec:
capacity:
storage: 50Gi
accessModes:
- ReadWriteOnce
persistentVolumeReclaimPolicy: Retain
storageClassName: local-storage
local:
path: /mnt/local-ssd/prometheus
nodeAffinity:
required:
nodeSelectorTerms:
- matchExpressions:
- key: kubernetes.io/hostname
operator: In
values:
- hetzner-2
---
# Loki PV
apiVersion: v1
kind: PersistentVolume
metadata:
name: loki-data-pv
spec:
capacity:
storage: 100Gi
accessModes:
- ReadWriteOnce
persistentVolumeReclaimPolicy: Retain
storageClassName: local-storage
local:
path: /mnt/local-ssd/loki
nodeAffinity:
required:
nodeSelectorTerms:
- matchExpressions:
- key: kubernetes.io/hostname
operator: In
values:
- hetzner-2
---
# Tempo PV
apiVersion: v1
kind: PersistentVolume
metadata:
name: tempo-data-pv
spec:
capacity:
storage: 50Gi
accessModes:
- ReadWriteOnce
persistentVolumeReclaimPolicy: Retain
storageClassName: local-storage
local:
path: /mnt/local-ssd/tempo
nodeAffinity:
required:
nodeSelectorTerms:
- matchExpressions:
- key: kubernetes.io/hostname
operator: In
values:
- hetzner-2
---
# Grafana PV
apiVersion: v1
kind: PersistentVolume
metadata:
name: grafana-data-pv
spec:
capacity:
storage: 10Gi
accessModes:
- ReadWriteOnce
persistentVolumeReclaimPolicy: Retain
storageClassName: local-storage
local:
path: /mnt/local-ssd/grafana
nodeAffinity:
required:
nodeSelectorTerms:
- matchExpressions:
- key: kubernetes.io/hostname
operator: In
values:
- hetzner-2
=== ./k8s/observability-stack/02-persistent-volume-claims.yaml ===
---
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: prometheus-data
namespace: observability
spec:
accessModes:
- ReadWriteOnce
storageClassName: local-storage
resources:
requests:
storage: 50Gi
---
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: loki-data
namespace: observability
spec:
accessModes:
- ReadWriteOnce
storageClassName: local-storage
resources:
requests:
storage: 100Gi
---
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: tempo-data
namespace: observability
spec:
accessModes:
- ReadWriteOnce
storageClassName: local-storage
resources:
requests:
storage: 50Gi
---
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: grafana-data
namespace: observability
spec:
accessModes:
- ReadWriteOnce
storageClassName: local-storage
resources:
requests:
storage: 10Gi
=== ./k8s/observability-stack/03-prometheus-config.yaml ===
apiVersion: v1
kind: ConfigMap
metadata:
name: prometheus-config
namespace: observability
data:
prometheus.yml: |
global:
scrape_interval: 15s
evaluation_interval: 15s
external_labels:
cluster: 'betelgeuse-k8s'
environment: 'production'
# Alerting configuration (optional - can add alertmanager later)
alerting:
alertmanagers:
- static_configs:
- targets: []
# Rule files
rule_files:
- /etc/prometheus/rules/*.yml
scrape_configs:
# Scrape Prometheus itself
- job_name: 'prometheus'
static_configs:
- targets: ['localhost:9090']
# Kubernetes API server
- job_name: 'kubernetes-apiservers'
kubernetes_sd_configs:
- role: endpoints
scheme: https
tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
relabel_configs:
- source_labels: [__meta_kubernetes_namespace, __meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name]
action: keep
regex: default;kubernetes;https
# Kubernetes nodes
- job_name: 'kubernetes-nodes'
kubernetes_sd_configs:
- role: node
scheme: https
tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
relabel_configs:
- action: labelmap
regex: __meta_kubernetes_node_label_(.+)
- target_label: __address__
replacement: kubernetes.default.svc:443
- source_labels: [__meta_kubernetes_node_name]
regex: (.+)
target_label: __metrics_path__
replacement: /api/v1/nodes/${1}/proxy/metrics
# Kubernetes nodes cadvisor
- job_name: 'kubernetes-cadvisor'
kubernetes_sd_configs:
- role: node
scheme: https
tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
relabel_configs:
- action: labelmap
regex: __meta_kubernetes_node_label_(.+)
- target_label: __address__
replacement: kubernetes.default.svc:443
- source_labels: [__meta_kubernetes_node_name]
regex: (.+)
target_label: __metrics_path__
replacement: /api/v1/nodes/${1}/proxy/metrics/cadvisor
# Kubernetes service endpoints
- job_name: 'kubernetes-service-endpoints'
kubernetes_sd_configs:
- role: endpoints
relabel_configs:
- source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scrape]
action: keep
regex: true
- source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scheme]
action: replace
target_label: __scheme__
regex: (https?)
- source_labels: [__meta_kubernetes_service_annotation_prometheus_io_path]
action: replace
target_label: __metrics_path__
regex: (.+)
- source_labels: [__address__, __meta_kubernetes_service_annotation_prometheus_io_port]
action: replace
target_label: __address__
regex: ([^:]+)(?::\d+)?;(\d+)
replacement: $1:$2
- action: labelmap
regex: __meta_kubernetes_service_label_(.+)
- source_labels: [__meta_kubernetes_namespace]
action: replace
target_label: kubernetes_namespace
- source_labels: [__meta_kubernetes_service_name]
action: replace
target_label: kubernetes_name
- source_labels: [__meta_kubernetes_pod_name]
action: replace
target_label: kubernetes_pod_name
# Kubernetes pods
- job_name: 'kubernetes-pods'
kubernetes_sd_configs:
- role: pod
relabel_configs:
- source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape]
action: keep
regex: true
- source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_path]
action: replace
target_label: __metrics_path__
regex: (.+)
- source_labels: [__address__, __meta_kubernetes_pod_annotation_prometheus_io_port]
action: replace
regex: ([^:]+)(?::\d+)?;(\d+)
replacement: $1:$2
target_label: __address__
- action: labelmap
regex: __meta_kubernetes_pod_label_(.+)
- source_labels: [__meta_kubernetes_namespace]
action: replace
target_label: kubernetes_namespace
- source_labels: [__meta_kubernetes_pod_name]
action: replace
target_label: kubernetes_pod_name
# kube-state-metrics
- job_name: 'kube-state-metrics'
static_configs:
- targets: ['kube-state-metrics.observability.svc.cluster.local:8080']
# node-exporter
- job_name: 'node-exporter'
kubernetes_sd_configs:
- role: pod
relabel_configs:
- source_labels: [__meta_kubernetes_pod_label_app]
action: keep
regex: node-exporter
- source_labels: [__meta_kubernetes_pod_node_name]
action: replace
target_label: instance
# Grafana Loki
- job_name: 'loki'
static_configs:
- targets: ['loki.observability.svc.cluster.local:3100']
# Grafana Tempo
- job_name: 'tempo'
static_configs:
- targets: ['tempo.observability.svc.cluster.local:3200']
# Grafana
- job_name: 'grafana'
static_configs:
- targets: ['grafana.observability.svc.cluster.local:3000']
=== ./k8s/observability-stack/04-loki-config.yaml ===
apiVersion: v1
kind: ConfigMap
metadata:
name: loki-config
namespace: observability
data:
loki.yaml: |
auth_enabled: false
server:
http_listen_port: 3100
grpc_listen_port: 9096
log_level: info
common:
path_prefix: /loki
storage:
filesystem:
chunks_directory: /loki/chunks
rules_directory: /loki/rules
replication_factor: 1
ring:
kvstore:
store: inmemory
schema_config:
configs:
- from: 2024-01-01
store: tsdb
object_store: filesystem
schema: v13
index:
prefix: index_
period: 24h
storage_config:
tsdb_shipper:
active_index_directory: /loki/tsdb-index
cache_location: /loki/tsdb-cache
filesystem:
directory: /loki/chunks
compactor:
working_directory: /loki/compactor
compaction_interval: 10m
retention_enabled: false
limits_config:
reject_old_samples: true
reject_old_samples_max_age: 168h # 7 days
retention_period: 168h # 7 days
max_query_length: 721h # 30 days for queries
max_query_parallelism: 32
max_streams_per_user: 0
max_global_streams_per_user: 0
ingestion_rate_mb: 50
ingestion_burst_size_mb: 100
per_stream_rate_limit: 10MB
per_stream_rate_limit_burst: 20MB
split_queries_by_interval: 15m
query_range:
align_queries_with_step: true
cache_results: true
results_cache:
cache:
embedded_cache:
enabled: true
max_size_mb: 500
frontend:
log_queries_longer_than: 5s
compress_responses: true
query_scheduler:
max_outstanding_requests_per_tenant: 2048
ingester:
chunk_idle_period: 30m
chunk_block_size: 262144
chunk_encoding: snappy
chunk_retain_period: 1m
max_chunk_age: 2h
wal:
enabled: true
dir: /loki/wal
flush_on_shutdown: true
replay_memory_ceiling: 1GB
analytics:
reporting_enabled: false
=== ./k8s/observability-stack/05-tempo-config.yaml ===
apiVersion: v1
kind: ConfigMap
metadata:
name: tempo-config
namespace: observability
data:
tempo.yaml: |
server:
http_listen_port: 3200
log_level: info
distributor:
receivers:
jaeger:
protocols:
thrift_http:
endpoint: 0.0.0.0:14268
grpc:
endpoint: 0.0.0.0:14250
zipkin:
endpoint: 0.0.0.0:9411
otlp:
protocols:
http:
endpoint: 0.0.0.0:4318
grpc:
endpoint: 0.0.0.0:4317
ingester:
max_block_duration: 5m
compactor:
compaction:
block_retention: 168h # 7 days
metrics_generator:
registry:
external_labels:
source: tempo
cluster: betelgeuse-k8s
storage:
path: /var/tempo/generator/wal
remote_write:
- url: http://prometheus.observability.svc.cluster.local:9090/api/v1/write
send_exemplars: true
storage:
trace:
backend: local
wal:
path: /var/tempo/wal
local:
path: /var/tempo/blocks
pool:
max_workers: 100
queue_depth: 10000
# Single instance mode - no need for frontend/querier split
query_frontend:
search:
duration_slo: 5s
throughput_bytes_slo: 1.073741824e+09
trace_by_id:
duration_slo: 5s
overrides:
defaults:
metrics_generator:
processors: [service-graphs, span-metrics]
=== ./k8s/observability-stack/06-alloy-config.yaml ===
apiVersion: v1
kind: ConfigMap
metadata:
name: alloy-config
namespace: observability
data:
config.alloy: |
// Logging configuration
logging {
level = "info"
format = "logfmt"
}
// Discover Kubernetes pods for log collection
discovery.kubernetes "pods" {
role = "pod"
}
// Discover Kubernetes nodes
discovery.kubernetes "nodes" {
role = "node"
}
// Relabel pods for log collection
discovery.relabel "pod_logs" {
targets = discovery.kubernetes.pods.targets
// Only scrape pods with logs
rule {
source_labels = ["__meta_kubernetes_pod_container_name"]
action = "keep"
regex = ".+"
}
// Set the log path
rule {
source_labels = ["__meta_kubernetes_pod_uid", "__meta_kubernetes_pod_container_name"]
target_label = "__path__"
separator = "/"
replacement = "/var/log/pods/*$1/*.log"
}
// Set namespace label
rule {
source_labels = ["__meta_kubernetes_namespace"]
target_label = "namespace"
}
// Set pod name label
rule {
source_labels = ["__meta_kubernetes_pod_name"]
target_label = "pod"
}
// Set container name label
rule {
source_labels = ["__meta_kubernetes_pod_container_name"]
target_label = "container"
}
// Set node name label
rule {
source_labels = ["__meta_kubernetes_pod_node_name"]
target_label = "node"
}
// Copy all pod labels
rule {
action = "labelmap"
regex = "__meta_kubernetes_pod_label_(.+)"
}
}
// Read logs from discovered pods
loki.source.kubernetes "pod_logs" {
targets = discovery.relabel.pod_logs.output
forward_to = [loki.process.pod_logs.receiver]
}
// Process and enrich logs
loki.process "pod_logs" {
forward_to = [loki.write.local.receiver]
// Parse JSON logs
stage.json {
expressions = {
level = "level",
message = "message",
timestamp = "timestamp",
}
}
// Extract log level
stage.labels {
values = {
level = "",
}
}
// Add cluster label
stage.static_labels {
values = {
cluster = "betelgeuse-k8s",
}
}
}
// Write logs to Loki
loki.write "local" {
endpoint {
url = "http://loki.observability.svc.cluster.local:3100/loki/api/v1/push"
}
}
// OpenTelemetry receiver for traces
otelcol.receiver.otlp "default" {
grpc {
endpoint = "0.0.0.0:4317"
}
http {
endpoint = "0.0.0.0:4318"
}
output {
traces = [otelcol.exporter.otlp.tempo.input]
}
}
// Export traces to Tempo
otelcol.exporter.otlp "tempo" {
client {
endpoint = "tempo.observability.svc.cluster.local:4317"
tls {
insecure = true
}
}
}
// Scrape local metrics (Alloy's own metrics)
// Prometheus will scrape these via service discovery
prometheus.exporter.self "alloy" {
}
=== ./k8s/observability-stack/07-grafana-datasources.yaml ===
apiVersion: v1
kind: ConfigMap
metadata:
name: grafana-datasources
namespace: observability
data:
datasources.yaml: |
apiVersion: 1
datasources:
# Prometheus
- name: Prometheus
type: prometheus
access: proxy
url: http://prometheus.observability.svc.cluster.local:9090
isDefault: true
editable: true
jsonData:
timeInterval: 15s
queryTimeout: 60s
httpMethod: POST
# Loki
- name: Loki
type: loki
access: proxy
url: http://loki.observability.svc.cluster.local:3100
editable: true
jsonData:
maxLines: 1000
derivedFields:
- datasourceUid: tempo
matcherRegex: "traceID=(\\w+)"
name: TraceID
url: "$${__value.raw}"
# Tempo
- name: Tempo
type: tempo
access: proxy
url: http://tempo.observability.svc.cluster.local:3200
editable: true
uid: tempo
jsonData:
tracesToLogsV2:
datasourceUid: loki
spanStartTimeShift: -1h
spanEndTimeShift: 1h
filterByTraceID: true
filterBySpanID: false
customQuery: false
tracesToMetrics:
datasourceUid: prometheus
spanStartTimeShift: -1h
spanEndTimeShift: 1h
serviceMap:
datasourceUid: prometheus
nodeGraph:
enabled: true
search:
hide: false
lokiSearch:
datasourceUid: loki
=== ./k8s/observability-stack/08-rbac.yaml ===
---
# Prometheus ServiceAccount
apiVersion: v1
kind: ServiceAccount
metadata:
name: prometheus
namespace: observability
---
# Prometheus ClusterRole
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
name: prometheus
rules:
- apiGroups: [""]
resources:
- nodes
- nodes/proxy
- services
- endpoints
- pods
verbs: ["get", "list", "watch"]
- apiGroups:
- extensions
resources:
- ingresses
verbs: ["get", "list", "watch"]
- nonResourceURLs: ["/metrics"]
verbs: ["get"]
---
# Prometheus ClusterRoleBinding
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: prometheus
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: prometheus
subjects:
- kind: ServiceAccount
name: prometheus
namespace: observability
---
# Alloy ServiceAccount
apiVersion: v1
kind: ServiceAccount
metadata:
name: alloy
namespace: observability
---
# Alloy ClusterRole
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
name: alloy
rules:
- apiGroups: [""]
resources:
- nodes
- nodes/proxy
- services
- endpoints
- pods
- pods/log
verbs: ["get", "list", "watch"]
- apiGroups:
- extensions
resources:
- ingresses
verbs: ["get", "list", "watch"]
---
# Alloy ClusterRoleBinding
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: alloy
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: alloy
subjects:
- kind: ServiceAccount
name: alloy
namespace: observability
---
# kube-state-metrics ServiceAccount
apiVersion: v1
kind: ServiceAccount
metadata:
name: kube-state-metrics
namespace: observability
---
# kube-state-metrics ClusterRole
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
name: kube-state-metrics
rules:
- apiGroups: [""]
resources:
- configmaps
- secrets
- nodes
- pods
- services
- resourcequotas
- replicationcontrollers
- limitranges
- persistentvolumeclaims
- persistentvolumes
- namespaces
- endpoints
verbs: ["list", "watch"]
- apiGroups: ["apps"]
resources:
- statefulsets
- daemonsets
- deployments
- replicasets
verbs: ["list", "watch"]
- apiGroups: ["batch"]
resources:
- cronjobs
- jobs
verbs: ["list", "watch"]
- apiGroups: ["autoscaling"]
resources:
- horizontalpodautoscalers
verbs: ["list", "watch"]
- apiGroups: ["policy"]
resources:
- poddisruptionbudgets
verbs: ["list", "watch"]
- apiGroups: ["certificates.k8s.io"]
resources:
- certificatesigningrequests
verbs: ["list", "watch"]
- apiGroups: ["storage.k8s.io"]
resources:
- storageclasses
- volumeattachments
verbs: ["list", "watch"]
- apiGroups: ["admissionregistration.k8s.io"]
resources:
- mutatingwebhookconfigurations
- validatingwebhookconfigurations
verbs: ["list", "watch"]
- apiGroups: ["networking.k8s.io"]
resources:
- networkpolicies
- ingresses
verbs: ["list", "watch"]
- apiGroups: ["coordination.k8s.io"]
resources:
- leases
verbs: ["list", "watch"]
---
# kube-state-metrics ClusterRoleBinding
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: kube-state-metrics
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: kube-state-metrics
subjects:
- kind: ServiceAccount
name: kube-state-metrics
namespace: observability
=== ./k8s/observability-stack/10-prometheus.yaml ===
apiVersion: apps/v1
kind: StatefulSet
metadata:
name: prometheus
namespace: observability
labels:
app: prometheus
spec:
serviceName: prometheus
replicas: 1
selector:
matchLabels:
app: prometheus
template:
metadata:
labels:
app: prometheus
annotations:
prometheus.io/scrape: "true"
prometheus.io/port: "9090"
spec:
serviceAccountName: prometheus
nodeSelector:
kubernetes.io/hostname: hetzner-2
containers:
- name: prometheus
image: prom/prometheus:v2.54.1
args:
- '--config.file=/etc/prometheus/prometheus.yml'
- '--storage.tsdb.path=/prometheus'
- '--storage.tsdb.retention.time=7d'
- '--web.console.libraries=/usr/share/prometheus/console_libraries'
- '--web.console.templates=/usr/share/prometheus/consoles'
- '--web.enable-lifecycle'
- '--web.enable-admin-api'
ports:
- name: http
containerPort: 9090
protocol: TCP
livenessProbe:
httpGet:
path: /-/healthy
port: http
initialDelaySeconds: 30
periodSeconds: 10
timeoutSeconds: 5
readinessProbe:
httpGet:
path: /-/ready
port: http
initialDelaySeconds: 30
periodSeconds: 10
timeoutSeconds: 5
resources:
requests:
cpu: 500m
memory: 2Gi
limits:
cpu: 2000m
memory: 4Gi
volumeMounts:
- name: prometheus-config
mountPath: /etc/prometheus
- name: prometheus-data
mountPath: /prometheus
volumes:
- name: prometheus-config
configMap:
name: prometheus-config
- name: prometheus-data
persistentVolumeClaim:
claimName: prometheus-data
---
apiVersion: v1
kind: Service
metadata:
name: prometheus
namespace: observability
labels:
app: prometheus
spec:
type: ClusterIP
ports:
- port: 9090
targetPort: http
protocol: TCP
name: http
selector:
app: prometheus
=== ./k8s/observability-stack/11-loki.yaml ===
apiVersion: apps/v1
kind: StatefulSet
metadata:
name: loki
namespace: observability
labels:
app: loki
spec:
serviceName: loki
replicas: 1
selector:
matchLabels:
app: loki
template:
metadata:
labels:
app: loki
annotations:
prometheus.io/scrape: "true"
prometheus.io/port: "3100"
spec:
nodeSelector:
kubernetes.io/hostname: hetzner-2
securityContext:
fsGroup: 10001
runAsGroup: 10001
runAsNonRoot: true
runAsUser: 10001
containers:
- name: loki
image: grafana/loki:3.2.1
args:
- '-config.file=/etc/loki/loki.yaml'
- '-target=all'
ports:
- name: http
containerPort: 3100
protocol: TCP
- name: grpc
containerPort: 9096
protocol: TCP
livenessProbe:
httpGet:
path: /ready
port: http
initialDelaySeconds: 45
periodSeconds: 10
timeoutSeconds: 5
readinessProbe:
httpGet:
path: /ready
port: http
initialDelaySeconds: 45
periodSeconds: 10
timeoutSeconds: 5
resources:
requests:
cpu: 500m
memory: 1Gi
limits:
cpu: 2000m
memory: 2Gi
volumeMounts:
- name: loki-config
mountPath: /etc/loki
- name: loki-data
mountPath: /loki
volumes:
- name: loki-config
configMap:
name: loki-config
- name: loki-data
persistentVolumeClaim:
claimName: loki-data
---
apiVersion: v1
kind: Service
metadata:
name: loki
namespace: observability
labels:
app: loki
spec:
type: ClusterIP
ports:
- port: 3100
targetPort: http
protocol: TCP
name: http
- port: 9096
targetPort: grpc
protocol: TCP
name: grpc
selector:
app: loki
=== ./k8s/observability-stack/12-tempo.yaml ===
apiVersion: apps/v1
kind: StatefulSet
metadata:
name: tempo
namespace: observability
labels:
app: tempo
spec:
serviceName: tempo
replicas: 1
selector:
matchLabels:
app: tempo
template:
metadata:
labels:
app: tempo
annotations:
prometheus.io/scrape: "true"
prometheus.io/port: "3200"
spec:
nodeSelector:
kubernetes.io/hostname: hetzner-2
securityContext:
fsGroup: 10001
runAsGroup: 10001
runAsNonRoot: true
runAsUser: 10001
containers:
- name: tempo
image: grafana/tempo:2.6.1
args:
- '-config.file=/etc/tempo/tempo.yaml'
ports:
- name: http
containerPort: 3200
protocol: TCP
- name: otlp-grpc
containerPort: 4317
protocol: TCP
- name: otlp-http
containerPort: 4318
protocol: TCP
- name: jaeger-grpc
containerPort: 14250
protocol: TCP
- name: jaeger-http
containerPort: 14268
protocol: TCP
- name: zipkin
containerPort: 9411
protocol: TCP
livenessProbe:
httpGet:
path: /ready
port: http
initialDelaySeconds: 30
periodSeconds: 10
timeoutSeconds: 5
readinessProbe:
httpGet:
path: /ready
port: http
initialDelaySeconds: 30
periodSeconds: 10
timeoutSeconds: 5
resources:
requests:
cpu: 500m
memory: 1Gi
limits:
cpu: 2000m
memory: 2Gi
volumeMounts:
- name: tempo-config
mountPath: /etc/tempo
- name: tempo-data
mountPath: /var/tempo
volumes:
- name: tempo-config
configMap:
name: tempo-config
- name: tempo-data
persistentVolumeClaim:
claimName: tempo-data
---
apiVersion: v1
kind: Service
metadata:
name: tempo
namespace: observability
labels:
app: tempo
spec:
type: ClusterIP
ports:
- port: 3200
targetPort: http
protocol: TCP
name: http
- port: 4317
targetPort: otlp-grpc
protocol: TCP
name: otlp-grpc
- port: 4318
targetPort: otlp-http
protocol: TCP
name: otlp-http
- port: 14250
targetPort: jaeger-grpc
protocol: TCP
name: jaeger-grpc
- port: 14268
targetPort: jaeger-http
protocol: TCP
name: jaeger-http
- port: 9411
targetPort: zipkin
protocol: TCP
name: zipkin
selector:
app: tempo
=== ./k8s/observability-stack/13-grafana.yaml ===
apiVersion: apps/v1
kind: StatefulSet
metadata:
name: grafana
namespace: observability
labels:
app: grafana
spec:
serviceName: grafana
replicas: 1
selector:
matchLabels:
app: grafana
template:
metadata:
labels:
app: grafana
spec:
nodeSelector:
kubernetes.io/hostname: hetzner-2
securityContext:
fsGroup: 472
runAsGroup: 472
runAsUser: 472
containers:
- name: grafana
image: grafana/grafana:11.4.0
ports:
- name: http
containerPort: 3000
protocol: TCP
env:
- name: GF_SECURITY_ADMIN_USER
value: admin
- name: GF_SECURITY_ADMIN_PASSWORD
value: admin # Change this in production!
- name: GF_INSTALL_PLUGINS
value: ""
- name: GF_FEATURE_TOGGLES_ENABLE
value: "traceqlEditor,correlations"
- name: GF_AUTH_ANONYMOUS_ENABLED
value: "false"
- name: GF_ANALYTICS_REPORTING_ENABLED
value: "false"
- name: GF_ANALYTICS_CHECK_FOR_UPDATES
value: "false"
livenessProbe:
httpGet:
path: /api/health
port: http
initialDelaySeconds: 60
periodSeconds: 10
timeoutSeconds: 5
readinessProbe:
httpGet:
path: /api/health
port: http
initialDelaySeconds: 60
periodSeconds: 10
timeoutSeconds: 5
resources:
requests:
cpu: 250m
memory: 512Mi
limits:
cpu: 1000m
memory: 1Gi
volumeMounts:
- name: grafana-data
mountPath: /var/lib/grafana
- name: grafana-datasources
mountPath: /etc/grafana/provisioning/datasources
volumes:
- name: grafana-data
persistentVolumeClaim:
claimName: grafana-data
- name: grafana-datasources
configMap:
name: grafana-datasources
---
apiVersion: v1
kind: Service
metadata:
name: grafana
namespace: observability
labels:
app: grafana
spec:
type: ClusterIP
ports:
- port: 3000
targetPort: http
protocol: TCP
name: http
selector:
app: grafana
=== ./k8s/observability-stack/14-alloy.yaml ===
apiVersion: apps/v1
kind: DaemonSet
metadata:
name: alloy
namespace: observability
labels:
app: alloy
spec:
selector:
matchLabels:
app: alloy
template:
metadata:
labels:
app: alloy
spec:
serviceAccountName: alloy
hostNetwork: true
hostPID: true
dnsPolicy: ClusterFirstWithHostNet
containers:
- name: alloy
image: grafana/alloy:v1.5.1
args:
- run
- /etc/alloy/config.alloy
- --storage.path=/var/lib/alloy
- --server.http.listen-addr=0.0.0.0:12345
ports:
- name: http-metrics
containerPort: 12345
protocol: TCP
- name: otlp-grpc
containerPort: 4317
protocol: TCP
- name: otlp-http
containerPort: 4318
protocol: TCP
env:
- name: HOSTNAME
valueFrom:
fieldRef:
fieldPath: spec.nodeName
securityContext:
privileged: true
runAsUser: 0
resources:
requests:
cpu: 100m
memory: 256Mi
limits:
cpu: 500m
memory: 512Mi
volumeMounts:
- name: config
mountPath: /etc/alloy
- name: varlog
mountPath: /var/log
readOnly: true
- name: varlibdockercontainers
mountPath: /var/lib/docker/containers
readOnly: true
- name: etcmachineid
mountPath: /etc/machine-id
readOnly: true
tolerations:
- effect: NoSchedule
operator: Exists
volumes:
- name: config
configMap:
name: alloy-config
- name: varlog
hostPath:
path: /var/log
- name: varlibdockercontainers
hostPath:
path: /var/lib/docker/containers
- name: etcmachineid
hostPath:
path: /etc/machine-id
---
apiVersion: v1
kind: Service
metadata:
name: alloy
namespace: observability
labels:
app: alloy
spec:
type: ClusterIP
ports:
- port: 12345
targetPort: http-metrics
protocol: TCP
name: http-metrics
- port: 4317
targetPort: otlp-grpc
protocol: TCP
name: otlp-grpc
- port: 4318
targetPort: otlp-http
protocol: TCP
name: otlp-http
selector:
app: alloy
=== ./k8s/observability-stack/15-kube-state-metrics.yaml ===
apiVersion: apps/v1
kind: Deployment
metadata:
name: kube-state-metrics
namespace: observability
labels:
app: kube-state-metrics
spec:
replicas: 1
selector:
matchLabels:
app: kube-state-metrics
template:
metadata:
labels:
app: kube-state-metrics
annotations:
prometheus.io/scrape: "true"
prometheus.io/port: "8080"
spec:
serviceAccountName: kube-state-metrics
containers:
- name: kube-state-metrics
image: registry.k8s.io/kube-state-metrics/kube-state-metrics:v2.13.0
ports:
- name: http-metrics
containerPort: 8080
- name: telemetry
containerPort: 8081
livenessProbe:
httpGet:
path: /healthz
port: 8080
initialDelaySeconds: 5
timeoutSeconds: 5
readinessProbe:
httpGet:
path: /
port: 8080
initialDelaySeconds: 5
timeoutSeconds: 5
resources:
requests:
cpu: 100m
memory: 128Mi
limits:
cpu: 200m
memory: 256Mi
---
apiVersion: v1
kind: Service
metadata:
name: kube-state-metrics
namespace: observability
labels:
app: kube-state-metrics
annotations:
prometheus.io/scrape: "true"
prometheus.io/port: "8080"
spec:
type: ClusterIP
ports:
- name: http-metrics
port: 8080
targetPort: http-metrics
- name: telemetry
port: 8081
targetPort: telemetry
selector:
app: kube-state-metrics
=== ./k8s/observability-stack/16-node-exporter.yaml ===
apiVersion: apps/v1
kind: DaemonSet
metadata:
name: node-exporter
namespace: observability
labels:
app: node-exporter
spec:
selector:
matchLabels:
app: node-exporter
template:
metadata:
labels:
app: node-exporter
annotations:
prometheus.io/scrape: "true"
prometheus.io/port: "9100"
spec:
hostNetwork: true
hostPID: true
containers:
- name: node-exporter
image: prom/node-exporter:v1.8.2
args:
- --path.procfs=/host/proc
- --path.sysfs=/host/sys
- --path.rootfs=/host/root
- --collector.filesystem.mount-points-exclude=^/(sys|proc|dev|host|etc)($$|/)
ports:
- name: metrics
containerPort: 9100
protocol: TCP
resources:
requests:
cpu: 100m
memory: 128Mi
limits:
cpu: 200m
memory: 256Mi
volumeMounts:
- name: proc
mountPath: /host/proc
readOnly: true
- name: sys
mountPath: /host/sys
readOnly: true
- name: root
mountPath: /host/root
mountPropagation: HostToContainer
readOnly: true
tolerations:
- effect: NoSchedule
operator: Exists
volumes:
- name: proc
hostPath:
path: /proc
- name: sys
hostPath:
path: /sys
- name: root
hostPath:
path: /
---
apiVersion: v1
kind: Service
metadata:
name: node-exporter
namespace: observability
labels:
app: node-exporter
annotations:
prometheus.io/scrape: "true"
prometheus.io/port: "9100"
spec:
type: ClusterIP
clusterIP: None
ports:
- name: metrics
port: 9100
targetPort: metrics
selector:
app: node-exporter
=== ./k8s/observability-stack/20-grafana-ingress.yaml ===
apiVersion: networking.k8s.io/v1
kind: Ingress
metadata:
name: grafana-ingress
namespace: observability
annotations:
cert-manager.io/cluster-issuer: "letsencrypt-prod"
nginx.ingress.kubernetes.io/ssl-redirect: "true"
nginx.ingress.kubernetes.io/force-ssl-redirect: "true"
spec:
ingressClassName: nginx
tls:
- hosts:
- grafana.betelgeusebytes.io
secretName: grafana-tls
rules:
- host: grafana.betelgeusebytes.io
http:
paths:
- path: /
pathType: Prefix
backend:
service:
name: grafana
port:
number: 3000
=== ./k8s/observability-stack/21-optional-ingresses.yaml ===
---
# Optional: Prometheus Ingress (for direct access to Prometheus UI)
apiVersion: networking.k8s.io/v1
kind: Ingress
metadata:
name: prometheus-ingress
namespace: observability
annotations:
cert-manager.io/cluster-issuer: "letsencrypt-prod"
nginx.ingress.kubernetes.io/ssl-redirect: "true"
nginx.ingress.kubernetes.io/force-ssl-redirect: "true"
# Optional: Add basic auth for security
# nginx.ingress.kubernetes.io/auth-type: basic
# nginx.ingress.kubernetes.io/auth-secret: prometheus-basic-auth
# nginx.ingress.kubernetes.io/auth-realm: 'Authentication Required'
spec:
ingressClassName: nginx
tls:
- hosts:
- prometheus.betelgeusebytes.io
secretName: prometheus-tls
rules:
- host: prometheus.betelgeusebytes.io
http:
paths:
- path: /
pathType: Prefix
backend:
service:
name: prometheus
port:
number: 9090
---
# Optional: Loki Ingress (for direct API access)
apiVersion: networking.k8s.io/v1
kind: Ingress
metadata:
name: loki-ingress
namespace: observability
annotations:
cert-manager.io/cluster-issuer: "letsencrypt-prod"
nginx.ingress.kubernetes.io/ssl-redirect: "true"
nginx.ingress.kubernetes.io/force-ssl-redirect: "true"
spec:
ingressClassName: nginx
tls:
- hosts:
- loki.betelgeusebytes.io
secretName: loki-tls
rules:
- host: loki.betelgeusebytes.io
http:
paths:
- path: /
pathType: Prefix
backend:
service:
name: loki
port:
number: 3100
---
# Optional: Tempo Ingress (for direct API access)
apiVersion: networking.k8s.io/v1
kind: Ingress
metadata:
name: tempo-ingress
namespace: observability
annotations:
cert-manager.io/cluster-issuer: "letsencrypt-prod"
nginx.ingress.kubernetes.io/ssl-redirect: "true"
nginx.ingress.kubernetes.io/force-ssl-redirect: "true"
spec:
ingressClassName: nginx
tls:
- hosts:
- tempo.betelgeusebytes.io
secretName: tempo-tls
rules:
- host: tempo.betelgeusebytes.io
http:
paths:
- path: /
pathType: Prefix
backend:
service:
name: tempo
port:
number: 3200
=== ./k8s/observability-stack/demo-app.yaml ===
---
# Example instrumented application to test the observability stack
# This is a simple Python Flask app with OpenTelemetry instrumentation
apiVersion: v1
kind: ConfigMap
metadata:
name: demo-app
namespace: observability
data:
app.py: |
from flask import Flask, jsonify
import logging
import json
import time
import random
# OpenTelemetry imports
from opentelemetry import trace, metrics
from opentelemetry.sdk.trace import TracerProvider
from opentelemetry.sdk.trace.export import BatchSpanProcessor
from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter
from opentelemetry.sdk.metrics import MeterProvider
from opentelemetry.sdk.metrics.export import PeriodicExportingMetricReader
from opentelemetry.exporter.otlp.proto.grpc.metric_exporter import OTLPMetricExporter
from opentelemetry.instrumentation.flask import FlaskInstrumentor
from opentelemetry.sdk.resources import Resource
from prometheus_flask_exporter import PrometheusMetrics
# Configure structured logging
logging.basicConfig(
level=logging.INFO,
format='%(message)s'
)
class JSONFormatter(logging.Formatter):
def format(self, record):
log_obj = {
'timestamp': self.formatTime(record, self.datefmt),
'level': record.levelname,
'message': record.getMessage(),
'logger': record.name,
}
if hasattr(record, 'trace_id'):
log_obj['trace_id'] = record.trace_id
log_obj['span_id'] = record.span_id
return json.dumps(log_obj)
handler = logging.StreamHandler()
handler.setFormatter(JSONFormatter())
logger = logging.getLogger(__name__)
logger.addHandler(handler)
logger.setLevel(logging.INFO)
# Configure OpenTelemetry
resource = Resource.create({"service.name": "demo-app"})
# Tracing
trace_provider = TracerProvider(resource=resource)
trace_provider.add_span_processor(
BatchSpanProcessor(
OTLPSpanExporter(
endpoint="http://tempo.observability.svc.cluster.local:4317",
insecure=True
)
)
)
trace.set_tracer_provider(trace_provider)
tracer = trace.get_tracer(__name__)
# Create Flask app
app = Flask(__name__)
# Prometheus metrics
metrics = PrometheusMetrics(app)
# Auto-instrument Flask
FlaskInstrumentor().instrument_app(app)
# Sample data
ITEMS = ["apple", "banana", "orange", "grape", "mango"]
@app.route('/')
def index():
span = trace.get_current_span()
trace_id = format(span.get_span_context().trace_id, '032x')
logger.info("Index page accessed", extra={
'trace_id': trace_id,
'endpoint': '/'
})
return jsonify({
'service': 'demo-app',
'status': 'healthy',
'trace_id': trace_id
})
@app.route('/items')
def get_items():
with tracer.start_as_current_span("fetch_items") as span:
# Simulate database query
time.sleep(random.uniform(0.01, 0.1))
span.set_attribute("items.count", len(ITEMS))
trace_id = format(span.get_span_context().trace_id, '032x')
logger.info("Items fetched", extra={
'trace_id': trace_id,
'count': len(ITEMS)
})
return jsonify({
'items': ITEMS,
'count': len(ITEMS),
'trace_id': trace_id
})
@app.route('/item/<int:item_id>')
def get_item(item_id):
with tracer.start_as_current_span("fetch_item") as span:
span.set_attribute("item.id", item_id)
trace_id = format(span.get_span_context().trace_id, '032x')
# Simulate processing
time.sleep(random.uniform(0.01, 0.05))
if item_id < 0 or item_id >= len(ITEMS):
logger.warning("Item not found", extra={
'trace_id': trace_id,
'item_id': item_id
})
return jsonify({'error': 'Item not found', 'trace_id': trace_id}), 404
item = ITEMS[item_id]
logger.info("Item fetched", extra={
'trace_id': trace_id,
'item_id': item_id,
'item': item
})
return jsonify({
'id': item_id,
'name': item,
'trace_id': trace_id
})
@app.route('/slow')
def slow_endpoint():
with tracer.start_as_current_span("slow_operation") as span:
trace_id = format(span.get_span_context().trace_id, '032x')
logger.info("Slow operation started", extra={'trace_id': trace_id})
# Simulate slow operation
time.sleep(random.uniform(1, 3))
logger.info("Slow operation completed", extra={'trace_id': trace_id})
return jsonify({
'message': 'Operation completed',
'trace_id': trace_id
})
@app.route('/error')
def error_endpoint():
with tracer.start_as_current_span("error_operation") as span:
trace_id = format(span.get_span_context().trace_id, '032x')
logger.error("Intentional error triggered", extra={'trace_id': trace_id})
span.set_attribute("error", True)
return jsonify({
'error': 'This is an intentional error',
'trace_id': trace_id
}), 500
if __name__ == '__main__':
app.run(host='0.0.0.0', port=8080)
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: demo-app
namespace: observability
labels:
app: demo-app
spec:
replicas: 1
selector:
matchLabels:
app: demo-app
template:
metadata:
labels:
app: demo-app
annotations:
prometheus.io/scrape: "true"
prometheus.io/port: "8080"
prometheus.io/path: "/metrics"
spec:
containers:
- name: demo-app
image: python:3.11-slim
command:
- /bin/bash
- -c
- |
pip install flask opentelemetry-api opentelemetry-sdk \
opentelemetry-instrumentation-flask \
opentelemetry-exporter-otlp-proto-grpc \
prometheus-flask-exporter && \
python /app/app.py
ports:
- name: http
containerPort: 8080
volumeMounts:
- name: app-code
mountPath: /app
resources:
requests:
cpu: 100m
memory: 256Mi
limits:
cpu: 500m
memory: 512Mi
volumes:
- name: app-code
configMap:
name: demo-app
---
apiVersion: v1
kind: Service
metadata:
name: demo-app
namespace: observability
labels:
app: demo-app
annotations:
prometheus.io/scrape: "true"
prometheus.io/port: "8080"
prometheus.io/path: "/metrics"
spec:
type: ClusterIP
ports:
- port: 8080
targetPort: http
protocol: TCP
name: http
selector:
app: demo-app
=== ./k8s/otlp/otel-collector.yaml ===
apiVersion: v1
kind: Service
metadata: { name: otel-collector, namespace: observability }
spec:
selector: { app: otel-collector }
ports:
- { name: otlp-http, port: 4318, targetPort: 4318 }
- { name: otlp-grpc, port: 4317, targetPort: 4317 }
---
apiVersion: apps/v1
kind: Deployment
metadata: { name: otel-collector, namespace: observability }
spec:
replicas: 2
selector: { matchLabels: { app: otel-collector } }
template:
metadata: { labels: { app: otel-collector } }
spec:
nodeSelector: { node: hetzner-2 }
containers:
- name: otel-collector
image: otel/opentelemetry-collector-contrib:0.102.0
args: ["--config=/etc/otel/config.yaml"]
ports:
- { containerPort: 4318 }
- { containerPort: 4317 }
volumeMounts:
- { name: cfg, mountPath: /etc/otel }
volumes:
- { name: cfg, configMap: { name: otel-config } }
---
apiVersion: v1
kind: ConfigMap
metadata: { name: otel-config, namespace: observability }
data:
config.yaml: |
receivers:
otlp:
protocols: { http: {}, grpc: {} }
processors: { batch: {} }
exporters:
logging: {}
elasticsearch:
endpoints: ["http://elasticsearch.elastic.svc.cluster.local:9200"]
logs_index: "k8s-logs"
service:
pipelines:
logs: { receivers: [otlp], processors: [batch], exporters: [elasticsearch, logging] }
traces: { receivers: [otlp], processors: [batch], exporters: [logging] }
metrics: { receivers: [otlp], processors: [batch], exporters: [logging] }
---
apiVersion: networking.k8s.io/v1
kind: Ingress
metadata:
name: otlp
namespace: observability
annotations: { cert-manager.io/cluster-issuer: letsencrypt-prod }
spec:
ingressClassName: nginx
tls: [{ hosts: ["otlp.betelgeusebytes.io"], secretName: otlp-tls }]
rules:
- host: otlp.betelgeusebytes.io
http:
paths:
- path: /v1/traces
pathType: Prefix
backend: { service: { name: otel-collector, port: { number: 4318 } } }
- path: /v1/metrics
pathType: Prefix
backend: { service: { name: otel-collector, port: { number: 4318 } } }
- path: /v1/logs
pathType: Prefix
backend: { service: { name: otel-collector, port: { number: 4318 } } }
=== ./k8s/postgres/pg.yaml ===
# k8s/postgres/pg-init-sql-configmap.yaml
apiVersion: v1
kind: ConfigMap
metadata:
name: pg-init-sql
namespace: db
data:
00_extensions.sql: |
\connect gitea
CREATE EXTENSION IF NOT EXISTS postgis;
CREATE EXTENSION IF NOT EXISTS postgis_topology;
CREATE EXTENSION IF NOT EXISTS fuzzystrmatch;
CREATE EXTENSION IF NOT EXISTS pg_trgm;
CREATE EXTENSION IF NOT EXISTS hstore;
CREATE EXTENSION IF NOT EXISTS "uuid-ossp";
CREATE EXTENSION IF NOT EXISTS citext;
CREATE EXTENSION IF NOT EXISTS unaccent;
CREATE EXTENSION IF NOT EXISTS pgcrypto;
DO $$ BEGIN
CREATE EXTENSION IF NOT EXISTS plpython3u;
EXCEPTION WHEN undefined_file THEN
RAISE NOTICE 'plpython3u not available in this image';
END $$;
01_tune.sql: |
ALTER SYSTEM SET shared_buffers = '1GB';
ALTER SYSTEM SET work_mem = '32MB';
ALTER SYSTEM SET maintenance_work_mem = '512MB';
ALTER SYSTEM SET max_connections = 200;
SELECT pg_reload_conf();
---
# k8s/postgres/pg-conf.yaml
apiVersion: v1
kind: ConfigMap
metadata:
name: pg-conf
namespace: db
data:
pg_hba.conf: |
# Local connections
local all all trust
host all all 127.0.0.1/32 trust
host all all ::1/128 trust
# TLS-only access from ANY external IP (harden as needed)
hostssl all all 0.0.0.0/0 md5
hostssl all all ::/0 md5
---
# k8s/postgres/pg-secret.yaml
apiVersion: v1
kind: Secret
metadata:
name: pg18-secret
namespace: db
type: Opaque
stringData:
POSTGRES_PASSWORD: "pa$$word"
---
# k8s/postgres/pg-certificate.yaml
apiVersion: cert-manager.io/v1
kind: Certificate
metadata:
name: pg-tls
namespace: db
spec:
secretName: pg-tls
dnsNames:
- pg.betelgeusebytes.io
issuerRef:
kind: ClusterIssuer
name: letsencrypt-prod
---
# k8s/postgres/postgres-svc.yaml
apiVersion: v1
kind: Service
metadata:
name: postgres
namespace: db
spec:
selector:
app: postgres
ports:
- name: postgres
port: 5432
targetPort: 5432
---
apiVersion: v1
kind: Service
metadata:
name: postgres-hl
namespace: db
spec:
clusterIP: None
selector:
app: postgres
ports:
- name: postgres
port: 5432
targetPort: 5432
---
# k8s/postgres/postgres.yaml
apiVersion: apps/v1
kind: StatefulSet
metadata:
name: postgres
namespace: db
spec:
serviceName: postgres-hl
replicas: 1
selector:
matchLabels:
app: postgres
template:
metadata:
labels:
app: postgres
spec:
securityContext:
runAsUser: 999
runAsGroup: 999
fsGroup: 999
fsGroupChangePolicy: "Always"
initContainers:
- name: install-certs
image: busybox:1.36
command:
- sh
- -c
- |
cp /in/tls.crt /out/server.crt
cp /in/tls.key /out/server.key
chown 999:999 /out/* || true
chmod 600 /out/server.key
securityContext:
runAsUser: 0
volumeMounts:
- { name: pg-tls, mountPath: /in, readOnly: true }
- { name: pg-certs, mountPath: /out }
containers:
- name: postgres
image: axxs/postgres:18-postgis-vector
imagePullPolicy: IfNotPresent
args:
- -c
- ssl=on
- -c
- ssl_cert_file=/certs/server.crt
- -c
- ssl_key_file=/certs/server.key
- -c
- hba_file=/etc/postgresql-custom/pg_hba.conf
env:
- name: POSTGRES_USER
value: "app"
- name: POSTGRES_DB
value: "gitea"
- name: POSTGRES_PASSWORD
valueFrom:
secretKeyRef:
name: pg18-secret
key: POSTGRES_PASSWORD
- name: TZ
value: "Europe/Paris"
ports:
- name: postgres
containerPort: 5432
volumeMounts:
- { name: data, mountPath: /var/lib/postgresql } # PG18 expects parent, creates /var/lib/postgresql/18/main
- { name: init, mountPath: /docker-entrypoint-initdb.d, readOnly: true }
- { name: pg-certs, mountPath: /certs }
- { name: pg-conf, mountPath: /etc/postgresql-custom }
readinessProbe:
exec: { command: ["sh","-c","pg_isready -U \"$POSTGRES_USER\" -d \"$POSTGRES_DB\" -h 127.0.0.1"] }
initialDelaySeconds: 5
periodSeconds: 10
timeoutSeconds: 5
failureThreshold: 6
livenessProbe:
exec: { command: ["sh","-c","pg_isready -U \"$POSTGRES_USER\" -d \"$POSTGRES_DB\" -h 127.0.0.1"] }
initialDelaySeconds: 20
periodSeconds: 10
timeoutSeconds: 5
failureThreshold: 6
resources:
requests: { cpu: "250m", memory: "512Mi" }
limits: { cpu: "1", memory: "2Gi" }
volumes:
- name: init
configMap:
name: pg-init-sql
defaultMode: 0444
- name: pg-tls
secret:
secretName: pg-tls
- name: pg-certs
emptyDir: {}
- name: pg-conf
configMap:
name: pg-conf
defaultMode: 0444
volumeClaimTemplates:
- metadata:
name: data
spec:
accessModes: ["ReadWriteOnce"]
storageClassName: local-ssd-hetzner
resources:
requests:
storage: 80Gi
# kubectl -n ingress-nginx create configmap tcp-services \
# --from-literal="5432=db/postgres:5432" \
# -o yaml --dry-run=client | kubectl apply -f -
# kubectl -n ingress-nginx patch deploy ingress-nginx-controller \
# --type='json' -p='[
# {"op":"add","path":"/spec/template/spec/containers/0/args/-","value":"--tcp-services-configmap=$(POD_NAMESPACE)/tcp-services"}
# ]'
# # controller must listen on hostPort:5432 (we already patched earlier)
=== ./k8s/postgres/postgres-ha.yaml ===
---
apiVersion: v1
kind: Namespace
metadata:
name: db
---
# Password secret (replace with your own or generate one)
apiVersion: v1
kind: Secret
metadata:
name: pg18-secret
namespace: db
type: Opaque
stringData:
POSTGRES_PASSWORD: "pa$$word"
---
# Init SQL: keeps your original name and keeps enabling PostGIS + vector
apiVersion: v1
kind: ConfigMap
metadata:
name: pg-init-sql
namespace: db
data:
00_extensions.sql: |
-- enable common extensions in the default DB and template1 so future DBs inherit them
\connect gitea
CREATE EXTENSION IF NOT EXISTS postgis;
CREATE EXTENSION IF NOT EXISTS vector;
CREATE COLLATION IF NOT EXISTS arabic (provider = icu, locale = 'ar', deterministic = false);
CREATE EXTENSION IF NOT EXISTS tablefunc;
-- postpone pg_stat_statements CREATE to postStart (needs preload)
CREATE EXTENSION IF NOT EXISTS postgis_topology;
CREATE EXTENSION IF NOT EXISTS fuzzystrmatch;
CREATE EXTENSION IF NOT EXISTS pg_trgm;
CREATE EXTENSION IF NOT EXISTS hstore;
CREATE EXTENSION IF NOT EXISTS "uuid-ossp";
CREATE EXTENSION IF NOT EXISTS citext;
CREATE EXTENSION IF NOT EXISTS unaccent;
CREATE EXTENSION IF NOT EXISTS pgcrypto;
-- PL/Python (available in your image)
DO $$ BEGIN
CREATE EXTENSION IF NOT EXISTS plpython3u;
EXCEPTION WHEN undefined_file THEN
RAISE NOTICE 'plpython3u not available in this image';
END $$;
-- Also on template1 for new DBs (heavier, but intentional)
\connect template1
CREATE EXTENSION IF NOT EXISTS postgis;
CREATE EXTENSION IF NOT EXISTS pg_trgm;
CREATE EXTENSION IF NOT EXISTS hstore;
CREATE EXTENSION IF NOT EXISTS "uuid-ossp";
CREATE EXTENSION IF NOT EXISTS citext;
CREATE EXTENSION IF NOT EXISTS unaccent;
CREATE EXTENSION IF NOT EXISTS pgcrypto;
-- Arabic-friendly ICU collation, non-deterministic for case/diacritics
DO $$
BEGIN
PERFORM 1 FROM pg_collation WHERE collname='arabic';
IF NOT FOUND THEN
CREATE COLLATION arabic (provider = icu, locale = 'ar', deterministic = false);
END IF;
END$$;
01_tune.sql: |
-- Enable pg_stat_statements on next server start
DO $$
DECLARE
cur text := current_setting('shared_preload_libraries', true);
BEGIN
IF cur IS NULL OR position('pg_stat_statements' in cur) = 0 THEN
PERFORM pg_catalog.pg_reload_conf(); -- harmless even if no changes yet
EXECUTE $$ALTER SYSTEM SET shared_preload_libraries =
$$ || quote_literal(coalesce(NULLIF(cur,'' ) || ',pg_stat_statements', 'pg_stat_statements'));
END IF;
END$$;
-- Optional tuning (adjust to your limits)
ALTER SYSTEM SET shared_buffers = '1GB';
ALTER SYSTEM SET work_mem = '32MB';
ALTER SYSTEM SET maintenance_work_mem = '512MB';
ALTER SYSTEM SET max_connections = 200;
-- Reload applies some settings immediately; others need restart (OK after init completes)
SELECT pg_reload_conf();
ALTER SYSTEM SET pg_stat_statements.max = 10000;
ALTER SYSTEM SET pg_stat_statements.track = 'all';
ALTER SYSTEM SET pg_stat_statements.save = on;
pg_hba.conf: |
# Allow loopback
local all all trust
host all all 127.0.0.1/32 trust
host all all ::1/128 trust
# Allow TLS connections from your IP(s) only
hostssl all all YOUR_PUBLIC_IP/32 md5
# (Optional) Add more CIDRs or a private network range here:
# hostssl all all 10.0.0.0/8 md5
---
# Headless service required by StatefulSet for stable network IDs
apiVersion: v1
kind: Service
metadata:
name: postgres-hl
namespace: db
spec:
clusterIP: None
selector:
app: postgres
ports:
- name: postgres
port: 5432
targetPort: 5432
---
# Regular ClusterIP service for clients (keeps your original name)
apiVersion: v1
kind: Service
metadata:
name: postgres
namespace: db
spec:
selector:
app: postgres
ports:
- name: postgres
port: 5432
targetPort: 5432
---
apiVersion: apps/v1
kind: StatefulSet
metadata:
name: postgres
namespace: db
spec:
serviceName: postgres-hl
replicas: 1
selector:
matchLabels:
app: postgres
template:
metadata:
labels:
app: postgres
spec:
securityContext:
runAsUser: 999
runAsGroup: 999
fsGroup: 999
fsGroupChangePolicy: "Always"
initContainers:
# Copy cert-manager certs to a writable path with correct perms for Postgres
- name: install-certs
image: busybox:1.36
command:
- sh
- -c
- |
cp /in/tls.crt /out/server.crt
cp /in/tls.key /out/server.key
cp /in/ca.crt /out/ca.crt || true
chown 999:999 /out/* || true
chmod 600 /out/server.key
securityContext:
runAsUser: 0
volumeMounts:
- { name: pg-tls, mountPath: /in, readOnly: true }
- { name: pg-certs, mountPath: /out }
containers:
- name: postgres
image: axxs/postgres:18-postgis-vector
imagePullPolicy: IfNotPresent
args:
- -c
- ssl=on
- -c
- ssl_cert_file=/certs/server.crt
- -c
- ssl_key_file=/certs/server.key
- -c
- ssl_ca_file=/certs/ca.crt
- -c
- hba_file=/etc/postgresql-custom/pg_hba.conf
lifecycle:
postStart:
exec:
command:
- /bin/sh
- -c
- |
set -e
# Wait until server accepts connections
for i in $(seq 1 30); do
pg_isready -h 127.0.0.1 -U "$POSTGRES_USER" -d "$POSTGRES_DB" && break
sleep 1
done
psql -v ON_ERROR_STOP=1 -U "$POSTGRES_USER" -d "$POSTGRES_DB" -c "CREATE EXTENSION IF NOT EXISTS pg_stat_statements;"
env:
- name: POSTGRES_USER
value: "app"
- name: POSTGRES_DB
value: "gitea" # matches your \connect gitea
- name: POSTGRES_PASSWORD
valueFrom:
secretKeyRef:
name: pg18-secret
key: POSTGRES_PASSWORD
- name: TZ
value: "Europe/Paris"
ports:
- name: postgres
containerPort: 5432
volumeMounts:
# ✅ PG 18 requires this parent path; it will create /var/lib/postgresql/18/main
- name: data
mountPath: /var/lib/postgresql
# your init scripts ConfigMap
- name: init
mountPath: /docker-entrypoint-initdb.d
readOnly: true
- name: pg-certs
mountPath: /certs
# pg_hba.conf
- name: pg-conf
mountPath: /etc/postgresql-custom
readinessProbe:
exec:
command:
- /bin/sh
- -c
- pg_isready -U "$POSTGRES_USER" -d "$POSTGRES_DB" -h 127.0.0.1
initialDelaySeconds: 5
periodSeconds: 10
timeoutSeconds: 5
failureThreshold: 6
livenessProbe:
exec:
command:
- /bin/sh
- -c
- pg_isready -U "$POSTGRES_USER" -d "$POSTGRES_DB" -h 127.0.0.1
initialDelaySeconds: 20
periodSeconds: 10
timeoutSeconds: 5
failureThreshold: 6
resources:
requests:
cpu: "250m"
memory: "512Mi"
limits:
cpu: "1"
memory: "2Gi"
volumes:
- name: init
configMap:
name: pg-init-sql
defaultMode: 0444
- name: pg-tls
secret:
secretName: pg-tls
- name: pg-certs
emptyDir: {}
- name: pg-conf
configMap:
name: pg-conf
defaultMode: 0444
volumeClaimTemplates:
- metadata:
name: data
spec:
accessModes: ["ReadWriteOnce"]
resources:
requests:
storage: 10Gi
# storageClassName: <your-storageclass> # optionally pin this
=== ./k8s/postgres/postgres.yaml ===
apiVersion: v1
kind: Service
metadata: { name: postgres, namespace: db }
spec:
ports: [{ port: 5432, targetPort: 5432 }]
selector: { app: postgres }
---
apiVersion: v1
kind: ConfigMap
metadata: { name: pg-init-sql, namespace: db }
data:
00_extensions.sql: |
-- enable common extensions in the default DB and template1 so future DBs inherit them
\connect gitea
CREATE EXTENSION IF NOT EXISTS postgis;
CREATE EXTENSION IF NOT EXISTS vector;
CREATE COLLATION IF NOT EXISTS arabic (provider = icu, locale = 'ar', deterministic = false);
CREATE EXTENSION IF NOT EXISTS tablefunc;
CREATE EXTENSION IF NOT EXISTS pg_stat_statements;
CREATE EXTENSION IF NOT EXISTS postgis_topology;
CREATE EXTENSION IF NOT EXISTS fuzzystrmatch;
CREATE EXTENSION IF NOT EXISTS pg_trgm;
CREATE EXTENSION IF NOT EXISTS hstore;
CREATE EXTENSION IF NOT EXISTS "uuid-ossp";
CREATE EXTENSION IF NOT EXISTS citext;
CREATE EXTENSION IF NOT EXISTS unaccent;
CREATE EXTENSION IF NOT EXISTS pgcrypto;
-- PL/Python (optional; requires image with plpython3u, postgis image has it)
DO $$ BEGIN
CREATE EXTENSION IF NOT EXISTS plpython3u;
EXCEPTION WHEN undefined_file THEN
RAISE NOTICE 'plpython3u not available in this image';
END $$;
-- Also on template1 for new DBs:
\connect template1
CREATE EXTENSION IF NOT EXISTS postgis;
CREATE EXTENSION IF NOT EXISTS pg_trgm;
CREATE EXTENSION IF NOT EXISTS hstore;
CREATE EXTENSION IF NOT EXISTS "uuid-ossp";
CREATE EXTENSION IF NOT EXISTS citext;
CREATE EXTENSION IF NOT EXISTS unaccent;
CREATE EXTENSION IF NOT EXISTS pgcrypto;
-- Arabic-friendly ICU collation (PostgreSQL >= 13)
-- Non-deterministic collation helps proper case/diacritics comparisons
DO $$
BEGIN
PERFORM 1 FROM pg_collation WHERE collname='arabic';
IF NOT FOUND THEN
CREATE COLLATION arabic (provider = icu, locale = 'ar', deterministic = false);
END IF;
END$$;
-- Example: ensure gitea DB uses UTF8; Arabic text search often needs unaccent + custom dictionaries.
-- You can create additional DBs with: CREATE DATABASE mydb TEMPLATE template1 ENCODING 'UTF8';
01_tune.sql: |
-- small safe defaults; adjust later
ALTER SYSTEM SET shared_buffers = '1GB';
ALTER SYSTEM SET work_mem = '32MB';
ALTER SYSTEM SET maintenance_work_mem = '512MB';
ALTER SYSTEM SET max_connections = 200;
SELECT pg_reload_conf();
---
apiVersion: apps/v1
kind: StatefulSet
metadata: { name: postgres, namespace: db }
spec:
serviceName: postgres
replicas: 1
selector: { matchLabels: { app: postgres } }
template:
metadata: { labels: { app: postgres } }
spec:
nodeSelector:
node: hetzner-2
securityContext:
fsGroup: 999 # Debian postgres user/group in postgis image
fsGroupChangePolicy: OnRootMismatch
initContainers:
- name: fix-perms
image: busybox:1.36
command: ["sh","-c","chown -R 999:999 /var/lib/postgresql/data || true"]
securityContext: { runAsUser: 0 }
volumeMounts: [{ name: data, mountPath: /var/lib/postgresql/data }]
containers:
- name: postgres
image: postgres:16-3.4
env:
- name: POSTGRES_PASSWORD
valueFrom: { secretKeyRef: { name: postgres-auth, key: POSTGRES_PASSWORD } }
- { name: POSTGRES_USER, value: gitea }
- { name: POSTGRES_DB, value: gitea }
- name: POSTGRES_INITDB_ARGS
value: "--encoding=UTF8 --locale=C.UTF-8"
ports: [{ containerPort: 5432 }]
volumeMounts:
- { name: data, mountPath: /var/lib/postgresql/data }
- { name: init, mountPath: /docker-entrypoint-initdb.d }
volumeClaimTemplates:
- metadata: { name: data }
spec:
accessModes: ["ReadWriteOnce"]
storageClassName: local-ssd-hetzner
resources: { requests: { storage: 80Gi } }
---
# Mount the init scripts
apiVersion: apps/v1
kind: StatefulSet
metadata:
name: postgres
namespace: db
spec:
template:
spec:
volumes:
- name: init
configMap:
name: pg-init-sql
defaultMode: 0444
=== ./k8s/postgres/secret.yaml ===
apiVersion: v1
kind: Secret
metadata: { name: postgres-auth, namespace: db }
type: Opaque
stringData:
POSTGRES_PASSWORD: "PG-ADM1N"
GITEA_DB_PASSWORD: "G1TEA"
=== ./k8s/prometheus/prometheus-config.yaml ===
apiVersion: v1
kind: ConfigMap
metadata: { name: prometheus-config, namespace: monitoring }
data:
prometheus.yml: |
global: { scrape_interval: 15s }
scrape_configs:
- job_name: 'kubernetes-pods'
kubernetes_sd_configs: [ { role: pod } ]
relabel_configs:
- action: keep
source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape]
regex: 'true'
=== ./k8s/prometheus/prometheus.yaml ===
apiVersion: v1
kind: Service
metadata: { name: prometheus, namespace: monitoring }
spec:
ports: [{ port: 9090, targetPort: 9090 }]
selector: { app: prometheus }
---
apiVersion: apps/v1
kind: StatefulSet
metadata: { name: prometheus, namespace: monitoring }
spec:
serviceName: prometheus
replicas: 1
selector: { matchLabels: { app: prometheus } }
template:
metadata: { labels: { app: prometheus } }
spec:
nodeSelector: { node: hetzner-2 }
containers:
- name: prometheus
image: prom/prometheus:v2.53.0
args: ["--config.file=/etc/prometheus/prometheus.yml","--storage.tsdb.path=/prometheus"]
ports: [{ containerPort: 9090 }]
volumeMounts:
- { name: data, mountPath: /prometheus }
- { name: config, mountPath: /etc/prometheus }
volumes:
- { name: config, configMap: { name: prometheus-config } }
volumeClaimTemplates:
- metadata: { name: data }
spec:
accessModes: ["ReadWriteOnce"]
storageClassName: local-ssd-hetzner
resources: { requests: { storage: 50Gi } }
---
apiVersion: networking.k8s.io/v1
kind: Ingress
metadata:
name: prometheus
namespace: monitoring
annotations:
cert-manager.io/cluster-issuer: letsencrypt-prod
nginx.ingress.kubernetes.io/auth-type: basic
nginx.ingress.kubernetes.io/auth-secret: basic-auth-prometheus
nginx.ingress.kubernetes.io/auth-realm: "Authentication Required"
spec:
ingressClassName: nginx
tls: [{ hosts: ["prometheus.betelgeusebytes.io"], secretName: prometheus-tls }]
rules:
- host: prometheus.betelgeusebytes.io
http:
paths:
- path: /
pathType: Prefix
backend: { service: { name: prometheus, port: { number: 9090 } } }
=== ./k8s/redis/redis-pv.yaml ===
apiVersion: v1
kind: PersistentVolume
metadata:
name: pv-redis
spec:
capacity:
storage: 10Gi
accessModes:
- ReadWriteOnce
persistentVolumeReclaimPolicy: Retain
storageClassName: local-ssd-hetzner
local:
path: /mnt/local-ssd/redis
nodeAffinity:
required:
nodeSelectorTerms:
- matchExpressions:
- key: kubernetes.io/hostname
operator: In
values:
- hetzner-2
=== ./k8s/redis/redis.yaml ===
apiVersion: v1
kind: Service
metadata: { name: redis, namespace: db }
spec:
ports: [{ port: 6379, targetPort: 6379 }]
selector: { app: redis }
---
apiVersion: apps/v1
kind: StatefulSet
metadata: { name: redis, namespace: db }
spec:
serviceName: redis
replicas: 1
selector: { matchLabels: { app: redis } }
template:
metadata: { labels: { app: redis } }
spec:
nodeSelector: { node: hetzner-2 }
containers:
- name: redis
image: redis:7
args: ["--requirepass", "$(REDIS_PASSWORD)"]
env:
- name: REDIS_PASSWORD
valueFrom: { secretKeyRef: { name: redis-auth, key: REDIS_PASSWORD } }
ports: [{ containerPort: 6379 }]
volumeMounts:
- { name: data, mountPath: /data }
volumeClaimTemplates:
- metadata: { name: data }
spec:
accessModes: ["ReadWriteOnce"]
storageClassName: local-ssd-hetzner
resources: { requests: { storage: 10Gi } }
---
apiVersion: v1
kind: Secret
metadata: { name: redis-auth, namespace: db }
type: Opaque
stringData: { REDIS_PASSWORD: "RED1S" }
=== ./k8s/sso/sso.yaml ===
# PV
apiVersion: v1
kind: PersistentVolume
metadata:
name: pv-auth
spec:
capacity:
storage: 10Gi
accessModes:
- ReadWriteOnce
persistentVolumeReclaimPolicy: Retain
storageClassName: local-ssd-hetzner
local:
path: /mnt/local-ssd/auth
nodeAffinity:
required:
nodeSelectorTerms:
- matchExpressions:
- key: kubernetes.io/hostname
operator: In
values:
- hetzner-2
---
# k8s/auth/keycloak/secret.yaml
apiVersion: v1
kind: Secret
metadata: { name: keycloak-admin, namespace: db }
type: Opaque
stringData: { KEYCLOAK_ADMIN: "admin", KEYCLOAK_ADMIN_PASSWORD: "admin" }
---
# k8s/auth/keycloak/pvc.yaml
apiVersion: v1
kind: PersistentVolumeClaim
metadata: { name: keycloak-data, namespace: db }
spec:
accessModes: ["ReadWriteOnce"]
storageClassName: local-ssd-hetzner
resources: { requests: { storage: 10Gi } }
---
# k8s/auth/keycloak/deploy.yaml
apiVersion: apps/v1
kind: Deployment
metadata: { name: keycloak, namespace: db }
spec:
replicas: 1
selector: { matchLabels: { app: keycloak } }
template:
metadata: { labels: { app: keycloak } }
spec:
# Ensure the PV is owned by the Keycloak UID/GID
securityContext:
fsGroup: 1000
initContainers:
- name: fix-permissions
image: busybox
command: ['sh', '-c', 'chown -R 1000:1000 /opt/keycloak/data && chmod -R 755 /opt/keycloak/data']
volumeMounts:
- name: data
mountPath: /opt/keycloak/data
containers:
- name: keycloak
image: quay.io/keycloak/keycloak:latest
args: ["start","--http-enabled=true","--proxy-headers=xforwarded","--hostname-strict=false"]
env:
- { name: KEYCLOAK_ADMIN, valueFrom: { secretKeyRef: { name: keycloak-admin, key: KEYCLOAK_ADMIN } } }
- { name: KEYCLOAK_ADMIN_PASSWORD, valueFrom: { secretKeyRef: { name: keycloak-admin, key: KEYCLOAK_ADMIN_PASSWORD } } }
ports: [{ containerPort: 8080 }]
volumeMounts: [{ name: data, mountPath: /opt/keycloak/data }]
securityContext:
runAsUser: 1000
runAsGroup: 1000
volumes:
- name: data
persistentVolumeClaim: { claimName: keycloak-data }
---
apiVersion: v1
kind: Service
metadata: { name: keycloak, namespace: db }
spec: { selector: { app: keycloak }, ports: [ { port: 80, targetPort: 8080 } ] }
---
apiVersion: networking.k8s.io/v1
kind: Ingress
metadata:
name: keycloak
namespace: db
annotations: { cert-manager.io/cluster-issuer: letsencrypt-prod }
spec:
ingressClassName: nginx
tls: [{ hosts: ["auth.betelgeusebytes.io"], secretName: keycloak-tls }]
rules:
- host: auth.betelgeusebytes.io
http:
paths:
- path: /
pathType: Prefix
backend: { service: { name: keycloak, port: { number: 80 } } }
=== ./k8s/storage/persistent-volumes.yaml ===
apiVersion: v1
kind: PersistentVolume
metadata:
name: pv-postgres
spec:
capacity:
storage: 80Gi
accessModes:
- ReadWriteOnce
persistentVolumeReclaimPolicy: Retain
storageClassName: local-ssd-hetzner
local:
path: /mnt/local-ssd/postgres
nodeAffinity:
required:
nodeSelectorTerms:
- matchExpressions:
- key: kubernetes.io/hostname
operator: In
values:
- hetzner-2
---
apiVersion: v1
kind: PersistentVolume
metadata:
name: pv-elasticsearch
spec:
capacity:
storage: 300Gi
accessModes:
- ReadWriteOnce
persistentVolumeReclaimPolicy: Retain
storageClassName: local-ssd-hetzner
local:
path: /mnt/local-ssd/elasticsearch
nodeAffinity:
required:
nodeSelectorTerms:
- matchExpressions:
- key: kubernetes.io/hostname
operator: In
values:
- hetzner-2
---
apiVersion: v1
kind: PersistentVolume
metadata:
name: pv-gitea
spec:
capacity:
storage: 50Gi
accessModes:
- ReadWriteOnce
persistentVolumeReclaimPolicy: Retain
storageClassName: local-ssd-hetzner
local:
path: /mnt/local-ssd/gitea
nodeAffinity:
required:
nodeSelectorTerms:
- matchExpressions:
- key: kubernetes.io/hostname
operator: In
values:
- hetzner-2
---
apiVersion: v1
kind: PersistentVolume
metadata:
name: pv-jupyter
spec:
capacity:
storage: 20Gi
accessModes:
- ReadWriteOnce
persistentVolumeReclaimPolicy: Retain
storageClassName: local-ssd-hetzner
local:
path: /mnt/local-ssd/jupyter
nodeAffinity:
required:
nodeSelectorTerms:
- matchExpressions:
- key: kubernetes.io/hostname
operator: In
values:
- hetzner-2
---
apiVersion: v1
kind: PersistentVolume
metadata:
name: pv-kafka
spec:
capacity:
storage: 50Gi
accessModes:
- ReadWriteOnce
persistentVolumeReclaimPolicy: Retain
storageClassName: local-ssd-hetzner
local:
path: /mnt/local-ssd/kafka
nodeAffinity:
required:
nodeSelectorTerms:
- matchExpressions:
- key: kubernetes.io/hostname
operator: In
values:
- hetzner-2
---
apiVersion: v1
kind: PersistentVolume
metadata:
name: pv-zookeeper-data
spec:
capacity:
storage: 10Gi
accessModes:
- ReadWriteOnce
persistentVolumeReclaimPolicy: Retain
storageClassName: local-ssd-hetzner
local:
path: /mnt/local-ssd/zookeeper-data
nodeAffinity:
required:
nodeSelectorTerms:
- matchExpressions:
- key: kubernetes.io/hostname
operator: In
values:
- hetzner-2
---
apiVersion: v1
kind: PersistentVolume
metadata:
name: pv-zookeeper-log
spec:
capacity:
storage: 10Gi
accessModes:
- ReadWriteOnce
persistentVolumeReclaimPolicy: Retain
storageClassName: local-ssd-hetzner
local:
path: /mnt/local-ssd/zookeeper-log
nodeAffinity:
required:
nodeSelectorTerms:
- matchExpressions:
- key: kubernetes.io/hostname
operator: In
values:
- hetzner-2
---
apiVersion: v1
kind: PersistentVolume
metadata:
name: pv-prometheus
spec:
capacity:
storage: 50Gi
accessModes:
- ReadWriteOnce
persistentVolumeReclaimPolicy: Retain
storageClassName: local-ssd-hetzner
local:
path: /mnt/local-ssd/prometheus
nodeAffinity:
required:
nodeSelectorTerms:
- matchExpressions:
- key: kubernetes.io/hostname
operator: In
values:
- hetzner-2
=== ./k8s/storage/storageclass.yaml ===
apiVersion: storage.k8s.io/v1
kind: StorageClass
metadata:
name: local-ssd-hetzner
provisioner: kubernetes.io/no-provisioner
volumeBindingMode: WaitForFirstConsumer
=== ./k8s/tei/tei.yaml ===
# k8s/ai/tei/deploy.yaml
apiVersion: apps/v1
kind: Deployment
metadata: { name: tei, namespace: ml }
spec:
replicas: 1
selector: { matchLabels: { app: tei } }
template:
metadata: { labels: { app: tei } }
spec:
containers:
- name: tei
image: ghcr.io/huggingface/text-embeddings-inference:cpu-latest
env: [{ name: MODEL_ID, value: "mixedbread-ai/mxbai-embed-large-v1" }]
ports: [{ containerPort: 80 }]
---
apiVersion: v1
kind: Service
metadata: { name: tei, namespace: ml }
spec: { selector: { app: tei }, ports: [ { port: 80, targetPort: 80 } ] }
---
apiVersion: networking.k8s.io/v1
kind: Ingress
metadata:
name: tei
namespace: ml
annotations: { cert-manager.io/cluster-issuer: letsencrypt-prod }
spec:
ingressClassName: nginx
tls: [{ hosts: ["embeddings.betelgeusebytes.io"], secretName: tei-tls }]
rules:
- host: embeddings.betelgeusebytes.io
http:
paths:
- path: /
pathType: Prefix
backend: { service: { name: tei, port: { number: 80 } } }
=== ./k8s/trading/ib-gateway.yaml ===
apiVersion: v1
kind: Namespace
metadata:
name: trading
labels:
name: trading
environment: production
---
# OPTIONAL: Use this if you want to persist IB Gateway settings/logs
# across pod restarts. For most use cases, this is NOT needed since
# IB Gateway is mostly stateless and credentials are in Secrets.
#
# Only create this PV/PVC if you need to persist:
# - TWS session data
# - Custom workspace layouts
# - Historical API usage logs
apiVersion: v1
kind: PersistentVolume
metadata:
name: ib-gateway-data
labels:
type: local
app: ib-gateway
spec:
capacity:
storage: 5Gi
accessModes:
- ReadWriteOnce
persistentVolumeReclaimPolicy: Retain
storageClassName: local-storage
local:
path: /mnt/local-ssd/ib-gateway # Adjust to your local SSD path
nodeAffinity:
required:
nodeSelectorTerms:
- matchExpressions:
- key: kubernetes.io/hostname
operator: In
values:
- hetzner-2
---
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: ib-gateway-data
namespace: trading
spec:
accessModes:
- ReadWriteOnce
resources:
requests:
storage: 5Gi
storageClassName: local-storage
selector:
matchLabels:
app: ib-gateway
# To use this PVC, add to Deployment volumeMounts:
# - name: data
# mountPath: /root/Jts
# And to volumes:
# - name: data
# persistentVolumeClaim:
# claimName: ib-gateway-data
---
apiVersion: v1
kind: Secret
metadata:
name: ib-credentials
namespace: trading
type: Opaque
stringData:
# IMPORTANT: Replace these with your actual IB credentials
# For paper trading, use your paper trading account
username: "saladin85"
password: "3Lcd@05041985"
# Trading mode: "paper" or "live"
trading-mode: "paper"
# IB Gateway config (jts.ini equivalent)
# This enables headless mode and configures ports
ibgateway.conf: |
[IBGateway]
TradingMode=paper
ApiOnly=true
ReadOnlyApi=false
TrustedIPs=127.0.0.1
[IBGatewayAPI]
ApiPortNumber=4002
[Logon]
UseRemoteSettings=no
Locale=en
ColorPaletteName=dark
[Display]
ShowSplashScreen=no
---
apiVersion: v1
kind: ConfigMap
metadata:
name: ib-gateway-config
namespace: trading
data:
# Startup script to configure IB Gateway for headless operation
startup.sh: |
#!/bin/bash
set -e
echo "Starting IB Gateway in headless mode..."
echo "Trading Mode: ${TRADING_MODE}"
echo "Port: ${TWS_PORT}"
# Configure based on trading mode
if [ "${TRADING_MODE}" == "live" ]; then
export TWS_PORT=4001
echo "⚠️ LIVE TRADING MODE - USE WITH CAUTION ⚠️"
else
export TWS_PORT=4002
echo "📝 Paper Trading Mode (Safe)"
fi
# IMPORTANT: use the env vars provided by the Deployment
export IB_USERNAME="${TWS_USERID}"
export IB_PASSWORD="${TWS_PASSWORD}"
# Start IB Gateway
exec /opt/ibgateway/ibgateway-latest-standalone-linux-x64.sh \
--tws-path=/root/Jts \
--tws-settings-path=/root \
--user="${IB_USERNAME}" \
--pw="${IB_PASSWORD}" \
--mode="${TRADING_MODE}" \
--port="${TWS_PORT}"
# Health check script
healthcheck.sh: |
#!/bin/bash
# Check if TWS API port is listening
# PORT=${TWS_PORT:-4002}
# nc -z localhost $PORT
# exit $?
#!/bin/sh
# Pure-python TCP check (no nc required)
PORT="${TWS_PORT:-4002}"
python - <<'PY'
import os, socket, sys
port = int(os.environ.get("TWS_PORT", os.environ.get("PORT", "4002")))
s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
s.settimeout(2)
try:
s.connect(("127.0.0.1", port))
sys.exit(0)
except Exception:
sys.exit(1)
finally:
s.close()
PY
---
# apiVersion: apps/v1
# kind: Deployment
# metadata:
# name: ib-gateway
# namespace: trading
# labels:
# app: ib-gateway
# component: trading-infrastructure
# spec:
# replicas: 1 # IB Gateway should only have 1 instance per account
# strategy:
# type: Recreate # Avoid multiple simultaneous logins
# selector:
# matchLabels:
# app: ib-gateway
# template:
# metadata:
# labels:
# app: ib-gateway
# annotations:
# prometheus.io/scrape: "false" # No metrics endpoint by default
# spec:
# # Pin to hetzner-2 (matches your existing pattern)
# nodeSelector:
# kubernetes.io/hostname: hetzner-2
# # Security context
# securityContext:
# runAsNonRoot: false # IB Gateway requires root for VNC (even if unused)
# fsGroup: 1000
# containers:
# - name: ib-gateway
# # Using community-maintained IB Gateway image
# # Alternative: waytrade/ib-gateway:latest
# image: ghcr.io/gnzsnz/ib-gateway:stable
# imagePullPolicy: IfNotPresent
# env:
# - name: TWS_USERID
# valueFrom:
# secretKeyRef:
# name: ib-credentials
# key: username
# - name: TWS_PASSWORD
# valueFrom:
# secretKeyRef:
# name: ib-credentials
# key: password
# - name: TRADING_MODE
# valueFrom:
# secretKeyRef:
# name: ib-credentials
# key: trading-mode
# - name: TWS_PORT
# value: "4002" # Default to paper trading
# - name: READ_ONLY_API
# value: "no"
# # Ports
# ports:
# - name: paper-trading
# containerPort: 4002
# protocol: TCP
# - name: live-trading
# containerPort: 4001
# protocol: TCP
# - name: vnc
# containerPort: 5900
# protocol: TCP # VNC (not exposed externally)
# # Resource limits
# resources:
# requests:
# memory: "1Gi"
# cpu: "500m"
# limits:
# memory: "2Gi"
# cpu: "1000m"
# # Liveness probe (check if API port is responsive)
# startupProbe:
# tcpSocket:
# port: 4002
# initialDelaySeconds: 60 # Wait 60s before first check
# periodSeconds: 10 # Check every 10s
# timeoutSeconds: 5
# failureThreshold: 18 # 60s + (10s * 18) = 240s total startup time
# livenessProbe:
# tcpSocket:
# port: 4002
# initialDelaySeconds: 0 # IB Gateway takes time to start
# periodSeconds: 60
# timeoutSeconds: 5
# failureThreshold: 3
# # Readiness probe
# readinessProbe:
# tcpSocket:
# port: 4002
# initialDelaySeconds: 0
# periodSeconds: 10
# timeoutSeconds: 5
# failureThreshold: 2
# # Volume mounts for config
# volumeMounts:
# - name: ib-config
# mountPath: /root/Jts/jts.ini
# subPath: ibgateway.conf
# - name: startup-script
# mountPath: /startup.sh
# subPath: startup.sh
# - name: data
# mountPath: /root/Jts
# # Logging to stdout (Fluent Bit will collect)
# # IB Gateway logs go to /root/Jts/log by default
# lifecycle:
# postStart:
# exec:
# command:
# - /bin/sh
# - -c
# - |
# mkdir -p /root/Jts/log
# ln -sf /dev/stdout /root/Jts/log/ibgateway.log || true
# volumes:
# - name: ib-config
# secret:
# secretName: ib-credentials
# defaultMode: 0644
# - name: startup-script
# configMap:
# name: ib-gateway-config
# defaultMode: 0755
# - name: data
# persistentVolumeClaim:
# claimName: ib-gateway-data
# # Restart policy
# restartPolicy: Always
# # DNS policy for internal cluster resolution
# dnsPolicy: ClusterFirst
apiVersion: apps/v1
kind: Deployment
metadata:
name: ib-gateway
namespace: trading
labels:
app: ib-gateway
component: trading-infrastructure
spec:
replicas: 1
strategy:
type: Recreate
selector:
matchLabels:
app: ib-gateway
template:
metadata:
labels:
app: ib-gateway
annotations:
prometheus.io/scrape: "false"
spec:
nodeSelector:
kubernetes.io/hostname: hetzner-2
securityContext:
runAsNonRoot: false
fsGroup: 1000
# Seed writable jts.ini into the PVC once
initContainers:
- name: seed-jts-config
image: busybox:1.36
command:
- sh
- -c
- |
set -e
mkdir -p /data
if [ ! -f /data/jts.ini ]; then
echo "Seeding jts.ini into PVC"
cp /config/ibgateway.conf /data/jts.ini
chmod 644 /data/jts.ini
else
echo "jts.ini already exists in PVC"
fi
volumeMounts:
- name: ib-config
mountPath: /config
readOnly: true
- name: data
mountPath: /data
containers:
# ------------------------------------------------------------------
# IB Gateway
# ------------------------------------------------------------------
- name: ib-gateway
image: ghcr.io/gnzsnz/ib-gateway:stable
imagePullPolicy: IfNotPresent
env:
- name: TWS_USERID
valueFrom:
secretKeyRef:
name: ib-credentials
key: username
- name: TWS_PASSWORD
valueFrom:
secretKeyRef:
name: ib-credentials
key: password
- name: TRADING_MODE
valueFrom:
secretKeyRef:
name: ib-credentials
key: trading-mode
- name: TWS_PORT
value: "4002"
- name: READ_ONLY_API
value: "no"
ports:
- name: ib-api-local
containerPort: 4002
protocol: TCP
- name: live-trading
containerPort: 4001
protocol: TCP
- name: vnc
containerPort: 5900
protocol: TCP
resources:
requests:
memory: "1Gi"
cpu: "500m"
limits:
memory: "2Gi"
cpu: "1000m"
# IMPORTANT: Probes should check the local IB port (4002)
startupProbe:
tcpSocket:
port: 4002
initialDelaySeconds: 60
periodSeconds: 10
timeoutSeconds: 5
failureThreshold: 18
livenessProbe:
tcpSocket:
port: 4002
periodSeconds: 60
timeoutSeconds: 5
failureThreshold: 3
readinessProbe:
tcpSocket:
port: 4002
periodSeconds: 10
timeoutSeconds: 5
failureThreshold: 2
volumeMounts:
- name: data
mountPath: /root/Jts
lifecycle:
postStart:
exec:
command:
- sh
- -c
- |
mkdir -p /root/Jts/log
ln -sf /dev/stdout /root/Jts/log/ibgateway.log || true
# ------------------------------------------------------------------
# Sidecar TCP proxy: accepts cluster traffic, forwards to localhost:4002
# ------------------------------------------------------------------
- name: ib-api-proxy
image: alpine/socat:1.8.0.0
imagePullPolicy: IfNotPresent
args:
- "-d"
- "-d"
- "TCP-LISTEN:4003,fork,reuseaddr"
- "TCP:127.0.0.1:4002"
ports:
- name: ib-api
containerPort: 4003
protocol: TCP
resources:
requests:
memory: "32Mi"
cpu: "10m"
limits:
memory: "128Mi"
cpu: "100m"
# basic probe: is proxy listening
readinessProbe:
tcpSocket:
port: 4003
periodSeconds: 5
timeoutSeconds: 2
failureThreshold: 3
volumes:
- name: ib-config
secret:
secretName: ib-credentials
defaultMode: 0644
- name: data
persistentVolumeClaim:
claimName: ib-gateway-data
restartPolicy: Always
dnsPolicy: ClusterFirst
---
# apiVersion: v1
# kind: Service
# metadata:
# name: ib-gateway
# namespace: trading
# labels:
# app: ib-gateway
# spec:
# type: ClusterIP # Internal-only, not exposed publicly
# clusterIP: None # Headless service (optional, remove if you want a stable ClusterIP)
# selector:
# app: ib-gateway
# ports:
# - name: paper-trading
# port: 4002
# targetPort: 4002
# protocol: TCP
# - name: live-trading
# port: 4001
# targetPort: 4001
# protocol: TCP
# sessionAffinity: ClientIP # Stick to same pod (important for stateful TWS sessions)
# sessionAffinityConfig:
# clientIP:
# timeoutSeconds: 3600 # 1 hour session stickiness
apiVersion: v1
kind: Service
metadata:
name: ib-gateway
namespace: trading
labels:
app: ib-gateway
spec:
type: ClusterIP
selector:
app: ib-gateway
ports:
- name: paper-trading
port: 4002
targetPort: 4003 # <-- proxy sidecar, not the gateway directly
protocol: TCP
- name: live-trading
port: 4001
targetPort: 4001
protocol: TCP
sessionAffinity: ClientIP
sessionAffinityConfig:
clientIP:
timeoutSeconds: 3600
=== ./k8s/trading/ib-gateway2.yaml ===
apiVersion: v1
kind: Namespace
metadata:
name: trading
labels:
name: trading
environment: production
---
apiVersion: v1
kind: Secret
metadata:
name: ib-credentials
namespace: trading
type: Opaque
stringData:
# Rotate your creds (you pasted them earlier).
username: "saladin85"
password: "3Lcd@05041985"
trading-mode: "paper"
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: ib-gateway
namespace: trading
labels:
app: ib-gateway
component: trading-infrastructure
spec:
replicas: 1
strategy:
type: Recreate
selector:
matchLabels:
app: ib-gateway
template:
metadata:
labels:
app: ib-gateway
annotations:
prometheus.io/scrape: "false"
spec:
nodeSelector:
kubernetes.io/hostname: hetzner-2
# Keep your original security context
securityContext:
runAsNonRoot: false
fsGroup: 1000
containers:
- name: ib-gateway
image: ghcr.io/gnzsnz/ib-gateway:stable
imagePullPolicy: IfNotPresent
# IMPORTANT: use env vars this image expects
env:
- name: TWS_USERID
valueFrom:
secretKeyRef:
name: ib-credentials
key: username
- name: TWS_PASSWORD
valueFrom:
secretKeyRef:
name: ib-credentials
key: password
- name: TRADING_MODE
valueFrom:
secretKeyRef:
name: ib-credentials
key: trading-mode
- name: READ_ONLY_API
value: "no"
# These two match what your log shows the image uses
- name: API_PORT
value: "4002"
- name: SOCAT_PORT
value: "4004"
# optional but nice
- name: TIME_ZONE
value: "Etc/UTC"
- name: TWOFA_TIMEOUT_ACTION
value: "exit"
ports:
# IB API ports (inside container / localhost use)
- name: api-paper
containerPort: 4002
protocol: TCP
- name: api-live
containerPort: 4001
protocol: TCP
# socat relay port for non-localhost clients (what we expose via Service)
- name: api-socat
containerPort: 4004
protocol: TCP
# optional UI/VNC
- name: vnc
containerPort: 5900
protocol: TCP
resources:
requests:
memory: "1Gi"
cpu: "500m"
limits:
memory: "2Gi"
cpu: "1000m"
# Probe the socat port (represents remote connectivity)
startupProbe:
tcpSocket:
port: 4004
initialDelaySeconds: 60
periodSeconds: 10
timeoutSeconds: 5
failureThreshold: 18
readinessProbe:
tcpSocket:
port: 4004
periodSeconds: 10
timeoutSeconds: 5
failureThreshold: 2
livenessProbe:
tcpSocket:
port: 4004
periodSeconds: 60
timeoutSeconds: 5
failureThreshold: 3
restartPolicy: Always
dnsPolicy: ClusterFirst
---
apiVersion: v1
kind: Service
metadata:
name: ib-gateway
namespace: trading
labels:
app: ib-gateway
spec:
type: ClusterIP
selector:
app: ib-gateway
ports:
# Clients connect to 4002, but we forward to SOCAT_PORT=4004
- name: paper-trading
port: 4002
targetPort: 4004
protocol: TCP
# If you truly need live, you should relay live via another socat port too.
# For now keep it direct (or remove it entirely for safety).
- name: live-trading
port: 4001
targetPort: 4001
protocol: TCP
sessionAffinity: ClientIP
sessionAffinityConfig:
clientIP:
timeoutSeconds: 3600
=== ./k8s/vector/qdrant.yaml ===
# k8s/vec/qdrant/pvc.yaml
apiVersion: v1
kind: PersistentVolumeClaim
metadata: { name: qdrant-data, namespace: db}
spec:
accessModes: ["ReadWriteOnce"]
storageClassName: local-ssd-hetzner
resources: { requests: { storage: 20Gi } }
---
# k8s/vec/qdrant/deploy.yaml
apiVersion: apps/v1
kind: Deployment
metadata: { name: qdrant, namespace: db}
spec:
replicas: 1
selector: { matchLabels: { app: qdrant } }
template:
metadata: { labels: { app: qdrant } }
spec:
containers:
- name: qdrant
image: qdrant/qdrant:latest
ports:
- { containerPort: 6333 } # HTTP + Web UI
- { containerPort: 6334 } # gRPC
volumeMounts:
- { name: data, mountPath: /qdrant/storage }
volumes:
- name: data
persistentVolumeClaim: { claimName: qdrant-data }
---
apiVersion: v1
kind: Service
metadata: { name: qdrant, namespace: db}
spec:
selector: { app: qdrant }
ports:
- { name: http, port: 80, targetPort: 6333 }
- { name: grpc, port: 6334, targetPort: 6334 }
---
apiVersion: networking.k8s.io/v1
kind: Ingress
metadata:
name: qdrant
namespace: db
annotations: { cert-manager.io/cluster-issuer: letsencrypt-prod }
spec:
ingressClassName: nginx
tls: [{ hosts: ["vector.betelgeusebytes.io"], secretName: qdrant-tls }]
rules:
- host: vector.betelgeusebytes.io
http:
paths:
- path: /
pathType: Prefix
backend: { service: { name: qdrant, port: { number: 80 } } }
---
# PV
apiVersion: v1
kind: PersistentVolume
metadata:
name: pv-qdrant
spec:
capacity:
storage: 20Gi
accessModes:
- ReadWriteOnce
persistentVolumeReclaimPolicy: Retain
storageClassName: local-ssd-hetzner
local:
path: /mnt/local-ssd/qdrant
nodeAffinity:
required:
nodeSelectorTerms:
- matchExpressions:
- key: kubernetes.io/hostname
operator: In
values:
- hetzner-2
=== ./k8s/vllm/vllm.yaml ===
# PV
apiVersion: v1
kind: PersistentVolume
metadata:
name: pv-vllm
spec:
capacity:
storage: 50Gi
accessModes:
- ReadWriteOnce
persistentVolumeReclaimPolicy: Retain
storageClassName: local-ssd-hetzner
local:
path: /mnt/local-ssd/vllm
nodeAffinity:
required:
nodeSelectorTerms:
- matchExpressions:
- key: kubernetes.io/hostname
operator: In
values:
- hetzner-2
---
# k8s/ai/vllm/secret.yaml
apiVersion: v1
kind: Secret
metadata: { name: vllm-auth, namespace: ml }
type: Opaque
stringData: { API_KEY: "replace_me" }
---
# k8s/ai/ollama/deploy.yaml
apiVersion: apps/v1
kind: Deployment
metadata: { name: ollama, namespace: ml }
spec:
replicas: 1
selector: { matchLabels: { app: ollama } }
template:
metadata: { labels: { app: ollama } }
spec:
securityContext:
runAsUser: 0 # needed so the init can write into /root/.ollama
initContainers:
- name: warm-models
image: ollama/ollama:latest
command: ["/bin/sh","-c"]
args:
- |
ollama serve & # start a temp daemon
sleep 2
# pull one or more small, quantized models for CPU
ollama pull qwen2.5:3b-instruct-q4_K_M || true
ollama pull llama3.2:3b-instruct-q4_K_M || true
pkill ollama || true
volumeMounts:
- { name: data, mountPath: /root/.ollama }
containers:
- name: ollama
image: ollama/ollama:latest
env:
- { name: OLLAMA_ORIGINS, value: "*" } # CORS if you call from browser
ports:
- { containerPort: 11434 }
volumeMounts:
- { name: data, mountPath: /root/.ollama }
resources:
requests: { cpu: "2", memory: "4Gi" }
limits: { cpu: "4", memory: "8Gi" }
volumes:
- name: data
persistentVolumeClaim: { claimName: ollama-data }
---
# k8s/ai/ollama/svc-ing.yaml
apiVersion: v1
kind: Service
metadata: { name: ollama, namespace: ml }
spec:
selector: { app: ollama }
ports: [ { name: http, port: 80, targetPort: 11434 } ]
# ---
# # old k8s/ai/vllm/deploy.yaml
# apiVersion: apps/v1
# kind: Deployment
# metadata: { name: vllm, namespace: ml }
# spec:
# replicas: 1
# selector: { matchLabels: { app: vllm } }
# template:
# metadata: { labels: { app: vllm } }
# spec:
# containers:
# - name: vllm
# image: vllm/vllm-openai:latest
# args: ["--model","Qwen/Qwen2.5-7B-Instruct","--max-model-len","8192","--port","8000","--host","0.0.0.0"]
# env:
# - name: VLLM_API_KEY
# valueFrom: { secretKeyRef: { name: vllm-auth, key: API_KEY } }
# ports: [{ containerPort: 8000 }]
# resources:
# limits:
# nvidia.com/gpu: 1
# requests:
# nvidia.com/gpu: 1
# volumeMounts:
# - { name: cache, mountPath: /root/.cache/huggingface }
# volumes:
# - name: cache
# persistentVolumeClaim: { claimName: vllm-cache-pvc }
---
apiVersion: v1
kind: PersistentVolumeClaim
metadata: { name: ollama-data, namespace: ml }
spec:
accessModes: ["ReadWriteOnce"]
storageClassName: local-ssd-hetzner
resources: { requests: { storage: 50Gi } }
# ---
#old k8s/ai/vllm/svc-ing.yaml
# apiVersion: v1
# kind: Service
# metadata: { name: vllm, namespace: ml }
# spec: { selector: { app: vllm }, ports: [ { port: 80, targetPort: 8000 } ] }
---
apiVersion: networking.k8s.io/v1
kind: Ingress
metadata:
name: vllm
namespace: ml
annotations: { cert-manager.io/cluster-issuer: letsencrypt-prod }
spec:
ingressClassName: nginx
tls: [{ hosts: ["llm.betelgeusebytes.io"], secretName: vllm-tls }]
rules:
- host: llm.betelgeusebytes.io
http:
paths:
- path: /
pathType: Prefix
backend: { service: { name: vllm, port: { number: 80 } } }