# PV apiVersion: v1 kind: PersistentVolume metadata: name: pv-vllm spec: capacity: storage: 50Gi accessModes: - ReadWriteOnce persistentVolumeReclaimPolicy: Retain storageClassName: local-ssd-hetzner local: path: /mnt/local-ssd/vllm nodeAffinity: required: nodeSelectorTerms: - matchExpressions: - key: kubernetes.io/hostname operator: In values: - hetzner-2 --- # k8s/ai/vllm/secret.yaml apiVersion: v1 kind: Secret metadata: { name: vllm-auth, namespace: ml } type: Opaque stringData: { API_KEY: "replace_me" } --- # k8s/ai/ollama/deploy.yaml apiVersion: apps/v1 kind: Deployment metadata: { name: ollama, namespace: ml } spec: replicas: 1 selector: { matchLabels: { app: ollama } } template: metadata: { labels: { app: ollama } } spec: securityContext: runAsUser: 0 # needed so the init can write into /root/.ollama initContainers: - name: warm-models image: ollama/ollama:latest command: ["/bin/sh","-c"] args: - | ollama serve & # start a temp daemon sleep 2 # pull one or more small, quantized models for CPU ollama pull qwen2.5:3b-instruct-q4_K_M || true ollama pull llama3.2:3b-instruct-q4_K_M || true pkill ollama || true volumeMounts: - { name: data, mountPath: /root/.ollama } containers: - name: ollama image: ollama/ollama:latest env: - { name: OLLAMA_ORIGINS, value: "*" } # CORS if you call from browser ports: - { containerPort: 11434 } volumeMounts: - { name: data, mountPath: /root/.ollama } resources: requests: { cpu: "2", memory: "4Gi" } limits: { cpu: "4", memory: "8Gi" } volumes: - name: data persistentVolumeClaim: { claimName: ollama-data } --- # k8s/ai/ollama/svc-ing.yaml apiVersion: v1 kind: Service metadata: { name: ollama, namespace: ml } spec: selector: { app: ollama } ports: [ { name: http, port: 80, targetPort: 11434 } ] # --- # # old k8s/ai/vllm/deploy.yaml # apiVersion: apps/v1 # kind: Deployment # metadata: { name: vllm, namespace: ml } # spec: # replicas: 1 # selector: { matchLabels: { app: vllm } } # template: # metadata: { labels: { app: vllm } } # spec: # containers: # - name: vllm # image: vllm/vllm-openai:latest # args: ["--model","Qwen/Qwen2.5-7B-Instruct","--max-model-len","8192","--port","8000","--host","0.0.0.0"] # env: # - name: VLLM_API_KEY # valueFrom: { secretKeyRef: { name: vllm-auth, key: API_KEY } } # ports: [{ containerPort: 8000 }] # resources: # limits: # nvidia.com/gpu: 1 # requests: # nvidia.com/gpu: 1 # volumeMounts: # - { name: cache, mountPath: /root/.cache/huggingface } # volumes: # - name: cache # persistentVolumeClaim: { claimName: vllm-cache-pvc } --- apiVersion: v1 kind: PersistentVolumeClaim metadata: { name: ollama-data, namespace: ml } spec: accessModes: ["ReadWriteOnce"] storageClassName: local-ssd-hetzner resources: { requests: { storage: 50Gi } } # --- #old k8s/ai/vllm/svc-ing.yaml # apiVersion: v1 # kind: Service # metadata: { name: vllm, namespace: ml } # spec: { selector: { app: vllm }, ports: [ { port: 80, targetPort: 8000 } ] } --- apiVersion: networking.k8s.io/v1 kind: Ingress metadata: name: vllm namespace: ml annotations: { cert-manager.io/cluster-issuer: letsencrypt-prod } spec: ingressClassName: nginx tls: [{ hosts: ["llm.betelgeusebytes.io"], secretName: vllm-tls }] rules: - host: llm.betelgeusebytes.io http: paths: - path: / pathType: Prefix backend: { service: { name: vllm, port: { number: 80 } } }