betelgeusebytes/k8s/vllm/vllm.yaml

143 lines
3.9 KiB
YAML

# PV
apiVersion: v1
kind: PersistentVolume
metadata:
name: pv-vllm
spec:
capacity:
storage: 50Gi
accessModes:
- ReadWriteOnce
persistentVolumeReclaimPolicy: Retain
storageClassName: local-ssd-hetzner
local:
path: /mnt/local-ssd/vllm
nodeAffinity:
required:
nodeSelectorTerms:
- matchExpressions:
- key: kubernetes.io/hostname
operator: In
values:
- hetzner-2
---
# k8s/ai/vllm/secret.yaml
apiVersion: v1
kind: Secret
metadata: { name: vllm-auth, namespace: ml }
type: Opaque
stringData: { API_KEY: "replace_me" }
---
# k8s/ai/ollama/deploy.yaml
apiVersion: apps/v1
kind: Deployment
metadata: { name: ollama, namespace: ml }
spec:
replicas: 1
selector: { matchLabels: { app: ollama } }
template:
metadata: { labels: { app: ollama } }
spec:
securityContext:
runAsUser: 0 # needed so the init can write into /root/.ollama
initContainers:
- name: warm-models
image: ollama/ollama:latest
command: ["/bin/sh","-c"]
args:
- |
ollama serve & # start a temp daemon
sleep 2
# pull one or more small, quantized models for CPU
ollama pull qwen2.5:3b-instruct-q4_K_M || true
ollama pull llama3.2:3b-instruct-q4_K_M || true
pkill ollama || true
volumeMounts:
- { name: data, mountPath: /root/.ollama }
containers:
- name: ollama
image: ollama/ollama:latest
env:
- { name: OLLAMA_ORIGINS, value: "*" } # CORS if you call from browser
ports:
- { containerPort: 11434 }
volumeMounts:
- { name: data, mountPath: /root/.ollama }
resources:
requests: { cpu: "2", memory: "4Gi" }
limits: { cpu: "4", memory: "8Gi" }
volumes:
- name: data
persistentVolumeClaim: { claimName: ollama-data }
---
# k8s/ai/ollama/svc-ing.yaml
apiVersion: v1
kind: Service
metadata: { name: ollama, namespace: ml }
spec:
selector: { app: ollama }
ports: [ { name: http, port: 80, targetPort: 11434 } ]
# ---
# # old k8s/ai/vllm/deploy.yaml
# apiVersion: apps/v1
# kind: Deployment
# metadata: { name: vllm, namespace: ml }
# spec:
# replicas: 1
# selector: { matchLabels: { app: vllm } }
# template:
# metadata: { labels: { app: vllm } }
# spec:
# containers:
# - name: vllm
# image: vllm/vllm-openai:latest
# args: ["--model","Qwen/Qwen2.5-7B-Instruct","--max-model-len","8192","--port","8000","--host","0.0.0.0"]
# env:
# - name: VLLM_API_KEY
# valueFrom: { secretKeyRef: { name: vllm-auth, key: API_KEY } }
# ports: [{ containerPort: 8000 }]
# resources:
# limits:
# nvidia.com/gpu: 1
# requests:
# nvidia.com/gpu: 1
# volumeMounts:
# - { name: cache, mountPath: /root/.cache/huggingface }
# volumes:
# - name: cache
# persistentVolumeClaim: { claimName: vllm-cache-pvc }
---
apiVersion: v1
kind: PersistentVolumeClaim
metadata: { name: ollama-data, namespace: ml }
spec:
accessModes: ["ReadWriteOnce"]
storageClassName: local-ssd-hetzner
resources: { requests: { storage: 50Gi } }
# ---
#old k8s/ai/vllm/svc-ing.yaml
# apiVersion: v1
# kind: Service
# metadata: { name: vllm, namespace: ml }
# spec: { selector: { app: vllm }, ports: [ { port: 80, targetPort: 8000 } ] }
---
apiVersion: networking.k8s.io/v1
kind: Ingress
metadata:
name: vllm
namespace: ml
annotations: { cert-manager.io/cluster-issuer: letsencrypt-prod }
spec:
ingressClassName: nginx
tls: [{ hosts: ["llm.betelgeusebytes.io"], secretName: vllm-tls }]
rules:
- host: llm.betelgeusebytes.io
http:
paths:
- path: /
pathType: Prefix
backend: { service: { name: vllm, port: { number: 80 } } }