143 lines
3.9 KiB
YAML
143 lines
3.9 KiB
YAML
# PV
|
|
apiVersion: v1
|
|
kind: PersistentVolume
|
|
metadata:
|
|
name: pv-vllm
|
|
spec:
|
|
capacity:
|
|
storage: 50Gi
|
|
accessModes:
|
|
- ReadWriteOnce
|
|
persistentVolumeReclaimPolicy: Retain
|
|
storageClassName: local-ssd-hetzner
|
|
local:
|
|
path: /mnt/local-ssd/vllm
|
|
nodeAffinity:
|
|
required:
|
|
nodeSelectorTerms:
|
|
- matchExpressions:
|
|
- key: kubernetes.io/hostname
|
|
operator: In
|
|
values:
|
|
- hetzner-2
|
|
---
|
|
# k8s/ai/vllm/secret.yaml
|
|
apiVersion: v1
|
|
kind: Secret
|
|
metadata: { name: vllm-auth, namespace: ml }
|
|
type: Opaque
|
|
stringData: { API_KEY: "replace_me" }
|
|
|
|
---
|
|
# k8s/ai/ollama/deploy.yaml
|
|
apiVersion: apps/v1
|
|
kind: Deployment
|
|
metadata: { name: ollama, namespace: ml }
|
|
spec:
|
|
replicas: 1
|
|
selector: { matchLabels: { app: ollama } }
|
|
template:
|
|
metadata: { labels: { app: ollama } }
|
|
spec:
|
|
securityContext:
|
|
runAsUser: 0 # needed so the init can write into /root/.ollama
|
|
initContainers:
|
|
- name: warm-models
|
|
image: ollama/ollama:latest
|
|
command: ["/bin/sh","-c"]
|
|
args:
|
|
- |
|
|
ollama serve & # start a temp daemon
|
|
sleep 2
|
|
# pull one or more small, quantized models for CPU
|
|
ollama pull qwen2.5:3b-instruct-q4_K_M || true
|
|
ollama pull llama3.2:3b-instruct-q4_K_M || true
|
|
pkill ollama || true
|
|
volumeMounts:
|
|
- { name: data, mountPath: /root/.ollama }
|
|
containers:
|
|
- name: ollama
|
|
image: ollama/ollama:latest
|
|
env:
|
|
- { name: OLLAMA_ORIGINS, value: "*" } # CORS if you call from browser
|
|
ports:
|
|
- { containerPort: 11434 }
|
|
volumeMounts:
|
|
- { name: data, mountPath: /root/.ollama }
|
|
resources:
|
|
requests: { cpu: "2", memory: "4Gi" }
|
|
limits: { cpu: "4", memory: "8Gi" }
|
|
volumes:
|
|
- name: data
|
|
persistentVolumeClaim: { claimName: ollama-data }
|
|
|
|
---
|
|
# k8s/ai/ollama/svc-ing.yaml
|
|
apiVersion: v1
|
|
kind: Service
|
|
metadata: { name: ollama, namespace: ml }
|
|
spec:
|
|
selector: { app: ollama }
|
|
ports: [ { name: http, port: 80, targetPort: 11434 } ]
|
|
|
|
# ---
|
|
# # old k8s/ai/vllm/deploy.yaml
|
|
# apiVersion: apps/v1
|
|
# kind: Deployment
|
|
# metadata: { name: vllm, namespace: ml }
|
|
# spec:
|
|
# replicas: 1
|
|
# selector: { matchLabels: { app: vllm } }
|
|
# template:
|
|
# metadata: { labels: { app: vllm } }
|
|
# spec:
|
|
# containers:
|
|
# - name: vllm
|
|
# image: vllm/vllm-openai:latest
|
|
# args: ["--model","Qwen/Qwen2.5-7B-Instruct","--max-model-len","8192","--port","8000","--host","0.0.0.0"]
|
|
# env:
|
|
# - name: VLLM_API_KEY
|
|
# valueFrom: { secretKeyRef: { name: vllm-auth, key: API_KEY } }
|
|
# ports: [{ containerPort: 8000 }]
|
|
# resources:
|
|
# limits:
|
|
# nvidia.com/gpu: 1
|
|
# requests:
|
|
# nvidia.com/gpu: 1
|
|
# volumeMounts:
|
|
# - { name: cache, mountPath: /root/.cache/huggingface }
|
|
# volumes:
|
|
# - name: cache
|
|
# persistentVolumeClaim: { claimName: vllm-cache-pvc }
|
|
---
|
|
apiVersion: v1
|
|
kind: PersistentVolumeClaim
|
|
metadata: { name: ollama-data, namespace: ml }
|
|
spec:
|
|
accessModes: ["ReadWriteOnce"]
|
|
storageClassName: local-ssd-hetzner
|
|
resources: { requests: { storage: 50Gi } }
|
|
# ---
|
|
#old k8s/ai/vllm/svc-ing.yaml
|
|
# apiVersion: v1
|
|
# kind: Service
|
|
# metadata: { name: vllm, namespace: ml }
|
|
# spec: { selector: { app: vllm }, ports: [ { port: 80, targetPort: 8000 } ] }
|
|
---
|
|
apiVersion: networking.k8s.io/v1
|
|
kind: Ingress
|
|
metadata:
|
|
name: vllm
|
|
namespace: ml
|
|
annotations: { cert-manager.io/cluster-issuer: letsencrypt-prod }
|
|
spec:
|
|
ingressClassName: nginx
|
|
tls: [{ hosts: ["llm.betelgeusebytes.io"], secretName: vllm-tls }]
|
|
rules:
|
|
- host: llm.betelgeusebytes.io
|
|
http:
|
|
paths:
|
|
- path: /
|
|
pathType: Prefix
|
|
backend: { service: { name: vllm, port: { number: 80 } } }
|