betelgeusebytes/k8s/scripts/cleanup.sh

319 lines
14 KiB
Bash
Executable File

#!/bin/bash
set -e
echo "=========================================================="
echo "Removing Existing Monitoring Stack"
echo "=========================================================="
echo ""
RED='\033[0;31m'
YELLOW='\033[1;33m'
GREEN='\033[0;32m'
NC='\033[0m' # No Color
echo -e "${YELLOW}This script will remove common monitoring deployments including:${NC}"
echo " - Prometheus (standalone or operator)"
echo " - Grafana"
echo " - Fluent Bit"
echo " - Vector"
echo " - Loki"
echo " - Tempo"
echo " - Node exporters"
echo " - kube-state-metrics"
echo " - Any monitoring/prometheus/grafana namespaces"
echo ""
echo -e "${RED}WARNING: This will delete all existing monitoring data!${NC}"
echo ""
read -p "Are you sure you want to continue? (yes/no): " confirm
if [ "$confirm" != "yes" ]; then
echo "Cleanup cancelled."
exit 0
fi
echo ""
echo -e "${YELLOW}Step 1: Checking for existing monitoring namespaces...${NC}"
# Common namespace names for monitoring
NAMESPACES=("monitoring" "prometheus" "grafana" "loki" "tempo" "logging")
for ns in "${NAMESPACES[@]}"; do
if kubectl get namespace "$ns" &> /dev/null; then
echo -e "${GREEN}Found namespace: $ns${NC}"
# Show what's in the namespace
echo " Resources in $ns:"
kubectl get all -n "$ns" 2>/dev/null | head -20 || true
echo ""
read -p " Delete namespace '$ns'? (yes/no): " delete_ns
if [ "$delete_ns" = "yes" ]; then
echo " Deleting namespace $ns..."
kubectl delete namespace "$ns" --timeout=120s || {
echo -e "${YELLOW} Warning: Namespace deletion timed out, forcing...${NC}"
kubectl delete namespace "$ns" --grace-period=0 --force &
}
fi
fi
done
echo ""
echo -e "${YELLOW}Step 2: Removing common monitoring Helm releases...${NC}"
# Check if helm is available
if command -v helm &> /dev/null; then
echo "Checking for Helm releases..."
# Common Helm release names
RELEASES=("prometheus" "grafana" "loki" "tempo" "fluent-bit" "prometheus-operator" "kube-prometheus-stack")
for release in "${RELEASES[@]}"; do
# Check all namespaces for the release
if helm list -A | grep -q "$release"; then
ns=$(helm list -A | grep "$release" | awk '{print $2}')
echo -e "${GREEN}Found Helm release: $release in namespace $ns${NC}"
read -p " Uninstall Helm release '$release'? (yes/no): " uninstall
if [ "$uninstall" = "yes" ]; then
echo " Uninstalling $release..."
helm uninstall "$release" -n "$ns" || echo -e "${YELLOW} Warning: Failed to uninstall $release${NC}"
fi
fi
done
else
echo "Helm not found, skipping Helm releases check"
fi
echo ""
echo -e "${YELLOW}Step 3: Removing standalone monitoring components...${NC}"
# Remove common DaemonSets in kube-system or default
echo "Checking for monitoring DaemonSets..."
for ns in kube-system default; do
if kubectl get daemonset -n "$ns" 2>/dev/null | grep -q "node-exporter\|fluent-bit\|fluentd\|vector"; then
echo -e "${GREEN}Found monitoring DaemonSets in $ns${NC}"
kubectl get daemonset -n "$ns" | grep -E "node-exporter|fluent-bit|fluentd|vector"
read -p " Delete these DaemonSets? (yes/no): " delete_ds
if [ "$delete_ds" = "yes" ]; then
kubectl delete daemonset -n "$ns" -l app=node-exporter --ignore-not-found
kubectl delete daemonset -n "$ns" -l app=fluent-bit --ignore-not-found
kubectl delete daemonset -n "$ns" -l app=fluentd --ignore-not-found
kubectl delete daemonset -n "$ns" -l app=vector --ignore-not-found
kubectl delete daemonset -n "$ns" node-exporter --ignore-not-found
kubectl delete daemonset -n "$ns" fluent-bit --ignore-not-found
kubectl delete daemonset -n "$ns" fluentd --ignore-not-found
kubectl delete daemonset -n "$ns" vector --ignore-not-found
fi
fi
done
# Remove common Deployments
echo ""
echo "Checking for monitoring Deployments..."
for ns in kube-system default; do
if kubectl get deployment -n "$ns" 2>/dev/null | grep -q "prometheus\|grafana\|kube-state-metrics\|loki\|tempo"; then
echo -e "${GREEN}Found monitoring Deployments in $ns${NC}"
kubectl get deployment -n "$ns" | grep -E "prometheus|grafana|kube-state-metrics|loki|tempo"
read -p " Delete these Deployments? (yes/no): " delete_deploy
if [ "$delete_deploy" = "yes" ]; then
kubectl delete deployment -n "$ns" -l app=prometheus --ignore-not-found
kubectl delete deployment -n "$ns" -l app=grafana --ignore-not-found
kubectl delete deployment -n "$ns" -l app=kube-state-metrics --ignore-not-found
kubectl delete deployment -n "$ns" -l app=loki --ignore-not-found
kubectl delete deployment -n "$ns" -l app=tempo --ignore-not-found
kubectl delete deployment -n "$ns" prometheus --ignore-not-found
kubectl delete deployment -n "$ns" grafana --ignore-not-found
kubectl delete deployment -n "$ns" kube-state-metrics --ignore-not-found
fi
fi
done
# Remove common StatefulSets
echo ""
echo "Checking for monitoring StatefulSets..."
for ns in kube-system default; do
if kubectl get statefulset -n "$ns" 2>/dev/null | grep -q "prometheus\|grafana\|loki\|tempo"; then
echo -e "${GREEN}Found monitoring StatefulSets in $ns${NC}"
kubectl get statefulset -n "$ns" | grep -E "prometheus|grafana|loki|tempo"
read -p " Delete these StatefulSets? (yes/no): " delete_sts
if [ "$delete_sts" = "yes" ]; then
kubectl delete statefulset -n "$ns" -l app=prometheus --ignore-not-found
kubectl delete statefulset -n "$ns" -l app=grafana --ignore-not-found
kubectl delete statefulset -n "$ns" -l app=loki --ignore-not-found
kubectl delete statefulset -n "$ns" -l app=tempo --ignore-not-found
kubectl delete statefulset -n "$ns" prometheus --ignore-not-found
kubectl delete statefulset -n "$ns" grafana --ignore-not-found
kubectl delete statefulset -n "$ns" loki --ignore-not-found
kubectl delete statefulset -n "$ns" tempo --ignore-not-found
fi
fi
done
echo ""
echo -e "${YELLOW}Step 4: Removing monitoring ConfigMaps...${NC}"
# Ask before removing ConfigMaps (they might contain important configs)
echo "Checking for monitoring ConfigMaps..."
for ns in kube-system default monitoring prometheus grafana; do
if kubectl get namespace "$ns" &> /dev/null; then
if kubectl get configmap -n "$ns" 2>/dev/null | grep -q "prometheus\|grafana\|loki\|tempo\|fluent"; then
echo -e "${GREEN}Found monitoring ConfigMaps in $ns${NC}"
kubectl get configmap -n "$ns" | grep -E "prometheus|grafana|loki|tempo|fluent"
read -p " Delete these ConfigMaps? (yes/no): " delete_cm
if [ "$delete_cm" = "yes" ]; then
kubectl delete configmap -n "$ns" -l app=prometheus --ignore-not-found
kubectl delete configmap -n "$ns" -l app=grafana --ignore-not-found
kubectl delete configmap -n "$ns" -l app=loki --ignore-not-found
kubectl delete configmap -n "$ns" -l app=fluent-bit --ignore-not-found
fi
fi
fi
done
echo ""
echo -e "${YELLOW}Step 5: Removing ClusterRoles and ClusterRoleBindings...${NC}"
# Remove monitoring-related RBAC
echo "Checking for monitoring ClusterRoles..."
if kubectl get clusterrole 2>/dev/null | grep -q "prometheus\|grafana\|kube-state-metrics\|fluent-bit\|node-exporter"; then
echo -e "${GREEN}Found monitoring ClusterRoles${NC}"
kubectl get clusterrole | grep -E "prometheus|grafana|kube-state-metrics|fluent-bit|node-exporter"
read -p " Delete these ClusterRoles? (yes/no): " delete_cr
if [ "$delete_cr" = "yes" ]; then
kubectl delete clusterrole prometheus --ignore-not-found
kubectl delete clusterrole grafana --ignore-not-found
kubectl delete clusterrole kube-state-metrics --ignore-not-found
kubectl delete clusterrole fluent-bit --ignore-not-found
kubectl delete clusterrole node-exporter --ignore-not-found
fi
fi
echo "Checking for monitoring ClusterRoleBindings..."
if kubectl get clusterrolebinding 2>/dev/null | grep -q "prometheus\|grafana\|kube-state-metrics\|fluent-bit\|node-exporter"; then
echo -e "${GREEN}Found monitoring ClusterRoleBindings${NC}"
kubectl get clusterrolebinding | grep -E "prometheus|grafana|kube-state-metrics|fluent-bit|node-exporter"
read -p " Delete these ClusterRoleBindings? (yes/no): " delete_crb
if [ "$delete_crb" = "yes" ]; then
kubectl delete clusterrolebinding prometheus --ignore-not-found
kubectl delete clusterrolebinding grafana --ignore-not-found
kubectl delete clusterrolebinding kube-state-metrics --ignore-not-found
kubectl delete clusterrolebinding fluent-bit --ignore-not-found
kubectl delete clusterrolebinding node-exporter --ignore-not-found
fi
fi
echo ""
echo -e "${YELLOW}Step 6: Removing PVCs and PVs...${NC}"
# Check for monitoring PVCs
echo "Checking for monitoring PersistentVolumeClaims..."
for ns in kube-system default monitoring prometheus grafana; do
if kubectl get namespace "$ns" &> /dev/null; then
if kubectl get pvc -n "$ns" 2>/dev/null | grep -q "prometheus\|grafana\|loki\|tempo"; then
echo -e "${GREEN}Found monitoring PVCs in $ns${NC}"
kubectl get pvc -n "$ns" | grep -E "prometheus|grafana|loki|tempo"
echo -e "${RED} WARNING: Deleting PVCs will delete all stored data!${NC}"
read -p " Delete these PVCs? (yes/no): " delete_pvc
if [ "$delete_pvc" = "yes" ]; then
kubectl delete pvc -n "$ns" -l app=prometheus --ignore-not-found
kubectl delete pvc -n "$ns" -l app=grafana --ignore-not-found
kubectl delete pvc -n "$ns" -l app=loki --ignore-not-found
kubectl delete pvc -n "$ns" -l app=tempo --ignore-not-found
# Also try by name patterns
kubectl get pvc -n "$ns" -o name | grep -E "prometheus|grafana|loki|tempo" | xargs -r kubectl delete -n "$ns" || true
fi
fi
fi
done
# Check for monitoring PVs
echo ""
echo "Checking for monitoring PersistentVolumes..."
if kubectl get pv 2>/dev/null | grep -q "prometheus\|grafana\|loki\|tempo\|monitoring"; then
echo -e "${GREEN}Found monitoring PVs${NC}"
kubectl get pv | grep -E "prometheus|grafana|loki|tempo|monitoring"
echo -e "${RED} WARNING: Deleting PVs may delete data on disk!${NC}"
read -p " Delete these PVs? (yes/no): " delete_pv
if [ "$delete_pv" = "yes" ]; then
kubectl get pv -o name | grep -E "prometheus|grafana|loki|tempo|monitoring" | xargs -r kubectl delete || true
fi
fi
echo ""
echo -e "${YELLOW}Step 7: Checking for monitoring Ingresses...${NC}"
for ns in kube-system default monitoring prometheus grafana; do
if kubectl get namespace "$ns" &> /dev/null; then
if kubectl get ingress -n "$ns" 2>/dev/null | grep -q "prometheus\|grafana\|loki"; then
echo -e "${GREEN}Found monitoring Ingresses in $ns${NC}"
kubectl get ingress -n "$ns" | grep -E "prometheus|grafana|loki"
read -p " Delete these Ingresses? (yes/no): " delete_ing
if [ "$delete_ing" = "yes" ]; then
kubectl delete ingress -n "$ns" -l app=prometheus --ignore-not-found
kubectl delete ingress -n "$ns" -l app=grafana --ignore-not-found
kubectl delete ingress -n "$ns" prometheus-ingress --ignore-not-found
kubectl delete ingress -n "$ns" grafana-ingress --ignore-not-found
fi
fi
fi
done
echo ""
echo -e "${YELLOW}Step 8: Checking for Prometheus Operator CRDs...${NC}"
# Check for Prometheus Operator CRDs
if kubectl get crd 2>/dev/null | grep -q "monitoring.coreos.com"; then
echo -e "${GREEN}Found Prometheus Operator CRDs${NC}"
kubectl get crd | grep "monitoring.coreos.com"
echo ""
echo -e "${RED}WARNING: Deleting these CRDs will remove ALL Prometheus Operator resources cluster-wide!${NC}"
read -p " Delete Prometheus Operator CRDs? (yes/no): " delete_crd
if [ "$delete_crd" = "yes" ]; then
kubectl delete crd prometheuses.monitoring.coreos.com --ignore-not-found
kubectl delete crd prometheusrules.monitoring.coreos.com --ignore-not-found
kubectl delete crd servicemonitors.monitoring.coreos.com --ignore-not-found
kubectl delete crd podmonitors.monitoring.coreos.com --ignore-not-found
kubectl delete crd alertmanagers.monitoring.coreos.com --ignore-not-found
kubectl delete crd alertmanagerconfigs.monitoring.coreos.com --ignore-not-found
kubectl delete crd probes.monitoring.coreos.com --ignore-not-found
kubectl delete crd thanosrulers.monitoring.coreos.com --ignore-not-found
fi
fi
echo ""
echo -e "${YELLOW}Step 9: Optional - Clean up data directories on nodes...${NC}"
echo ""
echo "You may have monitoring data stored on your nodes at:"
echo " - /mnt/local-ssd/prometheus"
echo " - /mnt/local-ssd/grafana"
echo " - /mnt/local-ssd/loki"
echo " - /mnt/local-ssd/tempo"
echo " - /var/lib/prometheus"
echo " - /var/lib/grafana"
echo ""
echo "To remove these, SSH to each node and run:"
echo " sudo rm -rf /mnt/local-ssd/{prometheus,grafana,loki,tempo}"
echo " sudo rm -rf /var/lib/{prometheus,grafana,loki,tempo}"
echo ""
read -p "Have you cleaned up the data directories? (yes to continue, no to skip): " cleanup_dirs
echo ""
echo -e "${GREEN}=========================================================="
echo "Existing Monitoring Stack Cleanup Complete!"
echo "==========================================================${NC}"
echo ""
echo "Summary of actions taken:"
echo " - Removed monitoring namespaces (if confirmed)"
echo " - Uninstalled Helm releases (if found and confirmed)"
echo " - Removed standalone monitoring components"
echo " - Removed monitoring ConfigMaps"
echo " - Removed RBAC resources"
echo " - Removed PVCs and PVs (if confirmed)"
echo " - Removed Ingresses"
echo " - Removed Prometheus Operator CRDs (if confirmed)"
echo ""
echo -e "${YELLOW}Next Steps:${NC}"
echo "1. Verify cleanup: kubectl get all -A | grep -E 'prometheus|grafana|loki|tempo|monitoring'"
echo "2. Clean up node data directories (see above)"
echo "3. Deploy new observability stack: ./deploy.sh"
echo ""