betelgeusebytes/k8s/scripts/cleanup.sh

#!/bin/bash

set -e

echo "=========================================================="
echo "Removing Existing Monitoring Stack"
echo "=========================================================="
echo ""

RED='\033[0;31m'
YELLOW='\033[1;33m'
GREEN='\033[0;32m'
NC='\033[0m' # No Color

echo -e "${YELLOW}This script will remove common monitoring deployments including:${NC}"
echo "  - Prometheus (standalone or operator)"
echo "  - Grafana"
echo "  - Fluent Bit"
echo "  - Vector"
echo "  - Loki"
echo "  - Tempo"
echo "  - Node exporters"
echo "  - kube-state-metrics"
echo "  - Any monitoring/prometheus/grafana namespaces"
echo ""
echo -e "${RED}WARNING: This will delete all existing monitoring data!${NC}"
echo ""
read -p "Are you sure you want to continue? (yes/no): " confirm

if [ "$confirm" != "yes" ]; then
    echo "Cleanup cancelled."
    exit 0
fi

echo ""
echo -e "${YELLOW}Step 1: Checking for existing monitoring namespaces...${NC}"

# Common namespace names for monitoring
NAMESPACES=("monitoring" "prometheus" "grafana" "loki" "tempo" "logging")

for ns in "${NAMESPACES[@]}"; do
    if kubectl get namespace "$ns" &> /dev/null; then
        echo -e "${GREEN}Found namespace: $ns${NC}"

        # Show what's in the namespace
        echo "  Resources in $ns:"
        kubectl get all -n "$ns" 2>/dev/null | head -20 || true
        echo ""

        read -p "  Delete namespace '$ns'? (yes/no): " delete_ns
        if [ "$delete_ns" = "yes" ]; then
            echo "  Deleting namespace $ns..."
            kubectl delete namespace "$ns" --timeout=120s || {
                echo -e "${YELLOW}  Warning: Namespace deletion timed out, forcing...${NC}"
                kubectl delete namespace "$ns" --grace-period=0 --force &
            }
        fi
    fi
done

echo ""
echo -e "${YELLOW}Step 2: Removing common monitoring Helm releases...${NC}"

# Check if helm is available
if command -v helm &> /dev/null; then
    echo "Checking for Helm releases..."

    # Common Helm release names
    RELEASES=("prometheus" "grafana" "loki" "tempo" "fluent-bit" "prometheus-operator" "kube-prometheus-stack")

    for release in "${RELEASES[@]}"; do
        # Check all namespaces for the release
        if helm list -A | grep -q "$release"; then
            ns=$(helm list -A | grep "$release" | awk '{print $2}')
            echo -e "${GREEN}Found Helm release: $release in namespace $ns${NC}"
            read -p "  Uninstall Helm release '$release'? (yes/no): " uninstall
            if [ "$uninstall" = "yes" ]; then
                echo "  Uninstalling $release..."
                helm uninstall "$release" -n "$ns" || echo -e "${YELLOW}  Warning: Failed to uninstall $release${NC}"
            fi
        fi
    done
else
    echo "Helm not found, skipping Helm releases check"
fi

echo ""
echo -e "${YELLOW}Step 3: Removing standalone monitoring components...${NC}"

# Remove common DaemonSets in kube-system or default
echo "Checking for monitoring DaemonSets..."
for ns in kube-system default; do
    if kubectl get daemonset -n "$ns" 2>/dev/null | grep -q "node-exporter\|fluent-bit\|fluentd\|vector"; then
        echo -e "${GREEN}Found monitoring DaemonSets in $ns${NC}"
        kubectl get daemonset -n "$ns" | grep -E "node-exporter|fluent-bit|fluentd|vector"
        read -p "  Delete these DaemonSets? (yes/no): " delete_ds
        if [ "$delete_ds" = "yes" ]; then
            kubectl delete daemonset -n "$ns" -l app=node-exporter --ignore-not-found
            kubectl delete daemonset -n "$ns" -l app=fluent-bit --ignore-not-found
            kubectl delete daemonset -n "$ns" -l app=fluentd --ignore-not-found
            kubectl delete daemonset -n "$ns" -l app=vector --ignore-not-found
            kubectl delete daemonset -n "$ns" node-exporter --ignore-not-found
            kubectl delete daemonset -n "$ns" fluent-bit --ignore-not-found
            kubectl delete daemonset -n "$ns" fluentd --ignore-not-found
            kubectl delete daemonset -n "$ns" vector --ignore-not-found
        fi
    fi
done

# Remove common Deployments
echo ""
echo "Checking for monitoring Deployments..."
for ns in kube-system default; do
    if kubectl get deployment -n "$ns" 2>/dev/null | grep -q "prometheus\|grafana\|kube-state-metrics\|loki\|tempo"; then
        echo -e "${GREEN}Found monitoring Deployments in $ns${NC}"
        kubectl get deployment -n "$ns" | grep -E "prometheus|grafana|kube-state-metrics|loki|tempo"
        read -p "  Delete these Deployments? (yes/no): " delete_deploy
        if [ "$delete_deploy" = "yes" ]; then
            kubectl delete deployment -n "$ns" -l app=prometheus --ignore-not-found
            kubectl delete deployment -n "$ns" -l app=grafana --ignore-not-found
            kubectl delete deployment -n "$ns" -l app=kube-state-metrics --ignore-not-found
            kubectl delete deployment -n "$ns" -l app=loki --ignore-not-found
            kubectl delete deployment -n "$ns" -l app=tempo --ignore-not-found
            kubectl delete deployment -n "$ns" prometheus --ignore-not-found
            kubectl delete deployment -n "$ns" grafana --ignore-not-found
            kubectl delete deployment -n "$ns" kube-state-metrics --ignore-not-found
        fi
    fi
done

# Remove common StatefulSets
echo ""
echo "Checking for monitoring StatefulSets..."
for ns in kube-system default; do
    if kubectl get statefulset -n "$ns" 2>/dev/null | grep -q "prometheus\|grafana\|loki\|tempo"; then
        echo -e "${GREEN}Found monitoring StatefulSets in $ns${NC}"
        kubectl get statefulset -n "$ns" | grep -E "prometheus|grafana|loki|tempo"
        read -p "  Delete these StatefulSets? (yes/no): " delete_sts
        if [ "$delete_sts" = "yes" ]; then
            kubectl delete statefulset -n "$ns" -l app=prometheus --ignore-not-found
            kubectl delete statefulset -n "$ns" -l app=grafana --ignore-not-found
            kubectl delete statefulset -n "$ns" -l app=loki --ignore-not-found
            kubectl delete statefulset -n "$ns" -l app=tempo --ignore-not-found
            kubectl delete statefulset -n "$ns" prometheus --ignore-not-found
            kubectl delete statefulset -n "$ns" grafana --ignore-not-found
            kubectl delete statefulset -n "$ns" loki --ignore-not-found
            kubectl delete statefulset -n "$ns" tempo --ignore-not-found
        fi
    fi
done

echo ""
echo -e "${YELLOW}Step 4: Removing monitoring ConfigMaps...${NC}"

# Ask before removing ConfigMaps (they might contain important configs)
echo "Checking for monitoring ConfigMaps..."
for ns in kube-system default monitoring prometheus grafana; do
    if kubectl get namespace "$ns" &> /dev/null; then
        if kubectl get configmap -n "$ns" 2>/dev/null | grep -q "prometheus\|grafana\|loki\|tempo\|fluent"; then
            echo -e "${GREEN}Found monitoring ConfigMaps in $ns${NC}"
            kubectl get configmap -n "$ns" | grep -E "prometheus|grafana|loki|tempo|fluent"
            read -p "  Delete these ConfigMaps? (yes/no): " delete_cm
            if [ "$delete_cm" = "yes" ]; then
                kubectl delete configmap -n "$ns" -l app=prometheus --ignore-not-found
                kubectl delete configmap -n "$ns" -l app=grafana --ignore-not-found
                kubectl delete configmap -n "$ns" -l app=loki --ignore-not-found
                kubectl delete configmap -n "$ns" -l app=fluent-bit --ignore-not-found
            fi
        fi
    fi
done

echo ""
echo -e "${YELLOW}Step 5: Removing ClusterRoles and ClusterRoleBindings...${NC}"

# Remove monitoring-related RBAC
echo "Checking for monitoring ClusterRoles..."
if kubectl get clusterrole 2>/dev/null | grep -q "prometheus\|grafana\|kube-state-metrics\|fluent-bit\|node-exporter"; then
    echo -e "${GREEN}Found monitoring ClusterRoles${NC}"
    kubectl get clusterrole | grep -E "prometheus|grafana|kube-state-metrics|fluent-bit|node-exporter"
    read -p "  Delete these ClusterRoles? (yes/no): " delete_cr
    if [ "$delete_cr" = "yes" ]; then
        kubectl delete clusterrole prometheus --ignore-not-found
        kubectl delete clusterrole grafana --ignore-not-found
        kubectl delete clusterrole kube-state-metrics --ignore-not-found
        kubectl delete clusterrole fluent-bit --ignore-not-found
        kubectl delete clusterrole node-exporter --ignore-not-found
    fi
fi

echo "Checking for monitoring ClusterRoleBindings..."
if kubectl get clusterrolebinding 2>/dev/null | grep -q "prometheus\|grafana\|kube-state-metrics\|fluent-bit\|node-exporter"; then
    echo -e "${GREEN}Found monitoring ClusterRoleBindings${NC}"
    kubectl get clusterrolebinding | grep -E "prometheus|grafana|kube-state-metrics|fluent-bit|node-exporter"
    read -p "  Delete these ClusterRoleBindings? (yes/no): " delete_crb
    if [ "$delete_crb" = "yes" ]; then
        kubectl delete clusterrolebinding prometheus --ignore-not-found
        kubectl delete clusterrolebinding grafana --ignore-not-found
        kubectl delete clusterrolebinding kube-state-metrics --ignore-not-found
        kubectl delete clusterrolebinding fluent-bit --ignore-not-found
        kubectl delete clusterrolebinding node-exporter --ignore-not-found
    fi
fi

echo ""
echo -e "${YELLOW}Step 6: Removing PVCs and PVs...${NC}"

# Check for monitoring PVCs
echo "Checking for monitoring PersistentVolumeClaims..."
for ns in kube-system default monitoring prometheus grafana; do
    if kubectl get namespace "$ns" &> /dev/null; then
        if kubectl get pvc -n "$ns" 2>/dev/null | grep -q "prometheus\|grafana\|loki\|tempo"; then
            echo -e "${GREEN}Found monitoring PVCs in $ns${NC}"
            kubectl get pvc -n "$ns" | grep -E "prometheus|grafana|loki|tempo"
            echo -e "${RED}  WARNING: Deleting PVCs will delete all stored data!${NC}"
            read -p "  Delete these PVCs? (yes/no): " delete_pvc
            if [ "$delete_pvc" = "yes" ]; then
                kubectl delete pvc -n "$ns" -l app=prometheus --ignore-not-found
                kubectl delete pvc -n "$ns" -l app=grafana --ignore-not-found
                kubectl delete pvc -n "$ns" -l app=loki --ignore-not-found
                kubectl delete pvc -n "$ns" -l app=tempo --ignore-not-found
                # Also try by name patterns
                kubectl get pvc -n "$ns" -o name | grep -E "prometheus|grafana|loki|tempo" | xargs -r kubectl delete -n "$ns" || true
            fi
        fi
    fi
done

# Check for monitoring PVs
echo ""
echo "Checking for monitoring PersistentVolumes..."
if kubectl get pv 2>/dev/null | grep -q "prometheus\|grafana\|loki\|tempo\|monitoring"; then
    echo -e "${GREEN}Found monitoring PVs${NC}"
    kubectl get pv | grep -E "prometheus|grafana|loki|tempo|monitoring"
    echo -e "${RED}  WARNING: Deleting PVs may delete data on disk!${NC}"
    read -p "  Delete these PVs? (yes/no): " delete_pv
    if [ "$delete_pv" = "yes" ]; then
        kubectl get pv -o name | grep -E "prometheus|grafana|loki|tempo|monitoring" | xargs -r kubectl delete || true
    fi
fi

echo ""
echo -e "${YELLOW}Step 7: Checking for monitoring Ingresses...${NC}"

for ns in kube-system default monitoring prometheus grafana; do
    if kubectl get namespace "$ns" &> /dev/null; then
        if kubectl get ingress -n "$ns" 2>/dev/null | grep -q "prometheus\|grafana\|loki"; then
            echo -e "${GREEN}Found monitoring Ingresses in $ns${NC}"
            kubectl get ingress -n "$ns" | grep -E "prometheus|grafana|loki"
            read -p "  Delete these Ingresses? (yes/no): " delete_ing
            if [ "$delete_ing" = "yes" ]; then
                kubectl delete ingress -n "$ns" -l app=prometheus --ignore-not-found
                kubectl delete ingress -n "$ns" -l app=grafana --ignore-not-found
                kubectl delete ingress -n "$ns" prometheus-ingress --ignore-not-found
                kubectl delete ingress -n "$ns" grafana-ingress --ignore-not-found
            fi
        fi
    fi
done

echo ""
echo -e "${YELLOW}Step 8: Checking for Prometheus Operator CRDs...${NC}"

# Check for Prometheus Operator CRDs
if kubectl get crd 2>/dev/null | grep -q "monitoring.coreos.com"; then
    echo -e "${GREEN}Found Prometheus Operator CRDs${NC}"
    kubectl get crd | grep "monitoring.coreos.com"
    echo ""
    echo -e "${RED}WARNING: Deleting these CRDs will remove ALL Prometheus Operator resources cluster-wide!${NC}"
    read -p "  Delete Prometheus Operator CRDs? (yes/no): " delete_crd
    if [ "$delete_crd" = "yes" ]; then
        kubectl delete crd prometheuses.monitoring.coreos.com --ignore-not-found
        kubectl delete crd prometheusrules.monitoring.coreos.com --ignore-not-found
        kubectl delete crd servicemonitors.monitoring.coreos.com --ignore-not-found
        kubectl delete crd podmonitors.monitoring.coreos.com --ignore-not-found
        kubectl delete crd alertmanagers.monitoring.coreos.com --ignore-not-found
        kubectl delete crd alertmanagerconfigs.monitoring.coreos.com --ignore-not-found
        kubectl delete crd probes.monitoring.coreos.com --ignore-not-found
        kubectl delete crd thanosrulers.monitoring.coreos.com --ignore-not-found
    fi
fi

echo ""
echo -e "${YELLOW}Step 9: Optional - Clean up data directories on nodes...${NC}"
echo ""
echo "You may have monitoring data stored on your nodes at:"
echo "  - /mnt/local-ssd/prometheus"
echo "  - /mnt/local-ssd/grafana"
echo "  - /mnt/local-ssd/loki"
echo "  - /mnt/local-ssd/tempo"
echo "  - /var/lib/prometheus"
echo "  - /var/lib/grafana"
echo ""
echo "To remove these, SSH to each node and run:"
echo "  sudo rm -rf /mnt/local-ssd/{prometheus,grafana,loki,tempo}"
echo "  sudo rm -rf /var/lib/{prometheus,grafana,loki,tempo}"
echo ""
read -p "Have you cleaned up the data directories? (yes to continue, no to skip): " cleanup_dirs

echo ""
echo -e "${GREEN}=========================================================="
echo "Existing Monitoring Stack Cleanup Complete!"
echo "==========================================================${NC}"
echo ""
echo "Summary of actions taken:"
echo "  - Removed monitoring namespaces (if confirmed)"
echo "  - Uninstalled Helm releases (if found and confirmed)"
echo "  - Removed standalone monitoring components"
echo "  - Removed monitoring ConfigMaps"
echo "  - Removed RBAC resources"
echo "  - Removed PVCs and PVs (if confirmed)"
echo "  - Removed Ingresses"
echo "  - Removed Prometheus Operator CRDs (if confirmed)"
echo ""
echo -e "${YELLOW}Next Steps:${NC}"
echo "1. Verify cleanup: kubectl get all -A | grep -E 'prometheus|grafana|loki|tempo|monitoring'"
echo "2. Clean up node data directories (see above)"
echo "3. Deploy new observability stack: ./deploy.sh"
echo ""