Compare commits

..

2 Commits

Author SHA1 Message Date
MickLesk ca7bf1212a Source core.func for shared messaging in disk-health
Replace locally duplicated color variables and msg_* helpers with
core.func + load_functions, matching the pattern used by update-apps
and pve-privilege-converter. Telemetry remains via api.func only.
2026-06-26 21:54:07 +02:00
MickLesk 9bacda8c21 Add disk-health tool (SMART + NVMe)
New PVE host tool that reports drive health for all physical disks:

- Installs smartmontools and nvme-cli on demand.
- Enumerates physical disks (skipping loop/zram/dm devices) and prints an
  overall SMART verdict plus key indicators (temperature, power-on hours,
  wear, reallocated/pending/offline sectors, CRC errors; NVMe spare,
  percentage used and media errors), highlighting non-zero error counters.
- Optionally starts a non-destructive short SMART self-test on a selected
  disk.
2026-06-26 21:41:15 +02:00
2 changed files with 156 additions and 158 deletions
+156
View File
@@ -0,0 +1,156 @@
#!/usr/bin/env bash
# Copyright (c) 2021-2026 community-scripts ORG
# Author: MickLesk (CanbiZ)
# License: MIT
# https://github.com/community-scripts/ProxmoxVE/raw/main/LICENSE
source <(curl -fsSL https://raw.githubusercontent.com/community-scripts/ProxmoxVE/refs/heads/main/misc/core.func)
source <(curl -fsSL https://raw.githubusercontent.com/community-scripts/ProxmoxVE/main/misc/api.func) 2>/dev/null || true
load_functions
declare -f init_tool_telemetry &>/dev/null && init_tool_telemetry "disk-health" "pve"
function header_info {
clear
cat <<"EOF"
____ _ __ __ __ ____ __
/ __ \(_)____/ /__ / / / /__ ____ _/ / /_/ /_
/ / / / / ___/ //_/ / /_/ / _ \/ __ `/ / __/ __ \
/ /_/ / (__ ) ,< / __ / __/ /_/ / / /_/ / / /
/_____/_/____/_/|_| /_/ /_/\___/\__,_/_/\__/_/ /_/
EOF
}
header_info
# Must run as root (SMART access requires it)
if [ "$(id -u)" -ne 0 ]; then
msg_error "This script must be run as root."
exit 1
fi
if ! command -v pveversion >/dev/null 2>&1; then
msg_error "No Proxmox VE detected!"
exit 1
fi
# Install required tooling on demand
if ! command -v smartctl >/dev/null 2>&1; then
msg_info "Installing smartmontools"
apt-get update &>/dev/null
if apt-get install -y smartmontools &>/dev/null; then
msg_ok "Installed smartmontools"
else
msg_error "Failed to install smartmontools"
exit 1
fi
fi
if ! command -v nvme >/dev/null 2>&1; then
msg_info "Installing nvme-cli"
if apt-get install -y nvme-cli &>/dev/null; then
msg_ok "Installed nvme-cli"
else
msg_error "nvme-cli not available (NVMe details limited)"
fi
fi
# Collect physical disks (exclude loop, zram and device-mapper devices)
mapfile -t DISKS < <(lsblk -dn -o NAME,TYPE | awk '$2=="disk"{print $1}' | grep -vE '^(loop|zram|dm-)' | sort)
if [ "${#DISKS[@]}" -eq 0 ]; then
msg_error "No physical disks found."
exit 0
fi
# Pull a single attribute value out of "smartctl -A" output by attribute name
sata_attr() {
local output="$1" name="$2"
echo "$output" | awk -v n="$name" '$2==n {print $10; exit}'
}
report_disk() {
local dev="$1"
local path="/dev/${dev}"
local model size health
model=$(lsblk -dn -o MODEL "$path" 2>/dev/null | sed 's/[[:space:]]*$//')
size=$(lsblk -dn -o SIZE "$path" 2>/dev/null | tr -d ' ')
echo -e "\n${BL}======================================================${CL}"
echo -e "${GN}${path}${CL} ${YW}${size:-?}${CL} ${model:-Unknown model}"
echo -e "${BL}======================================================${CL}"
# Overall SMART health verdict
health=$(smartctl -H "$path" 2>/dev/null | grep -iE "SMART overall-health|SMART Health Status" | sed 's/.*: *//')
if [ -z "$health" ]; then
echo -e " Health: ${YW}SMART not available for this device${CL}"
elif echo "$health" | grep -qiE "PASSED|OK"; then
echo -e " Health: ${GN}${health}${CL}"
else
echo -e " Health: ${RD}${health}${CL}"
fi
if [[ "$dev" == nvme* ]]; then
local a
a=$(smartctl -A "$path" 2>/dev/null)
echo "$a" | grep -iE "Temperature:|Available Spare:|Percentage Used:|Data Units Written:|Power On Hours:|Unsafe Shutdowns:|Media and Data Integrity Errors:" |
sed 's/^/ /'
else
local a poh temp realloc pending offline crc wear
a=$(smartctl -A "$path" 2>/dev/null)
poh=$(sata_attr "$a" "Power_On_Hours")
temp=$(sata_attr "$a" "Temperature_Celsius")
realloc=$(sata_attr "$a" "Reallocated_Sector_Ct")
pending=$(sata_attr "$a" "Current_Pending_Sector")
offline=$(sata_attr "$a" "Offline_Uncorrectable")
crc=$(sata_attr "$a" "UDMA_CRC_Error_Count")
wear=$(sata_attr "$a" "Wear_Leveling_Count")
[ -z "$wear" ] && wear=$(sata_attr "$a" "Media_Wearout_Indicator")
[ -n "$temp" ] && echo -e " Temperature: ${temp} C"
[ -n "$poh" ] && echo -e " Power On Hours: ${poh}"
[ -n "$wear" ] && echo -e " Wear Leveling/Wearout: ${wear}"
print_attr() {
local label="$1" val="$2"
[ -z "$val" ] && return
if [ "$val" -gt 0 ] 2>/dev/null; then
echo -e " ${label} ${RD}${val}${CL}"
else
echo -e " ${label} ${GN}${val}${CL}"
fi
}
print_attr "Reallocated Sectors: " "$realloc"
print_attr "Pending Sectors: " "$pending"
print_attr "Offline Uncorrectable:" "$offline"
print_attr "UDMA CRC Errors: " "$crc"
fi
}
header_info
echo -e "${YW}Scanning ${#DISKS[@]} disk(s) for SMART health...${CL}"
for d in "${DISKS[@]}"; do
report_disk "$d"
done
echo
# Offer an optional, non-destructive short self-test
if whiptail --backtitle "Proxmox VE Helper Scripts" --title "SMART Self-Test" \
--yesno "Health report complete.\n\nDo you want to start a non-destructive SHORT SMART self-test on a disk?\n\n(The test runs in the background; check results later with: smartctl -a /dev/XXX)" 14 70; then
TEST_MENU=()
for d in "${DISKS[@]}"; do
TEST_MENU+=("$d" "/dev/$d" "OFF")
done
sel=$(whiptail --backtitle "Proxmox VE Helper Scripts" --title "Select Disk for Short Self-Test" \
--radiolist "\nSelect a disk:\n" 16 60 6 "${TEST_MENU[@]}" 3>&1 1>&2 2>&3 | tr -d '"')
if [ -n "$sel" ]; then
msg_info "Starting short self-test on /dev/$sel"
if smartctl -t short "/dev/$sel" &>/dev/null; then
msg_ok "Short self-test started on /dev/$sel"
echo -e "${YW}Check progress/result with: ${GN}smartctl -a /dev/$sel${CL}"
else
msg_error "Could not start self-test on /dev/$sel"
fi
fi
fi
echo -e "\n${GN}Disk health check complete.${CL}\n"
-158
View File
@@ -1,158 +0,0 @@
#!/usr/bin/env bash
# Copyright (c) 2021-2026 community-scripts ORG
# Author: MickLesk (CanbiZ)
# License: MIT
# https://github.com/community-scripts/ProxmoxVE/raw/main/LICENSE
source <(curl -fsSL https://raw.githubusercontent.com/community-scripts/ProxmoxVE/refs/heads/main/misc/core.func)
source <(curl -fsSL https://raw.githubusercontent.com/community-scripts/ProxmoxVE/main/misc/api.func) 2>/dev/null || true
load_functions
declare -f init_tool_telemetry &>/dev/null && init_tool_telemetry "iommu-setup" "pve"
function header_info {
clear
cat <<"EOF"
____ ____ __ _____ __ ____ _____ __
/ _/ / __ \/ |/ / |/ / / / / / ___/___ / /___ ______
/ / / / / / /|_/ / /|_/ / / / / \__ \/ _ \/ __/ / / / __ \
_/ / / /_/ / / / / / / / /_/ / ___/ / __/ /_/ /_/ / /_/ /
/___/ \____/_/ /_/_/ /_/\____/ /____/\___/\__/\__,_/ .___/
/_/
EOF
}
header_info
# Guards
if [ "$(id -u)" -ne 0 ]; then
msg_error "This script must be run as root."
exit 1
fi
if ! command -v pveversion >/dev/null 2>&1; then
msg_error "No Proxmox VE detected!"
exit 1
fi
if ! pveversion | grep -Eq "pve-manager/(8\.[0-4]|9\.[0-9]+)(\.[0-9]+)*"; then
msg_error "This version of Proxmox Virtual Environment is not supported."
msg_error "Requires Proxmox Virtual Environment Version 8.0-8.4 or 9.x."
exit 1
fi
# systemd-detect-virt prints "none" but exits non-zero on bare metal, so a
# `|| echo none` fallback would duplicate the value; capture output as-is.
virt=$(systemd-detect-virt 2>/dev/null)
if [ -n "$virt" ] && [ "$virt" != "none" ]; then
msg_error "IOMMU/PCI passthrough must be configured on bare metal. Detected: $virt"
exit 1
fi
# Whether a kernel parameter is already present in a cmdline string
has_token() {
case " $1 " in
*" $2 "*) return 0 ;;
*) return 1 ;;
esac
}
# Detect CPU vendor and the matching kernel parameters
cpu_vendor=$(lscpu | grep -oP 'Vendor ID:\s*\K\S+' | head -n 1)
case "$cpu_vendor" in
GenuineIntel) IOMMU_PARAMS=("intel_iommu=on" "iommu=pt") ;;
AuthenticAMD) IOMMU_PARAMS=("amd_iommu=on" "iommu=pt") ;;
*)
msg_error "Unsupported CPU vendor: ${cpu_vendor:-unknown}"
exit 1
;;
esac
# Report current IOMMU state
iommu_active="no"
if [ -d /sys/kernel/iommu_groups ] && [ -n "$(ls -A /sys/kernel/iommu_groups 2>/dev/null)" ]; then
iommu_active="yes"
fi
echo -e "${BL}CPU vendor:${CL} ${cpu_vendor}"
echo -e "${BL}IOMMU active:${CL} $([ "$iommu_active" = "yes" ] && echo -e "${GN}yes${CL}" || echo -e "${RD}no${CL}")"
echo -e "${BL}Kernel params:${CL} ${IOMMU_PARAMS[*]}"
echo
if [ "$iommu_active" = "yes" ]; then
whiptail --backtitle "Proxmox VE Helper Scripts" --title "IOMMU Already Active" \
--yesno "IOMMU already appears to be active on this host.\n\nDo you still want to (re)apply the kernel parameters and vfio modules?" 12 70 || {
echo -e "${GN}Nothing to do.${CL}"
exit 0
}
else
whiptail --backtitle "Proxmox VE Helper Scripts" --title "Enable IOMMU / PCI(e) Passthrough" \
--yesno "This will enable IOMMU for PCI(e) passthrough by:\n\n - adding '${IOMMU_PARAMS[*]}' to the kernel command line\n - loading the vfio kernel modules\n\nA reboot is required afterwards. A backup of the modified boot config is created.\n\nProceed?" 16 74 || exit 0
fi
# Determine the boot configuration in use
# proxmox-boot-tool managed systems (ZFS / UEFI) use /etc/kernel/cmdline,
# everything else uses GRUB via /etc/default/grub.
if command -v proxmox-boot-tool >/dev/null 2>&1 && [ -f /etc/kernel/cmdline ]; then
BOOT_MODE="systemd-boot"
else
BOOT_MODE="grub"
fi
apply_grub() {
local file="/etc/default/grub" current merged
cp -a "$file" "${file}.bak.$(date +%Y%m%d%H%M%S)"
if grep -q '^GRUB_CMDLINE_LINUX_DEFAULT=' "$file"; then
current=$(sed -n 's/^GRUB_CMDLINE_LINUX_DEFAULT=//p' "$file" | tail -1)
current="${current%\"}"
current="${current#\"}"
else
current=""
fi
merged="$current"
for tok in "${IOMMU_PARAMS[@]}"; do
has_token "$merged" "$tok" || merged="${merged:+$merged }$tok"
done
if grep -q '^GRUB_CMDLINE_LINUX_DEFAULT=' "$file"; then
sed -i "s|^GRUB_CMDLINE_LINUX_DEFAULT=.*|GRUB_CMDLINE_LINUX_DEFAULT=\"${merged}\"|" "$file"
else
echo "GRUB_CMDLINE_LINUX_DEFAULT=\"${merged}\"" >>"$file"
fi
update-grub &>/dev/null
}
apply_systemd_boot() {
local file="/etc/kernel/cmdline" current merged
cp -a "$file" "${file}.bak.$(date +%Y%m%d%H%M%S)"
current=$(tr -d '\n' <"$file")
merged="$current"
for tok in "${IOMMU_PARAMS[@]}"; do
has_token "$merged" "$tok" || merged="${merged:+$merged }$tok"
done
echo "$merged" >"$file"
proxmox-boot-tool refresh &>/dev/null
}
msg_info "Applying kernel parameters via ${BOOT_MODE}"
if [ "$BOOT_MODE" = "systemd-boot" ]; then
apply_systemd_boot
else
apply_grub
fi
msg_ok "Applied kernel parameters (${BOOT_MODE})"
# Load vfio modules at boot (vfio_virqfd was merged into the core in
# kernel 6.2+, so it is intentionally not added here)
msg_info "Configuring vfio modules"
for m in vfio vfio_iommu_type1 vfio_pci; do
grep -qxF "$m" /etc/modules 2>/dev/null || echo "$m" >>/etc/modules
done
msg_ok "Configured vfio modules"
echo -e "\n${GN}IOMMU configuration written.${CL}"
echo -e "${YW}A reboot is required to activate IOMMU.${CL}"
echo -e "After rebooting, verify with: ${BL}dmesg | grep -e DMAR -e IOMMU${CL}"
echo -e "and list groups with: ${BL}find /sys/kernel/iommu_groups/ -type l${CL}\n"