Compare commits

..

1 Commits

Author SHA1 Message Date
CanbiZ (MickLesk)
c1eb00dabe fix(cluster): validate container IDs cluster-wide across all nodes
- Query /cluster/resources via pvesh to check all VMs/CTs on ALL nodes
- Check /etc/pve/nodes/*/qemu-server and /etc/pve/nodes/*/lxc dirs
- Handles pmxcfs sync delays that caused sporadic ID conflicts
- Remove duplicate validate_container_id/get_valid_container_id definitions
- Add max_attempts safeguard to prevent infinite loops
2026-02-14 14:02:40 +01:00
5 changed files with 112 additions and 188 deletions

View File

@@ -287,32 +287,6 @@ get_error_text() {
fi fi
} }
# ------------------------------------------------------------------------------
# build_error_string()
#
# - Builds a structured error string for telemetry reporting
# - Format: "exit_code=<N> | <explanation>\n---\n<last 20 log lines>"
# - If no log lines available, returns just the explanation
# - Arguments:
# * $1: exit_code (numeric)
# * $2: log_text (optional, output from get_error_text)
# - Returns structured error string via stdout
# ------------------------------------------------------------------------------
build_error_string() {
local exit_code="${1:-1}"
local log_text="${2:-}"
local explanation
explanation=$(explain_exit_code "$exit_code")
if [[ -n "$log_text" ]]; then
# Structured format: header + separator + log lines
printf 'exit_code=%s | %s\n---\n%s' "$exit_code" "$explanation" "$log_text"
else
# No log available - just the explanation with exit code
printf 'exit_code=%s | %s' "$exit_code" "$explanation"
fi
}
# ============================================================================== # ==============================================================================
# SECTION 2: TELEMETRY FUNCTIONS # SECTION 2: TELEMETRY FUNCTIONS
# ============================================================================== # ==============================================================================
@@ -691,12 +665,13 @@ post_update_to_api() {
else else
exit_code=1 exit_code=1
fi fi
# Get log lines and build structured error string
local error_text="" local error_text=""
error_text=$(get_error_text) error_text=$(get_error_text)
local full_error if [[ -n "$error_text" ]]; then
full_error=$(build_error_string "$exit_code" "$error_text") error=$(json_escape "$error_text")
error=$(json_escape "$full_error") else
error=$(json_escape "$(explain_exit_code "$exit_code")")
fi
short_error=$(json_escape "$(explain_exit_code "$exit_code")") short_error=$(json_escape "$(explain_exit_code "$exit_code")")
error_category=$(categorize_error "$exit_code") error_category=$(categorize_error "$exit_code")
[[ -z "$error" ]] && error="Unknown error" [[ -z "$error" ]] && error="Unknown error"
@@ -839,52 +814,31 @@ EOF
categorize_error() { categorize_error() {
local code="$1" local code="$1"
case "$code" in case "$code" in
# Network errors (curl/wget) # Network errors
6 | 7 | 22 | 35) echo "network" ;; 6 | 7 | 22 | 28 | 35) echo "network" ;;
# Timeout errors # Storage errors
28 | 124 | 211) echo "timeout" ;; 214 | 217 | 219) echo "storage" ;;
# Storage errors (Proxmox storage) # Dependency/Package errors
214 | 217 | 219 | 224) echo "storage" ;; 100 | 101 | 102 | 127 | 160 | 161 | 162) echo "dependency" ;;
# Dependency/Package errors (APT, DPKG, pip, commands)
100 | 101 | 102 | 127 | 160 | 161 | 162 | 255) echo "dependency" ;;
# Permission errors # Permission errors
126 | 152) echo "permission" ;; 126 | 152) echo "permission" ;;
# Configuration errors (Proxmox config, invalid args) # Timeout errors
128 | 203 | 204 | 205 | 206 | 207 | 208) echo "config" ;; 124 | 28 | 211) echo "timeout" ;;
# Proxmox container/template errors # Configuration errors
200 | 209 | 210 | 212 | 213 | 215 | 216 | 218 | 220 | 221 | 222 | 223 | 225 | 231) echo "proxmox" ;; 203 | 204 | 205 | 206 | 207 | 208) echo "config" ;;
# Service/Systemd errors
150 | 151 | 153 | 154) echo "service" ;;
# Database errors (PostgreSQL, MySQL, MongoDB)
170 | 171 | 172 | 173 | 180 | 181 | 182 | 183 | 190 | 191 | 192 | 193) echo "database" ;;
# Node.js / JavaScript runtime errors
243 | 245 | 246 | 247 | 248 | 249) echo "runtime" ;;
# Python environment errors
# (already covered: 160-162 under dependency)
# Aborted by user # Aborted by user
130) echo "aborted" ;; 130) echo "aborted" ;;
# Resource errors (OOM, SIGKILL, SIGABRT) # Resource errors (OOM, etc)
134 | 137) echo "resource" ;; 137 | 134) echo "resource" ;;
# Signal/Process errors (SIGTERM, SIGPIPE, SIGSEGV) # Default
139 | 141 | 143) echo "signal" ;;
# Shell errors (general error, syntax error)
1 | 2) echo "shell" ;;
# Default - truly unknown
*) echo "unknown" ;; *) echo "unknown" ;;
esac esac
} }
@@ -947,9 +901,11 @@ post_tool_to_api() {
[[ ! "$exit_code" =~ ^[0-9]+$ ]] && exit_code=1 [[ ! "$exit_code" =~ ^[0-9]+$ ]] && exit_code=1
local error_text="" local error_text=""
error_text=$(get_error_text) error_text=$(get_error_text)
local full_error if [[ -n "$error_text" ]]; then
full_error=$(build_error_string "$exit_code" "$error_text") error=$(json_escape "$error_text")
error=$(json_escape "$full_error") else
error=$(json_escape "$(explain_exit_code "$exit_code")")
fi
error_category=$(categorize_error "$exit_code") error_category=$(categorize_error "$exit_code")
fi fi
@@ -1012,9 +968,11 @@ post_addon_to_api() {
[[ ! "$exit_code" =~ ^[0-9]+$ ]] && exit_code=1 [[ ! "$exit_code" =~ ^[0-9]+$ ]] && exit_code=1
local error_text="" local error_text=""
error_text=$(get_error_text) error_text=$(get_error_text)
local full_error if [[ -n "$error_text" ]]; then
full_error=$(build_error_string "$exit_code" "$error_text") error=$(json_escape "$error_text")
error=$(json_escape "$full_error") else
error=$(json_escape "$(explain_exit_code "$exit_code")")
fi
error_category=$(categorize_error "$exit_code") error_category=$(categorize_error "$exit_code")
fi fi
@@ -1109,9 +1067,11 @@ post_update_to_api_extended() {
fi fi
local error_text="" local error_text=""
error_text=$(get_error_text) error_text=$(get_error_text)
local full_error if [[ -n "$error_text" ]]; then
full_error=$(build_error_string "$exit_code" "$error_text") error=$(json_escape "$error_text")
error=$(json_escape "$full_error") else
error=$(json_escape "$(explain_exit_code "$exit_code")")
fi
error_category=$(categorize_error "$exit_code") error_category=$(categorize_error "$exit_code")
[[ -z "$error" ]] && error="Unknown error" [[ -z "$error" ]] && error="Unknown error"
fi fi

View File

@@ -277,8 +277,9 @@ install_ssh_keys_into_ct() {
# ------------------------------------------------------------------------------ # ------------------------------------------------------------------------------
# validate_container_id() # validate_container_id()
# #
# - Validates if a container ID is available for use # - Validates if a container ID is available for use (CLUSTER-WIDE)
# - Checks if ID is already used by VM or LXC container # - Checks cluster resources via pvesh for VMs/CTs on ALL nodes
# - Falls back to local config file check if pvesh unavailable
# - Checks if ID is used in LVM logical volumes # - Checks if ID is used in LVM logical volumes
# - Returns 0 if ID is available, 1 if already in use # - Returns 0 if ID is available, 1 if already in use
# ------------------------------------------------------------------------------ # ------------------------------------------------------------------------------
@@ -290,11 +291,35 @@ validate_container_id() {
return 1 return 1
fi fi
# Check if config file exists for VM or LXC # CLUSTER-WIDE CHECK: Query all VMs/CTs across all nodes
# This catches IDs used on other nodes in the cluster
# NOTE: Works on single-node too - Proxmox always has internal cluster structure
# Falls back gracefully if pvesh unavailable or returns empty
if command -v pvesh &>/dev/null; then
local cluster_ids
cluster_ids=$(pvesh get /cluster/resources --type vm --output-format json 2>/dev/null |
grep -oP '"vmid":\s*\K[0-9]+' 2>/dev/null || true)
if [[ -n "$cluster_ids" ]] && echo "$cluster_ids" | grep -qw "$ctid"; then
return 1
fi
fi
# LOCAL FALLBACK: Check if config file exists for VM or LXC
# This handles edge cases where pvesh might not return all info
if [[ -f "/etc/pve/qemu-server/${ctid}.conf" ]] || [[ -f "/etc/pve/lxc/${ctid}.conf" ]]; then if [[ -f "/etc/pve/qemu-server/${ctid}.conf" ]] || [[ -f "/etc/pve/lxc/${ctid}.conf" ]]; then
return 1 return 1
fi fi
# Check ALL nodes in cluster for config files (handles pmxcfs sync delays)
# NOTE: On single-node, /etc/pve/nodes/ contains just the one node - still works
if [[ -d "/etc/pve/nodes" ]]; then
for node_dir in /etc/pve/nodes/*/; do
if [[ -f "${node_dir}qemu-server/${ctid}.conf" ]] || [[ -f "${node_dir}lxc/${ctid}.conf" ]]; then
return 1
fi
done
fi
# Check if ID is used in LVM logical volumes # Check if ID is used in LVM logical volumes
if lvs --noheadings -o lv_name 2>/dev/null | grep -qE "(^|[-_])${ctid}($|[-_])"; then if lvs --noheadings -o lv_name 2>/dev/null | grep -qE "(^|[-_])${ctid}($|[-_])"; then
return 1 return 1
@@ -306,63 +331,30 @@ validate_container_id() {
# ------------------------------------------------------------------------------ # ------------------------------------------------------------------------------
# get_valid_container_id() # get_valid_container_id()
# #
# - Returns a valid, unused container ID # - Returns a valid, unused container ID (CLUSTER-AWARE)
# - Uses pvesh /cluster/nextid as starting point (already cluster-aware)
# - If provided ID is valid, returns it # - If provided ID is valid, returns it
# - Otherwise increments from suggested ID until a free one is found # - Otherwise increments until a free one is found across entire cluster
# - Calls validate_container_id() to check availability # - Calls validate_container_id() to check availability
# ------------------------------------------------------------------------------ # ------------------------------------------------------------------------------
get_valid_container_id() { get_valid_container_id() {
local suggested_id="${1:-$(pvesh get /cluster/nextid)}" local suggested_id="${1:-$(pvesh get /cluster/nextid 2>/dev/null || echo 100)}"
while ! validate_container_id "$suggested_id"; do # Ensure we have a valid starting ID
suggested_id=$((suggested_id + 1)) if ! [[ "$suggested_id" =~ ^[0-9]+$ ]]; then
done suggested_id=$(pvesh get /cluster/nextid 2>/dev/null || echo 100)
fi
echo "$suggested_id"
} local max_attempts=1000
local attempts=0
# ------------------------------------------------------------------------------
# validate_container_id()
#
# - Validates if a container ID is available for use
# - Checks if ID is already used by VM or LXC container
# - Checks if ID is used in LVM logical volumes
# - Returns 0 if ID is available, 1 if already in use
# ------------------------------------------------------------------------------
validate_container_id() {
local ctid="$1"
# Check if ID is numeric
if ! [[ "$ctid" =~ ^[0-9]+$ ]]; then
return 1
fi
# Check if config file exists for VM or LXC
if [[ -f "/etc/pve/qemu-server/${ctid}.conf" ]] || [[ -f "/etc/pve/lxc/${ctid}.conf" ]]; then
return 1
fi
# Check if ID is used in LVM logical volumes
if lvs --noheadings -o lv_name 2>/dev/null | grep -qE "(^|[-_])${ctid}($|[-_])"; then
return 1
fi
return 0
}
# ------------------------------------------------------------------------------
# get_valid_container_id()
#
# - Returns a valid, unused container ID
# - If provided ID is valid, returns it
# - Otherwise increments from suggested ID until a free one is found
# - Calls validate_container_id() to check availability
# ------------------------------------------------------------------------------
get_valid_container_id() {
local suggested_id="${1:-$(pvesh get /cluster/nextid)}"
while ! validate_container_id "$suggested_id"; do while ! validate_container_id "$suggested_id"; do
suggested_id=$((suggested_id + 1)) suggested_id=$((suggested_id + 1))
attempts=$((attempts + 1))
if [[ $attempts -ge $max_attempts ]]; then
msg_error "Could not find available container ID after $max_attempts attempts"
exit 1
fi
done done
echo "$suggested_id" echo "$suggested_id"
@@ -4133,7 +4125,8 @@ EOF'
# Show combined log location # Show combined log location
if [[ -n "$CTID" && -n "${SESSION_ID:-}" ]]; then if [[ -n "$CTID" && -n "${SESSION_ID:-}" ]]; then
msg_custom "📋" "${YW}" "Installation log: ${combined_log}" echo ""
echo -e "${GN}${CL} Installation log: ${BL}${combined_log}${CL}"
fi fi
# Dev mode: Keep container or open breakpoint shell # Dev mode: Keep container or open breakpoint shell
@@ -4156,21 +4149,19 @@ EOF'
exit $install_exit_code exit $install_exit_code
fi fi
# Prompt user for cleanup with 60s timeout # Prompt user for cleanup with 60s timeout (plain echo - no msg_info to avoid spinner)
echo "" echo ""
echo -en "${TAB}${TAB}${YW}Remove broken container ${CTID}? (Y/n) [auto-remove in 60s]: ${CL}" echo -en "${YW}Remove broken container ${CTID}? (Y/n) [auto-remove in 60s]: ${CL}"
if read -t 60 -r response; then if read -t 60 -r response; then
if [[ -z "$response" || "$response" =~ ^[Yy]$ ]]; then if [[ -z "$response" || "$response" =~ ^[Yy]$ ]]; then
# Remove container # Remove container
echo "" echo -e "\n${TAB}${HOLD}${YW}Removing container ${CTID}${CL}"
msg_info "Removing container ${CTID}"
pct stop "$CTID" &>/dev/null || true pct stop "$CTID" &>/dev/null || true
pct destroy "$CTID" &>/dev/null || true pct destroy "$CTID" &>/dev/null || true
msg_ok "Container ${CTID} removed" echo -e "${BFR}${CM}${GN}Container ${CTID} removed${CL}"
elif [[ "$response" =~ ^[Nn]$ ]]; then elif [[ "$response" =~ ^[Nn]$ ]]; then
echo "" echo -e "\n${TAB}${YW}Container ${CTID} kept for debugging${CL}"
msg_warn "Container ${CTID} kept for debugging"
# Dev mode: Setup MOTD/SSH for debugging access to broken container # Dev mode: Setup MOTD/SSH for debugging access to broken container
if [[ "${DEV_MODE_MOTD:-false}" == "true" ]]; then if [[ "${DEV_MODE_MOTD:-false}" == "true" ]]; then
@@ -4186,11 +4177,11 @@ EOF'
fi fi
else else
# Timeout - auto-remove # Timeout - auto-remove
echo "" echo -e "\n${YW}No response - auto-removing container${CL}"
msg_info "No response - removing container ${CTID}" echo -e "${TAB}${HOLD}${YW}Removing container ${CTID}${CL}"
pct stop "$CTID" &>/dev/null || true pct stop "$CTID" &>/dev/null || true
pct destroy "$CTID" &>/dev/null || true pct destroy "$CTID" &>/dev/null || true
msg_ok "Container ${CTID} removed" echo -e "${BFR}${CM}${GN}Container ${CTID} removed${CL}"
fi fi
# Force one final status update attempt after cleanup # Force one final status update attempt after cleanup

View File

@@ -522,9 +522,15 @@ silent() {
msg_custom "→" "${YWB}" "${cmd}" msg_custom "→" "${YWB}" "${cmd}"
if [[ -s "$logfile" ]]; then if [[ -s "$logfile" ]]; then
echo -e "\n${TAB}--- Last 10 lines of log ---" local log_lines=$(wc -l <"$logfile")
echo "--- Last 10 lines of silent log ---"
tail -n 10 "$logfile" tail -n 10 "$logfile"
echo -e "${TAB}-----------------------------------\n" echo "-----------------------------------"
# Show how to view full log if there are more lines
if [[ $log_lines -gt 10 ]]; then
msg_custom "📋" "${YW}" "View full log (${log_lines} lines): ${logfile}"
fi
fi fi
exit "$rc" exit "$rc"

View File

@@ -175,9 +175,9 @@ error_handler() {
fi fi
if [[ -n "$active_log" && -s "$active_log" ]]; then if [[ -n "$active_log" && -s "$active_log" ]]; then
echo -e "\n${TAB}--- Last 20 lines of log ---" echo "--- Last 20 lines of silent log ---"
tail -n 20 "$active_log" tail -n 20 "$active_log"
echo -e "${TAB}-----------------------------------\n" echo "-----------------------------------"
# Detect context: Container (INSTALL_LOG set + /root exists) vs Host (BUILD_LOG) # Detect context: Container (INSTALL_LOG set + /root exists) vs Host (BUILD_LOG)
if [[ -n "${INSTALL_LOG:-}" && -d /root ]]; then if [[ -n "${INSTALL_LOG:-}" && -d /root ]]; then
@@ -204,50 +204,23 @@ error_handler() {
fi fi
echo "" echo ""
if declare -f msg_custom >/dev/null 2>&1; then echo -en "${YW}Remove broken container ${CTID}? (Y/n) [auto-remove in 60s]: ${CL}"
echo -en "${TAB}${TAB}${YW}Remove broken container ${CTID}? (Y/n) [auto-remove in 60s]: ${CL}"
else
echo -en "${YW}Remove broken container ${CTID}? (Y/n) [auto-remove in 60s]: ${CL}"
fi
if read -t 60 -r response; then if read -t 60 -r response; then
if [[ -z "$response" || "$response" =~ ^[Yy]$ ]]; then if [[ -z "$response" || "$response" =~ ^[Yy]$ ]]; then
echo "" echo -e "\n${YW}Removing container ${CTID}${CL}"
if declare -f msg_info >/dev/null 2>&1; then
msg_info "Removing container ${CTID}"
else
echo -e "${YW}Removing container ${CTID}${CL}"
fi
pct stop "$CTID" &>/dev/null || true pct stop "$CTID" &>/dev/null || true
pct destroy "$CTID" &>/dev/null || true pct destroy "$CTID" &>/dev/null || true
if declare -f msg_ok >/dev/null 2>&1; then echo -e "${GN}${CL} Container ${CTID} removed"
msg_ok "Container ${CTID} removed"
else
echo -e "${GN}${CL} Container ${CTID} removed"
fi
elif [[ "$response" =~ ^[Nn]$ ]]; then elif [[ "$response" =~ ^[Nn]$ ]]; then
echo "" echo -e "\n${YW}Container ${CTID} kept for debugging${CL}"
if declare -f msg_warn >/dev/null 2>&1; then
msg_warn "Container ${CTID} kept for debugging"
else
echo -e "${YW}Container ${CTID} kept for debugging${CL}"
fi
fi fi
else else
# Timeout - auto-remove # Timeout - auto-remove
echo "" echo -e "\n${YW}No response - auto-removing container${CL}"
if declare -f msg_info >/dev/null 2>&1; then
msg_info "No response - removing container ${CTID}"
else
echo -e "${YW}No response - removing container ${CTID}${CL}"
fi
pct stop "$CTID" &>/dev/null || true pct stop "$CTID" &>/dev/null || true
pct destroy "$CTID" &>/dev/null || true pct destroy "$CTID" &>/dev/null || true
if declare -f msg_ok >/dev/null 2>&1; then echo -e "${GN}${CL} Container ${CTID} removed"
msg_ok "Container ${CTID} removed"
else
echo -e "${GN}${CL} Container ${CTID} removed"
fi
fi fi
# Force one final status update attempt after cleanup # Force one final status update attempt after cleanup
@@ -281,10 +254,6 @@ on_exit() {
# post_to_api was called ("installing" sent) but post_update_to_api was never called # post_to_api was called ("installing" sent) but post_update_to_api was never called
if [[ "${POST_TO_API_DONE:-}" == "true" && "${POST_UPDATE_DONE:-}" != "true" ]]; then if [[ "${POST_TO_API_DONE:-}" == "true" && "${POST_UPDATE_DONE:-}" != "true" ]]; then
if declare -f post_update_to_api >/dev/null 2>&1; then if declare -f post_update_to_api >/dev/null 2>&1; then
# Ensure log is accessible on host before reporting
if declare -f ensure_log_on_host >/dev/null 2>&1; then
ensure_log_on_host
fi
if [[ $exit_code -ne 0 ]]; then if [[ $exit_code -ne 0 ]]; then
post_update_to_api "failed" "$exit_code" post_update_to_api "failed" "$exit_code"
else else
@@ -304,10 +273,6 @@ on_exit() {
# - Exits with code 130 (128 + SIGINT=2) # - Exits with code 130 (128 + SIGINT=2)
# ------------------------------------------------------------------------------ # ------------------------------------------------------------------------------
on_interrupt() { on_interrupt() {
# Ensure log is accessible on host before reporting
if declare -f ensure_log_on_host >/dev/null 2>&1; then
ensure_log_on_host
fi
# Report interruption to telemetry API (prevents stuck "installing" records) # Report interruption to telemetry API (prevents stuck "installing" records)
if declare -f post_update_to_api >/dev/null 2>&1; then if declare -f post_update_to_api >/dev/null 2>&1; then
post_update_to_api "failed" "130" post_update_to_api "failed" "130"
@@ -329,10 +294,6 @@ on_interrupt() {
# - Triggered by external process termination # - Triggered by external process termination
# ------------------------------------------------------------------------------ # ------------------------------------------------------------------------------
on_terminate() { on_terminate() {
# Ensure log is accessible on host before reporting
if declare -f ensure_log_on_host >/dev/null 2>&1; then
ensure_log_on_host
fi
# Report termination to telemetry API (prevents stuck "installing" records) # Report termination to telemetry API (prevents stuck "installing" records)
if declare -f post_update_to_api >/dev/null 2>&1; then if declare -f post_update_to_api >/dev/null 2>&1; then
post_update_to_api "failed" "143" post_update_to_api "failed" "143"

View File

@@ -207,9 +207,15 @@ silent() {
msg_custom "→" "${YWB}" "${cmd}" msg_custom "→" "${YWB}" "${cmd}"
if [[ -s "$logfile" ]]; then if [[ -s "$logfile" ]]; then
echo -e "\n${TAB}--- Last 10 lines of log ---" local log_lines=$(wc -l <"$logfile")
echo "--- Last 10 lines of log ---"
tail -n 10 "$logfile" tail -n 10 "$logfile"
echo -e "${TAB}----------------------------\n" echo "----------------------------"
# Show how to view full log if there are more lines
if [[ $log_lines -gt 10 ]]; then
msg_custom "📋" "${YW}" "View full log (${log_lines} lines): ${logfile}"
fi
fi fi
exit "$rc" exit "$rc"