fix(cluster): validate container IDs cluster-wide across all nodes

- Query /cluster/resources via pvesh to check all VMs/CTs on ALL nodes - Check /etc/pve/nodes/*/qemu-server and /etc/pve/nodes/*/lxc dirs - Handles pmxcfs sync delays that caused sporadic ID conflicts - Remove duplicate validate_container_id/get_valid_container_id definitions - Add max_attempts safeguard to prevent infinite loops
2026-02-16 18:23:27 +01:00 · 2026-02-14 14:02:40 +01:00
5 changed files with 112 additions and 188 deletions
--- a/misc/api.func
+++ b/misc/api.func
@@ -287,32 +287,6 @@ get_error_text() {
  fi
 }
 # ------------------------------------------------------------------------------
 # build_error_string()
 #
 # - Builds a structured error string for telemetry reporting
 # - Format: "exit_code=<N> | <explanation>\n---\n<last 20 log lines>"
 # - If no log lines available, returns just the explanation
 # - Arguments:
 #   * $1: exit_code (numeric)
 #   * $2: log_text (optional, output from get_error_text)
 # - Returns structured error string via stdout
 # ------------------------------------------------------------------------------
 build_error_string() {
  local exit_code="${1:-1}"
  local log_text="${2:-}"
  local explanation
  explanation=$(explain_exit_code "$exit_code")
  if [[ -n "$log_text" ]]; then
    # Structured format: header + separator + log lines
    printf 'exit_code=%s | %s\n---\n%s' "$exit_code" "$explanation" "$log_text"
  else
    # No log available - just the explanation with exit code
    printf 'exit_code=%s | %s' "$exit_code" "$explanation"
  fi
 }
 # ==============================================================================
 # SECTION 2: TELEMETRY FUNCTIONS
 # ==============================================================================
@@ -691,12 +665,13 @@ post_update_to_api() {
    else
      exit_code=1
    fi
    # Get log lines and build structured error string
    local error_text=""
    error_text=$(get_error_text)
-    local full_error
+    if [[ -n "$error_text" ]]; then
-    full_error=$(build_error_string "$exit_code" "$error_text")
+      error=$(json_escape "$error_text")
-    error=$(json_escape "$full_error")
+    else
      error=$(json_escape "$(explain_exit_code "$exit_code")")
    fi
    short_error=$(json_escape "$(explain_exit_code "$exit_code")")
    error_category=$(categorize_error "$exit_code")
    [[ -z "$error" ]] && error="Unknown error"
@@ -839,52 +814,31 @@ EOF
 categorize_error() {
  local code="$1"
  case "$code" in
-  # Network errors (curl/wget)
+  # Network errors
-  6 | 7 | 22 | 35) echo "network" ;;
+  6 | 7 | 22 | 28 | 35) echo "network" ;;
-  # Timeout errors
+  # Storage errors
-  28 | 124 | 211) echo "timeout" ;;
+  214 | 217 | 219) echo "storage" ;;
-  # Storage errors (Proxmox storage)
+  # Dependency/Package errors
-  214 | 217 | 219 | 224) echo "storage" ;;
+  100 | 101 | 102 | 127 | 160 | 161 | 162) echo "dependency" ;;
  # Dependency/Package errors (APT, DPKG, pip, commands)
  100 | 101 | 102 | 127 | 160 | 161 | 162 | 255) echo "dependency" ;;
  # Permission errors
  126 | 152) echo "permission" ;;
-  # Configuration errors (Proxmox config, invalid args)
+  # Timeout errors
-  128 | 203 | 204 | 205 | 206 | 207 | 208) echo "config" ;;
+  124 | 28 | 211) echo "timeout" ;;
-  # Proxmox container/template errors
+  # Configuration errors
-  200 | 209 | 210 | 212 | 213 | 215 | 216 | 218 | 220 | 221 | 222 | 223 | 225 | 231) echo "proxmox" ;;
+  203 | 204 | 205 | 206 | 207 | 208) echo "config" ;;
  # Service/Systemd errors
  150 | 151 | 153 | 154) echo "service" ;;
  # Database errors (PostgreSQL, MySQL, MongoDB)
  170 | 171 | 172 | 173 | 180 | 181 | 182 | 183 | 190 | 191 | 192 | 193) echo "database" ;;
  # Node.js / JavaScript runtime errors
  243 | 245 | 246 | 247 | 248 | 249) echo "runtime" ;;
  # Python environment errors
  # (already covered: 160-162 under dependency)
  # Aborted by user
  130) echo "aborted" ;;
-  # Resource errors (OOM, SIGKILL, SIGABRT)
+  # Resource errors (OOM, etc)
-  134 | 137) echo "resource" ;;
+  137 | 134) echo "resource" ;;
-  # Signal/Process errors (SIGTERM, SIGPIPE, SIGSEGV)
+  # Default
  139 | 141 | 143) echo "signal" ;;
  # Shell errors (general error, syntax error)  
  1 | 2) echo "shell" ;;
  # Default - truly unknown
  *) echo "unknown" ;;
  esac
 }
@@ -947,9 +901,11 @@ post_tool_to_api() {
    [[ ! "$exit_code" =~ ^[0-9]+$ ]] && exit_code=1
    local error_text=""
    error_text=$(get_error_text)
-    local full_error
+    if [[ -n "$error_text" ]]; then
-    full_error=$(build_error_string "$exit_code" "$error_text")
+      error=$(json_escape "$error_text")
-    error=$(json_escape "$full_error")
+    else
      error=$(json_escape "$(explain_exit_code "$exit_code")")
    fi
    error_category=$(categorize_error "$exit_code")
  fi
@@ -1012,9 +968,11 @@ post_addon_to_api() {
    [[ ! "$exit_code" =~ ^[0-9]+$ ]] && exit_code=1
    local error_text=""
    error_text=$(get_error_text)
-    local full_error
+    if [[ -n "$error_text" ]]; then
-    full_error=$(build_error_string "$exit_code" "$error_text")
+      error=$(json_escape "$error_text")
-    error=$(json_escape "$full_error")
+    else
      error=$(json_escape "$(explain_exit_code "$exit_code")")
    fi
    error_category=$(categorize_error "$exit_code")
  fi
@@ -1109,9 +1067,11 @@ post_update_to_api_extended() {
    fi
    local error_text=""
    error_text=$(get_error_text)
-    local full_error
+    if [[ -n "$error_text" ]]; then
-    full_error=$(build_error_string "$exit_code" "$error_text")
+      error=$(json_escape "$error_text")
-    error=$(json_escape "$full_error")
+    else
      error=$(json_escape "$(explain_exit_code "$exit_code")")
    fi
    error_category=$(categorize_error "$exit_code")
    [[ -z "$error" ]] && error="Unknown error"
  fi
--- a/misc/build.func
+++ b/misc/build.func
@@ -277,8 +277,9 @@ install_ssh_keys_into_ct() {
 # ------------------------------------------------------------------------------
 # validate_container_id()
 #
-# - Validates if a container ID is available for use
+# - Validates if a container ID is available for use (CLUSTER-WIDE)
-# - Checks if ID is already used by VM or LXC container
+# - Checks cluster resources via pvesh for VMs/CTs on ALL nodes
 # - Falls back to local config file check if pvesh unavailable
 # - Checks if ID is used in LVM logical volumes
 # - Returns 0 if ID is available, 1 if already in use
 # ------------------------------------------------------------------------------
@@ -290,11 +291,35 @@ validate_container_id() {
    return 1
  fi
-  # Check if config file exists for VM or LXC
+  # CLUSTER-WIDE CHECK: Query all VMs/CTs across all nodes
  # This catches IDs used on other nodes in the cluster
  # NOTE: Works on single-node too - Proxmox always has internal cluster structure
  # Falls back gracefully if pvesh unavailable or returns empty
  if command -v pvesh &>/dev/null; then
    local cluster_ids
    cluster_ids=$(pvesh get /cluster/resources --type vm --output-format json 2>/dev/null | 
      grep -oP '"vmid":\s*\K[0-9]+' 2>/dev/null || true)
    if [[ -n "$cluster_ids" ]] && echo "$cluster_ids" | grep -qw "$ctid"; then
      return 1
    fi
  fi
  # LOCAL FALLBACK: Check if config file exists for VM or LXC
  # This handles edge cases where pvesh might not return all info
  if [[ -f "/etc/pve/qemu-server/${ctid}.conf" ]] || [[ -f "/etc/pve/lxc/${ctid}.conf" ]]; then
    return 1
  fi
  # Check ALL nodes in cluster for config files (handles pmxcfs sync delays)
  # NOTE: On single-node, /etc/pve/nodes/ contains just the one node - still works
  if [[ -d "/etc/pve/nodes" ]]; then
    for node_dir in /etc/pve/nodes/*/; do
      if [[ -f "${node_dir}qemu-server/${ctid}.conf" ]] || [[ -f "${node_dir}lxc/${ctid}.conf" ]]; then
        return 1
      fi
    done
  fi
  # Check if ID is used in LVM logical volumes
  if lvs --noheadings -o lv_name 2>/dev/null | grep -qE "(^|[-_])${ctid}($|[-_])"; then
    return 1
@@ -306,63 +331,30 @@ validate_container_id() {
 # ------------------------------------------------------------------------------
 # get_valid_container_id()
 #
-# - Returns a valid, unused container ID
+# - Returns a valid, unused container ID (CLUSTER-AWARE)
 # - Uses pvesh /cluster/nextid as starting point (already cluster-aware)
 # - If provided ID is valid, returns it
-# - Otherwise increments from suggested ID until a free one is found
+# - Otherwise increments until a free one is found across entire cluster
 # - Calls validate_container_id() to check availability
 # ------------------------------------------------------------------------------
 get_valid_container_id() {
-  local suggested_id="${1:-$(pvesh get /cluster/nextid)}"
+  local suggested_id="${1:-$(pvesh get /cluster/nextid 2>/dev/null || echo 100)}"
-
+
-  while ! validate_container_id "$suggested_id"; do
+  # Ensure we have a valid starting ID
-    suggested_id=$((suggested_id + 1))
+  if ! [[ "$suggested_id" =~ ^[0-9]+$ ]]; then
-  done
+    suggested_id=$(pvesh get /cluster/nextid 2>/dev/null || echo 100)
-
+  fi
-  echo "$suggested_id"
+
-}
+  local max_attempts=1000
-
+  local attempts=0
 # ------------------------------------------------------------------------------
 # validate_container_id()
 #
 # - Validates if a container ID is available for use
 # - Checks if ID is already used by VM or LXC container
 # - Checks if ID is used in LVM logical volumes
 # - Returns 0 if ID is available, 1 if already in use
 # ------------------------------------------------------------------------------
 validate_container_id() {
  local ctid="$1"
  # Check if ID is numeric
  if ! [[ "$ctid" =~ ^[0-9]+$ ]]; then
    return 1
  fi
  # Check if config file exists for VM or LXC
  if [[ -f "/etc/pve/qemu-server/${ctid}.conf" ]] || [[ -f "/etc/pve/lxc/${ctid}.conf" ]]; then
    return 1
  fi
  # Check if ID is used in LVM logical volumes
  if lvs --noheadings -o lv_name 2>/dev/null | grep -qE "(^|[-_])${ctid}($|[-_])"; then
    return 1
  fi
  return 0
 }
 # ------------------------------------------------------------------------------
 # get_valid_container_id()
 #
 # - Returns a valid, unused container ID
 # - If provided ID is valid, returns it
 # - Otherwise increments from suggested ID until a free one is found
 # - Calls validate_container_id() to check availability
 # ------------------------------------------------------------------------------
 get_valid_container_id() {
  local suggested_id="${1:-$(pvesh get /cluster/nextid)}"
  while ! validate_container_id "$suggested_id"; do
    suggested_id=$((suggested_id + 1))
    attempts=$((attempts + 1))
    if [[ $attempts -ge $max_attempts ]]; then
      msg_error "Could not find available container ID after $max_attempts attempts"
      exit 1
    fi
  done
  echo "$suggested_id"
@@ -4133,7 +4125,8 @@ EOF'
    # Show combined log location
    if [[ -n "$CTID" && -n "${SESSION_ID:-}" ]]; then
-      msg_custom "📋" "${YW}" "Installation log: ${combined_log}"
+      echo ""
      echo -e "${GN}✔${CL} Installation log: ${BL}${combined_log}${CL}"
    fi
    # Dev mode: Keep container or open breakpoint shell
@@ -4156,21 +4149,19 @@ EOF'
      exit $install_exit_code
    fi
-    # Prompt user for cleanup with 60s timeout
+    # Prompt user for cleanup with 60s timeout (plain echo - no msg_info to avoid spinner)
    echo ""
-    echo -en "${TAB}❓${TAB}${YW}Remove broken container ${CTID}? (Y/n) [auto-remove in 60s]: ${CL}"
+    echo -en "${YW}Remove broken container ${CTID}? (Y/n) [auto-remove in 60s]: ${CL}"
    if read -t 60 -r response; then
      if [[ -z "$response" || "$response" =~ ^[Yy]$ ]]; then
        # Remove container
-        echo ""
+        echo -e "\n${TAB}${HOLD}${YW}Removing container ${CTID}${CL}"
        msg_info "Removing container ${CTID}"
        pct stop "$CTID" &>/dev/null || true
        pct destroy "$CTID" &>/dev/null || true
-        msg_ok "Container ${CTID} removed"
+        echo -e "${BFR}${CM}${GN}Container ${CTID} removed${CL}"
      elif [[ "$response" =~ ^[Nn]$ ]]; then
-        echo ""
+        echo -e "\n${TAB}${YW}Container ${CTID} kept for debugging${CL}"
        msg_warn "Container ${CTID} kept for debugging"
        # Dev mode: Setup MOTD/SSH for debugging access to broken container
        if [[ "${DEV_MODE_MOTD:-false}" == "true" ]]; then
@@ -4186,11 +4177,11 @@ EOF'
      fi
    else
      # Timeout - auto-remove
-      echo ""
+      echo -e "\n${YW}No response - auto-removing container${CL}"
-      msg_info "No response - removing container ${CTID}"
+      echo -e "${TAB}${HOLD}${YW}Removing container ${CTID}${CL}"
      pct stop "$CTID" &>/dev/null || true
      pct destroy "$CTID" &>/dev/null || true
-      msg_ok "Container ${CTID} removed"
+      echo -e "${BFR}${CM}${GN}Container ${CTID} removed${CL}"
    fi
    # Force one final status update attempt after cleanup
--- a/misc/core.func
+++ b/misc/core.func
@@ -522,9 +522,15 @@ silent() {
    msg_custom "→" "${YWB}" "${cmd}"
    if [[ -s "$logfile" ]]; then
-      echo -e "\n${TAB}--- Last 10 lines of log ---"
+      local log_lines=$(wc -l <"$logfile")
      echo "--- Last 10 lines of silent log ---"
      tail -n 10 "$logfile"
-      echo -e "${TAB}-----------------------------------\n"
+      echo "-----------------------------------"
      # Show how to view full log if there are more lines
      if [[ $log_lines -gt 10 ]]; then
        msg_custom "📋" "${YW}" "View full log (${log_lines} lines): ${logfile}"
      fi
    fi
    exit "$rc"
--- a/misc/error_handler.func
+++ b/misc/error_handler.func
@@ -175,9 +175,9 @@ error_handler() {
  fi
  if [[ -n "$active_log" && -s "$active_log" ]]; then
-    echo -e "\n${TAB}--- Last 20 lines of log ---"
+    echo "--- Last 20 lines of silent log ---"
    tail -n 20 "$active_log"
-    echo -e "${TAB}-----------------------------------\n"
+    echo "-----------------------------------"
    # Detect context: Container (INSTALL_LOG set + /root exists) vs Host (BUILD_LOG)
    if [[ -n "${INSTALL_LOG:-}" && -d /root ]]; then
@@ -204,50 +204,23 @@ error_handler() {
        fi
        echo ""
-        if declare -f msg_custom >/dev/null 2>&1; then
+        echo -en "${YW}Remove broken container ${CTID}? (Y/n) [auto-remove in 60s]: ${CL}"
          echo -en "${TAB}❓${TAB}${YW}Remove broken container ${CTID}? (Y/n) [auto-remove in 60s]: ${CL}"
        else
          echo -en "${YW}Remove broken container ${CTID}? (Y/n) [auto-remove in 60s]: ${CL}"
        fi
        if read -t 60 -r response; then
          if [[ -z "$response" || "$response" =~ ^[Yy]$ ]]; then
-            echo ""
+            echo -e "\n${YW}Removing container ${CTID}${CL}"
            if declare -f msg_info >/dev/null 2>&1; then
              msg_info "Removing container ${CTID}"
            else
              echo -e "${YW}Removing container ${CTID}${CL}"
            fi
            pct stop "$CTID" &>/dev/null || true
            pct destroy "$CTID" &>/dev/null || true
-            if declare -f msg_ok >/dev/null 2>&1; then
+            echo -e "${GN}✔${CL} Container ${CTID} removed"
              msg_ok "Container ${CTID} removed"
            else
              echo -e "${GN}✔${CL} Container ${CTID} removed"
            fi
          elif [[ "$response" =~ ^[Nn]$ ]]; then
-            echo ""
+            echo -e "\n${YW}Container ${CTID} kept for debugging${CL}"
            if declare -f msg_warn >/dev/null 2>&1; then
              msg_warn "Container ${CTID} kept for debugging"
            else
              echo -e "${YW}Container ${CTID} kept for debugging${CL}"
            fi
          fi
        else
          # Timeout - auto-remove
-          echo ""
+          echo -e "\n${YW}No response - auto-removing container${CL}"
          if declare -f msg_info >/dev/null 2>&1; then
            msg_info "No response - removing container ${CTID}"
          else
            echo -e "${YW}No response - removing container ${CTID}${CL}"
          fi
          pct stop "$CTID" &>/dev/null || true
          pct destroy "$CTID" &>/dev/null || true
-          if declare -f msg_ok >/dev/null 2>&1; then
+          echo -e "${GN}✔${CL} Container ${CTID} removed"
            msg_ok "Container ${CTID} removed"
          else
            echo -e "${GN}✔${CL} Container ${CTID} removed"
          fi
        fi
        # Force one final status update attempt after cleanup
@@ -281,10 +254,6 @@ on_exit() {
  # post_to_api was called ("installing" sent) but post_update_to_api was never called
  if [[ "${POST_TO_API_DONE:-}" == "true" && "${POST_UPDATE_DONE:-}" != "true" ]]; then
    if declare -f post_update_to_api >/dev/null 2>&1; then
      # Ensure log is accessible on host before reporting
      if declare -f ensure_log_on_host >/dev/null 2>&1; then
        ensure_log_on_host
      fi
      if [[ $exit_code -ne 0 ]]; then
        post_update_to_api "failed" "$exit_code"
      else
@@ -304,10 +273,6 @@ on_exit() {
 # - Exits with code 130 (128 + SIGINT=2)
 # ------------------------------------------------------------------------------
 on_interrupt() {
  # Ensure log is accessible on host before reporting
  if declare -f ensure_log_on_host >/dev/null 2>&1; then
    ensure_log_on_host
  fi
  # Report interruption to telemetry API (prevents stuck "installing" records)
  if declare -f post_update_to_api >/dev/null 2>&1; then
    post_update_to_api "failed" "130"
@@ -329,10 +294,6 @@ on_interrupt() {
 # - Triggered by external process termination
 # ------------------------------------------------------------------------------
 on_terminate() {
  # Ensure log is accessible on host before reporting
  if declare -f ensure_log_on_host >/dev/null 2>&1; then
    ensure_log_on_host
  fi
  # Report termination to telemetry API (prevents stuck "installing" records)
  if declare -f post_update_to_api >/dev/null 2>&1; then
    post_update_to_api "failed" "143"
--- a/misc/vm-core.func
+++ b/misc/vm-core.func
@@ -207,9 +207,15 @@ silent() {
    msg_custom "→" "${YWB}" "${cmd}"
    if [[ -s "$logfile" ]]; then
-      echo -e "\n${TAB}--- Last 10 lines of log ---"
+      local log_lines=$(wc -l <"$logfile")
      echo "--- Last 10 lines of log ---"
      tail -n 10 "$logfile"
-      echo -e "${TAB}----------------------------\n"
+      echo "----------------------------"
      # Show how to view full log if there are more lines
      if [[ $log_lines -gt 10 ]]; then
        msg_custom "📋" "${YW}" "View full log (${log_lines} lines): ${logfile}"
      fi
    fi
    exit "$rc"