fix: route customization errors through recovery menu instead of error_handler

Previously, when a container was stopped or base package installation failed during 'Customizing LXC Container', the error handler's simple 'Remove broken container? (Y/n)' prompt appeared instead of the full recovery menu with retry/repair options (verbose rebuild, APT repair, OOM retry, DNS override). Root cause: set -Eeuo pipefail and ERR trap were still active during the customization phase (locale, timezone, base packages). The exit 1 triggered error_handler() which has its own cleanup flow, bypassing the full recovery menu in build_container(). Changes: - Disable ERR trap before customization phase (not just before lxc-attach) - Replace exit 1 with install_exit_code=1 for base package failures - Wrap lxc-attach section in if-block that skips when customization failed - Both customization and install errors now reach the full recovery menu - Fix read in error_handler.func to use </dev/tty (same curl stdin issue) - Eliminates 'pop_var_context: head of shell_variables not a function context' bash error caused by double-exit (exit 1 -> ERR trap -> exit)
2026-05-15 04:54:58 +02:00 · 2026-02-28 08:41:31 +01:00
parent 10400e5c56
commit e80241745f
2 changed files with 66 additions and 56 deletions
--- a/misc/build.func
+++ b/misc/build.func
@@ -4012,6 +4012,16 @@ EOF
  #   install_gpu_userland "NVIDIA"
  # fi

+  # Disable error trap for entire customization & install phase.
+  # All errors are handled explicitly — recovery menu shown on failure.
+  # Without this, customization errors (e.g. container stopped during base package
+  # install) would trigger error_handler() with a simple "Remove broken container?"
+  # prompt instead of the full recovery menu with retry/repair options.
+  set +Eeuo pipefail
+  trap - ERR
+
+  local install_exit_code=0
+
  # Continue with standard container setup
  if [ "$var_os" == "alpine" ]; then
    sleep 3
@@ -4021,7 +4031,7 @@ http://dl-cdn.alpinelinux.org/alpine/latest-stable/community
 EOF'
    pct exec "$CTID" -- ash -c "apk add bash newt curl openssh nano mc ncurses jq >/dev/null" || {
      msg_error "Failed to install base packages in Alpine container"
-      exit 1
+      install_exit_code=1
    }
  else
    sleep 3
@@ -4047,67 +4057,67 @@ EOF'

    pct exec "$CTID" -- bash -c "apt-get update >/dev/null && apt-get install -y sudo curl mc gnupg2 jq >/dev/null" || {
      msg_error "apt-get base packages installation failed"
-      exit 1
+      install_exit_code=1
    }
  fi

-  msg_ok "Customized LXC Container"
+  # Only continue with installation if customization succeeded
+  if [[ $install_exit_code -eq 0 ]]; then
+    msg_ok "Customized LXC Container"

-  # Optional DNS override for retry scenarios (inside LXC, never on host)
-  if [[ "${DNS_RETRY_OVERRIDE:-false}" == "true" ]]; then
-    msg_info "Applying DNS retry override in LXC (8.8.8.8, 1.1.1.1)"
-    pct exec "$CTID" -- bash -c "printf 'nameserver 8.8.8.8\nnameserver 1.1.1.1\n' >/etc/resolv.conf" >/dev/null 2>&1 || true
-    msg_ok "DNS override applied in LXC"
-  fi
-
-  # Install SSH keys
-  install_ssh_keys_into_ct
-
-  # Start timer for duration tracking
-  start_install_timer
-
-  # Run application installer
-  # Disable error trap - container errors are handled internally via flag file
-  set +Eeuo pipefail # Disable ALL error handling temporarily
-  trap - ERR         # Remove ERR trap completely
-
-  # Signal handlers use this flag to stop the container on abort (SIGHUP/SIGINT/SIGTERM)
-  # Without this, SSH disconnects leave the container running as an orphan process
-  # that sends "configuring" status AFTER the host already reported "failed"
-  export CONTAINER_INSTALLING=true
-
-  # Capture lxc-attach terminal output to host-side log via tee.
-  # This is the ONLY reliable way to get install output when:
-  #   - install.func fails to load (DNS error) → no container-side logging
-  #   - install script crashes before logging starts
-  #   - $STD/silent() not used for some commands
-  # PIPESTATUS[0] gets the real exit code from lxc-attach (not from tee).
-  local _LXC_CAPTURE_LOG="/tmp/.install-capture-${SESSION_ID}.log"
-  lxc-attach -n "$CTID" -- bash -c "$(curl -fsSL https://raw.githubusercontent.com/community-scripts/ProxmoxVE/main/install/${var_install}.sh)" 2>&1 | tee "$_LXC_CAPTURE_LOG"
-  local lxc_exit=${PIPESTATUS[0]}
-
-  unset CONTAINER_INSTALLING
-
-  # Keep error handling DISABLED during failure detection and recovery
-  # Re-enabling it here would cause any pct exec/pull failure to trigger
-  # error_handler() on the host, bypassing the recovery menu entirely
-
-  # Check for error flag file in container (more reliable than lxc-attach exit code)
-  local install_exit_code=0
-  if [[ -n "${SESSION_ID:-}" ]]; then
-    local error_flag="/root/.install-${SESSION_ID}.failed"
-    if pct exec "$CTID" -- test -f "$error_flag" 2>/dev/null; then
-      install_exit_code=$(pct exec "$CTID" -- cat "$error_flag" 2>/dev/null || echo "1")
-      pct exec "$CTID" -- rm -f "$error_flag" 2>/dev/null || true
+    # Optional DNS override for retry scenarios (inside LXC, never on host)
+    if [[ "${DNS_RETRY_OVERRIDE:-false}" == "true" ]]; then
+      msg_info "Applying DNS retry override in LXC (8.8.8.8, 1.1.1.1)"
+      pct exec "$CTID" -- bash -c "printf 'nameserver 8.8.8.8\nnameserver 1.1.1.1\n' >/etc/resolv.conf" >/dev/null 2>&1 || true
+      msg_ok "DNS override applied in LXC"
    fi
-  fi

-  # Fallback to lxc-attach exit code if no flag file
-  if [[ $install_exit_code -eq 0 && $lxc_exit -ne 0 ]]; then
-    install_exit_code=$lxc_exit
-  fi
+    # Install SSH keys
+    install_ssh_keys_into_ct

-  # Installation failed?
+    # Start timer for duration tracking
+    start_install_timer
+
+    # Run application installer
+    # Error handling already disabled above (before customization phase)
+
+    # Signal handlers use this flag to stop the container on abort (SIGHUP/SIGINT/SIGTERM)
+    # Without this, SSH disconnects leave the container running as an orphan process
+    # that sends "configuring" status AFTER the host already reported "failed"
+    export CONTAINER_INSTALLING=true
+
+    # Capture lxc-attach terminal output to host-side log via tee.
+    # This is the ONLY reliable way to get install output when:
+    #   - install.func fails to load (DNS error) → no container-side logging
+    #   - install script crashes before logging starts
+    #   - $STD/silent() not used for some commands
+    # PIPESTATUS[0] gets the real exit code from lxc-attach (not from tee).
+    local _LXC_CAPTURE_LOG="/tmp/.install-capture-${SESSION_ID}.log"
+    lxc-attach -n "$CTID" -- bash -c "$(curl -fsSL https://raw.githubusercontent.com/community-scripts/ProxmoxVE/main/install/${var_install}.sh)" 2>&1 | tee "$_LXC_CAPTURE_LOG"
+    local lxc_exit=${PIPESTATUS[0]}
+
+    unset CONTAINER_INSTALLING
+
+    # Keep error handling DISABLED during failure detection and recovery
+    # Re-enabling it here would cause any pct exec/pull failure to trigger
+    # error_handler() on the host, bypassing the recovery menu entirely
+
+    # Check for error flag file in container (more reliable than lxc-attach exit code)
+    if [[ -n "${SESSION_ID:-}" ]]; then
+      local error_flag="/root/.install-${SESSION_ID}.failed"
+      if pct exec "$CTID" -- test -f "$error_flag" 2>/dev/null; then
+        install_exit_code=$(pct exec "$CTID" -- cat "$error_flag" 2>/dev/null || echo "1")
+        pct exec "$CTID" -- rm -f "$error_flag" 2>/dev/null || true
+      fi
+    fi
+
+    # Fallback to lxc-attach exit code if no flag file
+    if [[ $install_exit_code -eq 0 && ${lxc_exit:-0} -ne 0 ]]; then
+      install_exit_code=${lxc_exit:-0}
+    fi
+  fi # end: if [[ $install_exit_code -eq 0 ]] (customization succeeded)
+
+  # Installation or customization failed?
  if [[ $install_exit_code -ne 0 ]]; then
    # Prevent job-control signals from suspending the script during recovery.
    # In non-interactive shells (bash -c), background processes (spinner) can
--- a/misc/error_handler.func
+++ b/misc/error_handler.func
@@ -286,7 +286,7 @@ error_handler() {
        echo -en "${YW}Remove broken container ${CTID}? (Y/n) [auto-remove in 60s]: ${CL}"
      fi

-      if read -t 60 -r response; then
+      if read -t 60 -r response </dev/tty; then
        if [[ -z "$response" || "$response" =~ ^[Yy]$ ]]; then
          echo ""
          if declare -f msg_info >/dev/null 2>&1; then