From eef38514b11e1976a0dfe22cbc587560db236574 Mon Sep 17 00:00:00 2001 From: "CanbiZ (MickLesk)" <47820557+MickLesk@users.noreply.github.com> Date: Mon, 16 Feb 2026 18:06:44 +0100 Subject: [PATCH] feat(build): APT in-place repair, exit 1 subclassification, new exit codes - Add APT/DPKG in-place recovery: detects exit 100/101/102/255 and exit 1 with APT log patterns, offers to repair dpkg state and re-run install script without destroying the container - Add exit 1 subclassification: analyzes combined log to identify root cause (APT, OOM, network, command-not-found) and routes to appropriate recovery option - Add exit 10 hint: shows privileged mode / nesting suggestion - Add exit 127 hint: extracts missing command name from logs - Refactor recovery menu: use named option variables (APT_OPTION, OOM_OPTION, DNS_OPTION) instead of hardcoded option numbers, supports up to 6 dynamic options cleanly - Map missing exit codes in api.func: curl 27/36/45/47/55, signals 129 (SIGHUP) / 131 (SIGQUIT), npm 239 --- misc/api.func | 10 ++- misc/build.func | 193 +++++++++++++++++++++++++++++++++++------------- 2 files changed, 152 insertions(+), 51 deletions(-) diff --git a/misc/api.func b/misc/api.func index e0f990495..746054427 100644 --- a/misc/api.func +++ b/misc/api.func @@ -114,9 +114,14 @@ explain_exit_code() { 22) echo "curl: HTTP error returned (404, 429, 500+)" ;; 23) echo "curl: Write error (disk full or permissions)" ;; 25) echo "curl: Upload failed" ;; + 27) echo "curl: Out of memory (memory allocation failed)" ;; 28) echo "curl: Operation timeout (network slow or server not responding)" ;; 30) echo "curl: FTP port command failed" ;; 35) echo "curl: SSL/TLS handshake failed (certificate error)" ;; + 36) echo "curl: FTP bad download resume" ;; + 45) echo "curl: Interface error (failed to bind to specified interface)" ;; + 47) echo "curl: Too many redirects" ;; + 55) echo "curl: Failed sending network data" ;; 56) echo "curl: Receive error (connection reset by peer)" ;; 75) echo "Temporary failure (retry later)" ;; 78) echo "curl: Remote file not found (404 on FTP/file)" ;; @@ -146,7 +151,9 @@ explain_exit_code() { 126) echo "Command invoked cannot execute (permission problem?)" ;; 127) echo "Command not found" ;; 128) echo "Invalid argument to exit" ;; + 129) echo "Killed by SIGHUP (terminal closed / hangup)" ;; 130) echo "Aborted by user (SIGINT)" ;; + 131) echo "Killed by SIGQUIT (core dumped)" ;; 134) echo "Process aborted (SIGABRT - possibly Node.js heap overflow)" ;; 137) echo "Killed (SIGKILL / Out of memory?)" ;; 139) echo "Segmentation fault (core dumped)" ;; @@ -209,7 +216,8 @@ explain_exit_code() { 225) echo "Proxmox: No template available for OS/Version" ;; 231) echo "Proxmox: LXC stack upgrade failed" ;; - # --- Node.js / npm / pnpm / yarn (243-249) --- + # --- Node.js / npm / pnpm / yarn (239-249) --- + 239) echo "npm/Node.js: Unexpected runtime error or dependency failure" ;; 243) echo "Node.js: Out of memory (JavaScript heap out of memory)" ;; 245) echo "Node.js: Invalid command-line option" ;; 246) echo "Node.js: Internal JavaScript Parse Error" ;; diff --git a/misc/build.func b/misc/build.func index 876ff3da6..8860242f8 100644 --- a/misc/build.func +++ b/misc/build.func @@ -4161,6 +4161,8 @@ EOF' # Detect error type for smart recovery options local is_oom=false local is_network_issue=false + local is_apt_issue=false + local is_cmd_not_found=false local error_explanation="" if declare -f explain_exit_code >/dev/null 2>&1; then error_explanation="$(explain_exit_code "$install_exit_code")" @@ -4171,9 +4173,30 @@ EOF' is_oom=true fi + # APT/DPKG detection: exit codes 100-102 (APT), 255 (DPKG with log evidence) + case "$install_exit_code" in + 100 | 101 | 102) is_apt_issue=true ;; + 255) + if [[ -f "$combined_log" ]] && grep -qiE 'dpkg|apt-get|apt\.conf|broken packages|unmet dependencies|E: Sub-process|E: Failed' "$combined_log"; then + is_apt_issue=true + fi + ;; + esac + + # Command not found detection + if [[ $install_exit_code -eq 127 ]]; then + is_cmd_not_found=true + fi + # Network-related detection (curl/apt/git fetch failures and transient network issues) case "$install_exit_code" in - 6 | 7 | 22 | 28 | 35 | 56 | 75 | 78 | 100) is_network_issue=true ;; + 6 | 7 | 22 | 28 | 35 | 56 | 75 | 78) is_network_issue=true ;; + 100) + # APT can fail due to network (Failed to fetch) + if [[ -f "$combined_log" ]] && grep -qiE 'Failed to fetch|Could not resolve|Connection failed|Network is unreachable|Temporary failure resolving' "$combined_log"; then + is_network_issue=true + fi + ;; 128) if [[ -f "$combined_log" ]] && grep -qiE 'RPC failed|early EOF|fetch-pack|HTTP/2 stream|Could not resolve host|Temporary failure resolving|Failed to fetch|Connection reset|Network is unreachable' "$combined_log"; then is_network_issue=true @@ -4181,37 +4204,79 @@ EOF' ;; esac + # Exit 1 subclassification: analyze logs to identify actual root cause + # Many exit 1 errors are actually APT, OOM, network, or command-not-found issues + if [[ $install_exit_code -eq 1 && -f "$combined_log" ]]; then + if grep -qiE 'E: Unable to|E: Package|E: Failed to fetch|dpkg.*error|broken packages|unmet dependencies|dpkg --configure -a' "$combined_log"; then + is_apt_issue=true + fi + if grep -qiE 'Cannot allocate memory|Out of memory|oom-killer|Killed process|JavaScript heap' "$combined_log"; then + is_oom=true + fi + if grep -qiE 'Could not resolve|DNS|Connection refused|Network is unreachable|No route to host|Temporary failure resolving|Failed to fetch' "$combined_log"; then + is_network_issue=true + fi + if grep -qiE ': command not found|No such file or directory.*/s?bin/' "$combined_log"; then + is_cmd_not_found=true + fi + fi + # Show error explanation if available if [[ -n "$error_explanation" ]]; then echo -e "${TAB}${RD}Error: ${error_explanation}${CL}" echo "" fi + # Show specific hints for known error types + if [[ $install_exit_code -eq 10 ]]; then + echo -e "${TAB}${INFO} This error usually means the container needs ${GN}privileged${CL} mode or Docker/nesting support." + echo -e "${TAB}${INFO} Recreate with: Advanced Install → Container Type: ${GN}Privileged${CL}" + echo "" + fi + + if [[ "$is_cmd_not_found" == true ]]; then + local missing_cmd="" + if [[ -f "$combined_log" ]]; then + missing_cmd=$(grep -oiE '[a-zA-Z0-9_.-]+: command not found' "$combined_log" | tail -1 | sed 's/: command not found//') + fi + if [[ -n "$missing_cmd" ]]; then + echo -e "${TAB}${INFO} Missing command: ${GN}${missing_cmd}${CL}" + fi + echo "" + fi + # Build recovery menu based on error type echo -e "${YW}What would you like to do?${CL}" echo "" echo -e " ${GN}1)${CL} Remove container and exit" echo -e " ${GN}2)${CL} Keep container for debugging" - echo -e " ${GN}3)${CL} Retry with verbose mode" + echo -e " ${GN}3)${CL} Retry with verbose mode (full rebuild)" + + local next_option=4 + local APT_OPTION="" OOM_OPTION="" DNS_OPTION="" + + if [[ "$is_apt_issue" == true ]]; then + echo -e " ${GN}${next_option})${CL} Repair APT/DPKG state and re-run install (in-place)" + APT_OPTION=$next_option + next_option=$((next_option + 1)) + fi - local max_option=3 if [[ "$is_oom" == true ]]; then local new_ram=$((RAM_SIZE * 2)) local new_cpu=$((CORE_COUNT * 2)) - echo -e " ${GN}4)${CL} Retry with more resources (RAM: ${RAM_SIZE}→${new_ram} MiB, CPU: ${CORE_COUNT}→${new_cpu} cores)" - max_option=4 + echo -e " ${GN}${next_option})${CL} Retry with more resources (RAM: ${RAM_SIZE}→${new_ram} MiB, CPU: ${CORE_COUNT}→${new_cpu} cores)" + OOM_OPTION=$next_option + next_option=$((next_option + 1)) fi if [[ "$is_network_issue" == true ]]; then - if [[ "$max_option" -eq 3 ]]; then - echo -e " ${GN}4)${CL} Retry with DNS override in LXC (8.8.8.8 / 1.1.1.1)" - max_option=4 - else - echo -e " ${GN}5)${CL} Retry with DNS override in LXC (8.8.8.8 / 1.1.1.1)" - max_option=5 - fi + echo -e " ${GN}${next_option})${CL} Retry with DNS override in LXC (8.8.8.8 / 1.1.1.1)" + DNS_OPTION=$next_option + next_option=$((next_option + 1)) fi + local max_option=$((next_option - 1)) + echo "" echo -en "${YW}Select option [1-${max_option}] (default: 1, auto-remove in 60s): ${CL}" @@ -4240,7 +4305,7 @@ EOF' exit $install_exit_code ;; 3) - # Retry with verbose mode + # Retry with verbose mode (full rebuild) echo -e "\n${TAB}${HOLD}${YW}Removing container ${CTID} for rebuild...${CL}" pct stop "$CTID" &>/dev/null || true pct destroy "$CTID" &>/dev/null || true @@ -4264,15 +4329,71 @@ EOF' build_container return $? ;; - 4) - if [[ "$is_oom" == true ]]; then - # Retry with more resources + *) + # Handle dynamic smart recovery options via named option variables + local handled=false + + if [[ -n "${APT_OPTION}" && "${response}" == "${APT_OPTION}" ]]; then + # APT/DPKG in-place repair: fix broken package state and re-run install script + handled=true + echo -e "\n${TAB}${HOLD}${YW}Repairing APT/DPKG state in container ${CTID}...${CL}" + pct exec "$CTID" -- bash -c " + DEBIAN_FRONTEND=noninteractive dpkg --configure -a 2>/dev/null || true + apt-get -f install -y 2>/dev/null || true + apt-get clean 2>/dev/null + apt-get update 2>/dev/null || true + " >/dev/null 2>&1 || true + echo -e "${BFR}${CM}${GN}APT/DPKG state repaired in container ${CTID}${CL}" + echo "" + export VERBOSE="yes" + export var_verbose="yes" + + echo -e "${YW}Re-running installation in existing container ${CTID}:${CL}" + echo -e " RAM: ${RAM_SIZE} MiB | CPU: ${CORE_COUNT} cores | Disk: ${DISK_SIZE} GB" + echo -e " Verbose: ${GN}enabled${CL}" + echo "" + msg_info "Re-running installation script..." + + # Re-run install script in existing container (don't destroy/recreate) + set +Eeuo pipefail + trap - ERR + lxc-attach -n "$CTID" -- bash -c "$(curl -fsSL https://raw.githubusercontent.com/community-scripts/ProxmoxVE/main/install/${var_install}.sh)" + local apt_retry_exit=$? + set -Eeuo pipefail + trap 'error_handler' ERR + + # Check for error flag from retry + local apt_retry_code=0 + if [[ -n "${SESSION_ID:-}" ]]; then + local retry_error_flag="/root/.install-${SESSION_ID}.failed" + if pct exec "$CTID" -- test -f "$retry_error_flag" 2>/dev/null; then + apt_retry_code=$(pct exec "$CTID" -- cat "$retry_error_flag" 2>/dev/null || echo "1") + pct exec "$CTID" -- rm -f "$retry_error_flag" 2>/dev/null || true + fi + fi + + if [[ $apt_retry_code -eq 0 && $apt_retry_exit -ne 0 ]]; then + apt_retry_code=$apt_retry_exit + fi + + if [[ $apt_retry_code -eq 0 ]]; then + msg_ok "Installation completed successfully after APT repair!" + post_update_to_api "done" "0" "force" + return 0 + else + msg_error "Installation still failed after APT repair (exit code: ${apt_retry_code})" + install_exit_code=$apt_retry_code + fi + fi + + if [[ -n "${OOM_OPTION}" && "${response}" == "${OOM_OPTION}" ]]; then + # Retry with doubled resources + handled=true echo -e "\n${TAB}${HOLD}${YW}Removing container ${CTID} for rebuild with more resources...${CL}" pct stop "$CTID" &>/dev/null || true pct destroy "$CTID" &>/dev/null || true echo -e "${BFR}${CM}${GN}Container ${CTID} removed${CL}" echo "" - # Get new container ID and increase resources local old_ctid="$CTID" local old_ram="$RAM_SIZE" local old_cpu="$CORE_COUNT" @@ -4284,7 +4405,6 @@ EOF' export VERBOSE="yes" export var_verbose="yes" - # Show rebuild summary echo -e "${YW}Rebuilding with increased resources:${CL}" echo -e " Container ID: ${old_ctid} → ${CTID}" echo -e " RAM: ${old_ram} → ${GN}${RAM_SIZE}${CL} MiB (x2)" @@ -4293,11 +4413,13 @@ EOF' echo -e " Verbose: ${GN}enabled${CL}" echo "" msg_info "Restarting installation..." - # Re-run build_container build_container return $? - elif [[ "$is_network_issue" == true && "$max_option" -eq 4 ]]; then + fi + + if [[ -n "${DNS_OPTION}" && "${response}" == "${DNS_OPTION}" ]]; then # Retry with DNS override in LXC + handled=true echo -e "\n${TAB}${HOLD}${YW}Removing container ${CTID} for rebuild with DNS override...${CL}" pct stop "$CTID" &>/dev/null || true pct destroy "$CTID" &>/dev/null || true @@ -4317,42 +4439,13 @@ EOF' msg_info "Restarting installation..." build_container return $? - else - echo -e "\n${TAB}${YW}Invalid option. Container ${CTID} kept.${CL}" - exit $install_exit_code fi - ;; - 5) - if [[ "$is_network_issue" == true && "$is_oom" == true ]]; then - # Retry with DNS override in LXC (option 5 when OOM option occupies 4) - echo -e "\n${TAB}${HOLD}${YW}Removing container ${CTID} for rebuild with DNS override...${CL}" - pct stop "$CTID" &>/dev/null || true - pct destroy "$CTID" &>/dev/null || true - echo -e "${BFR}${CM}${GN}Container ${CTID} removed${CL}" - echo "" - local old_ctid="$CTID" - export CTID=$(get_valid_container_id "$CTID") - export DNS_RETRY_OVERRIDE="true" - export VERBOSE="yes" - export var_verbose="yes" - echo -e "${YW}Rebuilding with DNS override in LXC:${CL}" - echo -e " Container ID: ${old_ctid} → ${CTID}" - echo -e " DNS: ${GN}8.8.8.8, 1.1.1.1${CL} (inside LXC only)" - echo -e " Verbose: ${GN}enabled${CL}" - echo "" - msg_info "Restarting installation..." - build_container - return $? - else + if [[ "$handled" == false ]]; then echo -e "\n${TAB}${YW}Invalid option. Container ${CTID} kept.${CL}" exit $install_exit_code fi ;; - *) - echo -e "\n${TAB}${YW}Invalid option. Container ${CTID} kept.${CL}" - exit $install_exit_code - ;; esac else # Timeout - auto-remove