mirror of
https://github.com/community-scripts/ProxmoxVE.git
synced 2026-02-25 22:45:56 +01:00
core: remove duplicate traps, consolidate error handling and harden signal traps (#12316)
* fix(zammad): configure Elasticsearch for LXC container startup
- Set discovery.type: single-node (required for single-node ES)
- Set xpack.security.enabled: false (not needed in local LXC)
- Set bootstrap.memory_lock: false (fails in unprivileged LXC)
- Add startup wait loop (up to 60s) to ensure ES is ready before
Zammad installation continues
Fixes #12301-related recurring Elasticsearch startup failures
* refactor(api): eliminate duplicate traps, harden error handling & telemetry
Phase 1 - Structural:
- Remove api_exit_script() and 5 inline traps from build.func
- error_handler.func is now the sole trap owner via catch_errors()
- Update api.func comment reference (api_exit_script -> on_exit)
Phase 2 - Quality:
- Add stop_spinner() + cursor restore to error_handler(), on_interrupt(),
on_terminate(), on_hangup() to prevent spinner/cursor artifacts
- Enhance _send_abort_telemetry() with error text (last 20 log lines),
duration calculation, and 2 retry attempts (was fire-and-forget)
- Harden json_escape() to also strip DEL (0x7F) character
* fix(build): show spinner during post_update_to_api to prevent Ctrl+Z abort
post_update_to_api can take up to 33 seconds worst-case (3 curl attempts
x 10s timeout + sleep delays). Without any terminal output during this
time, users think the script is stuck and press Ctrl+Z, which prevents
the recovery menu from ever appearing.
Add msg_info spinner before both post_update_to_api calls in the failure
path (initial report + final force retry after recovery menu).
* fix(build): prevent SIGTSTP from killing recovery dialog
- Replace msg_info/stop_spinner with plain echo for telemetry reporting
The background spinner process in non-interactive shells (bash -c)
can trigger SIGTSTP, stopping the entire process group before the
recovery dialog appears. Plain echo avoids this.
- Add trap '' TSTP at failure path entry to ignore suspension signals
Prevents Ctrl+Z or terminal-related SIGTSTP from interrupting the
recovery menu. Restored with trap - TSTP before exit.
- Root cause: msg_info starts a background process (spinner &) that
is not properly detached in non-interactive shells where job control
(set -m) is OFF. The disown builtin has no effect without job
control, leaving the spinner in the same process group. This can
cause terminal I/O conflicts during the 33-second post_update_to_api
retry window, resulting in [2]+ Stopped.
* fix(test): initialize colors and remove illegal local in test harness
- Call load_functions() after sourcing core.func to initialize
color/formatting/icon variables (RD, GN, YW, CL, TAB, etc.)
- Remove 'local' keyword from top-level scope (not inside function)
- Default REPO_SOURCE to ref_api instead of main
* chore: remove test-recovery-dialog.sh from branch
* Revert "fix(zammad): configure Elasticsearch for LXC container startup"
This reverts commit 10e450b72f.
* fix(build): show telemetry status only in verbose mode
Telemetry reporting is an implementation detail that doesn't help
the user during failure recovery. Wrap echo statements with
VERBOSE check so they only appear when verbose mode is enabled.
This commit is contained in:
committed by
GitHub
parent
719761de6c
commit
dd46dd2d87
@@ -312,7 +312,8 @@ json_escape() {
|
||||
s=${s//$'\r'/}
|
||||
s=${s//$'\t'/\\t}
|
||||
# Remove any remaining control characters (0x00-0x1F except those already handled)
|
||||
s=$(printf '%s' "$s" | tr -d '\000-\010\013\014\016-\037')
|
||||
# Also remove DEL (0x7F) and invalid high bytes that break JSON parsers
|
||||
s=$(printf '%s' "$s" | tr -d '\000-\010\013\014\016-\037\177')
|
||||
printf '%s' "$s"
|
||||
}
|
||||
|
||||
@@ -982,7 +983,7 @@ EOF
|
||||
fi
|
||||
|
||||
# All 3 attempts failed — do NOT set POST_UPDATE_DONE=true.
|
||||
# This allows the EXIT trap (api_exit_script) to retry with 3 fresh attempts.
|
||||
# This allows the EXIT trap (on_exit in error_handler.func) to retry.
|
||||
# No infinite loop risk: EXIT trap fires exactly once.
|
||||
}
|
||||
|
||||
|
||||
@@ -4098,6 +4098,11 @@ EOF'
|
||||
|
||||
# Installation failed?
|
||||
if [[ $install_exit_code -ne 0 ]]; then
|
||||
# Prevent SIGTSTP (Ctrl+Z) from suspending the script during recovery.
|
||||
# In non-interactive shells (bash -c), background processes (spinner) can
|
||||
# trigger terminal-related signals that stop the entire process group.
|
||||
trap '' TSTP
|
||||
|
||||
msg_error "Installation failed in container ${CTID} (exit code: ${install_exit_code})"
|
||||
|
||||
# Copy install log from container BEFORE API call so get_error_text() can read it
|
||||
@@ -4173,7 +4178,12 @@ EOF'
|
||||
fi
|
||||
|
||||
# Report failure to telemetry API (now with log available on host)
|
||||
# NOTE: Do NOT use msg_info/spinner here — the background spinner process
|
||||
# causes SIGTSTP in non-interactive shells (bash -c "$(curl ...)"), which
|
||||
# stops the entire process group and prevents the recovery dialog from appearing.
|
||||
$STD echo -e "${TAB}⏳ Reporting failure to telemetry..."
|
||||
post_update_to_api "failed" "$install_exit_code"
|
||||
$STD echo -e "${TAB}${CM:-✔} Failure reported"
|
||||
|
||||
# Defense-in-depth: Ensure error handling stays disabled during recovery.
|
||||
# Some functions (e.g. silent/$STD) unconditionally re-enable set -Eeuo pipefail
|
||||
@@ -4537,8 +4547,12 @@ EOF'
|
||||
|
||||
# Force one final status update attempt after cleanup
|
||||
# This ensures status is updated even if the first attempt failed (e.g., HTTP 400)
|
||||
$STD echo -e "${TAB}⏳ Finalizing telemetry report..."
|
||||
post_update_to_api "failed" "$install_exit_code" "force"
|
||||
$STD echo -e "${TAB}${CM:-✔} Telemetry finalized"
|
||||
|
||||
# Restore default SIGTSTP handling before exit
|
||||
trap - TSTP
|
||||
exit $install_exit_code
|
||||
fi
|
||||
|
||||
@@ -5608,44 +5622,21 @@ ensure_log_on_host() {
|
||||
fi
|
||||
}
|
||||
|
||||
# ------------------------------------------------------------------------------
|
||||
# api_exit_script()
|
||||
# ==============================================================================
|
||||
# TRAP MANAGEMENT
|
||||
# ==============================================================================
|
||||
# All traps (ERR, EXIT, INT, TERM, HUP) are set by catch_errors() in
|
||||
# error_handler.func — called at the top of this file after sourcing.
|
||||
#
|
||||
# - Exit trap handler for reporting to API telemetry
|
||||
# - Captures exit code and reports to PocketBase using centralized error descriptions
|
||||
# - Uses explain_exit_code() from api.func for consistent error messages
|
||||
# - ALWAYS sends telemetry FIRST before log collection to prevent pct pull
|
||||
# hangs from blocking status updates (container may be dead/unresponsive)
|
||||
# - For non-zero exit codes: posts "failed" status
|
||||
# - For zero exit codes where post_update_to_api was never called:
|
||||
# catches orphaned "installing" records (e.g., script exited cleanly
|
||||
# but description() was never reached)
|
||||
# ------------------------------------------------------------------------------
|
||||
api_exit_script() {
|
||||
local exit_code=$?
|
||||
if [ $exit_code -ne 0 ]; then
|
||||
# ALWAYS send telemetry FIRST - ensure status is reported even if
|
||||
# ensure_log_on_host hangs (e.g. pct pull on dead container)
|
||||
post_update_to_api "failed" "$exit_code" 2>/dev/null || true
|
||||
# Best-effort log collection (non-critical after telemetry is sent)
|
||||
if declare -f ensure_log_on_host >/dev/null 2>&1; then
|
||||
ensure_log_on_host 2>/dev/null || true
|
||||
fi
|
||||
# Stop orphaned container if we're in the install phase
|
||||
if [[ "${CONTAINER_INSTALLING:-}" == "true" && -n "${CTID:-}" ]] && command -v pct &>/dev/null; then
|
||||
pct stop "$CTID" 2>/dev/null || true
|
||||
fi
|
||||
elif [[ "${POST_TO_API_DONE:-}" == "true" && "${POST_UPDATE_DONE:-}" != "true" ]]; then
|
||||
# Script exited with 0 but never sent a completion status
|
||||
# exit_code=0 is never an error — report as success
|
||||
post_update_to_api "done" "0"
|
||||
fi
|
||||
}
|
||||
|
||||
if command -v pveversion >/dev/null 2>&1; then
|
||||
trap 'api_exit_script' EXIT
|
||||
fi
|
||||
trap 'local _ec=$?; if [[ $_ec -ne 0 ]]; then post_update_to_api "failed" "$_ec" 2>/dev/null || true; if declare -f ensure_log_on_host &>/dev/null; then ensure_log_on_host 2>/dev/null || true; fi; fi' ERR
|
||||
trap 'post_update_to_api "failed" "129" 2>/dev/null || true; if [[ -n "${CTID:-}" ]] && command -v pct &>/dev/null; then pct stop "$CTID" 2>/dev/null || true; fi; exit 129' SIGHUP
|
||||
trap 'post_update_to_api "failed" "130" 2>/dev/null || true; if [[ -n "${CTID:-}" ]] && command -v pct &>/dev/null; then pct stop "$CTID" 2>/dev/null || true; fi; exit 130' SIGINT
|
||||
trap 'post_update_to_api "failed" "143" 2>/dev/null || true; if [[ -n "${CTID:-}" ]] && command -v pct &>/dev/null; then pct stop "$CTID" 2>/dev/null || true; fi; exit 143' SIGTERM
|
||||
# Do NOT set duplicate traps here. The handlers in error_handler.func
|
||||
# (on_exit, on_interrupt, on_terminate, on_hangup, error_handler) already:
|
||||
# - Send telemetry via post_update_to_api / _send_abort_telemetry
|
||||
# - Stop orphaned containers via _stop_container_if_installing
|
||||
# - Collect logs via ensure_log_on_host
|
||||
# - Clean up lock files and spinner processes
|
||||
#
|
||||
# Previously, inline traps here overwrote catch_errors() traps, causing:
|
||||
# - error_handler() never fired (no error output, no cleanup dialog)
|
||||
# - on_hangup() never fired (SSH disconnect → stuck records)
|
||||
# - Duplicated logic in two places (hard to debug)
|
||||
# ==============================================================================
|
||||
|
||||
@@ -199,11 +199,16 @@ error_handler() {
|
||||
return 0
|
||||
fi
|
||||
|
||||
# Stop spinner and restore cursor FIRST — before any output
|
||||
# This prevents spinner text overlapping with error messages
|
||||
if declare -f stop_spinner >/dev/null 2>&1; then
|
||||
stop_spinner 2>/dev/null || true
|
||||
fi
|
||||
printf "\e[?25h"
|
||||
|
||||
local explanation
|
||||
explanation="$(explain_exit_code "$exit_code")"
|
||||
|
||||
printf "\e[?25h"
|
||||
|
||||
# ALWAYS report failure to API immediately - don't wait for container checks
|
||||
# This ensures we capture failures that occur before/after container exists
|
||||
if declare -f post_update_to_api &>/dev/null; then
|
||||
@@ -359,9 +364,39 @@ _send_abort_telemetry() {
|
||||
command -v curl &>/dev/null || return 0
|
||||
[[ "${DIAGNOSTICS:-no}" == "no" ]] && return 0
|
||||
[[ -z "${RANDOM_UUID:-}" ]] && return 0
|
||||
curl -fsS -m 5 -X POST "${TELEMETRY_URL:-https://telemetry.community-scripts.org/telemetry}" \
|
||||
-H "Content-Type: application/json" \
|
||||
-d "{\"random_id\":\"${RANDOM_UUID}\",\"execution_id\":\"${EXECUTION_ID:-${RANDOM_UUID}}\",\"type\":\"${TELEMETRY_TYPE:-lxc}\",\"nsapp\":\"${NSAPP:-${app:-unknown}}\",\"status\":\"failed\",\"exit_code\":${exit_code}}" &>/dev/null || true
|
||||
|
||||
# Collect last 20 log lines for error diagnosis (best-effort)
|
||||
local error_text=""
|
||||
if [[ -n "${INSTALL_LOG:-}" && -s "${INSTALL_LOG}" ]]; then
|
||||
error_text=$(tail -n 20 "$INSTALL_LOG" 2>/dev/null | sed 's/\x1b\[[0-9;]*[a-zA-Z]//g; s/\\/\\\\/g; s/"/\\"/g; s/\r//g' | tr '\n' '|' | sed 's/|$//' | tr -d '\000-\010\013\014\016-\037\177') || true
|
||||
fi
|
||||
|
||||
# Calculate duration if start time is available
|
||||
local duration=""
|
||||
if [[ -n "${DIAGNOSTICS_START_TIME:-}" ]]; then
|
||||
duration=$(($(date +%s) - DIAGNOSTICS_START_TIME))
|
||||
fi
|
||||
|
||||
# Build JSON payload with error context
|
||||
local payload
|
||||
payload="{\"random_id\":\"${RANDOM_UUID}\",\"execution_id\":\"${EXECUTION_ID:-${RANDOM_UUID}}\",\"type\":\"${TELEMETRY_TYPE:-lxc}\",\"nsapp\":\"${NSAPP:-${app:-unknown}}\",\"status\":\"failed\",\"exit_code\":${exit_code}"
|
||||
[[ -n "$error_text" ]] && payload="${payload},\"error\":\"${error_text}\""
|
||||
[[ -n "$duration" ]] && payload="${payload},\"duration\":${duration}"
|
||||
payload="${payload}}"
|
||||
|
||||
local api_url="${TELEMETRY_URL:-https://telemetry.community-scripts.org/telemetry}"
|
||||
|
||||
# 2 attempts (retry once on failure) — original had no retry
|
||||
local attempt
|
||||
for attempt in 1 2; do
|
||||
if curl -fsS -m 5 -X POST "$api_url" \
|
||||
-H "Content-Type: application/json" \
|
||||
-d "$payload" &>/dev/null; then
|
||||
return 0
|
||||
fi
|
||||
[[ $attempt -eq 1 ]] && sleep 1
|
||||
done
|
||||
return 0
|
||||
}
|
||||
|
||||
# ------------------------------------------------------------------------------
|
||||
@@ -437,6 +472,12 @@ on_exit() {
|
||||
# - Exits with code 130 (128 + SIGINT=2)
|
||||
# ------------------------------------------------------------------------------
|
||||
on_interrupt() {
|
||||
# Stop spinner and restore cursor before any output
|
||||
if declare -f stop_spinner >/dev/null 2>&1; then
|
||||
stop_spinner 2>/dev/null || true
|
||||
fi
|
||||
printf "\e[?25h" 2>/dev/null || true
|
||||
|
||||
_send_abort_telemetry "130"
|
||||
_stop_container_if_installing
|
||||
if declare -f msg_error >/dev/null 2>&1; then
|
||||
@@ -456,6 +497,12 @@ on_interrupt() {
|
||||
# - Exits with code 143 (128 + SIGTERM=15)
|
||||
# ------------------------------------------------------------------------------
|
||||
on_terminate() {
|
||||
# Stop spinner and restore cursor before any output
|
||||
if declare -f stop_spinner >/dev/null 2>&1; then
|
||||
stop_spinner 2>/dev/null || true
|
||||
fi
|
||||
printf "\e[?25h" 2>/dev/null || true
|
||||
|
||||
_send_abort_telemetry "143"
|
||||
_stop_container_if_installing
|
||||
if declare -f msg_error >/dev/null 2>&1; then
|
||||
@@ -478,6 +525,11 @@ on_terminate() {
|
||||
# - Exits with code 129 (128 + SIGHUP=1)
|
||||
# ------------------------------------------------------------------------------
|
||||
on_hangup() {
|
||||
# Stop spinner (no cursor restore needed — terminal is already gone)
|
||||
if declare -f stop_spinner >/dev/null 2>&1; then
|
||||
stop_spinner 2>/dev/null || true
|
||||
fi
|
||||
|
||||
_send_abort_telemetry "129"
|
||||
_stop_container_if_installing
|
||||
exit 129
|
||||
|
||||
Reference in New Issue
Block a user