refactor(api): eliminate duplicate traps, harden error handling & telemetry

Phase 1 - Structural:
- Remove api_exit_script() and 5 inline traps from build.func
- error_handler.func is now the sole trap owner via catch_errors()
- Update api.func comment reference (api_exit_script -> on_exit)

Phase 2 - Quality:
- Add stop_spinner() + cursor restore to error_handler(), on_interrupt(),
  on_terminate(), on_hangup() to prevent spinner/cursor artifacts
- Enhance _send_abort_telemetry() with error text (last 20 log lines),
  duration calculation, and 2 retry attempts (was fire-and-forget)
- Harden json_escape() to also strip DEL (0x7F) character
This commit is contained in:
CanbiZ (MickLesk)
2026-02-25 12:58:06 +01:00
parent 10e450b72f
commit 866c4062e0
3 changed files with 77 additions and 47 deletions

View File

@@ -312,7 +312,8 @@ json_escape() {
s=${s//$'\r'/}
s=${s//$'\t'/\\t}
# Remove any remaining control characters (0x00-0x1F except those already handled)
s=$(printf '%s' "$s" | tr -d '\000-\010\013\014\016-\037')
# Also remove DEL (0x7F) and invalid high bytes that break JSON parsers
s=$(printf '%s' "$s" | tr -d '\000-\010\013\014\016-\037\177')
printf '%s' "$s"
}
@@ -982,7 +983,7 @@ EOF
fi
# All 3 attempts failed — do NOT set POST_UPDATE_DONE=true.
# This allows the EXIT trap (api_exit_script) to retry with 3 fresh attempts.
# This allows the EXIT trap (on_exit in error_handler.func) to retry.
# No infinite loop risk: EXIT trap fires exactly once.
}

View File

@@ -5608,44 +5608,21 @@ ensure_log_on_host() {
fi
}
# ------------------------------------------------------------------------------
# api_exit_script()
# ==============================================================================
# TRAP MANAGEMENT
# ==============================================================================
# All traps (ERR, EXIT, INT, TERM, HUP) are set by catch_errors() in
# error_handler.func — called at the top of this file after sourcing.
#
# - Exit trap handler for reporting to API telemetry
# - Captures exit code and reports to PocketBase using centralized error descriptions
# - Uses explain_exit_code() from api.func for consistent error messages
# - ALWAYS sends telemetry FIRST before log collection to prevent pct pull
# hangs from blocking status updates (container may be dead/unresponsive)
# - For non-zero exit codes: posts "failed" status
# - For zero exit codes where post_update_to_api was never called:
# catches orphaned "installing" records (e.g., script exited cleanly
# but description() was never reached)
# ------------------------------------------------------------------------------
api_exit_script() {
local exit_code=$?
if [ $exit_code -ne 0 ]; then
# ALWAYS send telemetry FIRST - ensure status is reported even if
# ensure_log_on_host hangs (e.g. pct pull on dead container)
post_update_to_api "failed" "$exit_code" 2>/dev/null || true
# Best-effort log collection (non-critical after telemetry is sent)
if declare -f ensure_log_on_host >/dev/null 2>&1; then
ensure_log_on_host 2>/dev/null || true
fi
# Stop orphaned container if we're in the install phase
if [[ "${CONTAINER_INSTALLING:-}" == "true" && -n "${CTID:-}" ]] && command -v pct &>/dev/null; then
pct stop "$CTID" 2>/dev/null || true
fi
elif [[ "${POST_TO_API_DONE:-}" == "true" && "${POST_UPDATE_DONE:-}" != "true" ]]; then
# Script exited with 0 but never sent a completion status
# exit_code=0 is never an error — report as success
post_update_to_api "done" "0"
fi
}
if command -v pveversion >/dev/null 2>&1; then
trap 'api_exit_script' EXIT
fi
trap 'local _ec=$?; if [[ $_ec -ne 0 ]]; then post_update_to_api "failed" "$_ec" 2>/dev/null || true; if declare -f ensure_log_on_host &>/dev/null; then ensure_log_on_host 2>/dev/null || true; fi; fi' ERR
trap 'post_update_to_api "failed" "129" 2>/dev/null || true; if [[ -n "${CTID:-}" ]] && command -v pct &>/dev/null; then pct stop "$CTID" 2>/dev/null || true; fi; exit 129' SIGHUP
trap 'post_update_to_api "failed" "130" 2>/dev/null || true; if [[ -n "${CTID:-}" ]] && command -v pct &>/dev/null; then pct stop "$CTID" 2>/dev/null || true; fi; exit 130' SIGINT
trap 'post_update_to_api "failed" "143" 2>/dev/null || true; if [[ -n "${CTID:-}" ]] && command -v pct &>/dev/null; then pct stop "$CTID" 2>/dev/null || true; fi; exit 143' SIGTERM
# Do NOT set duplicate traps here. The handlers in error_handler.func
# (on_exit, on_interrupt, on_terminate, on_hangup, error_handler) already:
# - Send telemetry via post_update_to_api / _send_abort_telemetry
# - Stop orphaned containers via _stop_container_if_installing
# - Collect logs via ensure_log_on_host
# - Clean up lock files and spinner processes
#
# Previously, inline traps here overwrote catch_errors() traps, causing:
# - error_handler() never fired (no error output, no cleanup dialog)
# - on_hangup() never fired (SSH disconnect → stuck records)
# - Duplicated logic in two places (hard to debug)
# ==============================================================================

View File

@@ -199,11 +199,16 @@ error_handler() {
return 0
fi
# Stop spinner and restore cursor FIRST — before any output
# This prevents spinner text overlapping with error messages
if declare -f stop_spinner >/dev/null 2>&1; then
stop_spinner 2>/dev/null || true
fi
printf "\e[?25h"
local explanation
explanation="$(explain_exit_code "$exit_code")"
printf "\e[?25h"
# ALWAYS report failure to API immediately - don't wait for container checks
# This ensures we capture failures that occur before/after container exists
if declare -f post_update_to_api &>/dev/null; then
@@ -359,9 +364,39 @@ _send_abort_telemetry() {
command -v curl &>/dev/null || return 0
[[ "${DIAGNOSTICS:-no}" == "no" ]] && return 0
[[ -z "${RANDOM_UUID:-}" ]] && return 0
curl -fsS -m 5 -X POST "${TELEMETRY_URL:-https://telemetry.community-scripts.org/telemetry}" \
-H "Content-Type: application/json" \
-d "{\"random_id\":\"${RANDOM_UUID}\",\"execution_id\":\"${EXECUTION_ID:-${RANDOM_UUID}}\",\"type\":\"${TELEMETRY_TYPE:-lxc}\",\"nsapp\":\"${NSAPP:-${app:-unknown}}\",\"status\":\"failed\",\"exit_code\":${exit_code}}" &>/dev/null || true
# Collect last 20 log lines for error diagnosis (best-effort)
local error_text=""
if [[ -n "${INSTALL_LOG:-}" && -s "${INSTALL_LOG}" ]]; then
error_text=$(tail -n 20 "$INSTALL_LOG" 2>/dev/null | sed 's/\x1b\[[0-9;]*[a-zA-Z]//g; s/\\/\\\\/g; s/"/\\"/g; s/\r//g' | tr '\n' '|' | sed 's/|$//' | tr -d '\000-\010\013\014\016-\037\177') || true
fi
# Calculate duration if start time is available
local duration=""
if [[ -n "${DIAGNOSTICS_START_TIME:-}" ]]; then
duration=$(($(date +%s) - DIAGNOSTICS_START_TIME))
fi
# Build JSON payload with error context
local payload
payload="{\"random_id\":\"${RANDOM_UUID}\",\"execution_id\":\"${EXECUTION_ID:-${RANDOM_UUID}}\",\"type\":\"${TELEMETRY_TYPE:-lxc}\",\"nsapp\":\"${NSAPP:-${app:-unknown}}\",\"status\":\"failed\",\"exit_code\":${exit_code}"
[[ -n "$error_text" ]] && payload="${payload},\"error\":\"${error_text}\""
[[ -n "$duration" ]] && payload="${payload},\"duration\":${duration}"
payload="${payload}}"
local api_url="${TELEMETRY_URL:-https://telemetry.community-scripts.org/telemetry}"
# 2 attempts (retry once on failure) — original had no retry
local attempt
for attempt in 1 2; do
if curl -fsS -m 5 -X POST "$api_url" \
-H "Content-Type: application/json" \
-d "$payload" &>/dev/null; then
return 0
fi
[[ $attempt -eq 1 ]] && sleep 1
done
return 0
}
# ------------------------------------------------------------------------------
@@ -437,6 +472,12 @@ on_exit() {
# - Exits with code 130 (128 + SIGINT=2)
# ------------------------------------------------------------------------------
on_interrupt() {
# Stop spinner and restore cursor before any output
if declare -f stop_spinner >/dev/null 2>&1; then
stop_spinner 2>/dev/null || true
fi
printf "\e[?25h" 2>/dev/null || true
_send_abort_telemetry "130"
_stop_container_if_installing
if declare -f msg_error >/dev/null 2>&1; then
@@ -456,6 +497,12 @@ on_interrupt() {
# - Exits with code 143 (128 + SIGTERM=15)
# ------------------------------------------------------------------------------
on_terminate() {
# Stop spinner and restore cursor before any output
if declare -f stop_spinner >/dev/null 2>&1; then
stop_spinner 2>/dev/null || true
fi
printf "\e[?25h" 2>/dev/null || true
_send_abort_telemetry "143"
_stop_container_if_installing
if declare -f msg_error >/dev/null 2>&1; then
@@ -478,6 +525,11 @@ on_terminate() {
# - Exits with code 129 (128 + SIGHUP=1)
# ------------------------------------------------------------------------------
on_hangup() {
# Stop spinner (no cursor restore needed — terminal is already gone)
if declare -f stop_spinner >/dev/null 2>&1; then
stop_spinner 2>/dev/null || true
fi
_send_abort_telemetry "129"
_stop_container_if_installing
exit 129