mirror of
https://github.com/community-scripts/ProxmoxVE.git
synced 2026-02-19 11:35:55 +01:00
Add timeouts and prioritize telemetry on exit
Prevent hangs when pulling logs from containers by wrapping pct pull calls with timeout (8s) and running ensure_log_on_host under timeout (10s). Always send telemetry (post_update_to_api) before attempting best-effort log collection so status is reported even if log retrieval blocks. Update EXIT/ERR/SIGHUP/SIGINT/SIGTERM traps and consolidate error/interrupt handlers to use the new timeouted log collection. Changes in misc/build.func and misc/error_handler.func.
This commit is contained in:
@@ -4106,9 +4106,9 @@ EOF'
|
||||
build_log_copied=true
|
||||
fi
|
||||
|
||||
# Copy and append INSTALL_LOG from container
|
||||
# Copy and append INSTALL_LOG from container (with timeout to prevent hangs)
|
||||
local temp_install_log="/tmp/.install-temp-${SESSION_ID}.log"
|
||||
if pct pull "$CTID" "/root/.install-${SESSION_ID}.log" "$temp_install_log" 2>/dev/null; then
|
||||
if timeout 8 pct pull "$CTID" "/root/.install-${SESSION_ID}.log" "$temp_install_log" 2>/dev/null; then
|
||||
{
|
||||
echo "================================================================================"
|
||||
echo "PHASE 2: APPLICATION INSTALLATION (Container)"
|
||||
@@ -5492,6 +5492,7 @@ EOF
|
||||
# - If INSTALL_LOG points to a container path (e.g. /root/.install-*),
|
||||
# tries to pull it from the container and create a combined log
|
||||
# - This allows get_error_text() to find actual error output for telemetry
|
||||
# - Uses timeout on pct pull to prevent hangs on dead/unresponsive containers
|
||||
# ------------------------------------------------------------------------------
|
||||
ensure_log_on_host() {
|
||||
# Already readable on host? Nothing to do.
|
||||
@@ -5521,9 +5522,9 @@ ensure_log_on_host() {
|
||||
echo ""
|
||||
} >>"$combined_log"
|
||||
fi
|
||||
# Pull INSTALL_LOG from container
|
||||
# Pull INSTALL_LOG from container (with timeout to prevent hangs on dead containers)
|
||||
local temp_log="/tmp/.install-temp-${SESSION_ID}.log"
|
||||
if pct pull "$CTID" "/root/.install-${SESSION_ID}.log" "$temp_log" 2>/dev/null; then
|
||||
if timeout 8 pct pull "$CTID" "/root/.install-${SESSION_ID}.log" "$temp_log" 2>/dev/null; then
|
||||
{
|
||||
echo "================================================================================"
|
||||
echo "PHASE 2: APPLICATION INSTALLATION (Container)"
|
||||
@@ -5546,8 +5547,8 @@ ensure_log_on_host() {
|
||||
# - Exit trap handler for reporting to API telemetry
|
||||
# - Captures exit code and reports to PocketBase using centralized error descriptions
|
||||
# - Uses explain_exit_code() from api.func for consistent error messages
|
||||
# - For signal exits (>128): sends telemetry FIRST before log collection
|
||||
# to prevent pct pull hangs from blocking status updates
|
||||
# - ALWAYS sends telemetry FIRST before log collection to prevent pct pull
|
||||
# hangs from blocking status updates (container may be dead/unresponsive)
|
||||
# - For non-zero exit codes: posts "failed" status
|
||||
# - For zero exit codes where post_update_to_api was never called:
|
||||
# catches orphaned "installing" records (e.g., script exited cleanly
|
||||
@@ -5556,14 +5557,12 @@ ensure_log_on_host() {
|
||||
api_exit_script() {
|
||||
local exit_code=$?
|
||||
if [ $exit_code -ne 0 ]; then
|
||||
if [ $exit_code -gt 128 ]; then
|
||||
# Signal exit: send telemetry IMMEDIATELY (container may be dying)
|
||||
post_update_to_api "failed" "$exit_code" 2>/dev/null || true
|
||||
ensure_log_on_host 2>/dev/null || true
|
||||
else
|
||||
# Normal error: collect logs first for better error details
|
||||
ensure_log_on_host 2>/dev/null || true
|
||||
post_update_to_api "failed" "$exit_code"
|
||||
# ALWAYS send telemetry FIRST - ensure status is reported even if
|
||||
# ensure_log_on_host hangs (e.g. pct pull on dead container)
|
||||
post_update_to_api "failed" "$exit_code" 2>/dev/null || true
|
||||
# Best-effort log collection with timeout (non-critical after telemetry is sent)
|
||||
if declare -f ensure_log_on_host >/dev/null 2>&1; then
|
||||
timeout 10 bash -c 'ensure_log_on_host' 2>/dev/null || true
|
||||
fi
|
||||
elif [[ "${POST_TO_API_DONE:-}" == "true" && "${POST_UPDATE_DONE:-}" != "true" ]]; then
|
||||
# Script exited with 0 but never sent a completion status
|
||||
@@ -5575,7 +5574,7 @@ api_exit_script() {
|
||||
if command -v pveversion >/dev/null 2>&1; then
|
||||
trap 'api_exit_script' EXIT
|
||||
fi
|
||||
trap 'local _ec=$?; if [[ $_ec -ne 0 ]]; then ensure_log_on_host 2>/dev/null || true; post_update_to_api "failed" "$_ec"; fi' ERR
|
||||
trap 'post_update_to_api "failed" "129" 2>/dev/null || true; ensure_log_on_host 2>/dev/null || true; exit 129' SIGHUP
|
||||
trap 'post_update_to_api "failed" "130" 2>/dev/null || true; ensure_log_on_host 2>/dev/null || true; exit 130' SIGINT
|
||||
trap 'post_update_to_api "failed" "143" 2>/dev/null || true; ensure_log_on_host 2>/dev/null || true; exit 143' SIGTERM
|
||||
trap 'local _ec=$?; if [[ $_ec -ne 0 ]]; then post_update_to_api "failed" "$_ec" 2>/dev/null || true; timeout 10 bash -c "ensure_log_on_host" 2>/dev/null || true; fi' ERR
|
||||
trap 'post_update_to_api "failed" "129" 2>/dev/null || true; timeout 10 bash -c "ensure_log_on_host" 2>/dev/null || true; exit 129' SIGHUP
|
||||
trap 'post_update_to_api "failed" "130" 2>/dev/null || true; timeout 10 bash -c "ensure_log_on_host" 2>/dev/null || true; exit 130' SIGINT
|
||||
trap 'post_update_to_api "failed" "143" 2>/dev/null || true; timeout 10 bash -c "ensure_log_on_host" 2>/dev/null || true; exit 143' SIGTERM
|
||||
|
||||
@@ -339,24 +339,17 @@ on_exit() {
|
||||
# post_to_api was called ("installing" sent) but post_update_to_api was never called
|
||||
if [[ "${POST_TO_API_DONE:-}" == "true" && "${POST_UPDATE_DONE:-}" != "true" ]]; then
|
||||
if declare -f post_update_to_api >/dev/null 2>&1; then
|
||||
if [[ $exit_code -gt 128 ]]; then
|
||||
# Signal exit: send telemetry IMMEDIATELY (container may be dying, pct pull could hang)
|
||||
# ALWAYS send telemetry FIRST - ensure status is reported even if
|
||||
# ensure_log_on_host hangs (e.g. pct pull on dead/unresponsive container)
|
||||
if [[ $exit_code -ne 0 ]]; then
|
||||
post_update_to_api "failed" "$exit_code" 2>/dev/null || true
|
||||
# Then try log collection (non-critical, best-effort)
|
||||
if declare -f ensure_log_on_host >/dev/null 2>&1; then
|
||||
ensure_log_on_host 2>/dev/null || true
|
||||
fi
|
||||
else
|
||||
# Normal exit: collect logs first for better error details
|
||||
if declare -f ensure_log_on_host >/dev/null 2>&1; then
|
||||
ensure_log_on_host 2>/dev/null || true
|
||||
fi
|
||||
if [[ $exit_code -ne 0 ]]; then
|
||||
post_update_to_api "failed" "$exit_code"
|
||||
else
|
||||
# exit_code=0 is never an error — report as success
|
||||
post_update_to_api "done" "0"
|
||||
fi
|
||||
# exit_code=0 is never an error — report as success
|
||||
post_update_to_api "done" "0" 2>/dev/null || true
|
||||
fi
|
||||
# Best-effort log collection with timeout (non-critical after telemetry is sent)
|
||||
if declare -f ensure_log_on_host >/dev/null 2>&1; then
|
||||
timeout 10 bash -c 'ensure_log_on_host' 2>/dev/null || true
|
||||
fi
|
||||
fi
|
||||
fi
|
||||
@@ -380,9 +373,9 @@ on_interrupt() {
|
||||
if declare -f post_update_to_api >/dev/null 2>&1; then
|
||||
post_update_to_api "failed" "130" 2>/dev/null || true
|
||||
fi
|
||||
# Best-effort log collection (non-critical after telemetry is sent)
|
||||
# Best-effort log collection with timeout (non-critical after telemetry is sent)
|
||||
if declare -f ensure_log_on_host >/dev/null 2>&1; then
|
||||
ensure_log_on_host 2>/dev/null || true
|
||||
timeout 10 bash -c 'ensure_log_on_host' 2>/dev/null || true
|
||||
fi
|
||||
if declare -f msg_error >/dev/null 2>&1; then
|
||||
msg_error "Interrupted by user (SIGINT)" 2>/dev/null || true
|
||||
@@ -409,9 +402,9 @@ on_terminate() {
|
||||
if declare -f post_update_to_api >/dev/null 2>&1; then
|
||||
post_update_to_api "failed" "143" 2>/dev/null || true
|
||||
fi
|
||||
# Best-effort log collection (non-critical after telemetry is sent)
|
||||
# Best-effort log collection with timeout (non-critical after telemetry is sent)
|
||||
if declare -f ensure_log_on_host >/dev/null 2>&1; then
|
||||
ensure_log_on_host 2>/dev/null || true
|
||||
timeout 10 bash -c 'ensure_log_on_host' 2>/dev/null || true
|
||||
fi
|
||||
if declare -f msg_error >/dev/null 2>&1; then
|
||||
msg_error "Terminated by signal (SIGTERM)" 2>/dev/null || true
|
||||
|
||||
Reference in New Issue
Block a user