fix: improve error trace propagation for telemetry

- post_update_to_api: Attempts 2/3 now send medium_error (16KB truncated
  log) instead of short_error (generic description only). This is the
  primary fix — when attempt 1 fails (120KB payload too large/timeout),
  attempts 2/3 no longer discard all log data.

- _send_abort_telemetry: Increased container fallback from 20 to 200
  log lines (capped at 16KB). Added SILENT_LOGFILE as fallback source.
  Added exit code explanation header and error_category to payload.

- get_error_text/get_full_log: Added SILENT_LOGFILE as last-resort
  fallback when INSTALL_LOG, combined log, and BUILD_LOG are all
  empty/missing.
This commit is contained in:
CanbiZ (MickLesk)
2026-03-02 14:38:42 +01:00
parent 564a8136a5
commit ebc3512f50
2 changed files with 57 additions and 7 deletions

View File

@@ -393,6 +393,11 @@ get_error_text() {
logfile="$BUILD_LOG"
fi
# Try SILENT_LOGFILE as last resort (captures $STD command output)
if [[ -z "$logfile" || ! -s "$logfile" ]] && [[ -n "${SILENT_LOGFILE:-}" && -s "${SILENT_LOGFILE}" ]]; then
logfile="$SILENT_LOGFILE"
fi
if [[ -n "$logfile" && -s "$logfile" ]]; then
tail -n 20 "$logfile" 2>/dev/null | sed 's/\r$//' | sed 's/\x1b\[[0-9;]*[a-zA-Z]//g'
fi
@@ -438,6 +443,13 @@ get_full_log() {
fi
fi
# Fall back to SILENT_LOGFILE (captures $STD command output)
if [[ -z "$logfile" || ! -s "$logfile" ]]; then
if [[ -n "${SILENT_LOGFILE:-}" && -s "${SILENT_LOGFILE}" ]]; then
logfile="$SILENT_LOGFILE"
fi
fi
if [[ -n "$logfile" && -s "$logfile" ]]; then
# Strip ANSI codes, carriage returns, and anonymize IP addresses (GDPR)
sed 's/\r$//' "$logfile" 2>/dev/null |
@@ -876,7 +888,7 @@ post_update_to_api() {
esac
# For failed/unknown status, resolve exit code and error description
local short_error=""
local short_error="" medium_error=""
if [[ "$pb_status" == "failed" ]] || [[ "$pb_status" == "unknown" ]]; then
if [[ "$raw_exit_code" =~ ^[0-9]+$ ]]; then
exit_code="$raw_exit_code"
@@ -896,6 +908,18 @@ post_update_to_api() {
short_error=$(json_escape "$(explain_exit_code "$exit_code")")
error_category=$(categorize_error "$exit_code")
[[ -z "$error" ]] && error="Unknown error"
# Build medium error for attempt 2: explanation + last 100 log lines (≤16KB)
# This is the critical middle ground between full 120KB log and generic-only description
local medium_log=""
medium_log=$(get_full_log 16384) || true # 16KB max
if [[ -z "$medium_log" ]]; then
medium_log=$(get_error_text) || true
fi
local medium_full
medium_full=$(build_error_string "$exit_code" "$medium_log")
medium_error=$(json_escape "$medium_full")
[[ -z "$medium_error" ]] && medium_error="$short_error"
fi
# Calculate duration if timer was started
@@ -954,7 +978,7 @@ EOF
return 0
fi
# ── Attempt 2: Short error text (no full log) ──
# ── Attempt 2: Medium error text (truncated log ≤16KB instead of full 120KB) ──
sleep 1
local RETRY_PAYLOAD
RETRY_PAYLOAD=$(
@@ -974,7 +998,7 @@ EOF
"pve_version": "${pve_version}",
"method": "${METHOD:-default}",
"exit_code": ${exit_code},
"error": "${short_error}",
"error": "${medium_error}",
"error_category": "${error_category}",
"install_duration": ${duration},
"cpu_vendor": "${cpu_vendor}",
@@ -997,7 +1021,7 @@ EOF
return 0
fi
# ── Attempt 3: Minimal payload (bare minimum to set status) ──
# ── Attempt 3: Minimal payload with medium error (bare minimum to set status) ──
sleep 2
local MINIMAL_PAYLOAD
MINIMAL_PAYLOAD=$(
@@ -1009,7 +1033,7 @@ EOF
"nsapp": "${NSAPP:-unknown}",
"status": "${pb_status}",
"exit_code": ${exit_code},
"error": "${short_error}",
"error": "${medium_error}",
"error_category": "${error_category}",
"install_duration": ${duration}
}

View File

@@ -408,10 +408,29 @@ _send_abort_telemetry() {
[[ "${DIAGNOSTICS:-no}" == "no" ]] && return 0
[[ -z "${RANDOM_UUID:-}" ]] && return 0
# Collect last 20 log lines for error diagnosis (best-effort)
# Collect last 200 log lines for error diagnosis (best-effort)
# Container context has no get_full_log(), so we gather as much as possible
local error_text=""
local logfile=""
if [[ -n "${INSTALL_LOG:-}" && -s "${INSTALL_LOG}" ]]; then
error_text=$(tail -n 20 "$INSTALL_LOG" 2>/dev/null | sed 's/\x1b\[[0-9;]*[a-zA-Z]//g; s/\\/\\\\/g; s/"/\\"/g; s/\r//g' | tr '\n' '|' | sed 's/|$//' | tr -d '\000-\010\013\014\016-\037\177') || true
logfile="${INSTALL_LOG}"
elif [[ -n "${SILENT_LOGFILE:-}" && -s "${SILENT_LOGFILE}" ]]; then
logfile="${SILENT_LOGFILE}"
fi
if [[ -n "$logfile" ]]; then
error_text=$(tail -n 200 "$logfile" 2>/dev/null | sed 's/\x1b\[[0-9;]*[a-zA-Z]//g; s/\\/\\\\/g; s/"/\\"/g; s/\r//g' | tr '\n' '|' | sed 's/|$//' | head -c 16384 | tr -d '\000-\010\013\014\016-\037\177') || true
fi
# Prepend exit code explanation header (like build_error_string does on host)
local explanation=""
if declare -f explain_exit_code &>/dev/null; then
explanation=$(explain_exit_code "$exit_code" 2>/dev/null) || true
fi
if [[ -n "$explanation" && -n "$error_text" ]]; then
error_text="exit_code=${exit_code} | ${explanation}|---|${error_text}"
elif [[ -n "$explanation" && -z "$error_text" ]]; then
error_text="exit_code=${exit_code} | ${explanation}"
fi
# Calculate duration if start time is available
@@ -420,10 +439,17 @@ _send_abort_telemetry() {
duration=$(($(date +%s) - DIAGNOSTICS_START_TIME))
fi
# Categorize error if function is available (may not be in minimal container context)
local error_category=""
if declare -f categorize_error &>/dev/null; then
error_category=$(categorize_error "$exit_code" 2>/dev/null) || true
fi
# Build JSON payload with error context
local payload
payload="{\"random_id\":\"${RANDOM_UUID}\",\"execution_id\":\"${EXECUTION_ID:-${RANDOM_UUID}}\",\"type\":\"${TELEMETRY_TYPE:-lxc}\",\"nsapp\":\"${NSAPP:-${app:-unknown}}\",\"status\":\"failed\",\"exit_code\":${exit_code}"
[[ -n "$error_text" ]] && payload="${payload},\"error\":\"${error_text}\""
[[ -n "$error_category" ]] && payload="${payload},\"error_category\":\"${error_category}\""
[[ -n "$duration" ]] && payload="${payload},\"duration\":${duration}"
payload="${payload}}"