diff --git a/misc/api.func b/misc/api.func index e29975e2c..e895c84e6 100644 --- a/misc/api.func +++ b/misc/api.func @@ -348,10 +348,10 @@ explain_exit_code() { json_escape() { # Escape a string for safe JSON embedding using awk (handles any input size). # Pipeline: strip ANSI → remove control chars → escape \ " TAB → join lines with \n - printf '%s' "$1" \ - | sed 's/\x1b\[[0-9;]*[a-zA-Z]//g' \ - | tr -d '\000-\010\013\014\016-\037\177\r' \ - | awk ' + printf '%s' "$1" | + sed 's/\x1b\[[0-9;]*[a-zA-Z]//g' | + tr -d '\000-\010\013\014\016-\037\177\r' | + awk ' BEGIN { ORS = "" } { gsub(/\\/, "\\\\") # backslash → \\ @@ -627,8 +627,8 @@ post_to_api() { [[ "${DEV_MODE:-}" == "true" ]] && echo "[DEBUG] post_to_api() DIAGNOSTICS=$DIAGNOSTICS RANDOM_UUID=$RANDOM_UUID NSAPP=$NSAPP" >&2 - # Set type for later status updates - TELEMETRY_TYPE="lxc" + # Set type for later status updates (preserve if already set, e.g. turnkey) + TELEMETRY_TYPE="${TELEMETRY_TYPE:-lxc}" local pve_version="" if command -v pveversion &>/dev/null; then @@ -692,6 +692,7 @@ EOF # Send initial "installing" record with retry. # This record MUST exist for all subsequent updates to succeed. local http_code="" attempt + local _post_success=false for attempt in 1 2 3; do if [[ "${DEV_MODE:-}" == "true" ]]; then http_code=$(curl -sS -w "%{http_code}" -m "${TELEMETRY_TIMEOUT}" -X POST "${TELEMETRY_URL}" \ @@ -703,11 +704,19 @@ EOF -H "Content-Type: application/json" \ -d "$JSON_PAYLOAD" -o /dev/null 2>/dev/null) || http_code="000" fi - [[ "$http_code" =~ ^2[0-9]{2}$ ]] && break + if [[ "$http_code" =~ ^2[0-9]{2}$ ]]; then + _post_success=true + break + fi [[ "$attempt" -lt 3 ]] && sleep 1 done - POST_TO_API_DONE=true + # Only mark done if at least one attempt succeeded. + # If all 3 failed, POST_TO_API_DONE stays false so post_update_to_api + # and on_exit() know the initial record was never created. + # The server has fallback logic to create a new record on status updates, + # so subsequent calls can still succeed even without the initial record. + POST_TO_API_DONE=${_post_success} } # ------------------------------------------------------------------------------ @@ -798,15 +807,19 @@ EOF # Send initial "installing" record with retry (must succeed for updates to work) local http_code="" attempt + local _post_success=false for attempt in 1 2 3; do http_code=$(curl -sS -w "%{http_code}" -m "${TELEMETRY_TIMEOUT}" -X POST "${TELEMETRY_URL}" \ -H "Content-Type: application/json" \ -d "$JSON_PAYLOAD" -o /dev/null 2>/dev/null) || http_code="000" - [[ "$http_code" =~ ^2[0-9]{2}$ ]] && break + if [[ "$http_code" =~ ^2[0-9]{2}$ ]]; then + _post_success=true + break + fi [[ "$attempt" -lt 3 ]] && sleep 1 done - POST_TO_API_DONE=true + POST_TO_API_DONE=${_post_success} } # ------------------------------------------------------------------------------ @@ -1083,6 +1096,12 @@ EOF # - Used to group errors in dashboard # ------------------------------------------------------------------------------ categorize_error() { + # Allow build.func to override category based on log analysis (exit code 1 subclassification) + if [[ -n "${ERROR_CATEGORY_OVERRIDE:-}" ]]; then + echo "$ERROR_CATEGORY_OVERRIDE" + return + fi + local code="$1" case "$code" in # Network errors (curl/wget) diff --git a/misc/build.func b/misc/build.func index b8483ca0d..304f1c46b 100644 --- a/misc/build.func +++ b/misc/build.func @@ -222,9 +222,12 @@ update_motd_ip() { local current_ip="$(hostname -I | awk '{print $1}')" # Escape sed special chars in replacement strings (& \ |) - current_os="${current_os//\\/\\\\}"; current_os="${current_os//&/\\&}" - current_hostname="${current_hostname//\\/\\\\}"; current_hostname="${current_hostname//&/\\&}" - current_ip="${current_ip//\\/\\\\}"; current_ip="${current_ip//&/\\&}" + current_os="${current_os//\\/\\\\}" + current_os="${current_os//&/\\&}" + current_hostname="${current_hostname//\\/\\\\}" + current_hostname="${current_hostname//&/\\&}" + current_ip="${current_ip//\\/\\\\}" + current_ip="${current_ip//&/\\&}" # Update only if values actually changed if ! grep -q "OS:.*$current_os" "$PROFILE_FILE" 2>/dev/null; then @@ -4223,6 +4226,53 @@ EOF' fi fi + # Defense-in-depth: Ensure error handling stays disabled during recovery. + # Some functions (e.g. silent/$STD) unconditionally re-enable set -Eeuo pipefail + # and trap 'error_handler' ERR. If any code path above called such a function, + # the grep/sed pipelines below would trigger error_handler on non-match (exit 1). + set +Eeuo pipefail + trap - ERR + + # --- Exit code 1 subclassification: analyze logs BEFORE telemetry call --- + # Exit code 1 is generic ("General error"). Analyze logs to determine the + # real error category so telemetry gets a useful classification instead of "shell". + local is_oom=false + local is_network_issue=false + local is_apt_issue=false + local is_cmd_not_found=false + local is_disk_full=false + + if [[ $install_exit_code -eq 1 && -f "$combined_log" ]]; then + if grep -qiE 'E: Unable to|E: Package|E: Failed to fetch|dpkg.*error|broken packages|unmet dependencies|dpkg --configure -a' "$combined_log"; then + is_apt_issue=true + fi + if grep -qiE 'Cannot allocate memory|Out of memory|oom-killer|Killed process|JavaScript heap' "$combined_log"; then + is_oom=true + fi + if grep -qiE 'Could not resolve|DNS|Connection refused|Network is unreachable|No route to host|Temporary failure resolving|Failed to fetch' "$combined_log"; then + is_network_issue=true + fi + if grep -qiE ': command not found|No such file or directory.*/s?bin/' "$combined_log"; then + is_cmd_not_found=true + fi + if grep -qiE 'ENOSPC|no space left on device|Disk quota exceeded|errno -28' "$combined_log"; then + is_disk_full=true + fi + fi + + # Set override for categorize_error() so telemetry gets the real category + if [[ "$is_apt_issue" == true ]]; then + export ERROR_CATEGORY_OVERRIDE="dependency" + elif [[ "$is_oom" == true ]]; then + export ERROR_CATEGORY_OVERRIDE="resource" + elif [[ "$is_network_issue" == true ]]; then + export ERROR_CATEGORY_OVERRIDE="network" + elif [[ "$is_disk_full" == true ]]; then + export ERROR_CATEGORY_OVERRIDE="storage" + elif [[ "$is_cmd_not_found" == true ]]; then + export ERROR_CATEGORY_OVERRIDE="dependency" + fi + # Report failure to telemetry API (now with log available on host) # NOTE: Do NOT use msg_info/spinner here — the background spinner process # causes SIGTSTP in non-interactive shells (bash -c "$(curl ...)"), which @@ -4231,13 +4281,6 @@ EOF' post_update_to_api "failed" "$install_exit_code" $STD echo -e "${TAB}${CM:-✔} Failure reported" - # Defense-in-depth: Ensure error handling stays disabled during recovery. - # Some functions (e.g. silent/$STD) unconditionally re-enable set -Eeuo pipefail - # and trap 'error_handler' ERR. If any code path above called such a function, - # the grep/sed pipelines below would trigger error_handler on non-match (exit 1). - set +Eeuo pipefail - trap - ERR - # Show combined log location if [[ -n "$CTID" && -n "${SESSION_ID:-}" ]]; then msg_custom "📋" "${YW}" "Installation log: ${combined_log}" @@ -4266,12 +4309,9 @@ EOF' # Prompt user for cleanup with 60s timeout echo "" - # Detect error type for smart recovery options - local is_oom=false - local is_network_issue=false - local is_apt_issue=false - local is_cmd_not_found=false - local is_disk_full=false + # Extend error detection for non-exit-1 codes (exit 1 was already analyzed above) + # The is_* flags were set above for exit code 1 log analysis; here we add + # exit-code-specific detections for other codes. local error_explanation="" if declare -f explain_exit_code >/dev/null 2>&1; then error_explanation="$(explain_exit_code "$install_exit_code")" @@ -4321,26 +4361,6 @@ EOF' ;; esac - # Exit 1 subclassification: analyze logs to identify actual root cause - # Many exit 1 errors are actually APT, OOM, network, or command-not-found issues - if [[ $install_exit_code -eq 1 && -f "$combined_log" ]]; then - if grep -qiE 'E: Unable to|E: Package|E: Failed to fetch|dpkg.*error|broken packages|unmet dependencies|dpkg --configure -a' "$combined_log"; then - is_apt_issue=true - fi - if grep -qiE 'Cannot allocate memory|Out of memory|oom-killer|Killed process|JavaScript heap' "$combined_log"; then - is_oom=true - fi - if grep -qiE 'Could not resolve|DNS|Connection refused|Network is unreachable|No route to host|Temporary failure resolving|Failed to fetch' "$combined_log"; then - is_network_issue=true - fi - if grep -qiE ': command not found|No such file or directory.*/s?bin/' "$combined_log"; then - is_cmd_not_found=true - fi - if grep -qiE 'ENOSPC|no space left on device|Disk quota exceeded|errno -28' "$combined_log"; then - is_disk_full=true - fi - fi - # Show error explanation if available if [[ -n "$error_explanation" ]]; then echo -e "${TAB}${RD}Error: ${error_explanation}${CL}" @@ -4542,6 +4562,7 @@ EOF' if [[ $apt_retry_code -eq 0 ]]; then msg_ok "Installation completed successfully after APT repair!" + INSTALL_COMPLETE=true post_update_to_api "done" "0" "force" return 0 else @@ -5716,6 +5737,7 @@ EOF systemctl start ping-instances.service fi + INSTALL_COMPLETE=true post_update_to_api "done" "none" } diff --git a/misc/error_handler.func b/misc/error_handler.func index 39e5e667f..bf81ea9bf 100644 --- a/misc/error_handler.func +++ b/misc/error_handler.func @@ -507,14 +507,23 @@ _stop_container_if_installing() { on_exit() { local exit_code=$? - # Report orphaned "installing" records to telemetry API - # Catches ALL exit paths: errors, signals, AND clean exits where - # post_to_api was called but post_update_to_api was never called - if [[ "${POST_TO_API_DONE:-}" == "true" && "${POST_UPDATE_DONE:-}" != "true" ]]; then - if [[ $exit_code -ne 0 ]]; then - _send_abort_telemetry "$exit_code" - elif declare -f post_update_to_api >/dev/null 2>&1; then - post_update_to_api "done" "0" 2>/dev/null || true + # Report orphaned telemetry records + # Two scenarios handled: + # 1. POST_TO_API_DONE=true but POST_UPDATE_DONE=false: Record was created but + # never got a final status update → send abort/done now. + # 2. POST_TO_API_DONE=false but DIAGNOSTICS=yes: Initial post failed (server + # unreachable/timeout), but the server has fallback create-on-update logic, + # so a status update can still create the record. Worth one last try. + if [[ "${POST_UPDATE_DONE:-}" != "true" ]]; then + if [[ "${POST_TO_API_DONE:-}" == "true" || "${DIAGNOSTICS:-no}" == "yes" ]]; then + if [[ $exit_code -ne 0 ]]; then + _send_abort_telemetry "$exit_code" + elif [[ "${INSTALL_COMPLETE:-}" == "true" ]] && declare -f post_update_to_api >/dev/null 2>&1; then + # Only report success if the install was explicitly marked complete. + # Without this guard, early bailouts (e.g. user cancelled) with exit 0 + # would be falsely reported as successful installations. + post_update_to_api "done" "0" 2>/dev/null || true + fi fi fi