Classify exit-1 errors & guard telemetry

Analyze logs for generic exit code 1 and export an ERROR_CATEGORY_OVERRIDE so telemetry receives a more accurate error category (apt, oom, network, storage, dependency). Preserve any existing TELEMETRY_TYPE when posting updates. Add defense-in-depth by disabling strict error traps before running grep/sed log analysis to avoid spurious error_handler invocations. Mark successful installs with INSTALL_COMPLETE and update the error handler to only report a successful "done" telemetry state when INSTALL_COMPLETE is explicitly set, preventing false-positive success reports from early zero-exit exits.
This commit is contained in:
CanbiZ (MickLesk)
2026-03-24 09:57:43 +01:00
parent 9aa0390553
commit 86c658909a
3 changed files with 104 additions and 54 deletions
+58 -36
View File
@@ -222,9 +222,12 @@ update_motd_ip() {
local current_ip="$(hostname -I | awk '{print $1}')"
# Escape sed special chars in replacement strings (& \ |)
current_os="${current_os//\\/\\\\}"; current_os="${current_os//&/\\&}"
current_hostname="${current_hostname//\\/\\\\}"; current_hostname="${current_hostname//&/\\&}"
current_ip="${current_ip//\\/\\\\}"; current_ip="${current_ip//&/\\&}"
current_os="${current_os//\\/\\\\}"
current_os="${current_os//&/\\&}"
current_hostname="${current_hostname//\\/\\\\}"
current_hostname="${current_hostname//&/\\&}"
current_ip="${current_ip//\\/\\\\}"
current_ip="${current_ip//&/\\&}"
# Update only if values actually changed
if ! grep -q "OS:.*$current_os" "$PROFILE_FILE" 2>/dev/null; then
@@ -4223,6 +4226,53 @@ EOF'
fi
fi
# Defense-in-depth: Ensure error handling stays disabled during recovery.
# Some functions (e.g. silent/$STD) unconditionally re-enable set -Eeuo pipefail
# and trap 'error_handler' ERR. If any code path above called such a function,
# the grep/sed pipelines below would trigger error_handler on non-match (exit 1).
set +Eeuo pipefail
trap - ERR
# --- Exit code 1 subclassification: analyze logs BEFORE telemetry call ---
# Exit code 1 is generic ("General error"). Analyze logs to determine the
# real error category so telemetry gets a useful classification instead of "shell".
local is_oom=false
local is_network_issue=false
local is_apt_issue=false
local is_cmd_not_found=false
local is_disk_full=false
if [[ $install_exit_code -eq 1 && -f "$combined_log" ]]; then
if grep -qiE 'E: Unable to|E: Package|E: Failed to fetch|dpkg.*error|broken packages|unmet dependencies|dpkg --configure -a' "$combined_log"; then
is_apt_issue=true
fi
if grep -qiE 'Cannot allocate memory|Out of memory|oom-killer|Killed process|JavaScript heap' "$combined_log"; then
is_oom=true
fi
if grep -qiE 'Could not resolve|DNS|Connection refused|Network is unreachable|No route to host|Temporary failure resolving|Failed to fetch' "$combined_log"; then
is_network_issue=true
fi
if grep -qiE ': command not found|No such file or directory.*/s?bin/' "$combined_log"; then
is_cmd_not_found=true
fi
if grep -qiE 'ENOSPC|no space left on device|Disk quota exceeded|errno -28' "$combined_log"; then
is_disk_full=true
fi
fi
# Set override for categorize_error() so telemetry gets the real category
if [[ "$is_apt_issue" == true ]]; then
export ERROR_CATEGORY_OVERRIDE="dependency"
elif [[ "$is_oom" == true ]]; then
export ERROR_CATEGORY_OVERRIDE="resource"
elif [[ "$is_network_issue" == true ]]; then
export ERROR_CATEGORY_OVERRIDE="network"
elif [[ "$is_disk_full" == true ]]; then
export ERROR_CATEGORY_OVERRIDE="storage"
elif [[ "$is_cmd_not_found" == true ]]; then
export ERROR_CATEGORY_OVERRIDE="dependency"
fi
# Report failure to telemetry API (now with log available on host)
# NOTE: Do NOT use msg_info/spinner here — the background spinner process
# causes SIGTSTP in non-interactive shells (bash -c "$(curl ...)"), which
@@ -4231,13 +4281,6 @@ EOF'
post_update_to_api "failed" "$install_exit_code"
$STD echo -e "${TAB}${CM:-} Failure reported"
# Defense-in-depth: Ensure error handling stays disabled during recovery.
# Some functions (e.g. silent/$STD) unconditionally re-enable set -Eeuo pipefail
# and trap 'error_handler' ERR. If any code path above called such a function,
# the grep/sed pipelines below would trigger error_handler on non-match (exit 1).
set +Eeuo pipefail
trap - ERR
# Show combined log location
if [[ -n "$CTID" && -n "${SESSION_ID:-}" ]]; then
msg_custom "📋" "${YW}" "Installation log: ${combined_log}"
@@ -4266,12 +4309,9 @@ EOF'
# Prompt user for cleanup with 60s timeout
echo ""
# Detect error type for smart recovery options
local is_oom=false
local is_network_issue=false
local is_apt_issue=false
local is_cmd_not_found=false
local is_disk_full=false
# Extend error detection for non-exit-1 codes (exit 1 was already analyzed above)
# The is_* flags were set above for exit code 1 log analysis; here we add
# exit-code-specific detections for other codes.
local error_explanation=""
if declare -f explain_exit_code >/dev/null 2>&1; then
error_explanation="$(explain_exit_code "$install_exit_code")"
@@ -4321,26 +4361,6 @@ EOF'
;;
esac
# Exit 1 subclassification: analyze logs to identify actual root cause
# Many exit 1 errors are actually APT, OOM, network, or command-not-found issues
if [[ $install_exit_code -eq 1 && -f "$combined_log" ]]; then
if grep -qiE 'E: Unable to|E: Package|E: Failed to fetch|dpkg.*error|broken packages|unmet dependencies|dpkg --configure -a' "$combined_log"; then
is_apt_issue=true
fi
if grep -qiE 'Cannot allocate memory|Out of memory|oom-killer|Killed process|JavaScript heap' "$combined_log"; then
is_oom=true
fi
if grep -qiE 'Could not resolve|DNS|Connection refused|Network is unreachable|No route to host|Temporary failure resolving|Failed to fetch' "$combined_log"; then
is_network_issue=true
fi
if grep -qiE ': command not found|No such file or directory.*/s?bin/' "$combined_log"; then
is_cmd_not_found=true
fi
if grep -qiE 'ENOSPC|no space left on device|Disk quota exceeded|errno -28' "$combined_log"; then
is_disk_full=true
fi
fi
# Show error explanation if available
if [[ -n "$error_explanation" ]]; then
echo -e "${TAB}${RD}Error: ${error_explanation}${CL}"
@@ -4542,6 +4562,7 @@ EOF'
if [[ $apt_retry_code -eq 0 ]]; then
msg_ok "Installation completed successfully after APT repair!"
INSTALL_COMPLETE=true
post_update_to_api "done" "0" "force"
return 0
else
@@ -5716,6 +5737,7 @@ EOF
systemctl start ping-instances.service
fi
INSTALL_COMPLETE=true
post_update_to_api "done" "none"
}