mirror of
https://github.com/community-scripts/ProxmoxVE.git
synced 2026-04-26 12:20:40 +00:00
Classify exit-1 errors & guard telemetry
Analyze logs for generic exit code 1 and export an ERROR_CATEGORY_OVERRIDE so telemetry receives a more accurate error category (apt, oom, network, storage, dependency). Preserve any existing TELEMETRY_TYPE when posting updates. Add defense-in-depth by disabling strict error traps before running grep/sed log analysis to avoid spurious error_handler invocations. Mark successful installs with INSTALL_COMPLETE and update the error handler to only report a successful "done" telemetry state when INSTALL_COMPLETE is explicitly set, preventing false-positive success reports from early zero-exit exits.
This commit is contained in:
+29
-10
@@ -348,10 +348,10 @@ explain_exit_code() {
|
|||||||
json_escape() {
|
json_escape() {
|
||||||
# Escape a string for safe JSON embedding using awk (handles any input size).
|
# Escape a string for safe JSON embedding using awk (handles any input size).
|
||||||
# Pipeline: strip ANSI → remove control chars → escape \ " TAB → join lines with \n
|
# Pipeline: strip ANSI → remove control chars → escape \ " TAB → join lines with \n
|
||||||
printf '%s' "$1" \
|
printf '%s' "$1" |
|
||||||
| sed 's/\x1b\[[0-9;]*[a-zA-Z]//g' \
|
sed 's/\x1b\[[0-9;]*[a-zA-Z]//g' |
|
||||||
| tr -d '\000-\010\013\014\016-\037\177\r' \
|
tr -d '\000-\010\013\014\016-\037\177\r' |
|
||||||
| awk '
|
awk '
|
||||||
BEGIN { ORS = "" }
|
BEGIN { ORS = "" }
|
||||||
{
|
{
|
||||||
gsub(/\\/, "\\\\") # backslash → \\
|
gsub(/\\/, "\\\\") # backslash → \\
|
||||||
@@ -627,8 +627,8 @@ post_to_api() {
|
|||||||
|
|
||||||
[[ "${DEV_MODE:-}" == "true" ]] && echo "[DEBUG] post_to_api() DIAGNOSTICS=$DIAGNOSTICS RANDOM_UUID=$RANDOM_UUID NSAPP=$NSAPP" >&2
|
[[ "${DEV_MODE:-}" == "true" ]] && echo "[DEBUG] post_to_api() DIAGNOSTICS=$DIAGNOSTICS RANDOM_UUID=$RANDOM_UUID NSAPP=$NSAPP" >&2
|
||||||
|
|
||||||
# Set type for later status updates
|
# Set type for later status updates (preserve if already set, e.g. turnkey)
|
||||||
TELEMETRY_TYPE="lxc"
|
TELEMETRY_TYPE="${TELEMETRY_TYPE:-lxc}"
|
||||||
|
|
||||||
local pve_version=""
|
local pve_version=""
|
||||||
if command -v pveversion &>/dev/null; then
|
if command -v pveversion &>/dev/null; then
|
||||||
@@ -692,6 +692,7 @@ EOF
|
|||||||
# Send initial "installing" record with retry.
|
# Send initial "installing" record with retry.
|
||||||
# This record MUST exist for all subsequent updates to succeed.
|
# This record MUST exist for all subsequent updates to succeed.
|
||||||
local http_code="" attempt
|
local http_code="" attempt
|
||||||
|
local _post_success=false
|
||||||
for attempt in 1 2 3; do
|
for attempt in 1 2 3; do
|
||||||
if [[ "${DEV_MODE:-}" == "true" ]]; then
|
if [[ "${DEV_MODE:-}" == "true" ]]; then
|
||||||
http_code=$(curl -sS -w "%{http_code}" -m "${TELEMETRY_TIMEOUT}" -X POST "${TELEMETRY_URL}" \
|
http_code=$(curl -sS -w "%{http_code}" -m "${TELEMETRY_TIMEOUT}" -X POST "${TELEMETRY_URL}" \
|
||||||
@@ -703,11 +704,19 @@ EOF
|
|||||||
-H "Content-Type: application/json" \
|
-H "Content-Type: application/json" \
|
||||||
-d "$JSON_PAYLOAD" -o /dev/null 2>/dev/null) || http_code="000"
|
-d "$JSON_PAYLOAD" -o /dev/null 2>/dev/null) || http_code="000"
|
||||||
fi
|
fi
|
||||||
[[ "$http_code" =~ ^2[0-9]{2}$ ]] && break
|
if [[ "$http_code" =~ ^2[0-9]{2}$ ]]; then
|
||||||
|
_post_success=true
|
||||||
|
break
|
||||||
|
fi
|
||||||
[[ "$attempt" -lt 3 ]] && sleep 1
|
[[ "$attempt" -lt 3 ]] && sleep 1
|
||||||
done
|
done
|
||||||
|
|
||||||
POST_TO_API_DONE=true
|
# Only mark done if at least one attempt succeeded.
|
||||||
|
# If all 3 failed, POST_TO_API_DONE stays false so post_update_to_api
|
||||||
|
# and on_exit() know the initial record was never created.
|
||||||
|
# The server has fallback logic to create a new record on status updates,
|
||||||
|
# so subsequent calls can still succeed even without the initial record.
|
||||||
|
POST_TO_API_DONE=${_post_success}
|
||||||
}
|
}
|
||||||
|
|
||||||
# ------------------------------------------------------------------------------
|
# ------------------------------------------------------------------------------
|
||||||
@@ -798,15 +807,19 @@ EOF
|
|||||||
|
|
||||||
# Send initial "installing" record with retry (must succeed for updates to work)
|
# Send initial "installing" record with retry (must succeed for updates to work)
|
||||||
local http_code="" attempt
|
local http_code="" attempt
|
||||||
|
local _post_success=false
|
||||||
for attempt in 1 2 3; do
|
for attempt in 1 2 3; do
|
||||||
http_code=$(curl -sS -w "%{http_code}" -m "${TELEMETRY_TIMEOUT}" -X POST "${TELEMETRY_URL}" \
|
http_code=$(curl -sS -w "%{http_code}" -m "${TELEMETRY_TIMEOUT}" -X POST "${TELEMETRY_URL}" \
|
||||||
-H "Content-Type: application/json" \
|
-H "Content-Type: application/json" \
|
||||||
-d "$JSON_PAYLOAD" -o /dev/null 2>/dev/null) || http_code="000"
|
-d "$JSON_PAYLOAD" -o /dev/null 2>/dev/null) || http_code="000"
|
||||||
[[ "$http_code" =~ ^2[0-9]{2}$ ]] && break
|
if [[ "$http_code" =~ ^2[0-9]{2}$ ]]; then
|
||||||
|
_post_success=true
|
||||||
|
break
|
||||||
|
fi
|
||||||
[[ "$attempt" -lt 3 ]] && sleep 1
|
[[ "$attempt" -lt 3 ]] && sleep 1
|
||||||
done
|
done
|
||||||
|
|
||||||
POST_TO_API_DONE=true
|
POST_TO_API_DONE=${_post_success}
|
||||||
}
|
}
|
||||||
|
|
||||||
# ------------------------------------------------------------------------------
|
# ------------------------------------------------------------------------------
|
||||||
@@ -1083,6 +1096,12 @@ EOF
|
|||||||
# - Used to group errors in dashboard
|
# - Used to group errors in dashboard
|
||||||
# ------------------------------------------------------------------------------
|
# ------------------------------------------------------------------------------
|
||||||
categorize_error() {
|
categorize_error() {
|
||||||
|
# Allow build.func to override category based on log analysis (exit code 1 subclassification)
|
||||||
|
if [[ -n "${ERROR_CATEGORY_OVERRIDE:-}" ]]; then
|
||||||
|
echo "$ERROR_CATEGORY_OVERRIDE"
|
||||||
|
return
|
||||||
|
fi
|
||||||
|
|
||||||
local code="$1"
|
local code="$1"
|
||||||
case "$code" in
|
case "$code" in
|
||||||
# Network errors (curl/wget)
|
# Network errors (curl/wget)
|
||||||
|
|||||||
+58
-36
@@ -222,9 +222,12 @@ update_motd_ip() {
|
|||||||
local current_ip="$(hostname -I | awk '{print $1}')"
|
local current_ip="$(hostname -I | awk '{print $1}')"
|
||||||
|
|
||||||
# Escape sed special chars in replacement strings (& \ |)
|
# Escape sed special chars in replacement strings (& \ |)
|
||||||
current_os="${current_os//\\/\\\\}"; current_os="${current_os//&/\\&}"
|
current_os="${current_os//\\/\\\\}"
|
||||||
current_hostname="${current_hostname//\\/\\\\}"; current_hostname="${current_hostname//&/\\&}"
|
current_os="${current_os//&/\\&}"
|
||||||
current_ip="${current_ip//\\/\\\\}"; current_ip="${current_ip//&/\\&}"
|
current_hostname="${current_hostname//\\/\\\\}"
|
||||||
|
current_hostname="${current_hostname//&/\\&}"
|
||||||
|
current_ip="${current_ip//\\/\\\\}"
|
||||||
|
current_ip="${current_ip//&/\\&}"
|
||||||
|
|
||||||
# Update only if values actually changed
|
# Update only if values actually changed
|
||||||
if ! grep -q "OS:.*$current_os" "$PROFILE_FILE" 2>/dev/null; then
|
if ! grep -q "OS:.*$current_os" "$PROFILE_FILE" 2>/dev/null; then
|
||||||
@@ -4223,6 +4226,53 @@ EOF'
|
|||||||
fi
|
fi
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
# Defense-in-depth: Ensure error handling stays disabled during recovery.
|
||||||
|
# Some functions (e.g. silent/$STD) unconditionally re-enable set -Eeuo pipefail
|
||||||
|
# and trap 'error_handler' ERR. If any code path above called such a function,
|
||||||
|
# the grep/sed pipelines below would trigger error_handler on non-match (exit 1).
|
||||||
|
set +Eeuo pipefail
|
||||||
|
trap - ERR
|
||||||
|
|
||||||
|
# --- Exit code 1 subclassification: analyze logs BEFORE telemetry call ---
|
||||||
|
# Exit code 1 is generic ("General error"). Analyze logs to determine the
|
||||||
|
# real error category so telemetry gets a useful classification instead of "shell".
|
||||||
|
local is_oom=false
|
||||||
|
local is_network_issue=false
|
||||||
|
local is_apt_issue=false
|
||||||
|
local is_cmd_not_found=false
|
||||||
|
local is_disk_full=false
|
||||||
|
|
||||||
|
if [[ $install_exit_code -eq 1 && -f "$combined_log" ]]; then
|
||||||
|
if grep -qiE 'E: Unable to|E: Package|E: Failed to fetch|dpkg.*error|broken packages|unmet dependencies|dpkg --configure -a' "$combined_log"; then
|
||||||
|
is_apt_issue=true
|
||||||
|
fi
|
||||||
|
if grep -qiE 'Cannot allocate memory|Out of memory|oom-killer|Killed process|JavaScript heap' "$combined_log"; then
|
||||||
|
is_oom=true
|
||||||
|
fi
|
||||||
|
if grep -qiE 'Could not resolve|DNS|Connection refused|Network is unreachable|No route to host|Temporary failure resolving|Failed to fetch' "$combined_log"; then
|
||||||
|
is_network_issue=true
|
||||||
|
fi
|
||||||
|
if grep -qiE ': command not found|No such file or directory.*/s?bin/' "$combined_log"; then
|
||||||
|
is_cmd_not_found=true
|
||||||
|
fi
|
||||||
|
if grep -qiE 'ENOSPC|no space left on device|Disk quota exceeded|errno -28' "$combined_log"; then
|
||||||
|
is_disk_full=true
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Set override for categorize_error() so telemetry gets the real category
|
||||||
|
if [[ "$is_apt_issue" == true ]]; then
|
||||||
|
export ERROR_CATEGORY_OVERRIDE="dependency"
|
||||||
|
elif [[ "$is_oom" == true ]]; then
|
||||||
|
export ERROR_CATEGORY_OVERRIDE="resource"
|
||||||
|
elif [[ "$is_network_issue" == true ]]; then
|
||||||
|
export ERROR_CATEGORY_OVERRIDE="network"
|
||||||
|
elif [[ "$is_disk_full" == true ]]; then
|
||||||
|
export ERROR_CATEGORY_OVERRIDE="storage"
|
||||||
|
elif [[ "$is_cmd_not_found" == true ]]; then
|
||||||
|
export ERROR_CATEGORY_OVERRIDE="dependency"
|
||||||
|
fi
|
||||||
|
|
||||||
# Report failure to telemetry API (now with log available on host)
|
# Report failure to telemetry API (now with log available on host)
|
||||||
# NOTE: Do NOT use msg_info/spinner here — the background spinner process
|
# NOTE: Do NOT use msg_info/spinner here — the background spinner process
|
||||||
# causes SIGTSTP in non-interactive shells (bash -c "$(curl ...)"), which
|
# causes SIGTSTP in non-interactive shells (bash -c "$(curl ...)"), which
|
||||||
@@ -4231,13 +4281,6 @@ EOF'
|
|||||||
post_update_to_api "failed" "$install_exit_code"
|
post_update_to_api "failed" "$install_exit_code"
|
||||||
$STD echo -e "${TAB}${CM:-✔} Failure reported"
|
$STD echo -e "${TAB}${CM:-✔} Failure reported"
|
||||||
|
|
||||||
# Defense-in-depth: Ensure error handling stays disabled during recovery.
|
|
||||||
# Some functions (e.g. silent/$STD) unconditionally re-enable set -Eeuo pipefail
|
|
||||||
# and trap 'error_handler' ERR. If any code path above called such a function,
|
|
||||||
# the grep/sed pipelines below would trigger error_handler on non-match (exit 1).
|
|
||||||
set +Eeuo pipefail
|
|
||||||
trap - ERR
|
|
||||||
|
|
||||||
# Show combined log location
|
# Show combined log location
|
||||||
if [[ -n "$CTID" && -n "${SESSION_ID:-}" ]]; then
|
if [[ -n "$CTID" && -n "${SESSION_ID:-}" ]]; then
|
||||||
msg_custom "📋" "${YW}" "Installation log: ${combined_log}"
|
msg_custom "📋" "${YW}" "Installation log: ${combined_log}"
|
||||||
@@ -4266,12 +4309,9 @@ EOF'
|
|||||||
# Prompt user for cleanup with 60s timeout
|
# Prompt user for cleanup with 60s timeout
|
||||||
echo ""
|
echo ""
|
||||||
|
|
||||||
# Detect error type for smart recovery options
|
# Extend error detection for non-exit-1 codes (exit 1 was already analyzed above)
|
||||||
local is_oom=false
|
# The is_* flags were set above for exit code 1 log analysis; here we add
|
||||||
local is_network_issue=false
|
# exit-code-specific detections for other codes.
|
||||||
local is_apt_issue=false
|
|
||||||
local is_cmd_not_found=false
|
|
||||||
local is_disk_full=false
|
|
||||||
local error_explanation=""
|
local error_explanation=""
|
||||||
if declare -f explain_exit_code >/dev/null 2>&1; then
|
if declare -f explain_exit_code >/dev/null 2>&1; then
|
||||||
error_explanation="$(explain_exit_code "$install_exit_code")"
|
error_explanation="$(explain_exit_code "$install_exit_code")"
|
||||||
@@ -4321,26 +4361,6 @@ EOF'
|
|||||||
;;
|
;;
|
||||||
esac
|
esac
|
||||||
|
|
||||||
# Exit 1 subclassification: analyze logs to identify actual root cause
|
|
||||||
# Many exit 1 errors are actually APT, OOM, network, or command-not-found issues
|
|
||||||
if [[ $install_exit_code -eq 1 && -f "$combined_log" ]]; then
|
|
||||||
if grep -qiE 'E: Unable to|E: Package|E: Failed to fetch|dpkg.*error|broken packages|unmet dependencies|dpkg --configure -a' "$combined_log"; then
|
|
||||||
is_apt_issue=true
|
|
||||||
fi
|
|
||||||
if grep -qiE 'Cannot allocate memory|Out of memory|oom-killer|Killed process|JavaScript heap' "$combined_log"; then
|
|
||||||
is_oom=true
|
|
||||||
fi
|
|
||||||
if grep -qiE 'Could not resolve|DNS|Connection refused|Network is unreachable|No route to host|Temporary failure resolving|Failed to fetch' "$combined_log"; then
|
|
||||||
is_network_issue=true
|
|
||||||
fi
|
|
||||||
if grep -qiE ': command not found|No such file or directory.*/s?bin/' "$combined_log"; then
|
|
||||||
is_cmd_not_found=true
|
|
||||||
fi
|
|
||||||
if grep -qiE 'ENOSPC|no space left on device|Disk quota exceeded|errno -28' "$combined_log"; then
|
|
||||||
is_disk_full=true
|
|
||||||
fi
|
|
||||||
fi
|
|
||||||
|
|
||||||
# Show error explanation if available
|
# Show error explanation if available
|
||||||
if [[ -n "$error_explanation" ]]; then
|
if [[ -n "$error_explanation" ]]; then
|
||||||
echo -e "${TAB}${RD}Error: ${error_explanation}${CL}"
|
echo -e "${TAB}${RD}Error: ${error_explanation}${CL}"
|
||||||
@@ -4542,6 +4562,7 @@ EOF'
|
|||||||
|
|
||||||
if [[ $apt_retry_code -eq 0 ]]; then
|
if [[ $apt_retry_code -eq 0 ]]; then
|
||||||
msg_ok "Installation completed successfully after APT repair!"
|
msg_ok "Installation completed successfully after APT repair!"
|
||||||
|
INSTALL_COMPLETE=true
|
||||||
post_update_to_api "done" "0" "force"
|
post_update_to_api "done" "0" "force"
|
||||||
return 0
|
return 0
|
||||||
else
|
else
|
||||||
@@ -5716,6 +5737,7 @@ EOF
|
|||||||
systemctl start ping-instances.service
|
systemctl start ping-instances.service
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
INSTALL_COMPLETE=true
|
||||||
post_update_to_api "done" "none"
|
post_update_to_api "done" "none"
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
+17
-8
@@ -507,14 +507,23 @@ _stop_container_if_installing() {
|
|||||||
on_exit() {
|
on_exit() {
|
||||||
local exit_code=$?
|
local exit_code=$?
|
||||||
|
|
||||||
# Report orphaned "installing" records to telemetry API
|
# Report orphaned telemetry records
|
||||||
# Catches ALL exit paths: errors, signals, AND clean exits where
|
# Two scenarios handled:
|
||||||
# post_to_api was called but post_update_to_api was never called
|
# 1. POST_TO_API_DONE=true but POST_UPDATE_DONE=false: Record was created but
|
||||||
if [[ "${POST_TO_API_DONE:-}" == "true" && "${POST_UPDATE_DONE:-}" != "true" ]]; then
|
# never got a final status update → send abort/done now.
|
||||||
if [[ $exit_code -ne 0 ]]; then
|
# 2. POST_TO_API_DONE=false but DIAGNOSTICS=yes: Initial post failed (server
|
||||||
_send_abort_telemetry "$exit_code"
|
# unreachable/timeout), but the server has fallback create-on-update logic,
|
||||||
elif declare -f post_update_to_api >/dev/null 2>&1; then
|
# so a status update can still create the record. Worth one last try.
|
||||||
post_update_to_api "done" "0" 2>/dev/null || true
|
if [[ "${POST_UPDATE_DONE:-}" != "true" ]]; then
|
||||||
|
if [[ "${POST_TO_API_DONE:-}" == "true" || "${DIAGNOSTICS:-no}" == "yes" ]]; then
|
||||||
|
if [[ $exit_code -ne 0 ]]; then
|
||||||
|
_send_abort_telemetry "$exit_code"
|
||||||
|
elif [[ "${INSTALL_COMPLETE:-}" == "true" ]] && declare -f post_update_to_api >/dev/null 2>&1; then
|
||||||
|
# Only report success if the install was explicitly marked complete.
|
||||||
|
# Without this guard, early bailouts (e.g. user cancelled) with exit 0
|
||||||
|
# would be falsely reported as successful installations.
|
||||||
|
post_update_to_api "done" "0" 2>/dev/null || true
|
||||||
|
fi
|
||||||
fi
|
fi
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user