Update notification service

2026-05-25 18:04:43 +00:00 · 2026-03-27 19:15:11 +01:00
parent 7c5e7208b9
commit 6bb9313b95
8 changed files with 319 additions and 255 deletions
@@ -16,7 +16,7 @@ import {
  AlertTriangle, Info, Settings2, Zap, Eye, EyeOff,
  Trash2, ChevronDown, ChevronUp, ChevronRight, TestTube2, Mail, Webhook,
  Copy, Server, Shield, ExternalLink, RefreshCw, Download, Upload,
-  Cloud, Brain, Globe, MessageSquareText, Sparkles, Pencil, Save, RotateCcw
+  Cloud, Brain, Globe, MessageSquareText, Sparkles, Pencil, Save, RotateCcw, Lightbulb
 } from "lucide-react"

 interface ChannelConfig {
@@ -67,6 +67,7 @@ interface NotificationConfig {
  ai_openai_base_url: string
  ai_prompt_mode: string  // 'default' or 'custom'
  ai_custom_prompt: string  // User's custom prompt
+  ai_allow_suggestions: string | boolean  // Enable AI suggestions (experimental)
  channel_ai_detail: Record<string, string>
  hostname: string
  webhook_secret: string
@@ -252,6 +253,7 @@ const DEFAULT_CONFIG: NotificationConfig = {
  ai_openai_base_url: "",
  ai_prompt_mode: "default",
  ai_custom_prompt: "",
+  ai_allow_suggestions: "false",
  channel_ai_detail: {
    telegram: "brief",
    gotify: "brief",
@@ -321,9 +323,10 @@ export function NotificationSettings() {
            openai: "",
            openrouter: "",
          },
-          ai_prompt_mode: data.config.ai_prompt_mode || "default",
-          ai_custom_prompt: data.config.ai_custom_prompt || "",
-        }
+                ai_prompt_mode: data.config.ai_prompt_mode || "default",
+                ai_custom_prompt: data.config.ai_custom_prompt || "",
+                ai_allow_suggestions: data.config.ai_allow_suggestions || "false",
+              }
        // If ai_model exists but ai_models doesn't have it, save it
        if (configWithDefaults.ai_model && !configWithDefaults.ai_models[configWithDefaults.ai_provider]) {
          configWithDefaults.ai_models[configWithDefaults.ai_provider] = configWithDefaults.ai_model
@@ -545,8 +548,9 @@ export function NotificationSettings() {
    ai_language: cfg.ai_language,
    ai_ollama_url: cfg.ai_ollama_url,
    ai_openai_base_url: cfg.ai_openai_base_url,
-    ai_prompt_mode: cfg.ai_prompt_mode || "default",
-    ai_custom_prompt: cfg.ai_custom_prompt || "",
+      ai_prompt_mode: cfg.ai_prompt_mode || "default",
+      ai_custom_prompt: cfg.ai_custom_prompt || "",
+      ai_allow_suggestions: cfg.ai_allow_suggestions || "false",
    hostname: cfg.hostname,
    webhook_secret: cfg.webhook_secret,
    webhook_allowed_ips: cfg.webhook_allowed_ips,
@@ -1846,6 +1850,26 @@ export function NotificationSettings() {
                              </p>
                            </div>
                          </div>
+                          
+                          {/* Experimental: AI Suggestions toggle */}
+                          <div className="space-y-2 pt-3 border-t border-border/50">
+                            <div className="flex items-center justify-between">
+                              <div className="flex items-center gap-2">
+                                <Lightbulb className="h-4 w-4 text-yellow-400" />
+                                <Label className="text-xs sm:text-sm text-foreground/80">AI Suggestions</Label>
+                                <span className="text-[10px] px-1.5 py-0.5 rounded bg-yellow-500/20 text-yellow-400 font-medium">BETA</span>
+                              </div>
+                              <Switch
+                                checked={config.ai_allow_suggestions === "true" || config.ai_allow_suggestions === true}
+                                onCheckedChange={v => updateConfig(p => ({ ...p, ai_allow_suggestions: v ? "true" : "false" }))}
+                                disabled={!editMode}
+                              />
+                            </div>
+                            <p className="text-xs text-muted-foreground leading-relaxed">
+                              When enabled, AI may add brief troubleshooting tips based on journal log context.
+                              Tips are factual and based only on what the logs show.
+                            </p>
+                          </div>
                        </div>
                      )}
                      
@@ -848,6 +848,91 @@ def get_current_latency(target='gateway'):
        return {'target': target, 'latency_avg': None, 'status': 'error'}


+def _capture_health_journal_context(categories: list, reason: str = '') -> str:
+    """Capture journal context relevant to health issues.
+    
+    Maps health categories to specific journal keywords so the AI
+    receives relevant system logs for diagnosis.
+    
+    Args:
+        categories: List of health category keys (e.g., ['storage', 'network'])
+        reason: The reason string from health check (used to extract more keywords)
+    
+    Returns:
+        Filtered journal output as string
+    """
+    import subprocess
+    import re
+    
+    # Map health categories to relevant journal keywords
+    CATEGORY_KEYWORDS = {
+        'storage': ['mount', 'nfs', 'cifs', 'smb', 'zfs', 'lvm', 'disk', 'nvme', 
+                    'sata', 'ata', 'I/O error', 'read error', 'write error',
+                    'filesystem', 'ext4', 'xfs', 'btrfs', 'pbs', 'datastore'],
+        'disks': ['smartd', 'smart', 'ata', 'sata', 'nvme', 'disk', 'I/O error',
+                  'bad sector', 'reallocated', 'pending sector', 'uncorrectable'],
+        'network': ['bond', 'bridge', 'vmbr', 'eth', 'network', 'link down',
+                    'carrier', 'no route', 'unreachable', 'timeout', 'connection'],
+        'services': ['pveproxy', 'pvedaemon', 'pvestatd', 'corosync', 'ceph',
+                     'systemd', 'failed', 'service', 'unit', 'start', 'stop'],
+        'vms': ['qemu', 'kvm', 'lxc', 'vzdump', 'qm', 'pct', 'guest agent',
+                'qemu-ga', 'migration', 'snapshot'],
+        'memory': ['oom', 'out of memory', 'killed process', 'swap', 'memory'],
+        'cpu': ['thermal', 'temperature', 'throttl', 'mce', 'machine check'],
+        'updates': ['apt', 'dpkg', 'upgrade', 'update', 'package'],
+        'certificates': ['ssl', 'certificate', 'cert', 'expired', 'pve-ssl'],
+        'logs': ['rsyslog', 'journal', 'log rotation'],
+        'latency': ['ping', 'latency', 'timeout', 'unreachable', 'network'],
+    }
+    
+    # Collect keywords for all degraded categories
+    keywords = set()
+    for cat in categories:
+        cat_lower = cat.lower()
+        if cat_lower in CATEGORY_KEYWORDS:
+            keywords.update(CATEGORY_KEYWORDS[cat_lower])
+    
+    # Extract additional keywords from reason (IPs, hostnames, storage names)
+    if reason:
+        # Find IP addresses
+        ips = re.findall(r'\b\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}\b', reason)
+        keywords.update(ips)
+        
+        # Find storage/service names (words in quotes or after colon)
+        quoted = re.findall(r"'([^']+)'|\"([^\"]+)\"", reason)
+        for match in quoted:
+            keywords.update(w for w in match if w)
+    
+    if not keywords:
+        return ""
+    
+    try:
+        # Build grep pattern
+        pattern = "|".join(re.escape(k) for k in keywords if k)
+        if not pattern:
+            return ""
+        
+        # Capture recent journal entries matching keywords
+        cmd = (
+            f"journalctl --since='10 minutes ago' --no-pager -n 500 2>/dev/null | "
+            f"grep -iE '{pattern}' | tail -n 30"
+        )
+        
+        result = subprocess.run(
+            cmd,
+            shell=True,
+            capture_output=True,
+            text=True,
+            timeout=5
+        )
+        
+        if result.returncode == 0 and result.stdout.strip():
+            return result.stdout.strip()
+        return ""
+    except Exception:
+        return ""
+
+
 def _health_collector_loop():
    """Background thread: run full health checks every 5 minutes.
    Keeps the health cache always fresh and records events/errors in the DB.
@@ -942,6 +1027,7 @@ def _health_collector_loop():
                    
                    if not skip_notification:
                        degraded.append({
+                            'cat_key': cat_key,  # Original key for journal capture
                            'category': cat_name,
                            'status': cur_status,
                            'reason': reason,
@@ -956,6 +1042,12 @@ def _health_collector_loop():
                    import socket as _sock
                    hostname = _sock.gethostname()
                
+                # Capture journal context for AI enrichment
+                # Extract category keys and reasons for keyword matching
+                cat_keys = [d.get('cat_key', d.get('category', '').lower()) for d in degraded]
+                all_reasons = ' '.join(d.get('reason', '') for d in degraded)
+                journal_context = _capture_health_journal_context(cat_keys, all_reasons)
+                
                if len(degraded) == 1:
                    d = degraded[0]
                    title = f"{hostname}: Health {d['status']} - {d['category']}"
@@ -977,7 +1069,11 @@ def _health_collector_loop():
                        severity=severity,
                        title=title,
                        message=body,
-                        data={'hostname': hostname, 'count': str(len(degraded))},
+                        data={
+                            'hostname': hostname,
+                            'count': str(len(degraded)),
+                            '_journal_context': journal_context,  # For AI enrichment
+                        },
                        source='health_monitor',
                    )
                except Exception as e:
@@ -175,7 +175,7 @@ class HealthMonitor:
        r'proxmenux-monitor.*failed at step exec',
        r'proxmenux-monitor\.appimage',
        
-        # ── PVE scheduler operational noise ──
+        # ─��� PVE scheduler operational noise ──
        # pvescheduler emits "could not update job state" every minute
        # when a scheduled job reference is stale.  This is cosmetic,
        # not a system problem.
@@ -2118,7 +2118,7 @@ class HealthMonitor:
                            except Exception:
                                pass
                        
-                        # ── Record disk observation (always, even if transient) ──
+                        # ── Record disk observation (always, even if transient) ���─
                        # Signature must be stable across cycles: strip volatile
                        # data (hex values, counts, timestamps) to dedup properly.
                        # e.g. "ata8.00: exception Emask 0x1 SAct 0xc1000000"
@@ -4580,10 +4580,18 @@ class HealthMonitor:
        Returns None if the module is not available.
        
        Respects storage exclusions: excluded storages are reported as INFO, not CRITICAL.
+        
+        During startup grace period (first 5 minutes after boot):
+        - Storage errors are reported as INFO instead of CRITICAL
+        - No persistent errors are recorded
+        This prevents false positives when NFS/PBS/remote storage is still mounting.
        """
        if not PROXMOX_STORAGE_AVAILABLE:
            return None
        
+        # Check if we're in startup grace period
+        in_grace_period = _is_startup_health_grace()
+        
        try:
            # Reload configuration to ensure we have the latest storage definitions
            proxmox_storage_monitor.reload_configuration()
@@ -4649,19 +4657,21 @@ class HealthMonitor:
                else:
                    reason = f"Storage '{storage_name}' has status: {status_detail}."
                
-                # Record a persistent CRITICAL error for each unavailable storage
-                health_persistence.record_error(
-                    error_key=error_key,
-                    category='storage',
-                    severity='CRITICAL',
-                    reason=reason,
-                    details={
-                        'storage_name': storage_name,
-                        'storage_type': storage.get('type', 'unknown'),
-                        'status_detail': status_detail,
-                        'dismissable': False
-                    }
-                )
+                # During grace period, don't record persistent errors (storage may still be mounting)
+                # After grace period, record as CRITICAL
+                if not in_grace_period:
+                    health_persistence.record_error(
+                        error_key=error_key,
+                        category='storage',
+                        severity='CRITICAL',
+                        reason=reason,
+                        details={
+                            'storage_name': storage_name,
+                            'storage_type': storage.get('type', 'unknown'),
+                            'status_detail': status_detail,
+                            'dismissable': False
+                        }
+                    )
                
                # Add to details dict with dismissable false for frontend
                storage_details[storage_name] = {
@@ -4672,13 +4682,22 @@ class HealthMonitor:
                }
            
            # Build checks from storage_details
+            # During grace period, report as INFO instead of CRITICAL
            checks = {}
            for st_name, st_info in storage_details.items():
-                checks[st_name] = {
-                    'status': 'CRITICAL',
-                    'detail': st_info.get('reason', 'Unavailable'),
-                    'dismissable': False
-                }
+                if in_grace_period:
+                    checks[st_name] = {
+                        'status': 'INFO',
+                        'detail': f"[Startup] {st_info.get('reason', 'Unavailable')} (checking...)",
+                        'dismissable': False,
+                        'grace_period': True
+                    }
+                else:
+                    checks[st_name] = {
+                        'status': 'CRITICAL',
+                        'detail': st_info.get('reason', 'Unavailable'),
+                        'dismissable': False
+                    }
            
            # Add excluded unavailable storages as INFO (not as errors)
            for st in excluded_unavailable:
@@ -4702,12 +4721,22 @@ class HealthMonitor:
            
            # Determine overall status based on non-excluded issues only
            if real_unavailable:
-                return {
-                    'status': 'CRITICAL',
-                    'reason': f'{len(real_unavailable)} Proxmox storage(s) unavailable',
-                    'details': storage_details,
-                    'checks': checks
-                }
+                # During grace period, return INFO instead of CRITICAL
+                if in_grace_period:
+                    return {
+                        'status': 'INFO',
+                        'reason': f'{len(real_unavailable)} storage(s) not yet available (startup)',
+                        'details': storage_details,
+                        'checks': checks,
+                        'grace_period': True
+                    }
+                else:
+                    return {
+                        'status': 'CRITICAL',
+                        'reason': f'{len(real_unavailable)} Proxmox storage(s) unavailable',
+                        'details': storage_details,
+                        'checks': checks
+                    }
            else:
                # Only excluded storages are unavailable - this is OK
                return {
@@ -1093,7 +1093,7 @@ class HealthPersistence:
        conn.commit()
        conn.close()
    
-    # ─── System Capabilities Cache ────────────────────��──────────
+    # ─── System Capabilities Cache ───────────────────────────────
    
    def get_capability(self, cap_key: str) -> Optional[str]:
        """
@@ -79,7 +79,7 @@ class _SharedState:
 _shared_state = _SharedState()


-# ─── Event Object ─────────────────────────────────────────────────
+# ─── Event Object ──────────────��──────────────────────────────────

 class NotificationEvent:
    """Represents a detected event ready for notification dispatch.
@@ -2538,7 +2538,7 @@ class PollingCollector:
        except Exception as e:
            print(f"[PollingCollector] AI model check failed: {e}")
    
-    # ── Persistence helpers ──────────────────────────────��─────
+    # ── Persistence helpers ────────────────────────────────────
    
    def _load_last_notified(self):
        """Load per-error notification timestamps from DB on startup."""
@@ -763,8 +763,10 @@ class NotificationManager:
                ch_title, ch_body = title, body
                
                # ── Per-channel settings ──
+                # Email defaults to 'detailed' (technical report), others to 'standard'
                detail_level_key = f'{ch_name}.ai_detail_level'
-                detail_level = self._config.get(detail_level_key, 'standard')
+                default_detail = 'detailed' if ch_name == 'email' else 'standard'
+                detail_level = self._config.get(detail_level_key, default_detail)
                
                rich_key = f'{ch_name}.rich_format'
                use_rich_format = self._config.get(rich_key, 'false') == 'true'
@@ -1382,241 +1382,146 @@ AI_DETAIL_TOKENS = {
    'detailed': 3000,  # Complete technical reports with all details
 }

-# System prompt template - informative, no recommendations
-AI_SYSTEM_PROMPT = """You are a system notification formatter for ProxMenux Monitor, a Proxmox VE monitoring tool.
+# System prompt template - optimized hybrid version
+AI_SYSTEM_PROMPT = """You are a notification FORMATTER for ProxMenux Monitor (Proxmox VE).
+Your job: translate and reformat alerts into {language}. You are NOT an analyst — do not interpret or diagnose.

-Your task is to translate and lightly reformat incoming server alert messages into {language}.
+═══ WHAT TO TRANSLATE ═══
+Translate: labels, descriptions, status words, units (GB→Go in French, etc.)
+DO NOT translate: hostnames, IPs, paths, VM/CT IDs, device names (/dev/sdX), technical identifiers

-═══ CORE ROLE ═══
-You are a formatter, not an analyst.
-Translate, clean, and present the message clearly.
-Do NOT reinterpret the event, do NOT add meaning, and do NOT rebuild the message from scratch.
-
-═══ ABSOLUTE RULES ═══
-1. Translate BOTH title and body into {language}.
-
-2. Translate human-readable text only.
-   Do NOT translate:
-   - hostnames
-   - device paths (/dev/sdX, /dev/nvmeXnX)
-   - filesystem paths
-   - IDs, VMIDs, CTIDs, UUIDs
-   - timestamps, dates, archive names, PBS paths
-   - version numbers
-   - technical units (B, KB, MB, GB, TB, KiB, MiB, GiB, TiB, %, ms, s)
-
-3. Plain text only.
-   No markdown: no **bold**, no *italic*, no `code`, no headers (#), no markdown lists (- or *).
-   The bullet character "•" is allowed only where explicitly required.
-
-4. Tone: factual, concise, technical.
-   No greetings, no closings, no apologies, no conversational filler.
-
-5. Do NOT add recommendations, action items, remediation, or suggestions.
-
-6. Present ONLY the facts already present in the input.
-   Do NOT invent, assume, explain, soften, or escalate anything.
-
-7. Do NOT change severity or status meaning.
-   For example:
-   - "failed" must stay a failure
-   - "warning" must stay a warning
-   - "degraded" must stay degraded
-
-8. Preserve structure whenever possible.
-   Keep the same fields, lines, and data already present in the input.
-   Do NOT remove important lines such as storage, archive path, totals, durations, target node, reason, or summaries.
-
-9. Reordering must be minimal.
-   Only reorder lines if it clearly improves readability without changing meaning.
-
-10. PLAIN NARRATIVE LINES:
-    If a line is already a complete sentence, translate it as a sentence.
-    Do NOT prepend labels like "Message:", "Note:", or "Details:" unless they already exist in the input.
-
-11. Detail level to apply: {detail_level}
-    - brief    → compact output, keep only essential lines, but never remove critical facts
-    - standard → preserve structure with moderate cleanup
-    - detailed → preserve all available technical details
-
-12. DEDUPLICATION:
-    Remove ONLY exact duplicates or obviously duplicated repeated lines.
-    Do NOT merge distinct facts just because they look similar.
-    Do NOT summarize multiple separate events into one.
-
-13. Keep the "hostname: " prefix in the title.
-    Translate only the descriptive part.
-    Example: "pve01: Updates available" → "pve01: Actualizaciones disponibles"
-
-14. EMPTY VALUES:
-    If a list field is empty, "none", "0", or equivalent, write the translated word for "none".
-    Never leave a declared field blank.
-
-15. UNKNOWN INPUT:
-    If the message format is unfamiliar, preserve it as closely as possible and translate faithfully.
-    Do NOT force it into another template.
-
-═══ PROXMOX CONTEXT ═══
-Silently replace raw Proxmox technical references with the clearer forms below.
-Do NOT explain them. Just use the friendly equivalent directly.
-
-Service / process mappings:
- "pve-container@XXXX.service"  → "Container CT XXXX"
- "qemu-server@XXXX.service"    → "Virtual Machine VM XXXX"
- "pvesr-XXXX"                  → "storage replication job for XXXX"
- "vzdump"                      → "backup process"
- "pveproxy"                    → "Proxmox web proxy"
- "pvedaemon"                   → "Proxmox daemon"
- "pvestatd"                    → "Proxmox statistics service"
- "pvescheduler"                → "Proxmox task scheduler"
- "pve-cluster"                 → "Proxmox cluster service"
- "corosync"                    → "cluster communication service"
- "ceph-osd@N"                  → "Ceph storage disk N"
- "ceph-mon"                    → "Ceph monitor service"
-
-Systemd-style patterns:
- "systemd[1]: pve-container@9000.service: Failed"
-  → "Container CT 9000 service failed"
- "systemd[1]: qemu-server@100.service: Failed with result 'exit-code'"
-  → "Virtual Machine VM 100 failed to start"
- "systemd[1]: Started pve-container@9000.service"
-  → "Container CT 9000 started"
-
-Kernel / storage patterns:
- "ata8.00: exception Emask ..."
-  → "ATA controller error on port 8"
- "blk_update_request: I/O error, dev sdX, sector NNNN"
-  → "I/O error on disk /dev/sdX at sector NNNN"
- "SCSI error: return code = 0x08000002"
-  → "SCSI communication error"
-
-Apply these mappings in titles, field values, and body text when the raw technical string appears.
+═══ CORE RULES ═══
+1. Plain text only — NO markdown, no **bold**, no `code`, no bullet lists (use "• " for packages only)
+2. Preserve severity: "failed" stays "failed", "warning" stays "warning" — never soften errors
+3. Preserve structure: keep same fields and line order, only translate content
+4. Detail level "{detail_level}": brief (2-3 lines) | standard (short paragraph) | detailed (full report)
+5. DEDUPLICATION: merge duplicate facts from multiple sources into one clear statement
+6. EMPTY LISTS: write translated "none" after label, never leave blank
+7. Keep "hostname:" prefix in title — translate only the descriptive part
+8. DO NOT add recommendations or suggestions ("you should...", "try...", "consider...")
+{suggestions_addon}9. Present facts from message AND journal context — describe what happened, do NOT speculate
+10. OUTPUT ONLY the final result — no "Original:", no before/after comparisons
+11. Unknown input: preserve as closely as possible, translate what you can

+═══ PROXMOX MAPPINGS (use directly, never explain) ═══
+pve-container@XXXX → "CT XXXX" | qemu-server@XXXX → "VM XXXX" | vzdump → "backup"
+pveproxy/pvedaemon/pvestatd → "Proxmox service" | corosync → "cluster service"
+"ata8.00: exception Emask..." → "ATA error on port 8"
+"blk_update_request: I/O error, dev sdX" → "I/O error on /dev/sdX"
 {emoji_instructions}
+═══ MESSAGE FORMATS ═══

-═══ MESSAGE-TYPE GUIDANCE ═══
+BACKUP: List each VM/CT with status/size/duration/storage. End with summary.
+  - Partial failure (some OK, some failed) = "Backup partially failed", not "failed"
+  - NEVER collapse multi-VM backup into one line — show each VM separately
+  - ALWAYS include storage path and summary line

-BACKUP (backup_complete / backup_fail / backup_start):
- Preserve per-VM / per-CT detail if present.
- Preserve size, duration, storage/archive path, and final summary if present.
- If both successes and failures are present in the same backup job, use a title equivalent to "Backup partially failed".
- Do NOT collapse multi-guest backup results into a single generic sentence.
+UPDATES: Counts on own lines. Packages use "• " under header. No redundant summary.

-UPDATES (update_summary):
- Keep each count on its own line.
- Keep the important packages block if present.
- Use "• " for package items.
- Do NOT add a redundant summary line repeating totals already shown.
+DISK/SMART: Device + specific error. Deduplicate repeated info.

-PVE UPDATE (pve_update):
- Preserve current version, new version, and package list if present.
- Keep the announcement concise.
+HEALTH: Category + severity + what changed. Duration if resolved.

-DISK / SMART / STORAGE (disk_io_error / storage_unavailable):
- Preserve device, specific error, failing attribute, and counts if present.
- Do NOT repeat the same disk fact twice.
+VM/CT LIFECYCLE: Confirm event with key facts (1-2 lines).

-RESOURCES (cpu_high / ram_high / temp_high / load_high):
- Preserve current value, threshold, and context if present.
-
-SECURITY (auth_fail / ip_block):
- Keep source IP, user, service, jail, and failure count on separate clear lines if present.
-
-VM / CT LIFECYCLE (vm_*, ct_*, migration_*, replication_*):
- Keep name, ID, state, reason, and target node if present.
- Keep lifecycle messages compact unless detail_level is detailed.
-
-CLUSTER / HEALTH:
- Preserve node name, quorum, category, severity, duration, and reason if present.
-
-═══ OUTPUT FORMAT ═══
+═══ OUTPUT FORMAT (CRITICAL - parsers rely on exact structure) ═══
 [TITLE]
-translated title here
+translated title here (NO [TITLE] text in actual title)
 [BODY]
-translated body here
+translated body here (NO [BODY] text in actual body)

-CRITICAL OUTPUT RULES:
- Write [TITLE] on its own line
- Write the title on the next line
- Write [BODY] on its own line
- Write the body starting on the next line
- Do NOT replace these markers with "Title:" or "Body:"
- Do NOT include any extra text before or after the formatted result
- Do NOT add blank lines between [TITLE] and the title
- Do NOT add blank lines between [BODY] and the first body line"""
+CRITICAL RULES:
+- [TITLE] and [BODY] are PARSING MARKERS ONLY — they must NOT appear in your actual content
+- Write [TITLE] on line 1, title text on line 2 (no blank line between)
+- Write [BODY] on line 3, body text starting line 4 (no blank line between)
+- Do NOT write "Title:", "Body:", "[TITLE]", "[BODY]" inside the translated text
+- Do NOT include markers in emojis line: WRONG "🔽[TITLE] server shutdown" → RIGHT "🔽 server shutdown"
+- Output ONLY the formatted result — no explanations, no "Original:", no commentary"""
+
+# Addon for experimental suggestions mode
+AI_SUGGESTIONS_ADDON = """   When journal context shows a clear problem, you MAY add ONE brief tip at the end,
+   prefixed with "Tip:" (translated). Keep tips factual, based only on what logs show.
+"""

 # Emoji instructions injected into AI_SYSTEM_PROMPT for rich channels (Telegram, Discord, Pushover)
 AI_EMOJI_INSTRUCTIONS = """
 ═══ EMOJI RULES ═══
-Place ONE emoji at the START of every non-empty line (title and each body line).
-Never skip a line. Never put the emoji at the end.
-A blank line must be completely empty — no emoji, no spaces.
- 
-    TITLE emoji — one per event type:
-    ✅  success / resolved / complete / reconnected
-    ❌  failed / FAILED / error
-    💥  crash / I/O error / hardware fault
-    🆘  new critical health issue
-    📦  backup started / updates available (update_summary)
-    🆕  new PVE version available (pve_update)
-    🔺  escalated / severity increased
-    📋  health digest / persistent issues
-    🚚  migration started
-    🔌  network down / node disconnected
-    🚨  auth failure / security alert
-    🚷  IP banned / blocked
-    🔑  permission change
-    💢  split-brain
-    💣  OOM kill
-    🚀  VM or CT started
-    ⏹️  VM or CT stopped
-    🔽  VM or CT shutdown
-    🔄  restarted / reboot / proxmox updates
-    🔥  high CPU / firewall issue
-    💧  high memory
-    🌡️  high temperature
-    ⚠️  warning / degraded / high load / system problem
-    📉  low disk space
-    🚫  storage unavailable
-    🐢  high latency
-    📸  snapshot created
-    ⏻  system shutdown
-    
-    BODY LINE emoji — one per line based on content:
-    🏷️  VM name / CT name / ID line (first line of VM/CT lifecycle events)
-    ✔️  status ok / success / action confirmed
-    ❌  status error / failed
-    💽  size (individual VM/CT backup)
-    💾  total backup size (summary line only)
-    ⏱️  duration
-    🗄️  storage location / PBS path
-    📦  total updates count
-    🔒  security updates / jail
-    🔄  proxmox updates
-    ⚙️  kernel updates / service name
-    🗂️  important packages header
-    🌐  source IP
-    👤  user
-    📝  reason / details
-    🌡️  temperature
-    🔥  CPU usage
-    💧  memory usage
-    📊  summary line / statistics
-    👥  quorum / cluster nodes
-    💿  disk device
-    📂  filesystem / mount point
-    📌  category / package item (pve_update)
-    🚦  severity
-    🖥️  node name
-    🎯  target node
-    🔹  current version (pve_update)
-    🟢  new version (pve_update)
+Use 1-2 emojis at START of lines where they add clarity. Combine when meaningful (💾✅ backup ok).
+Not every line needs emoji — use them to highlight, not as filler. Blank lines = completely empty.

+TITLE: ✅success ❌failed 💥crash 🆘critical 📦updates 🆕pve-update 🚚migration ⏹️stop 
+       🔽shutdown ⚠️warning 💢split-brain 🔌disconnect 🚨auth-fail 🚷banned 📋digest
+       🚀 = something STARTS (VM/CT start, backup start, server boot, task begin)
+       Combine: 💾🚀backup-start  🖥️🚀system-boot  🚀VM/CT-start

-    BLANK LINES:
-    Insert one blank line only between logical sections inside the body.
-    Do not add a blank line before the first body line or after the last one.
-    """
+BODY:  🏷️VM/CT name ✔️ok ❌error 💽size 💾total ⏱️duration 🗄️storage 📊summary
+       📦updates 🔒security 🔄proxmox ⚙️kernel 🗂️packages 💿disk 📝reason
+       🌐IP 👤user 🌡️temp 🔥CPU 💧RAM 🎯target 🔹current 🟢new 📌item
+
+BLANK LINES: Insert between logical sections (VM entries, before summary, before packages block).
+
+═══ EXAMPLES (follow these formats) ═══
+
+BACKUP START:
+[TITLE]
+💾🚀 pve01: Backup started
+[BODY]
+Backup job starting on storage PBS.
+🏷️ VMs: web01 (100), db (101)
+
+BACKUP COMPLETE:
+[TITLE]
+💾✅ pve01: Backup complete
+[BODY]
+Backup job finished on storage local-bak.
+
+🏷️ VM web01 (ID: 100)
+✔️ Status: ok
+💽 Size: 12.3 GiB
+⏱️ Duration: 00:04:21
+🗄️ Storage: vm/100/2026-03-17T22:00:08Z
+
+📊 Total: 1 backup | 💾 12.3 GiB | ⏱️ 00:04:21
+
+BACKUP PARTIAL FAIL:
+[TITLE]
+💾❌ pve01: Backup partially failed
+[BODY]
+Backup job finished with errors.
+
+🏷️ VM web01 (ID: 100)
+✔️ Status: ok
+💽 Size: 12.3 GiB
+
+🏷️ VM broken (ID: 102)
+❌ Status: error
+
+📊 Total: 2 backups | ❌ 1 failed
+
+UPDATES:
+[TITLE]
+📦 amd: Updates available
+[BODY]
+📦 Total updates: 24
+🔒 Security updates: 6
+🔄 Proxmox updates: 0
+
+🗂️ Important packages:
+• none
+
+VM/CT START:
+[TITLE]
+🚀 pve01: VM arch-linux (100) started
+[BODY]
+🏷️ Virtual machine arch-linux (ID: 100)
+✔️ Now running
+
+HEALTH DEGRADED:
+[TITLE]
+⚠️ amd: Health warning — Disk I/O
+[BODY]
+💿 Device: /dev/sda
+⚠️ 1 sector unreadable (pending)"""


 # No emoji instructions for email/plain text channels
@@ -1721,10 +1626,18 @@ class AIEnhancer:
            # Default prompt: use detail level and emoji settings
            max_tokens = AI_DETAIL_TOKENS.get(detail_level, 200)
            emoji_instructions = AI_EMOJI_INSTRUCTIONS if use_emojis else AI_NO_EMOJI_INSTRUCTIONS
+            
+            # Check if experimental suggestions mode is enabled
+            allow_suggestions = self.config.get('ai_allow_suggestions', 'false')
+            if isinstance(allow_suggestions, str):
+                allow_suggestions = allow_suggestions.lower() == 'true'
+            suggestions_addon = AI_SUGGESTIONS_ADDON if allow_suggestions else ''
+            
            system_prompt = AI_SYSTEM_PROMPT.format(
                language=language_name,
                detail_level=detail_level,
-                emoji_instructions=emoji_instructions
+                emoji_instructions=emoji_instructions,
+                suggestions_addon=suggestions_addon
            )
        
        # Build user message
@@ -120,7 +120,7 @@ class _StartupGraceState:
        with self._lock:
            return time.time() - self._startup_time
    
-    # ─── Shutdown Tracking ────────────────────────────────────────��──────────
+    # ─── Shutdown Tracking ───────────────────────────────────────────────────
    
    def mark_shutdown(self):
        """