jcloude/libs/mariadb_io_monitor/anomaly_detector.go
2025-12-23 19:17:16 +08:00

187 lines
5.8 KiB
Go

package main
import (
"fmt"
"time"
)
// Cooldown periods - prevent rapid state flapping
const (
WarningCooldown = 30 * time.Second // Stay in WARNING for at least 30s
CriticalCooldown = 60 * time.Second // Stay in CRITICAL for at least 60s
)
const (
// CRITICAL: Full PSI - all non-idle tasks stalled (queries blocked)
PSI_IO_Full_Critical = 5.0 // % - queries waiting on disk I/O
PSI_Memory_Full_Critical = 3.0 // % - should rarely happen with proper buffer pool
PSI_CPU_Full_Critical = 8.0 // % - all queries blocked on CPU (rare)
// WARNING: Some PSI - early pressure detection for query performance
PSI_IO_Some_Warning = 15.0 // % - catch I/O pressure early (NVMe should be fast)
PSI_IO_Full_Warning = 2.0 // % - any full I/O stall is concerning for DB
PSI_Memory_Some_Warning = 25.0 // % - memory contention beyond buffer pool
PSI_CPU_Some_Warning = 40.0 // % - query concurrency issues
)
// CPU Thresholds
const (
IOWait_Warning = 15.0 // % - early sign of disk becoming bottleneck
IOWait_Critical = 30.0 // % - severe I/O bottleneck, queries delayed
)
// Memory Thresholds
const (
Memory_Usage_Warning = 90.0 // % - non-buffer memory filling up
Memory_Usage_Critical = 98.0 // % - risk of OOM killer, potential buffer pool eviction
)
// Swap Activity Thresholds
const (
Swap_Rate_Warning = 50.0 // pages/sec - LOW threshold, databases should not swap
Swap_Rate_Critical = 200.0 // pages/sec - heavy thrashing, queries grinding to halt
)
// Disk I/O Thresholds - NVMe SSD specific
const (
Disk_Latency_Warning = 10.0 // milliseconds - slow for NVMe, affects query performance
Disk_Latency_Critical = 25.0 // milliseconds - very slow for NVMe, serious bottleneck
Disk_QueueDepth_Warning = 16.0 // Queue building up on fast NVMe = saturation
)
// CheckWarningConditions checks if any WARNING-level thresholds are exceeded
func (m *Monitor) CheckWarningConditions(sample StatsSample) bool {
// PSI Warning levels - early I/O pressure detection
if sample.PSI.IO_SomeAvg10 > PSI_IO_Some_Warning {
return true
}
// Any "full" I/O stall is a warning for databases
if sample.PSI.IO_FullAvg10 > PSI_IO_Full_Warning {
fmt.Println("PSI I/O Full Warning triggered")
return true
}
if sample.PSI.Memory_SomeAvg10 > PSI_Memory_Some_Warning {
return true
}
if sample.PSI.CPU_SomeAvg10 > PSI_CPU_Some_Warning {
return true
}
// CPU IOWait warning - lower threshold for DB workloads
if sample.CPU.IOwaitPercent > IOWait_Warning {
return true
}
// Memory usage warning - adjusted for 70% InnoDB buffer pool allocation
if sample.Memory.TotalBytes > 0 {
memUsagePercent := float64(sample.Memory.ActualUsedBytes) / float64(sample.Memory.TotalBytes) * 100.0
if memUsagePercent > Memory_Usage_Warning {
fmt.Println("Memory usage warning triggered")
return true
}
}
// Swap activity warning - databases should NOT swap
if sample.Swap.PageInPerSecond > Swap_Rate_Warning || sample.Swap.PageOutPerSecond > Swap_Rate_Warning {
fmt.Println("Swap activity warning triggered")
return true
}
// Disk latency warning - NVMe specific, lower threshold
if sample.Disk.ReadLatencyMillis > Disk_Latency_Warning || sample.Disk.WriteLatencyMillis > Disk_Latency_Warning {
fmt.Println("Disk latency warning triggered")
return true
}
// Queue depth warning - indicates NVMe saturation
if sample.Disk.QueueDepth > Disk_QueueDepth_Warning {
fmt.Println("Disk queue depth warning triggered")
return true
}
return false
}
// CheckCriticalConditions checks if any CRITICAL-level thresholds are exceeded
func (m *Monitor) CheckCriticalConditions(sample StatsSample) bool {
// PSI Critical levels - queries are blocked
if sample.PSI.IO_FullAvg10 > PSI_IO_Full_Critical {
return true
}
if sample.PSI.Memory_FullAvg10 > PSI_Memory_Full_Critical {
return true
}
if sample.PSI.CPU_FullAvg10 > PSI_CPU_Full_Critical {
return true
}
// CPU IOWait critical - severe disk bottleneck
if sample.CPU.IOwaitPercent > IOWait_Critical {
return true
}
// Memory usage critical - risk of OOM affecting buffer pool
if sample.Memory.TotalBytes > 0 {
memUsagePercent := float64(sample.Memory.ActualUsedBytes) / float64(sample.Memory.TotalBytes) * 100.0
if memUsagePercent > Memory_Usage_Critical {
return true
}
}
// Heavy swap thrashing - catastrophic for database
if sample.Swap.PageInPerSecond > Swap_Rate_Critical || sample.Swap.PageOutPerSecond > Swap_Rate_Critical {
return true
}
// Severe disk latency - NVMe performing like HDD
if sample.Disk.ReadLatencyMillis > Disk_Latency_Critical || sample.Disk.WriteLatencyMillis > Disk_Latency_Critical {
return true
}
return false
}
// UpdateAnomalyState checks current conditions and updates Monitor flags with cooldown logic
func (m *Monitor) UpdateAnomalyState(sample StatsSample) {
now := time.Now()
// Check current conditions
currentlyWarning := m.CheckWarningConditions(sample)
currentlyCritical := m.CheckCriticalConditions(sample)
// Update timestamps if conditions are met
if currentlyCritical {
m.lastCriticalTime = now
}
if currentlyWarning {
m.lastWarningTime = now
}
// Apply cooldown logic - stay in state for minimum duration
criticalActive := now.Sub(m.lastCriticalTime) < CriticalCooldown
warningActive := now.Sub(m.lastWarningTime) < WarningCooldown
defer func() {
fmt.Printf("Anomaly State Updated - WARNING Active: %v, CRITICAL Active: %v\n", warningActive, criticalActive)
}()
// Set flags based on active states
// CRITICAL implies both logging AND eBPF collection
if criticalActive {
m.enableEventLoggingInDisk = true
m.enableEbpfEventCollection = true
return
}
// WARNING implies only eBPF collection (no logging yet)
if warningActive {
m.enableEventLoggingInDisk = false
m.enableEbpfEventCollection = true
return
}
// Normal state - clear both flags
m.enableEventLoggingInDisk = false
m.enableEbpfEventCollection = false
}