187 lines
5.8 KiB
Go
187 lines
5.8 KiB
Go
package main
|
|
|
|
import (
|
|
"fmt"
|
|
"time"
|
|
)
|
|
|
|
// Cooldown periods - prevent rapid state flapping
|
|
const (
|
|
WarningCooldown = 30 * time.Second // Stay in WARNING for at least 30s
|
|
CriticalCooldown = 60 * time.Second // Stay in CRITICAL for at least 60s
|
|
)
|
|
|
|
const (
|
|
// CRITICAL: Full PSI - all non-idle tasks stalled (queries blocked)
|
|
PSI_IO_Full_Critical = 5.0 // % - queries waiting on disk I/O
|
|
PSI_Memory_Full_Critical = 3.0 // % - should rarely happen with proper buffer pool
|
|
PSI_CPU_Full_Critical = 8.0 // % - all queries blocked on CPU (rare)
|
|
|
|
// WARNING: Some PSI - early pressure detection for query performance
|
|
PSI_IO_Some_Warning = 15.0 // % - catch I/O pressure early (NVMe should be fast)
|
|
PSI_IO_Full_Warning = 2.0 // % - any full I/O stall is concerning for DB
|
|
PSI_Memory_Some_Warning = 25.0 // % - memory contention beyond buffer pool
|
|
PSI_CPU_Some_Warning = 40.0 // % - query concurrency issues
|
|
)
|
|
|
|
// CPU Thresholds
|
|
const (
|
|
IOWait_Warning = 15.0 // % - early sign of disk becoming bottleneck
|
|
IOWait_Critical = 30.0 // % - severe I/O bottleneck, queries delayed
|
|
)
|
|
|
|
// Memory Thresholds
|
|
const (
|
|
Memory_Usage_Warning = 90.0 // % - non-buffer memory filling up
|
|
Memory_Usage_Critical = 98.0 // % - risk of OOM killer, potential buffer pool eviction
|
|
)
|
|
|
|
// Swap Activity Thresholds
|
|
const (
|
|
Swap_Rate_Warning = 50.0 // pages/sec - LOW threshold, databases should not swap
|
|
Swap_Rate_Critical = 200.0 // pages/sec - heavy thrashing, queries grinding to halt
|
|
)
|
|
|
|
// Disk I/O Thresholds - NVMe SSD specific
|
|
const (
|
|
Disk_Latency_Warning = 10.0 // milliseconds - slow for NVMe, affects query performance
|
|
Disk_Latency_Critical = 25.0 // milliseconds - very slow for NVMe, serious bottleneck
|
|
Disk_QueueDepth_Warning = 16.0 // Queue building up on fast NVMe = saturation
|
|
)
|
|
|
|
// CheckWarningConditions checks if any WARNING-level thresholds are exceeded
|
|
func (m *Monitor) CheckWarningConditions(sample StatsSample) bool {
|
|
// PSI Warning levels - early I/O pressure detection
|
|
if sample.PSI.IO_SomeAvg10 > PSI_IO_Some_Warning {
|
|
return true
|
|
}
|
|
// Any "full" I/O stall is a warning for databases
|
|
if sample.PSI.IO_FullAvg10 > PSI_IO_Full_Warning {
|
|
fmt.Println("PSI I/O Full Warning triggered")
|
|
return true
|
|
}
|
|
if sample.PSI.Memory_SomeAvg10 > PSI_Memory_Some_Warning {
|
|
return true
|
|
}
|
|
if sample.PSI.CPU_SomeAvg10 > PSI_CPU_Some_Warning {
|
|
return true
|
|
}
|
|
|
|
// CPU IOWait warning - lower threshold for DB workloads
|
|
if sample.CPU.IOwaitPercent > IOWait_Warning {
|
|
return true
|
|
}
|
|
|
|
// Memory usage warning - adjusted for 70% InnoDB buffer pool allocation
|
|
if sample.Memory.TotalBytes > 0 {
|
|
memUsagePercent := float64(sample.Memory.ActualUsedBytes) / float64(sample.Memory.TotalBytes) * 100.0
|
|
if memUsagePercent > Memory_Usage_Warning {
|
|
fmt.Println("Memory usage warning triggered")
|
|
return true
|
|
}
|
|
}
|
|
|
|
// Swap activity warning - databases should NOT swap
|
|
if sample.Swap.PageInPerSecond > Swap_Rate_Warning || sample.Swap.PageOutPerSecond > Swap_Rate_Warning {
|
|
fmt.Println("Swap activity warning triggered")
|
|
return true
|
|
}
|
|
|
|
// Disk latency warning - NVMe specific, lower threshold
|
|
if sample.Disk.ReadLatencyMillis > Disk_Latency_Warning || sample.Disk.WriteLatencyMillis > Disk_Latency_Warning {
|
|
fmt.Println("Disk latency warning triggered")
|
|
return true
|
|
}
|
|
|
|
// Queue depth warning - indicates NVMe saturation
|
|
if sample.Disk.QueueDepth > Disk_QueueDepth_Warning {
|
|
fmt.Println("Disk queue depth warning triggered")
|
|
return true
|
|
}
|
|
|
|
return false
|
|
}
|
|
|
|
// CheckCriticalConditions checks if any CRITICAL-level thresholds are exceeded
|
|
func (m *Monitor) CheckCriticalConditions(sample StatsSample) bool {
|
|
// PSI Critical levels - queries are blocked
|
|
if sample.PSI.IO_FullAvg10 > PSI_IO_Full_Critical {
|
|
return true
|
|
}
|
|
if sample.PSI.Memory_FullAvg10 > PSI_Memory_Full_Critical {
|
|
return true
|
|
}
|
|
if sample.PSI.CPU_FullAvg10 > PSI_CPU_Full_Critical {
|
|
return true
|
|
}
|
|
|
|
// CPU IOWait critical - severe disk bottleneck
|
|
if sample.CPU.IOwaitPercent > IOWait_Critical {
|
|
return true
|
|
}
|
|
|
|
// Memory usage critical - risk of OOM affecting buffer pool
|
|
if sample.Memory.TotalBytes > 0 {
|
|
memUsagePercent := float64(sample.Memory.ActualUsedBytes) / float64(sample.Memory.TotalBytes) * 100.0
|
|
if memUsagePercent > Memory_Usage_Critical {
|
|
return true
|
|
}
|
|
}
|
|
|
|
// Heavy swap thrashing - catastrophic for database
|
|
if sample.Swap.PageInPerSecond > Swap_Rate_Critical || sample.Swap.PageOutPerSecond > Swap_Rate_Critical {
|
|
return true
|
|
}
|
|
|
|
// Severe disk latency - NVMe performing like HDD
|
|
if sample.Disk.ReadLatencyMillis > Disk_Latency_Critical || sample.Disk.WriteLatencyMillis > Disk_Latency_Critical {
|
|
return true
|
|
}
|
|
|
|
return false
|
|
}
|
|
|
|
// UpdateAnomalyState checks current conditions and updates Monitor flags with cooldown logic
|
|
func (m *Monitor) UpdateAnomalyState(sample StatsSample) {
|
|
now := time.Now()
|
|
|
|
// Check current conditions
|
|
currentlyWarning := m.CheckWarningConditions(sample)
|
|
currentlyCritical := m.CheckCriticalConditions(sample)
|
|
|
|
// Update timestamps if conditions are met
|
|
if currentlyCritical {
|
|
m.lastCriticalTime = now
|
|
}
|
|
if currentlyWarning {
|
|
m.lastWarningTime = now
|
|
}
|
|
|
|
// Apply cooldown logic - stay in state for minimum duration
|
|
criticalActive := now.Sub(m.lastCriticalTime) < CriticalCooldown
|
|
warningActive := now.Sub(m.lastWarningTime) < WarningCooldown
|
|
|
|
defer func() {
|
|
fmt.Printf("Anomaly State Updated - WARNING Active: %v, CRITICAL Active: %v\n", warningActive, criticalActive)
|
|
}()
|
|
|
|
// Set flags based on active states
|
|
// CRITICAL implies both logging AND eBPF collection
|
|
if criticalActive {
|
|
m.enableEventLoggingInDisk = true
|
|
m.enableEbpfEventCollection = true
|
|
return
|
|
}
|
|
|
|
// WARNING implies only eBPF collection (no logging yet)
|
|
if warningActive {
|
|
m.enableEventLoggingInDisk = false
|
|
m.enableEbpfEventCollection = true
|
|
return
|
|
}
|
|
|
|
// Normal state - clear both flags
|
|
m.enableEventLoggingInDisk = false
|
|
m.enableEbpfEventCollection = false
|
|
}
|