jcloude/libs/mariadb_io_monitor/stats_sampler.go
2025-12-23 19:17:16 +08:00

546 lines
13 KiB
Go

package main
import (
"bufio"
"fmt"
"os"
"path/filepath"
"sort"
"strconv"
"strings"
"time"
)
// ProcessInfo represents a process with its resource usage
type ProcessInfo struct {
Name string
Usage uint64 // bytes for memory, bytes for disk I/O
}
// StatsSample represents a single snapshot of system statistics
type StatsSample struct {
Timestamp time.Time
CPU struct {
UserPercent float64
SystemPercent float64
IOwaitPercent float64
IdlePercent float64
}
Memory struct {
TotalBytes uint64
CacheBytes uint64
ActualUsedBytes uint64
ReclaimableBytes uint64
NonReclaimableBytes uint64
}
Swap struct {
UsedBytes uint64
PageInPerSecond float64
PageOutPerSecond float64
}
Disk struct {
ReadIOPS float64
WriteIOPS float64
ReadThroughputBytes float64
WriteThroughputBytes float64
ReadLatencyMillis float64
WriteLatencyMillis float64
QueueDepth float64
}
PSI struct {
IO_SomeAvg10 float64
IO_FullAvg10 float64
Memory_SomeAvg10 float64
Memory_FullAvg10 float64
CPU_SomeAvg10 float64
CPU_FullAvg10 float64
}
TopMemoryProcesses []ProcessInfo // Top 5 memory consuming processes
TopCPUProcesses []ProcessInfo // Top 5 CPU consuming processes (Usage = percentage * 100)
}
// diskAgg aggregates disk statistics across all devices
type diskAgg struct {
readIOs, readSectors, readTicks uint64
writeIOs, writeSectors, writeTicks uint64
ioTicks uint64
}
// Sampler collects and stores system statistics samples
type Sampler struct {
buf *CircularBuffer[StatsSample]
lastCPU struct {
user, system, iowait, idle, total uint64
valid bool
}
lastDisk struct {
diskAgg
valid bool
}
lastSwapIn uint64
lastSwapOut uint64
lastSwapSet bool
}
// Start begins periodic collection of system statistics
func (m *Monitor) StartStatsSampler() {
tick := time.NewTicker(SampleInterval)
m.wg.Add(1)
defer m.wg.Done()
defer tick.Stop()
for {
select {
case <-m.ctx.Done():
fmt.Println("Sampler stopping:", m.ctx.Err())
return
case <-tick.C:
m.sampler.collect()
}
}
}
func (s *Sampler) collect() {
now := time.Now()
// CPU
u, sys, io, idle, tot, _ := parseCPU()
var userPct, sysPct, ioPct, idlePct float64
if s.lastCPU.valid && tot > s.lastCPU.total {
td := float64(tot - s.lastCPU.total)
userPct = 100 * float64(u-s.lastCPU.user) / td
sysPct = 100 * float64(sys-s.lastCPU.system) / td
ioPct = 100 * float64(io-s.lastCPU.iowait) / td
idlePct = 100 * float64(idle-s.lastCPU.idle) / td
}
s.lastCPU.user = u
s.lastCPU.system = sys
s.lastCPU.iowait = io
s.lastCPU.idle = idle
s.lastCPU.total = tot
s.lastCPU.valid = true
// Memory
mem, _ := parseMemInfo()
total := mem["MemTotal"]
cache := mem["Cached"] + mem["Buffers"]
reclaimable := mem["SReclaimable"]
slab := mem["Slab"]
nonReclaimable := uint64(0)
if slab > reclaimable {
nonReclaimable = slab - reclaimable
}
actualUsed := total - mem["MemFree"] - cache - reclaimable
// Swap
swpIn, swpOut, _ := parseVMStat()
swapUsed := mem["SwapTotal"] - mem["SwapFree"]
var swapInPS, swapOutPS float64
if s.lastSwapSet {
swapInPS = float64(swpIn-s.lastSwapIn) / SampleInterval.Seconds()
swapOutPS = float64(swpOut-s.lastSwapOut) / SampleInterval.Seconds()
}
s.lastSwapIn = swpIn
s.lastSwapOut = swpOut
s.lastSwapSet = true
// Disk
disk, _ := parseDiskstats()
var readIOPS, writeIOPS, readBps, writeBps, readLat, writeLat, qdepth float64
if s.lastDisk.valid {
dt := SampleInterval.Seconds()
readDelta := disk.readIOs - s.lastDisk.readIOs
writeDelta := disk.writeIOs - s.lastDisk.writeIOs
readIOPS = float64(readDelta) / dt
writeIOPS = float64(writeDelta) / dt
readBps = float64(disk.readSectors-s.lastDisk.readSectors) * SectorSize / dt
writeBps = float64(disk.writeSectors-s.lastDisk.writeSectors) * SectorSize / dt
if readDelta > 0 {
readLat = float64(disk.readTicks-s.lastDisk.readTicks) / float64(readDelta)
}
if writeDelta > 0 {
writeLat = float64(disk.writeTicks-s.lastDisk.writeTicks) / float64(writeDelta)
}
ioTicks := disk.ioTicks - s.lastDisk.ioTicks
if ioTicks > 0 {
qdepth = (float64(ioTicks) / 1000.0) * (readIOPS + writeIOPS)
}
}
s.lastDisk.diskAgg = disk
s.lastDisk.valid = true
// PSI
ioSome, ioFull, _ := parsePSI("io")
memSome, memFull, _ := parsePSI("memory")
cpuSome, cpuFull, _ := parsePSI("cpu")
// Build sample
sample := StatsSample{Timestamp: now}
sample.CPU.UserPercent = userPct
sample.CPU.SystemPercent = sysPct
sample.CPU.IOwaitPercent = ioPct
sample.CPU.IdlePercent = idlePct
sample.Memory.TotalBytes = total
sample.Memory.CacheBytes = cache
sample.Memory.ActualUsedBytes = actualUsed
sample.Memory.ReclaimableBytes = reclaimable
sample.Memory.NonReclaimableBytes = nonReclaimable
sample.Swap.UsedBytes = swapUsed
sample.Swap.PageInPerSecond = swapInPS
sample.Swap.PageOutPerSecond = swapOutPS
sample.Disk.ReadIOPS = readIOPS
sample.Disk.WriteIOPS = writeIOPS
sample.Disk.ReadThroughputBytes = readBps
sample.Disk.WriteThroughputBytes = writeBps
sample.Disk.ReadLatencyMillis = readLat
sample.Disk.WriteLatencyMillis = writeLat
sample.Disk.QueueDepth = qdepth
sample.PSI.IO_SomeAvg10 = ioSome
sample.PSI.IO_FullAvg10 = ioFull
sample.PSI.Memory_SomeAvg10 = memSome
sample.PSI.Memory_FullAvg10 = memFull
sample.PSI.CPU_SomeAvg10 = cpuSome
sample.PSI.CPU_FullAvg10 = cpuFull
// Top Processes
sample.TopMemoryProcesses = getTopMemoryProcesses()
sample.TopCPUProcesses = getTopCPUProcesses()
s.buf.Add(sample)
}
// parseCPU reads CPU statistics from /proc/stat
func parseCPU() (user, system, iowait, idle, total uint64, err error) {
f, err := os.Open("/proc/stat")
if err != nil {
return
}
defer f.Close()
sc := bufio.NewScanner(f)
if sc.Scan() {
fields := strings.Fields(sc.Text())
if len(fields) < 6 || fields[0] != "cpu" {
err = fmt.Errorf("unexpected /proc/stat format")
return
}
parse := func(i int) uint64 {
if i >= len(fields) {
return 0
}
v, _ := strconv.ParseUint(fields[i], 10, 64)
return v
}
user = parse(1)
system = parse(3)
idle = parse(4)
iowait = parse(5)
for i := 1; i < len(fields); i++ {
v, _ := strconv.ParseUint(fields[i], 10, 64)
total += v
}
}
return
}
// parseMemInfo reads memory statistics from /proc/meminfo
func parseMemInfo() (map[string]uint64, error) {
f, err := os.Open("/proc/meminfo")
if err != nil {
return nil, err
}
defer f.Close()
m := make(map[string]uint64)
sc := bufio.NewScanner(f)
for sc.Scan() {
parts := strings.Fields(sc.Text())
if len(parts) < 2 {
continue
}
key := strings.TrimSuffix(parts[0], ":")
val, _ := strconv.ParseUint(parts[1], 10, 64)
m[key] = val * 1024 // convert kB → bytes
}
return m, sc.Err()
}
// parseVMStat reads swap statistics from /proc/vmstat
func parseVMStat() (pageIn, pageOut uint64, err error) {
f, err := os.Open("/proc/vmstat")
if err != nil {
return
}
defer f.Close()
sc := bufio.NewScanner(f)
for sc.Scan() {
parts := strings.Fields(sc.Text())
if len(parts) != 2 {
continue
}
switch parts[0] {
case "pswpin":
pageIn, _ = strconv.ParseUint(parts[1], 10, 64)
case "pswpout":
pageOut, _ = strconv.ParseUint(parts[1], 10, 64)
}
}
return
}
// parseDiskstats reads disk I/O statistics from /proc/diskstats
func parseDiskstats() (diskAgg, error) {
var agg diskAgg
f, err := os.Open("/proc/diskstats")
if err != nil {
return agg, err
}
defer f.Close()
sc := bufio.NewScanner(f)
for sc.Scan() {
fields := strings.Fields(sc.Text())
if len(fields) < 14 {
continue
}
name := fields[2]
// Skip loop and ram devices
if strings.HasPrefix(name, "loop") || strings.HasPrefix(name, "ram") {
continue
}
readIOs, _ := strconv.ParseUint(fields[3], 10, 64)
readSectors, _ := strconv.ParseUint(fields[5], 10, 64)
readTicks, _ := strconv.ParseUint(fields[6], 10, 64)
writeIOs, _ := strconv.ParseUint(fields[7], 10, 64)
writeSectors, _ := strconv.ParseUint(fields[9], 10, 64)
writeTicks, _ := strconv.ParseUint(fields[10], 10, 64)
ioTicks, _ := strconv.ParseUint(fields[12], 10, 64)
agg.readIOs += readIOs
agg.readSectors += readSectors
agg.readTicks += readTicks
agg.writeIOs += writeIOs
agg.writeSectors += writeSectors
agg.writeTicks += writeTicks
agg.ioTicks += ioTicks
}
return agg, sc.Err()
}
// parsePSI reads pressure stall information for a given resource
func parsePSI(resource string) (someAvg10, fullAvg10 float64, err error) {
path := filepath.Join("/proc/pressure", resource)
f, err := os.Open(path)
if err != nil {
return 0, 0, fmt.Errorf("missing PSI: %s", resource)
}
defer f.Close()
sc := bufio.NewScanner(f)
for sc.Scan() {
line := sc.Text()
fields := strings.Fields(line)
if len(fields) < 2 {
continue
}
switch fields[0] {
case "some":
for _, f := range fields[1:] {
if strings.HasPrefix(f, "avg10=") {
someAvg10, _ = strconv.ParseFloat(strings.TrimPrefix(f, "avg10="), 64)
}
}
case "full":
for _, f := range fields[1:] {
if strings.HasPrefix(f, "avg10=") {
fullAvg10, _ = strconv.ParseFloat(strings.TrimPrefix(f, "avg10="), 64)
}
}
}
}
return
}
// getTopMemoryProcesses returns the top 5 memory consuming processes
func getTopMemoryProcesses() []ProcessInfo {
procDir, err := os.Open("/proc")
if err != nil {
return nil
}
defer procDir.Close()
entries, err := procDir.Readdirnames(-1)
if err != nil {
return nil
}
var processes []ProcessInfo
for _, entry := range entries {
// Check if entry is a PID (numeric)
if _, err := strconv.Atoi(entry); err != nil {
continue
}
// Read process name
commPath := filepath.Join("/proc", entry, "comm")
commData, err := os.ReadFile(commPath)
if err != nil {
continue
}
name := strings.TrimSpace(string(commData))
// Read memory usage from status
statusPath := filepath.Join("/proc", entry, "status")
statusFile, err := os.Open(statusPath)
if err != nil {
continue
}
var rssKB uint64
scanner := bufio.NewScanner(statusFile)
for scanner.Scan() {
line := scanner.Text()
if strings.HasPrefix(line, "VmRSS:") {
fields := strings.Fields(line)
if len(fields) >= 2 {
rssKB, _ = strconv.ParseUint(fields[1], 10, 64)
break
}
}
}
statusFile.Close()
if rssKB > 0 {
processes = append(processes, ProcessInfo{
Name: name,
Usage: rssKB * 1024, // Convert kB to bytes
})
}
}
// Sort by usage (descending)
sort.Slice(processes, func(i, j int) bool {
return processes[i].Usage > processes[j].Usage
})
// Return top 5
if len(processes) > 5 {
processes = processes[:5]
}
return processes
}
// getTopCPUProcesses returns the top 5 CPU consuming processes
func getTopCPUProcesses() []ProcessInfo {
procDir, err := os.Open("/proc")
if err != nil {
return nil
}
defer procDir.Close()
entries, err := procDir.Readdirnames(-1)
if err != nil {
return nil
}
// Get system uptime for CPU calculation
uptimeData, err := os.ReadFile("/proc/uptime")
if err != nil {
return nil
}
uptimeFields := strings.Fields(string(uptimeData))
if len(uptimeFields) < 1 {
return nil
}
uptime, _ := strconv.ParseFloat(uptimeFields[0], 64)
var processes []ProcessInfo
for _, entry := range entries {
// Check if entry is a PID (numeric)
if _, err := strconv.Atoi(entry); err != nil {
continue
}
// Read process name
commPath := filepath.Join("/proc", entry, "comm")
commData, err := os.ReadFile(commPath)
if err != nil {
continue
}
name := strings.TrimSpace(string(commData))
// Read CPU times from /proc/[pid]/stat
statPath := filepath.Join("/proc", entry, "stat")
statData, err := os.ReadFile(statPath)
if err != nil {
continue
}
// Parse stat file
statStr := string(statData)
// Find the last ')' to handle process names with spaces/parentheses
lastParen := strings.LastIndex(statStr, ")")
if lastParen == -1 {
continue
}
fields := strings.Fields(statStr[lastParen+1:])
if len(fields) < 20 {
continue
}
// Fields after process name: state utime stime cutime cstime...
// utime is at index 11, stime at 12, starttime at 19 (in original stat)
// After removing name part, utime is at index 11, stime at 12, starttime at 19
utime, _ := strconv.ParseUint(fields[11], 10, 64)
stime, _ := strconv.ParseUint(fields[12], 10, 64)
starttime, _ := strconv.ParseUint(fields[19], 10, 64)
// Calculate CPU percentage
// Total CPU time in clock ticks
totalTime := float64(utime + stime)
// Process uptime in seconds
clockTicks := 100.0 // sysconf(_SC_CLK_TCK) is typically 100
processUptime := uptime - (float64(starttime) / clockTicks)
if processUptime <= 0 {
continue
}
// CPU percentage = (total_time / clock_ticks / process_uptime) * 100 * num_cpu
cpuPercent := (totalTime / clockTicks / processUptime) * 100.0
// Store as integer (percentage * 100 to preserve precision)
cpuUsage := uint64(cpuPercent * 100)
if cpuUsage > 0 {
processes = append(processes, ProcessInfo{
Name: name,
Usage: cpuUsage,
})
}
}
// Sort by usage (descending)
sort.Slice(processes, func(i, j int) bool {
return processes[i].Usage > processes[j].Usage
})
// Return top 5
if len(processes) > 5 {
processes = processes[:5]
}
return processes
}