checkpointing, heavy dev

This commit is contained in:
2025-07-24 14:32:50 +02:00
parent a3bc63d2d9
commit c2040a5c08
89 changed files with 741883 additions and 477 deletions

View File

@@ -1,11 +1,17 @@
// Package statcollector provides system information collection
package statcollector
import (
"log/slog"
"os"
"os/exec"
"regexp"
"strconv"
"strings"
"time"
"git.eeqj.de/sneak/hdmistat/internal/netmon"
"github.com/dustin/go-humanize"
"github.com/shirou/gopsutil/v3/cpu"
"github.com/shirou/gopsutil/v3/disk"
"github.com/shirou/gopsutil/v3/host"
@@ -14,6 +20,17 @@ import (
"github.com/shirou/gopsutil/v3/process"
)
const (
// Process collection constants
maxProcesses = 100
processTimeout = 50 * time.Millisecond
processStableTime = 100 * time.Millisecond
msToSecondsDivisor = 1000
// Network constants
bitsPerMegabit = 1000 * 1000
)
// SystemInfo represents overall system information
type SystemInfo struct {
Hostname string
@@ -40,13 +57,23 @@ type DiskInfo struct {
// NetworkInfo represents network interface information
type NetworkInfo struct {
Name string
IPAddresses []string
LinkSpeed uint64
BytesSent uint64
BytesRecv uint64
PacketsSent uint64
PacketsRecv uint64
Name string
IPAddresses []string
LinkSpeed uint64
BytesSent uint64
BytesRecv uint64
BitsSentRate uint64 // bits per second
BitsRecvRate uint64 // bits per second
}
// FormatSentRate returns the send rate as a human-readable string
func (n *NetworkInfo) FormatSentRate() string {
return humanize.SI(float64(n.BitsSentRate), "bit/s")
}
// FormatRecvRate returns the receive rate as a human-readable string
func (n *NetworkInfo) FormatRecvRate() string {
return humanize.SI(float64(n.BitsRecvRate), "bit/s")
}
// ProcessInfo represents process information
@@ -67,15 +94,25 @@ type Collector interface {
// SystemCollector implements Collector
type SystemCollector struct {
logger *slog.Logger
lastNetStats map[string]psnet.IOCountersStat
netMonitor *netmon.Monitor
lastCollectTime time.Time
}
// NewSystemCollector creates a new system collector
func NewSystemCollector(logger *slog.Logger) *SystemCollector {
nm := netmon.New(logger)
nm.Start()
return &SystemCollector{
logger: logger,
lastNetStats: make(map[string]psnet.IOCountersStat),
logger: logger,
netMonitor: nm,
}
}
// Stop stops the system collector
func (c *SystemCollector) Stop() {
if c.netMonitor != nil {
c.netMonitor.Stop()
}
}
@@ -100,7 +137,13 @@ func (c *SystemCollector) Collect() (*SystemInfo, error) {
if err != nil {
c.logger.Warn("getting uptime", "error", err)
} else {
info.Uptime = time.Duration(uptimeSecs) * time.Second
if uptimeSecs > 0 {
// Convert uint64 to int64 safely to avoid overflow
maxInt64 := ^uint64(0) >> 1
if uptimeSecs <= maxInt64 {
info.Uptime = time.Duration(int64(uptimeSecs)) * time.Second
}
}
}
// Memory
@@ -160,37 +203,52 @@ func (c *SystemCollector) Collect() (*SystemInfo, error) {
}
}
// Network
// Network - get rates from network monitor
netStats := c.netMonitor.GetStats()
// Also get interface details for IP addresses
interfaces, err := psnet.Interfaces()
if err != nil {
c.logger.Warn("getting network interfaces", "error", err)
} else {
ioCounters, _ := psnet.IOCounters(true)
ioMap := make(map[string]psnet.IOCountersStat)
for _, counter := range ioCounters {
ioMap[counter.Name] = counter
}
// Create a map of interface names to IPs and link speeds
ifaceIPs := make(map[string][]string)
ifaceSpeeds := make(map[string]uint64)
for _, iface := range interfaces {
if iface.Name == "lo" || strings.HasPrefix(iface.Name, "docker") {
continue
}
netInfo := NetworkInfo{
Name: iface.Name,
}
// Get IP addresses
var ips []string
for _, addr := range iface.Addrs {
netInfo.IPAddresses = append(netInfo.IPAddresses, addr.Addr)
ips = append(ips, addr.Addr)
}
ifaceIPs[iface.Name] = ips
// Try to get link speed with ethtool
if speed := c.getLinkSpeed(iface.Name); speed > 0 {
ifaceSpeeds[iface.Name] = speed
}
}
// Combine network monitor stats with interface details
for _, stat := range netStats {
netInfo := NetworkInfo{
Name: stat.Name,
BytesSent: stat.BytesSent,
BytesRecv: stat.BytesRecv,
BitsSentRate: stat.BitsSentRate,
BitsRecvRate: stat.BitsRecvRate,
}
// Get stats
if stats, ok := ioMap[iface.Name]; ok {
netInfo.BytesSent = stats.BytesSent
netInfo.BytesRecv = stats.BytesRecv
netInfo.PacketsSent = stats.PacketsSent
netInfo.PacketsRecv = stats.PacketsRecv
// Add IP addresses if available
if ips, ok := ifaceIPs[stat.Name]; ok {
netInfo.IPAddresses = ips
}
// Add link speed if available
if speed, ok := ifaceSpeeds[stat.Name]; ok {
netInfo.LinkSpeed = speed
}
info.Network = append(info.Network, netInfo)
@@ -202,9 +260,43 @@ func (c *SystemCollector) Collect() (*SystemInfo, error) {
if err != nil {
c.logger.Warn("getting processes", "error", err)
} else {
// Limit to top processes to avoid hanging
processCount := 0
for _, p := range processes {
name, _ := p.Name()
cpuPercent, _ := p.CPUPercent()
if processCount >= maxProcesses {
break
}
// Skip kernel threads and very short-lived processes
name, err := p.Name()
if err != nil || name == "" {
continue
}
// Use CreateTime to skip very new processes that might not have stable stats
createTime, err := p.CreateTime()
if err != nil || time.Since(time.Unix(createTime/msToSecondsDivisor, 0)) < processStableTime {
continue
}
// Get CPU percent with timeout - this is the call that can hang
cpuPercent := 0.0
cpuChan := make(chan float64, 1)
go func() {
cpu, _ := p.CPUPercent()
cpuChan <- cpu
}()
select {
case cpu := <-cpuChan:
cpuPercent = cpu
case <-time.After(processTimeout):
// Skip this process if CPU sampling takes too long
c.logger.Debug("skipping process due to CPU timeout", "pid", p.Pid, "name", name)
continue
}
memInfo, _ := p.MemoryInfo()
username, _ := p.Username()
@@ -216,9 +308,36 @@ func (c *SystemCollector) Collect() (*SystemInfo, error) {
MemoryVMS: memInfo.VMS,
Username: username,
})
processCount++
}
}
c.lastCollectTime = time.Now()
return info, nil
}
// getLinkSpeed gets the link speed for an interface using ethtool
func (c *SystemCollector) getLinkSpeed(ifaceName string) uint64 {
// Run ethtool to get link speed
output, err := exec.Command("ethtool", ifaceName).Output()
if err != nil {
return 0
}
// Parse the output for speed
// Look for lines like "Speed: 1000Mb/s" or "Speed: 10000Mb/s"
speedRegex := regexp.MustCompile(`Speed:\s+(\d+)Mb/s`)
matches := speedRegex.FindSubmatch(output)
if len(matches) < 2 {
return 0
}
// Convert from Mb/s to bits/s
mbps, err := strconv.ParseUint(string(matches[1]), 10, 64)
if err != nil {
return 0
}
return mbps * bitsPerMegabit // Convert to bits per second
}