fix: 优化告警触发次数记录逻辑,使用sync.Map存储触发时间,清理过期记录

This commit is contained in:
TsMask
2025-05-23 11:45:37 +08:00
parent 687d1dcf64
commit 1228f5ea5b

View File

@@ -7,6 +7,7 @@ import (
"sort" "sort"
"strconv" "strconv"
"strings" "strings"
"sync"
"time" "time"
"be.ems/src/framework/constants" "be.ems/src/framework/constants"
@@ -20,6 +21,12 @@ import (
wsService "be.ems/src/modules/ws/service" wsService "be.ems/src/modules/ws/service"
) )
var (
triggerMax int64 = 3 // 阈值:连续触发次数大于该值才会产生告警
triggerCount sync.Map // 阈值连续触发次数,存储每个事件的触发记录
triggerWindow time.Duration = 5 * time.Second // 事件触发的时间窗口
)
var NewProcessor = &NeAlarmStateCheckCMDProcessor{ var NewProcessor = &NeAlarmStateCheckCMDProcessor{
neConfigBackupService: neService.NewNeConfigBackup, neConfigBackupService: neService.NewNeConfigBackup,
neInfoService: neService.NewNeInfo, neInfoService: neService.NewNeInfo,
@@ -27,8 +34,6 @@ var NewProcessor = &NeAlarmStateCheckCMDProcessor{
alarmService: neDataService.NewAlarm, alarmService: neDataService.NewAlarm,
wsSendService: wsService.NewWSSend, wsSendService: wsService.NewWSSend,
count: 0, count: 0,
triggerMax: 4,
triggerCount: 0,
} }
// NeAlarmStateCheckCMDProcessor 网元告警内存/CPU/磁盘检查 // NeAlarmStateCheckCMDProcessor 网元告警内存/CPU/磁盘检查
@@ -39,8 +44,6 @@ type NeAlarmStateCheckCMDProcessor struct {
alarmService *neDataService.Alarm // 告警信息服务 alarmService *neDataService.Alarm // 告警信息服务
wsSendService *wsService.WSSend // ws发送服务 wsSendService *wsService.WSSend // ws发送服务
count int // 执行次数 count int // 执行次数
triggerMax int // 阈值连续触发次数大于才会产生告警
triggerCount int // 阈值连续触发次数
} }
// alarmParams 告警参数 // alarmParams 告警参数
@@ -142,7 +145,7 @@ func (s *NeAlarmStateCheckCMDProcessor) Execute(data any) (any, error) {
// 进行新增 // 进行新增
newAlarm, err := s.alarmNew(neInfo, params) newAlarm, err := s.alarmNew(neInfo, params)
params.AddInfo = addInfo // 恢复附加信息 params.AddInfo = addInfo // 恢复附加信息
s.triggerCount = 0 // 重置连续触发次数 triggerCount.Clear() // 重置连续触发次数
if err != nil { if err != nil {
result[neTypeAndId] = err.Error() result[neTypeAndId] = err.Error()
continue continue
@@ -233,8 +236,25 @@ func (s NeAlarmStateCheckCMDProcessor) serverState(state map[string]any, cpuUseG
warnMsg = append(warnMsg, fmt.Sprintf("disk usage %.2f%%", sysDiskUsage)) warnMsg = append(warnMsg, fmt.Sprintf("disk usage %.2f%%", sysDiskUsage))
} }
if len(warnMsg) > 0 { if len(warnMsg) > 0 {
s.triggerCount++ currentTime := time.Now()
if s.triggerCount > s.triggerMax { neTypeAndId := fmt.Sprintf("%s_%s", neState.NeType, neState.NeId)
validTimes := []time.Time{}
if v, ok := triggerCount.Load(neTypeAndId); ok {
times := v.([]time.Time)
// 清理过期的记录10秒前的触发记录不再计入
for _, t := range times {
if currentTime.Sub(t) <= triggerWindow {
validTimes = append(validTimes, t)
}
}
validTimes = append(validTimes, currentTime)
triggerCount.Store(neTypeAndId, validTimes)
} else {
// 事件第一次触发,初始化记录
validTimes = append(validTimes, currentTime)
triggerCount.Store(neTypeAndId, validTimes)
}
if int64(len(validTimes)) >= triggerMax {
return fmt.Errorf("greater than %s", strings.Join(warnMsg, ", ")) return fmt.Errorf("greater than %s", strings.Join(warnMsg, ", "))
} }
} }