fix: 优化告警触发次数记录逻辑,使用sync.Map存储触发时间,清理过期记录
This commit is contained in:
@@ -7,6 +7,7 @@ import (
|
|||||||
"sort"
|
"sort"
|
||||||
"strconv"
|
"strconv"
|
||||||
"strings"
|
"strings"
|
||||||
|
"sync"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
"be.ems/src/framework/constants"
|
"be.ems/src/framework/constants"
|
||||||
@@ -20,6 +21,12 @@ import (
|
|||||||
wsService "be.ems/src/modules/ws/service"
|
wsService "be.ems/src/modules/ws/service"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
var (
|
||||||
|
triggerMax int64 = 3 // 阈值:连续触发次数大于该值才会产生告警
|
||||||
|
triggerCount sync.Map // 阈值连续触发次数,存储每个事件的触发记录
|
||||||
|
triggerWindow time.Duration = 5 * time.Second // 事件触发的时间窗口
|
||||||
|
)
|
||||||
|
|
||||||
var NewProcessor = &NeAlarmStateCheckCMDProcessor{
|
var NewProcessor = &NeAlarmStateCheckCMDProcessor{
|
||||||
neConfigBackupService: neService.NewNeConfigBackup,
|
neConfigBackupService: neService.NewNeConfigBackup,
|
||||||
neInfoService: neService.NewNeInfo,
|
neInfoService: neService.NewNeInfo,
|
||||||
@@ -27,8 +34,6 @@ var NewProcessor = &NeAlarmStateCheckCMDProcessor{
|
|||||||
alarmService: neDataService.NewAlarm,
|
alarmService: neDataService.NewAlarm,
|
||||||
wsSendService: wsService.NewWSSend,
|
wsSendService: wsService.NewWSSend,
|
||||||
count: 0,
|
count: 0,
|
||||||
triggerMax: 4,
|
|
||||||
triggerCount: 0,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// NeAlarmStateCheckCMDProcessor 网元告警内存/CPU/磁盘检查
|
// NeAlarmStateCheckCMDProcessor 网元告警内存/CPU/磁盘检查
|
||||||
@@ -39,8 +44,6 @@ type NeAlarmStateCheckCMDProcessor struct {
|
|||||||
alarmService *neDataService.Alarm // 告警信息服务
|
alarmService *neDataService.Alarm // 告警信息服务
|
||||||
wsSendService *wsService.WSSend // ws发送服务
|
wsSendService *wsService.WSSend // ws发送服务
|
||||||
count int // 执行次数
|
count int // 执行次数
|
||||||
triggerMax int // 阈值连续触发次数大于才会产生告警
|
|
||||||
triggerCount int // 阈值连续触发次数
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// alarmParams 告警参数
|
// alarmParams 告警参数
|
||||||
@@ -142,7 +145,7 @@ func (s *NeAlarmStateCheckCMDProcessor) Execute(data any) (any, error) {
|
|||||||
// 进行新增
|
// 进行新增
|
||||||
newAlarm, err := s.alarmNew(neInfo, params)
|
newAlarm, err := s.alarmNew(neInfo, params)
|
||||||
params.AddInfo = addInfo // 恢复附加信息
|
params.AddInfo = addInfo // 恢复附加信息
|
||||||
s.triggerCount = 0 // 重置连续触发次数
|
triggerCount.Clear() // 重置连续触发次数
|
||||||
if err != nil {
|
if err != nil {
|
||||||
result[neTypeAndId] = err.Error()
|
result[neTypeAndId] = err.Error()
|
||||||
continue
|
continue
|
||||||
@@ -233,8 +236,25 @@ func (s NeAlarmStateCheckCMDProcessor) serverState(state map[string]any, cpuUseG
|
|||||||
warnMsg = append(warnMsg, fmt.Sprintf("disk usage %.2f%%", sysDiskUsage))
|
warnMsg = append(warnMsg, fmt.Sprintf("disk usage %.2f%%", sysDiskUsage))
|
||||||
}
|
}
|
||||||
if len(warnMsg) > 0 {
|
if len(warnMsg) > 0 {
|
||||||
s.triggerCount++
|
currentTime := time.Now()
|
||||||
if s.triggerCount > s.triggerMax {
|
neTypeAndId := fmt.Sprintf("%s_%s", neState.NeType, neState.NeId)
|
||||||
|
validTimes := []time.Time{}
|
||||||
|
if v, ok := triggerCount.Load(neTypeAndId); ok {
|
||||||
|
times := v.([]time.Time)
|
||||||
|
// 清理过期的记录:10秒前的触发记录不再计入
|
||||||
|
for _, t := range times {
|
||||||
|
if currentTime.Sub(t) <= triggerWindow {
|
||||||
|
validTimes = append(validTimes, t)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
validTimes = append(validTimes, currentTime)
|
||||||
|
triggerCount.Store(neTypeAndId, validTimes)
|
||||||
|
} else {
|
||||||
|
// 事件第一次触发,初始化记录
|
||||||
|
validTimes = append(validTimes, currentTime)
|
||||||
|
triggerCount.Store(neTypeAndId, validTimes)
|
||||||
|
}
|
||||||
|
if int64(len(validTimes)) >= triggerMax {
|
||||||
return fmt.Errorf("greater than %s", strings.Join(warnMsg, ", "))
|
return fmt.Errorf("greater than %s", strings.Join(warnMsg, ", "))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user