fix: 优化告警状态检查逻辑,增加连续触发次数管理
This commit is contained in:
@@ -3,6 +3,7 @@ package ne_alarm_state_check
|
|||||||
import (
|
import (
|
||||||
"encoding/json"
|
"encoding/json"
|
||||||
"fmt"
|
"fmt"
|
||||||
|
"sync"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
"github.com/tsmask/go-oam"
|
"github.com/tsmask/go-oam"
|
||||||
@@ -20,12 +21,16 @@ import (
|
|||||||
var NewProcessor = &NeAlarmStateCheckProcessor{
|
var NewProcessor = &NeAlarmStateCheckProcessor{
|
||||||
neInfoService: neService.NewNeInfo,
|
neInfoService: neService.NewNeInfo,
|
||||||
count: 0,
|
count: 0,
|
||||||
|
triggerMax: 3,
|
||||||
|
triggerCount: sync.Map{},
|
||||||
}
|
}
|
||||||
|
|
||||||
// NeAlarmStateCheckProcessor 网元告警状态检查
|
// NeAlarmStateCheckProcessor 网元告警状态检查
|
||||||
type NeAlarmStateCheckProcessor struct {
|
type NeAlarmStateCheckProcessor struct {
|
||||||
neInfoService *neService.NeInfo // 网元信息服务
|
neInfoService *neService.NeInfo // 网元信息服务
|
||||||
count int // 执行次数
|
count int // 执行次数
|
||||||
|
triggerMax int64 // 阈值:连续触发次数大于该值才会产生告警
|
||||||
|
triggerCount sync.Map // 阈值连续触发次数,存储每个事件的触发记录
|
||||||
}
|
}
|
||||||
|
|
||||||
// alarmParams 告警参数
|
// alarmParams 告警参数
|
||||||
@@ -75,9 +80,29 @@ func (s *NeAlarmStateCheckProcessor) Execute(data any) (any, error) {
|
|||||||
isOnline := parse.Boolean(neInfo.ServerState["online"])
|
isOnline := parse.Boolean(neInfo.ServerState["online"])
|
||||||
|
|
||||||
// 告警状态
|
// 告警状态
|
||||||
alarmStatus := oam.ALARM_STATUS_ACTIVE
|
alarmStatus := oam.ALARM_STATUS_CLEAR
|
||||||
if isOnline {
|
if !isOnline {
|
||||||
alarmStatus = oam.ALARM_STATUS_CLEAR
|
// 重置连续触发次数, 超过阈值才会清除告警
|
||||||
|
onlineKey := "ONLINE:" + neInfo.RmUID
|
||||||
|
if v, ok := s.triggerCount.Load(onlineKey); ok {
|
||||||
|
count := parse.Number(v)
|
||||||
|
if count < s.triggerMax {
|
||||||
|
s.triggerCount.Store(onlineKey, count+1)
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
s.triggerCount.Delete(onlineKey)
|
||||||
|
} else {
|
||||||
|
s.triggerCount.Store(onlineKey, 0)
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
alarmStatus = oam.ALARM_STATUS_ACTIVE
|
||||||
|
}
|
||||||
|
// 附加信息
|
||||||
|
addInfo := params.AddInfo
|
||||||
|
if addInfo != "" {
|
||||||
|
addInfo = fmt.Sprintf("%s, NE Connect Failed %s:%d", addInfo, neInfo.IP, neInfo.Port)
|
||||||
|
} else {
|
||||||
|
addInfo = fmt.Sprintf("NE Connect Failed %s:%d", neInfo.IP, neInfo.Port)
|
||||||
}
|
}
|
||||||
// 告警ID
|
// 告警ID
|
||||||
params.AlarmId = fmt.Sprintf("%d%d", constants.ALARM_STATE_CHECK, neInfo.CreateTime)
|
params.AlarmId = fmt.Sprintf("%d%d", constants.ALARM_STATE_CHECK, neInfo.CreateTime)
|
||||||
@@ -94,7 +119,7 @@ func (s *NeAlarmStateCheckProcessor) Execute(data any) (any, error) {
|
|||||||
AlarmStatus: alarmStatus, // 告警状态
|
AlarmStatus: alarmStatus, // 告警状态
|
||||||
SpecificProblem: params.SpecificProblem, // 告警问题原因
|
SpecificProblem: params.SpecificProblem, // 告警问题原因
|
||||||
SpecificProblemID: params.SpecificProblemID, // 告警问题原因ID
|
SpecificProblemID: params.SpecificProblemID, // 告警问题原因ID
|
||||||
AddInfo: params.AddInfo, // 告警辅助信息
|
AddInfo: addInfo, // 告警辅助信息
|
||||||
LocationInfo: "NE State: Heartbeat", // 告警定位信息
|
LocationInfo: "NE State: Heartbeat", // 告警定位信息
|
||||||
}
|
}
|
||||||
if err = oamService.NewAlarm.Resolve(alarm); err == nil {
|
if err = oamService.NewAlarm.Resolve(alarm); err == nil {
|
||||||
|
|||||||
@@ -25,17 +25,13 @@ import (
|
|||||||
wsService "be.ems/src/modules/ws/service"
|
wsService "be.ems/src/modules/ws/service"
|
||||||
)
|
)
|
||||||
|
|
||||||
var (
|
|
||||||
triggerMax int64 = 3 // 阈值:连续触发次数大于该值才会产生告警
|
|
||||||
triggerCount sync.Map // 阈值连续触发次数,存储每个事件的触发记录
|
|
||||||
triggerWindow time.Duration = 30 * time.Second // 事件触发的时间窗口
|
|
||||||
)
|
|
||||||
|
|
||||||
var NewProcessor = &NeAlarmStateCheckCMDProcessor{
|
var NewProcessor = &NeAlarmStateCheckCMDProcessor{
|
||||||
neInfoService: neService.NewNeInfo,
|
neInfoService: neService.NewNeInfo,
|
||||||
neStateService: neDataService.NewNEState,
|
neStateService: neDataService.NewNEState,
|
||||||
wsSendService: wsService.NewWSSend,
|
wsSendService: wsService.NewWSSend,
|
||||||
count: 0,
|
count: 0,
|
||||||
|
triggerMax: 3,
|
||||||
|
triggerCount: sync.Map{},
|
||||||
}
|
}
|
||||||
|
|
||||||
// NeAlarmStateCheckCMDProcessor 网元告警内存/CPU/磁盘检查
|
// NeAlarmStateCheckCMDProcessor 网元告警内存/CPU/磁盘检查
|
||||||
@@ -44,6 +40,8 @@ type NeAlarmStateCheckCMDProcessor struct {
|
|||||||
neStateService *neDataService.NEState // 网元状态信息服务
|
neStateService *neDataService.NEState // 网元状态信息服务
|
||||||
wsSendService *wsService.WSSend // ws发送服务
|
wsSendService *wsService.WSSend // ws发送服务
|
||||||
count int // 执行次数
|
count int // 执行次数
|
||||||
|
triggerMax int64 // 阈值:连续触发次数大于该值才会产生告警
|
||||||
|
triggerCount sync.Map // 阈值连续触发次数,存储每个事件的触发记录
|
||||||
}
|
}
|
||||||
|
|
||||||
// alarmParams 告警参数
|
// alarmParams 告警参数
|
||||||
@@ -124,31 +122,37 @@ func (s *NeAlarmStateCheckCMDProcessor) Execute(data any) (any, error) {
|
|||||||
}
|
}
|
||||||
var err error
|
var err error
|
||||||
if len(warnMsg) > 0 {
|
if len(warnMsg) > 0 {
|
||||||
currentTime := time.Now()
|
var count int64
|
||||||
validTimes := []time.Time{}
|
if v, ok := s.triggerCount.Load(neInfo.RmUID); ok {
|
||||||
if v, ok := triggerCount.Load(neInfo.RmUID); ok {
|
count = parse.Number(v)
|
||||||
times := v.([]time.Time)
|
s.triggerCount.Store(neInfo.RmUID, count+1)
|
||||||
// 清理过期的记录:10秒前的触发记录不再计入
|
|
||||||
for _, t := range times {
|
|
||||||
if currentTime.Sub(t) <= triggerWindow {
|
|
||||||
validTimes = append(validTimes, t)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
validTimes = append(validTimes, currentTime)
|
|
||||||
triggerCount.Store(neInfo.RmUID, validTimes)
|
|
||||||
} else {
|
} else {
|
||||||
// 事件第一次触发,初始化记录
|
s.triggerCount.Store(neInfo.RmUID, 0)
|
||||||
validTimes = append(validTimes, currentTime)
|
|
||||||
triggerCount.Store(neInfo.RmUID, validTimes)
|
|
||||||
}
|
}
|
||||||
if int64(len(validTimes)) >= triggerMax {
|
if count >= s.triggerMax {
|
||||||
|
s.triggerCount.Delete("CLEAR:" + neInfo.RmUID)
|
||||||
|
s.triggerCount.Delete(neInfo.RmUID)
|
||||||
err = fmt.Errorf("greater than %s", strings.Join(warnMsg, ", "))
|
err = fmt.Errorf("greater than %s", strings.Join(warnMsg, ", "))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// 告警状态
|
// 告警状态
|
||||||
alarmStatus := oam.ALARM_STATUS_ACTIVE
|
alarmStatus := oam.ALARM_STATUS_ACTIVE
|
||||||
if err == nil { // 检查状态连续触发
|
if err == nil {
|
||||||
|
// 重置连续触发次数, 超过阈值才会清除告警
|
||||||
|
clearKey := "CLEAR:" + neInfo.RmUID
|
||||||
|
if v, ok := s.triggerCount.Load(clearKey); ok {
|
||||||
|
count := parse.Number(v)
|
||||||
|
if count < s.triggerMax {
|
||||||
|
s.triggerCount.Store(clearKey, count+1)
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
s.triggerCount.Delete(clearKey)
|
||||||
|
s.triggerCount.Delete(neInfo.RmUID)
|
||||||
|
} else {
|
||||||
|
s.triggerCount.Store(clearKey, 0)
|
||||||
|
continue
|
||||||
|
}
|
||||||
alarmStatus = oam.ALARM_STATUS_CLEAR
|
alarmStatus = oam.ALARM_STATUS_CLEAR
|
||||||
}
|
}
|
||||||
// 附加信息
|
// 附加信息
|
||||||
@@ -181,7 +185,6 @@ func (s *NeAlarmStateCheckCMDProcessor) Execute(data any) (any, error) {
|
|||||||
if err = oamService.NewAlarm.Resolve(alarm); err == nil {
|
if err = oamService.NewAlarm.Resolve(alarm); err == nil {
|
||||||
result[neInfo.RmUID] = "cmd alarm"
|
result[neInfo.RmUID] = "cmd alarm"
|
||||||
}
|
}
|
||||||
triggerCount.Delete(neInfo.RmUID)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// 返回结果,用于记录执行结果
|
// 返回结果,用于记录执行结果
|
||||||
@@ -189,7 +192,7 @@ func (s *NeAlarmStateCheckCMDProcessor) Execute(data any) (any, error) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// serverState 网元状态记录
|
// serverState 网元状态记录
|
||||||
func (s NeAlarmStateCheckCMDProcessor) serverState(state map[string]any) (float64, float64, float64) {
|
func (s *NeAlarmStateCheckCMDProcessor) serverState(state map[string]any) (float64, float64, float64) {
|
||||||
// 网元CPU使用率
|
// 网元CPU使用率
|
||||||
var nfCpuUsage float64 = 0
|
var nfCpuUsage float64 = 0
|
||||||
// CPU使用率
|
// CPU使用率
|
||||||
|
|||||||
Reference in New Issue
Block a user