From c325b89f6fb59da2ce69445e6210b73133b1abd9 Mon Sep 17 00:00:00 2001 From: TsMask <340112800@qq.com> Date: Thu, 16 Oct 2025 11:29:46 +0800 Subject: [PATCH] =?UTF-8?q?fix:=20=E4=BC=98=E5=8C=96=E5=91=8A=E8=AD=A6?= =?UTF-8?q?=E7=8A=B6=E6=80=81=E6=A3=80=E6=9F=A5=E9=80=BB=E8=BE=91=EF=BC=8C?= =?UTF-8?q?=E5=A2=9E=E5=8A=A0=E8=BF=9E=E7=BB=AD=E8=A7=A6=E5=8F=91=E6=AC=A1?= =?UTF-8?q?=E6=95=B0=E7=AE=A1=E7=90=86?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../ne_alarm_state_check.go | 33 ++++++++++-- .../ne_alarm_state_check_cmd.go | 53 ++++++++++--------- 2 files changed, 57 insertions(+), 29 deletions(-) diff --git a/src/modules/crontask/processor/ne_alarm_state_check/ne_alarm_state_check.go b/src/modules/crontask/processor/ne_alarm_state_check/ne_alarm_state_check.go index 843b16d2..c0499513 100644 --- a/src/modules/crontask/processor/ne_alarm_state_check/ne_alarm_state_check.go +++ b/src/modules/crontask/processor/ne_alarm_state_check/ne_alarm_state_check.go @@ -3,6 +3,7 @@ package ne_alarm_state_check import ( "encoding/json" "fmt" + "sync" "time" "github.com/tsmask/go-oam" @@ -20,12 +21,16 @@ import ( var NewProcessor = &NeAlarmStateCheckProcessor{ neInfoService: neService.NewNeInfo, count: 0, + triggerMax: 3, + triggerCount: sync.Map{}, } // NeAlarmStateCheckProcessor 网元告警状态检查 type NeAlarmStateCheckProcessor struct { neInfoService *neService.NeInfo // 网元信息服务 count int // 执行次数 + triggerMax int64 // 阈值:连续触发次数大于该值才会产生告警 + triggerCount sync.Map // 阈值连续触发次数,存储每个事件的触发记录 } // alarmParams 告警参数 @@ -75,9 +80,29 @@ func (s *NeAlarmStateCheckProcessor) Execute(data any) (any, error) { isOnline := parse.Boolean(neInfo.ServerState["online"]) // 告警状态 - alarmStatus := oam.ALARM_STATUS_ACTIVE - if isOnline { - alarmStatus = oam.ALARM_STATUS_CLEAR + alarmStatus := oam.ALARM_STATUS_CLEAR + if !isOnline { + // 重置连续触发次数, 超过阈值才会清除告警 + onlineKey := "ONLINE:" + neInfo.RmUID + if v, ok := s.triggerCount.Load(onlineKey); ok { + count := parse.Number(v) + if count < s.triggerMax { + s.triggerCount.Store(onlineKey, count+1) + continue + } + s.triggerCount.Delete(onlineKey) + } else { + s.triggerCount.Store(onlineKey, 0) + continue + } + alarmStatus = oam.ALARM_STATUS_ACTIVE + } + // 附加信息 + addInfo := params.AddInfo + if addInfo != "" { + addInfo = fmt.Sprintf("%s, NE Connect Failed %s:%d", addInfo, neInfo.IP, neInfo.Port) + } else { + addInfo = fmt.Sprintf("NE Connect Failed %s:%d", neInfo.IP, neInfo.Port) } // 告警ID params.AlarmId = fmt.Sprintf("%d%d", constants.ALARM_STATE_CHECK, neInfo.CreateTime) @@ -94,7 +119,7 @@ func (s *NeAlarmStateCheckProcessor) Execute(data any) (any, error) { AlarmStatus: alarmStatus, // 告警状态 SpecificProblem: params.SpecificProblem, // 告警问题原因 SpecificProblemID: params.SpecificProblemID, // 告警问题原因ID - AddInfo: params.AddInfo, // 告警辅助信息 + AddInfo: addInfo, // 告警辅助信息 LocationInfo: "NE State: Heartbeat", // 告警定位信息 } if err = oamService.NewAlarm.Resolve(alarm); err == nil { diff --git a/src/modules/crontask/processor/ne_alarm_state_check_cmd/ne_alarm_state_check_cmd.go b/src/modules/crontask/processor/ne_alarm_state_check_cmd/ne_alarm_state_check_cmd.go index c7dd970a..c1de2070 100644 --- a/src/modules/crontask/processor/ne_alarm_state_check_cmd/ne_alarm_state_check_cmd.go +++ b/src/modules/crontask/processor/ne_alarm_state_check_cmd/ne_alarm_state_check_cmd.go @@ -25,17 +25,13 @@ import ( wsService "be.ems/src/modules/ws/service" ) -var ( - triggerMax int64 = 3 // 阈值:连续触发次数大于该值才会产生告警 - triggerCount sync.Map // 阈值连续触发次数,存储每个事件的触发记录 - triggerWindow time.Duration = 30 * time.Second // 事件触发的时间窗口 -) - var NewProcessor = &NeAlarmStateCheckCMDProcessor{ neInfoService: neService.NewNeInfo, neStateService: neDataService.NewNEState, wsSendService: wsService.NewWSSend, count: 0, + triggerMax: 3, + triggerCount: sync.Map{}, } // NeAlarmStateCheckCMDProcessor 网元告警内存/CPU/磁盘检查 @@ -44,6 +40,8 @@ type NeAlarmStateCheckCMDProcessor struct { neStateService *neDataService.NEState // 网元状态信息服务 wsSendService *wsService.WSSend // ws发送服务 count int // 执行次数 + triggerMax int64 // 阈值:连续触发次数大于该值才会产生告警 + triggerCount sync.Map // 阈值连续触发次数,存储每个事件的触发记录 } // alarmParams 告警参数 @@ -124,31 +122,37 @@ func (s *NeAlarmStateCheckCMDProcessor) Execute(data any) (any, error) { } var err error if len(warnMsg) > 0 { - currentTime := time.Now() - validTimes := []time.Time{} - if v, ok := triggerCount.Load(neInfo.RmUID); ok { - times := v.([]time.Time) - // 清理过期的记录:10秒前的触发记录不再计入 - for _, t := range times { - if currentTime.Sub(t) <= triggerWindow { - validTimes = append(validTimes, t) - } - } - validTimes = append(validTimes, currentTime) - triggerCount.Store(neInfo.RmUID, validTimes) + var count int64 + if v, ok := s.triggerCount.Load(neInfo.RmUID); ok { + count = parse.Number(v) + s.triggerCount.Store(neInfo.RmUID, count+1) } else { - // 事件第一次触发,初始化记录 - validTimes = append(validTimes, currentTime) - triggerCount.Store(neInfo.RmUID, validTimes) + s.triggerCount.Store(neInfo.RmUID, 0) } - if int64(len(validTimes)) >= triggerMax { + if count >= s.triggerMax { + s.triggerCount.Delete("CLEAR:" + neInfo.RmUID) + s.triggerCount.Delete(neInfo.RmUID) err = fmt.Errorf("greater than %s", strings.Join(warnMsg, ", ")) } } // 告警状态 alarmStatus := oam.ALARM_STATUS_ACTIVE - if err == nil { // 检查状态连续触发 + if err == nil { + // 重置连续触发次数, 超过阈值才会清除告警 + clearKey := "CLEAR:" + neInfo.RmUID + if v, ok := s.triggerCount.Load(clearKey); ok { + count := parse.Number(v) + if count < s.triggerMax { + s.triggerCount.Store(clearKey, count+1) + continue + } + s.triggerCount.Delete(clearKey) + s.triggerCount.Delete(neInfo.RmUID) + } else { + s.triggerCount.Store(clearKey, 0) + continue + } alarmStatus = oam.ALARM_STATUS_CLEAR } // 附加信息 @@ -181,7 +185,6 @@ func (s *NeAlarmStateCheckCMDProcessor) Execute(data any) (any, error) { if err = oamService.NewAlarm.Resolve(alarm); err == nil { result[neInfo.RmUID] = "cmd alarm" } - triggerCount.Delete(neInfo.RmUID) } // 返回结果,用于记录执行结果 @@ -189,7 +192,7 @@ func (s *NeAlarmStateCheckCMDProcessor) Execute(data any) (any, error) { } // serverState 网元状态记录 -func (s NeAlarmStateCheckCMDProcessor) serverState(state map[string]any) (float64, float64, float64) { +func (s *NeAlarmStateCheckCMDProcessor) serverState(state map[string]any) (float64, float64, float64) { // 网元CPU使用率 var nfCpuUsage float64 = 0 // CPU使用率