fix: 更新告警参数描述,修正CPU使用率计算逻辑, license检查天数错误
This commit is contained in:
@@ -15,6 +15,10 @@ import (
|
||||
"be.ems/lib/global"
|
||||
"be.ems/lib/log"
|
||||
"be.ems/lib/services"
|
||||
"be.ems/src/framework/constants"
|
||||
neDataModel "be.ems/src/modules/network_data/model"
|
||||
neDataService "be.ems/src/modules/network_data/service"
|
||||
traceService "be.ems/src/modules/trace/service"
|
||||
"xorm.io/xorm"
|
||||
|
||||
"github.com/go-resty/resty/v2"
|
||||
@@ -476,6 +480,23 @@ func PostAlarmFromNF(w http.ResponseWriter, r *http.Request) {
|
||||
log.Error("Failed to AlarmSMSForward:", err)
|
||||
}
|
||||
}
|
||||
|
||||
// 网元重启后,清除活动告警
|
||||
if alarmData.AlarmCode == constants.ALARM_EVENT_REBOOT {
|
||||
alarmService := neDataService.NewAlarm
|
||||
rows := alarmService.Find(neDataModel.Alarm{
|
||||
NeType: alarmData.NeType,
|
||||
NeId: alarmData.NeId,
|
||||
AlarmStatus: "1",
|
||||
})
|
||||
for _, v := range rows {
|
||||
alarmService.AlarmClearByIds([]string{v.ID}, "system")
|
||||
}
|
||||
}
|
||||
// 网元重启后,有跟踪任务的需要重新补发启动任务
|
||||
if alarmData.AlarmCode == constants.ALARM_EVENT_REBOOT {
|
||||
traceService.NewTraceTask.RunUnstopped()
|
||||
}
|
||||
}
|
||||
|
||||
services.ResponseStatusOK200Null(w)
|
||||
@@ -788,6 +809,23 @@ func GetAlarmFromNF(w http.ResponseWriter, r *http.Request) {
|
||||
log.Error("Failed to AlarmSMSForward:", err)
|
||||
}
|
||||
}
|
||||
|
||||
// 网元重启后,清除活动告警
|
||||
if alarmData.AlarmCode == constants.ALARM_EVENT_REBOOT {
|
||||
alarmService := neDataService.NewAlarm
|
||||
rows := alarmService.Find(neDataModel.Alarm{
|
||||
NeType: alarmData.NeType,
|
||||
NeId: alarmData.NeId,
|
||||
AlarmStatus: "1",
|
||||
})
|
||||
for _, v := range rows {
|
||||
alarmService.AlarmClearByIds([]string{v.ID}, "system")
|
||||
}
|
||||
}
|
||||
// 网元重启后,有跟踪任务的需要重新补发启动任务
|
||||
if alarmData.AlarmCode == constants.ALARM_EVENT_REBOOT {
|
||||
traceService.NewTraceTask.RunUnstopped()
|
||||
}
|
||||
}
|
||||
log.Warn("Failed to insert alarm data:", err)
|
||||
}
|
||||
|
||||
@@ -3,6 +3,7 @@ package ne_alarm_state_check_cmd
|
||||
import (
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"runtime"
|
||||
"sort"
|
||||
"strconv"
|
||||
"strings"
|
||||
@@ -26,6 +27,8 @@ var NewProcessor = &NeAlarmStateCheckCMDProcessor{
|
||||
alarmService: neDataService.NewAlarm,
|
||||
wsSendService: wsService.NewWSSend,
|
||||
count: 0,
|
||||
triggerMax: 4,
|
||||
triggerCount: 0,
|
||||
}
|
||||
|
||||
// NeAlarmStateCheckCMDProcessor 网元告警内存/CPU/磁盘检查
|
||||
@@ -36,7 +39,8 @@ type NeAlarmStateCheckCMDProcessor struct {
|
||||
alarmService *neDataService.Alarm // 告警信息服务
|
||||
wsSendService *wsService.WSSend // ws发送服务
|
||||
count int // 执行次数
|
||||
|
||||
triggerMax int // 阈值连续触发次数大于才会产生告警
|
||||
triggerCount int // 阈值连续触发次数
|
||||
}
|
||||
|
||||
// alarmParams 告警参数
|
||||
@@ -47,7 +51,7 @@ type alarmParams struct {
|
||||
SpecificProblem string `json:"specificProblem"` // Alarm Cause: CPU/Menory/Disk status received from target NE reaches the threshold
|
||||
SpecificProblemID string `json:"specificProblemId"` // AC10100
|
||||
AddInfo string `json:"addInfo"` // 告警补充信息
|
||||
CPUUseGt int64 `json:"cpuUseGt"` // CPU使用率大于, 范围0~100%
|
||||
CPUUseGt int64 `json:"cpuUseGt"` // CPU使用率大于, 范围0~100*CPU核心数
|
||||
MemUseGt int64 `json:"memUseGt"` // 内存使用率大于, 范围0~100%
|
||||
DiskUseGt int64 `json:"diskUseGt"` // 磁盘使用率大于, 范围0~100%
|
||||
|
||||
@@ -72,8 +76,9 @@ func (s *NeAlarmStateCheckCMDProcessor) Execute(data any) (any, error) {
|
||||
return nil, fmt.Errorf("json params err: %v", err)
|
||||
}
|
||||
// 检查使用率
|
||||
if params.CPUUseGt > 100 || params.CPUUseGt < 0 {
|
||||
return nil, fmt.Errorf("cpuUseGt must be between 0 and 100")
|
||||
numCPU := runtime.NumCPU()
|
||||
if params.CPUUseGt > int64(numCPU*100) || params.CPUUseGt < 0 {
|
||||
return nil, fmt.Errorf("cpuUseGt must be between 0 and 100 * NumCPU")
|
||||
}
|
||||
if params.MemUseGt > 100 || params.MemUseGt < 0 {
|
||||
return nil, fmt.Errorf("memUseGt must be between 0 and 100")
|
||||
@@ -137,6 +142,7 @@ func (s *NeAlarmStateCheckCMDProcessor) Execute(data any) (any, error) {
|
||||
// 进行新增
|
||||
newAlarm, err := s.alarmNew(neInfo, params)
|
||||
params.AddInfo = addInfo // 恢复附加信息
|
||||
s.triggerCount = 0 // 重置连续触发次数
|
||||
if err != nil {
|
||||
result[neTypeAndId] = err.Error()
|
||||
continue
|
||||
@@ -227,7 +233,10 @@ func (s NeAlarmStateCheckCMDProcessor) serverState(state map[string]any, cpuUseG
|
||||
warnMsg = append(warnMsg, fmt.Sprintf("disk usage %.2f%%", sysDiskUsage))
|
||||
}
|
||||
if len(warnMsg) > 0 {
|
||||
return fmt.Errorf("greater than %s", strings.Join(warnMsg, ", "))
|
||||
s.triggerCount++
|
||||
if s.triggerCount > s.triggerMax {
|
||||
return fmt.Errorf("greater than %s", strings.Join(warnMsg, ", "))
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
@@ -120,8 +120,15 @@ func (s *NeAlarmStateCheckLicenseProcessor) Execute(data any) (any, error) {
|
||||
}
|
||||
// 未记录
|
||||
if alarmStatus == "" {
|
||||
addInfo := params.AddInfo
|
||||
if params.AddInfo != "" {
|
||||
params.AddInfo = params.AddInfo + ", " + err.Error()
|
||||
} else {
|
||||
params.AddInfo = err.Error()
|
||||
}
|
||||
// 进行新增
|
||||
newAlarm, err := s.alarmNew(neInfo, params)
|
||||
params.AddInfo = addInfo // 恢复附加信息
|
||||
if err != nil {
|
||||
result[neTypeAndId] = err.Error()
|
||||
continue
|
||||
@@ -150,7 +157,7 @@ func (s NeAlarmStateCheckLicenseProcessor) serverState(state map[string]any, day
|
||||
}
|
||||
|
||||
// 计算距离天数
|
||||
daysLeft := int64(time.Since(expireTime).Hours() / 24)
|
||||
daysLeft := int64(time.Until(expireTime).Hours() / 24)
|
||||
if daysLeft < dayLt {
|
||||
return fmt.Errorf("license will expire in %d days", daysLeft)
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user