dash: HTTP等服务监控的故障/恢复报警

This commit is contained in:
naiba
2021-04-17 23:36:37 +08:00
parent e916ca06d0
commit 96be2330a9
18 changed files with 343 additions and 204 deletions

View File

@@ -3,7 +3,6 @@ package controller
import (
"errors"
"fmt"
"log"
"net/http"
"time"
@@ -80,79 +79,8 @@ func (p *commonPage) checkViewPassword(c *gin.Context) {
c.Next()
}
type ServiceItem struct {
Monitor model.Monitor
TotalUp uint64
TotalDown uint64
CurrentUp uint64
CurrentDown uint64
Delay *[30]float32
Up *[30]int
Down *[30]int
}
func (p *commonPage) service(c *gin.Context) {
var msm map[uint64]*ServiceItem
var cached bool
if _, has := c.Get(model.CtxKeyAuthorizedUser); !has {
data, has := dao.Cache.Get(model.CacheKeyServicePage)
if has {
log.Println("use cache")
msm = data.(map[uint64]*ServiceItem)
cached = true
}
}
if !cached {
msm = make(map[uint64]*ServiceItem)
var ms []model.Monitor
dao.DB.Find(&ms)
year, month, day := time.Now().Date()
today := time.Date(year, month, day, 0, 0, 0, 0, time.Local)
var mhs []model.MonitorHistory
dao.DB.Where("created_at >= ?", today.AddDate(0, 0, -29)).Find(&mhs)
for i := 0; i < len(ms); i++ {
msm[ms[i].ID] = &ServiceItem{
Monitor: ms[i],
Delay: &[30]float32{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
Up: &[30]int{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
Down: &[30]int{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
}
}
// 整合数据
todayStatus := make(map[uint64][]bool)
for i := 0; i < len(mhs); i++ {
dayIndex := 29
if mhs[i].CreatedAt.Before(today) {
dayIndex = 28 - (int(today.Sub(mhs[i].CreatedAt).Hours()) / 24)
} else {
todayStatus[mhs[i].MonitorID] = append(todayStatus[mhs[i].MonitorID], mhs[i].Successful)
}
if mhs[i].Successful {
msm[mhs[i].MonitorID].TotalUp++
msm[mhs[i].MonitorID].Delay[dayIndex] = (msm[mhs[i].MonitorID].Delay[dayIndex]*float32(msm[mhs[i].MonitorID].Up[dayIndex]) + mhs[i].Delay) / float32(msm[mhs[i].MonitorID].Up[dayIndex]+1)
msm[mhs[i].MonitorID].Up[dayIndex]++
} else {
msm[mhs[i].MonitorID].TotalDown++
msm[mhs[i].MonitorID].Down[dayIndex]++
}
}
// 当日最后 20 个采样作为当前状态
for _, m := range msm {
for i := len(todayStatus[m.Monitor.ID]) - 1; i >= 0 && i >= (len(todayStatus[m.Monitor.ID])-1-20); i-- {
if todayStatus[m.Monitor.ID][i] {
m.CurrentUp++
} else {
m.CurrentDown++
}
}
}
// 未登录人员缓存十分钟
dao.Cache.Set(model.CacheKeyServicePage, msm, time.Minute*10)
}
msm := dao.ServiceSentinelShared.LoadStats()
c.HTML(http.StatusOK, "theme-"+dao.Conf.Site.Theme+"/service", mygin.CommonEnvironment(c, gin.H{
"Title": "服务状态",
"Services": msm,

View File

@@ -73,6 +73,7 @@ func (ma *memberAPI) delete(c *gin.Context) {
case "monitor":
err = dao.DB.Delete(&model.Monitor{}, "id = ?", id).Error
if err == nil {
dao.ServiceSentinelShared.OnMonitorDelete(id)
err = dao.DB.Delete(&model.MonitorHistory{}, "monitor_id = ?", id).Error
}
case "cron":
@@ -194,6 +195,7 @@ type monitorForm struct {
Name string
Target string
Type uint8
Notify string
}
func (ma *memberAPI) addOrEditMonitor(c *gin.Context) {
@@ -205,6 +207,7 @@ func (ma *memberAPI) addOrEditMonitor(c *gin.Context) {
m.Target = mf.Target
m.Type = mf.Type
m.ID = mf.ID
m.Notify = mf.Notify == "on"
}
if err == nil {
if m.ID == 0 {
@@ -219,6 +222,8 @@ func (ma *memberAPI) addOrEditMonitor(c *gin.Context) {
Message: fmt.Sprintf("请求错误:%s", err),
})
return
} else {
dao.ServiceSentinelShared.OnMonitorUpdate()
}
c.JSON(http.StatusOK, model.Response{
Code: http.StatusOK,

View File

@@ -39,11 +39,9 @@ func (mp *memberPage) server(c *gin.Context) {
}
func (mp *memberPage) monitor(c *gin.Context) {
var monitors []model.Monitor
dao.DB.Find(&monitors)
c.HTML(http.StatusOK, "dashboard/monitor", mygin.CommonEnvironment(c, gin.H{
"Title": "服务监控",
"Monitors": monitors,
"Monitors": dao.ServiceSentinelShared.Monitors(),
}))
}

View File

@@ -52,6 +52,7 @@ func initSystem() {
dao.DB.AutoMigrate(model.Server{}, model.User{},
model.Notification{}, model.AlertRule{}, model.Monitor{},
model.MonitorHistory{}, model.Cron{})
dao.NewServiceSentinel()
loadServers() //加载服务器列表
loadCrons() //加载计划任务

View File

@@ -7,7 +7,6 @@ import (
"google.golang.org/grpc"
"github.com/naiba/nezha/model"
pb "github.com/naiba/nezha/proto"
"github.com/naiba/nezha/service/dao"
rpcService "github.com/naiba/nezha/service/rpc"
@@ -28,9 +27,8 @@ func ServeRPC(port uint) {
func DispatchTask(duration time.Duration) {
var index uint64 = 0
for {
var tasks []model.Monitor
var hasAliveAgent bool
dao.DB.Find(&tasks)
tasks := dao.ServiceSentinelShared.Monitors()
dao.SortedServerLock.RLock()
startedAt := time.Now()
for i := 0; i < len(tasks); i++ {

View File

@@ -53,7 +53,6 @@ func sysinfo() {
for model, count := range cpuModelCount {
cpus = append(cpus, fmt.Sprintf("%s %d %s Core", model, count, cpuType))
}
log.Println(cpus)
os.Exit(0)
// 硬盘信息,不使用的原因是会重复统计 Linux、Mac
dparts, _ := disk.Partitions(false)