mirror of
https://github.com/Buriburizaem0n/nezha_domains.git
synced 2026-05-06 05:38:50 +00:00
feat(v2.0.0): tsdb (#1162)
* feat: tsdb * fix(ci): remove --parseGoList=false from swag init to fix dependency resolution * fix(ci): fix swag init directory and temporary remove s390x support due to cgo issues * fix(ci): fix swag init output directory to cmd/dashboard/docs * fix(ci): set GOTOOLCHAIN=auto for gosec * feat: add system storage maintenance for SQLite and TSDB * shit * feat: add s390x support and improve service monitoring * ci: upgrade goreleaser-cross image to v1.25 * ci: add libzstd-dev:s390x for cross-compilation * ci: build libzstd for s390x from source * ci: add libzstd_linux_s390x.go for gozstd linking * ci: use vendor mode for s390x gozstd build * ci: clone zstd source for s390x build * refactor(tsdb): rename MaxDiskUsageGB to MinFreeDiskSpaceGB and optimize queries - Rename config to accurately reflect VictoriaMetrics behavior: minimum free disk space threshold - Add QueryServiceHistoryByServerID for batch query optimization - Fix hasStatus to avoid false status counting when only delay data exists - Fix service aggregation boundary: use successCount*2 >= count - Fix serviceID parsing with strconv.ParseUint error handling - Add TagFiltersCacheSize for better query performance * feat(api): add server metrics endpoint and simplify service history response - Add /server/:id/metrics API for querying TSDB server metrics - Simplify getServiceHistory by removing redundant data conversion - Change AvgDelay type from float32 to float64 - Remove generated swagger docs (to be regenerated) - Update TSDB query, writer and tests * chore: 临时禁用不支持前端 * ci: cache zstd build for s390x to speed up CI * fix(tsdb): fix race conditions, data correctness and optimize performance - Fix TOCTOU race between IsClosed() and write/query by holding RLock - Fix delay=0 excluded from stats by using hasDelay flag instead of value > 0 - Fix fmt.Sscanf -> strconv.ParseUint for server_id parsing with error logging - Fix buffer unbounded growth by flushing inside lock when over maxSize - Split makeMetricRow into makeServerMetricRow/makeServiceMetricRow - Extract InitGlobalSettings() from Open() for VictoriaMetrics globals - Remove redundant instance/GetInstance/SetInstance singleton - Add error logging for silently skipped block decode errors - Optimize WriteBatch* to build all rows in single write call - Optimize downsample to use linear scan instead of map for sorted data - Optimize query slice reuse across block iterations * 服务添加DisplayIndex (#1166) * 服务添加DisplayIndex * 根据ai建议修改 --------- Co-authored-by: huYang <306061454@qq.com> * fix(tsdb): restore SQLite fallback and monthly status reload on restart - Restore ServiceHistory model and SQLite write fallback when TSDB is disabled - Reload monthlyStatus (30-day) and serviceStatusToday from TSDB/SQLite on startup - Add SQLite fallback query for /service/:id/history and /server/:id/service - Remove breaking GET /service/:id endpoint, keep /service/:id/history only - Add QueryServiceDailyStats to TSDB for per-day aggregation - Add tests for monthly status and today stats loading from both TSDB and SQLite - Migrate ServiceHistory table only when TSDB is disabled * ci: exclude false-positive gosec rules G117, G703, G704 * feat(api): expose tsdb_enabled in setting response * ci: restore G115 exclusion accidentally dropped in previous commit * fix: update version numbers for OfficialAdmin and Official templates * chore: upgrade frontend * chore: upgrade frontend --------- Co-authored-by: 胡说丷刂 <34758853+laosan-xx@users.noreply.github.com> Co-authored-by: huYang <306061454@qq.com>
This commit is contained in:
@@ -16,6 +16,7 @@ import (
|
||||
"golang.org/x/exp/constraints"
|
||||
|
||||
"github.com/nezhahq/nezha/model"
|
||||
"github.com/nezhahq/nezha/pkg/tsdb"
|
||||
"github.com/nezhahq/nezha/pkg/utils"
|
||||
pb "github.com/nezhahq/nezha/proto"
|
||||
)
|
||||
@@ -39,7 +40,7 @@ type ReportData struct {
|
||||
type _TodayStatsOfService struct {
|
||||
Up uint64 // 今日在线计数
|
||||
Down uint64 // 今日离线计数
|
||||
Delay float32 // 今日平均延迟
|
||||
Delay float64 // 今日平均延迟
|
||||
}
|
||||
|
||||
type serviceResponseData = _TodayStatsOfService
|
||||
@@ -51,8 +52,9 @@ type serviceTaskStatus struct {
|
||||
}
|
||||
|
||||
type pingStore struct {
|
||||
count int
|
||||
ping float32
|
||||
count int
|
||||
ping float64
|
||||
successCount int
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -108,23 +110,7 @@ func NewServiceSentinel(serviceSentinelDispatchBus chan<- *model.Service) (*Serv
|
||||
|
||||
year, month, day := time.Now().Date()
|
||||
today := time.Date(year, month, day, 0, 0, 0, 0, Loc)
|
||||
|
||||
var mhs []model.ServiceHistory
|
||||
// 加载当日记录
|
||||
DB.Where("created_at >= ? AND server_id = 0", today).Find(&mhs)
|
||||
totalDelay := make(map[uint64]float32)
|
||||
totalDelayCount := make(map[uint64]float32)
|
||||
for _, mh := range mhs {
|
||||
totalDelay[mh.ServiceID] += mh.AvgDelay
|
||||
totalDelayCount[mh.ServiceID]++
|
||||
ss.serviceStatusToday[mh.ServiceID].Up += mh.Up
|
||||
ss.monthlyStatus[mh.ServiceID].TotalUp += mh.Up
|
||||
ss.serviceStatusToday[mh.ServiceID].Down += mh.Down
|
||||
ss.monthlyStatus[mh.ServiceID].TotalDown += mh.Down
|
||||
}
|
||||
for id, delay := range totalDelay {
|
||||
ss.serviceStatusToday[id].Delay = delay / float32(totalDelayCount[id])
|
||||
}
|
||||
ss.loadTodayStats(today)
|
||||
|
||||
// 启动服务监控器
|
||||
go ss.worker()
|
||||
@@ -135,6 +121,12 @@ func NewServiceSentinel(serviceSentinelDispatchBus chan<- *model.Service) (*Serv
|
||||
return nil, err
|
||||
}
|
||||
|
||||
// 每周日凌晨 4:00 执行系统存储维护
|
||||
_, err = CronShared.AddFunc("0 0 4 * * 0", PerformMaintenance)
|
||||
if err != nil {
|
||||
log.Printf("NEZHA>> Warning: failed to schedule maintenance task: %v", err)
|
||||
}
|
||||
|
||||
return ss, nil
|
||||
}
|
||||
|
||||
@@ -171,6 +163,16 @@ func (ss *ServiceSentinel) Dispatch(r ReportData) {
|
||||
ss.serviceReportChannel <- r
|
||||
}
|
||||
|
||||
// sortServices 按 DisplayIndex 降序、ID 升序排列服务列表
|
||||
func sortServices(services []*model.Service) {
|
||||
slices.SortFunc(services, func(a, b *model.Service) int {
|
||||
if a.DisplayIndex != b.DisplayIndex {
|
||||
return cmp.Compare(b.DisplayIndex, a.DisplayIndex)
|
||||
}
|
||||
return cmp.Compare(a.ID, b.ID)
|
||||
})
|
||||
}
|
||||
|
||||
func (ss *ServiceSentinel) UpdateServiceList() {
|
||||
ss.servicesLock.RLock()
|
||||
defer ss.servicesLock.RUnlock()
|
||||
@@ -179,9 +181,7 @@ func (ss *ServiceSentinel) UpdateServiceList() {
|
||||
defer ss.serviceListLock.Unlock()
|
||||
|
||||
ss.serviceList = utils.MapValuesToSlice(ss.services)
|
||||
slices.SortFunc(ss.serviceList, func(a, b *model.Service) int {
|
||||
return cmp.Compare(a.ID, b.ID)
|
||||
})
|
||||
sortServices(ss.serviceList)
|
||||
}
|
||||
|
||||
// loadServiceHistory 加载服务监控器的历史状态信息
|
||||
@@ -207,6 +207,7 @@ func (ss *ServiceSentinel) loadServiceHistory() error {
|
||||
ss.serviceStatusToday[service.ID] = &_TodayStatsOfService{}
|
||||
}
|
||||
ss.serviceList = services
|
||||
sortServices(ss.serviceList)
|
||||
|
||||
year, month, day := time.Now().Date()
|
||||
today := time.Date(year, month, day, 0, 0, 0, 0, Loc)
|
||||
@@ -215,33 +216,111 @@ func (ss *ServiceSentinel) loadServiceHistory() error {
|
||||
ss.monthlyStatus[service.ID] = &serviceResponseItem{
|
||||
service: service,
|
||||
ServiceResponseItem: model.ServiceResponseItem{
|
||||
Delay: &[30]float32{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
|
||||
Up: &[30]uint64{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
|
||||
Down: &[30]uint64{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
|
||||
Delay: &[30]float64{},
|
||||
Up: &[30]uint64{},
|
||||
Down: &[30]uint64{},
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
// 加载服务监控历史记录
|
||||
var mhs []model.ServiceHistory
|
||||
DB.Where("created_at > ? AND created_at < ? AND server_id = 0", today.AddDate(0, 0, -29), today).Find(&mhs)
|
||||
var delayCount = make(map[int]int)
|
||||
for _, mh := range mhs {
|
||||
dayIndex := 28 - (int(today.Sub(mh.CreatedAt).Hours()) / 24)
|
||||
if dayIndex < 0 {
|
||||
continue
|
||||
}
|
||||
ss.monthlyStatus[mh.ServiceID].Delay[dayIndex] = (ss.monthlyStatus[mh.ServiceID].Delay[dayIndex]*float32(delayCount[dayIndex]) + mh.AvgDelay) / float32(delayCount[dayIndex]+1)
|
||||
delayCount[dayIndex]++
|
||||
ss.monthlyStatus[mh.ServiceID].Up[dayIndex] += mh.Up
|
||||
ss.monthlyStatus[mh.ServiceID].TotalUp += mh.Up
|
||||
ss.monthlyStatus[mh.ServiceID].Down[dayIndex] += mh.Down
|
||||
ss.monthlyStatus[mh.ServiceID].TotalDown += mh.Down
|
||||
if TSDBEnabled() {
|
||||
ss.loadMonthlyStatusFromTSDB(services, today)
|
||||
} else {
|
||||
ss.loadMonthlyStatusFromDB(today)
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (ss *ServiceSentinel) loadMonthlyStatusFromTSDB(services []*model.Service, today time.Time) {
|
||||
for _, service := range services {
|
||||
dailyStats, err := TSDBShared.QueryServiceDailyStats(service.ID, today, 30)
|
||||
if err != nil {
|
||||
log.Printf("NEZHA>> Failed to load TSDB history for service %d: %v", service.ID, err)
|
||||
continue
|
||||
}
|
||||
ms := ss.monthlyStatus[service.ID]
|
||||
for i := 0; i < 29; i++ {
|
||||
ms.Up[i] = dailyStats[i].Up
|
||||
ms.TotalUp += dailyStats[i].Up
|
||||
ms.Down[i] = dailyStats[i].Down
|
||||
ms.TotalDown += dailyStats[i].Down
|
||||
ms.Delay[i] = dailyStats[i].Delay
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func (ss *ServiceSentinel) loadMonthlyStatusFromDB(today time.Time) {
|
||||
var mhs []model.ServiceHistory
|
||||
DB.Where("created_at > ? AND created_at < ? AND server_id = 0", today.AddDate(0, 0, -29), today).Find(&mhs)
|
||||
delayCount := make(map[uint64]map[int]int)
|
||||
for _, mh := range mhs {
|
||||
dayIndex := 28 - int(today.Sub(mh.CreatedAt).Hours())/24
|
||||
if dayIndex < 0 {
|
||||
continue
|
||||
}
|
||||
ms := ss.monthlyStatus[mh.ServiceID]
|
||||
if ms == nil {
|
||||
continue
|
||||
}
|
||||
if delayCount[mh.ServiceID] == nil {
|
||||
delayCount[mh.ServiceID] = make(map[int]int)
|
||||
}
|
||||
ms.Delay[dayIndex] = (ms.Delay[dayIndex]*float64(delayCount[mh.ServiceID][dayIndex]) + mh.AvgDelay) / float64(delayCount[mh.ServiceID][dayIndex]+1)
|
||||
delayCount[mh.ServiceID][dayIndex]++
|
||||
ms.Up[dayIndex] += mh.Up
|
||||
ms.TotalUp += mh.Up
|
||||
ms.Down[dayIndex] += mh.Down
|
||||
ms.TotalDown += mh.Down
|
||||
}
|
||||
}
|
||||
|
||||
func (ss *ServiceSentinel) loadTodayStats(today time.Time) {
|
||||
if TSDBEnabled() {
|
||||
for serviceID, ms := range ss.monthlyStatus {
|
||||
result, err := TSDBShared.QueryServiceHistory(serviceID, tsdb.Period1Day)
|
||||
if err != nil {
|
||||
log.Printf("NEZHA>> Failed to load TSDB today stats for service %d: %v", serviceID, err)
|
||||
continue
|
||||
}
|
||||
var totalUp, totalDown uint64
|
||||
var totalDelay float64
|
||||
var delayCount int
|
||||
for _, serverStats := range result.Servers {
|
||||
totalUp += serverStats.Stats.TotalUp
|
||||
totalDown += serverStats.Stats.TotalDown
|
||||
if serverStats.Stats.AvgDelay > 0 {
|
||||
totalDelay += serverStats.Stats.AvgDelay
|
||||
delayCount++
|
||||
}
|
||||
}
|
||||
ss.serviceStatusToday[serviceID].Up = totalUp
|
||||
ss.serviceStatusToday[serviceID].Down = totalDown
|
||||
if delayCount > 0 {
|
||||
ss.serviceStatusToday[serviceID].Delay = totalDelay / float64(delayCount)
|
||||
}
|
||||
ms.TotalUp += totalUp
|
||||
ms.TotalDown += totalDown
|
||||
}
|
||||
} else {
|
||||
var mhs []model.ServiceHistory
|
||||
DB.Where("created_at >= ? AND server_id = 0", today).Find(&mhs)
|
||||
totalDelay := make(map[uint64]float64)
|
||||
totalDelayCount := make(map[uint64]int)
|
||||
for _, mh := range mhs {
|
||||
ss.serviceStatusToday[mh.ServiceID].Up += mh.Up
|
||||
ss.monthlyStatus[mh.ServiceID].TotalUp += mh.Up
|
||||
ss.serviceStatusToday[mh.ServiceID].Down += mh.Down
|
||||
ss.monthlyStatus[mh.ServiceID].TotalDown += mh.Down
|
||||
totalDelay[mh.ServiceID] += mh.AvgDelay
|
||||
totalDelayCount[mh.ServiceID]++
|
||||
}
|
||||
for id, delay := range totalDelay {
|
||||
ss.serviceStatusToday[id].Delay = delay / float64(totalDelayCount[id])
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func (ss *ServiceSentinel) Update(m *model.Service) error {
|
||||
ss.serviceResponseDataStoreLock.Lock()
|
||||
defer ss.serviceResponseDataStoreLock.Unlock()
|
||||
@@ -266,9 +345,9 @@ func (ss *ServiceSentinel) Update(m *model.Service) error {
|
||||
ss.monthlyStatus[m.ID] = &serviceResponseItem{
|
||||
service: m,
|
||||
ServiceResponseItem: model.ServiceResponseItem{
|
||||
Delay: &[30]float32{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
|
||||
Up: &[30]uint64{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
|
||||
Down: &[30]uint64{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
|
||||
Delay: &[30]float64{},
|
||||
Up: &[30]uint64{},
|
||||
Down: &[30]uint64{},
|
||||
},
|
||||
}
|
||||
if ss.serviceCurrentStatusData[m.ID] == nil {
|
||||
@@ -406,6 +485,7 @@ func (ss *ServiceSentinel) worker() {
|
||||
|
||||
mh := r.Data
|
||||
if mh.Type == model.TaskTypeTCPPing || mh.Type == model.TaskTypeICMPPing {
|
||||
// TCP/ICMP Ping 使用平均值计算后再写入
|
||||
serviceTcpMap, ok := ss.serviceResponsePing[mh.GetId()]
|
||||
if !ok {
|
||||
serviceTcpMap = make(map[uint64]*pingStore)
|
||||
@@ -416,28 +496,56 @@ func (ss *ServiceSentinel) worker() {
|
||||
ts = &pingStore{}
|
||||
}
|
||||
ts.count++
|
||||
ts.ping = (ts.ping*float32(ts.count-1) + mh.Delay) / float32(ts.count)
|
||||
ts.ping = (ts.ping*float64(ts.count-1) + float64(mh.Delay)) / float64(ts.count)
|
||||
if mh.Successful {
|
||||
ts.successCount++
|
||||
}
|
||||
if ts.count == Conf.AvgPingCount {
|
||||
if err := DB.Create(&model.ServiceHistory{
|
||||
ServiceID: mh.GetId(),
|
||||
AvgDelay: ts.ping,
|
||||
Data: mh.Data,
|
||||
ServerID: r.Reporter,
|
||||
}).Error; err != nil {
|
||||
log.Printf("NEZHA>> Failed to save service monitor metrics: %v", err)
|
||||
if TSDBEnabled() {
|
||||
if err := TSDBShared.WriteServiceMetrics(&tsdb.ServiceMetrics{
|
||||
ServiceID: mh.GetId(),
|
||||
ServerID: r.Reporter,
|
||||
Timestamp: time.Now(),
|
||||
Delay: ts.ping,
|
||||
Successful: ts.successCount*2 >= ts.count,
|
||||
}); err != nil {
|
||||
log.Printf("NEZHA>> Failed to save service monitor metrics to TSDB: %v", err)
|
||||
}
|
||||
} else {
|
||||
if err := DB.Create(&model.ServiceHistory{
|
||||
ServiceID: mh.GetId(),
|
||||
AvgDelay: ts.ping,
|
||||
Data: mh.Data,
|
||||
ServerID: r.Reporter,
|
||||
}).Error; err != nil {
|
||||
log.Printf("NEZHA>> Failed to save service monitor metrics: %v", err)
|
||||
}
|
||||
}
|
||||
ts.count = 0
|
||||
ts.ping = mh.Delay
|
||||
ts.ping = 0
|
||||
ts.successCount = 0
|
||||
}
|
||||
serviceTcpMap[r.Reporter] = ts
|
||||
} else {
|
||||
if TSDBEnabled() {
|
||||
if err := TSDBShared.WriteServiceMetrics(&tsdb.ServiceMetrics{
|
||||
ServiceID: mh.GetId(),
|
||||
ServerID: r.Reporter,
|
||||
Timestamp: time.Now(),
|
||||
Delay: float64(mh.Delay),
|
||||
Successful: mh.Successful,
|
||||
}); err != nil {
|
||||
log.Printf("NEZHA>> Failed to save service monitor metrics to TSDB: %v", err)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
ss.serviceResponseDataStoreLock.Lock()
|
||||
// 写入当天状态
|
||||
if mh.Successful {
|
||||
ss.serviceStatusToday[mh.GetId()].Delay = (ss.serviceStatusToday[mh.
|
||||
GetId()].Delay*float32(ss.serviceStatusToday[mh.GetId()].Up) +
|
||||
mh.Delay) / float32(ss.serviceStatusToday[mh.GetId()].Up+1)
|
||||
GetId()].Delay*float64(ss.serviceStatusToday[mh.GetId()].Up) +
|
||||
float64(mh.Delay)) / float64(ss.serviceStatusToday[mh.GetId()].Up+1)
|
||||
ss.serviceStatusToday[mh.GetId()].Up++
|
||||
} else {
|
||||
ss.serviceStatusToday[mh.GetId()].Down++
|
||||
@@ -463,7 +571,7 @@ func (ss *ServiceSentinel) worker() {
|
||||
rd := ss.serviceResponseDataStore[mh.GetId()]
|
||||
if cs.Successful {
|
||||
rd.Up++
|
||||
rd.Delay = (rd.Delay*float32(rd.Up-1) + cs.Delay) / float32(rd.Up)
|
||||
rd.Delay = (rd.Delay*float64(rd.Up-1) + float64(cs.Delay)) / float64(rd.Up)
|
||||
} else {
|
||||
rd.Down++
|
||||
}
|
||||
@@ -482,20 +590,20 @@ func (ss *ServiceSentinel) worker() {
|
||||
stateCode = GetStatusCode(upPercent)
|
||||
}
|
||||
|
||||
// 数据持久化
|
||||
if len(ss.serviceCurrentStatusData[mh.GetId()].result) == _CurrentStatusSize {
|
||||
ss.serviceCurrentStatusData[mh.GetId()].t = currentTime
|
||||
rd := ss.serviceResponseDataStore[mh.GetId()]
|
||||
if err := DB.Create(&model.ServiceHistory{
|
||||
ServiceID: mh.GetId(),
|
||||
AvgDelay: rd.Delay,
|
||||
Data: mh.Data,
|
||||
Up: rd.Up,
|
||||
Down: rd.Down,
|
||||
}).Error; err != nil {
|
||||
log.Printf("NEZHA>> Failed to save service monitor metrics: %v", err)
|
||||
if !TSDBEnabled() {
|
||||
rd := ss.serviceResponseDataStore[mh.GetId()]
|
||||
if err := DB.Create(&model.ServiceHistory{
|
||||
ServiceID: mh.GetId(),
|
||||
AvgDelay: rd.Delay,
|
||||
Data: mh.Data,
|
||||
Up: rd.Up,
|
||||
Down: rd.Down,
|
||||
}).Error; err != nil {
|
||||
log.Printf("NEZHA>> Failed to save service monitor metrics: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
ss.serviceCurrentStatusData[mh.GetId()].result = ss.serviceCurrentStatusData[mh.GetId()].result[:0]
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user