mirror of
https://github.com/Buriburizaem0n/nezha_domains.git
synced 2026-05-06 13:48:52 +00:00
feat(v2.0.0): tsdb (#1162)
* feat: tsdb * fix(ci): remove --parseGoList=false from swag init to fix dependency resolution * fix(ci): fix swag init directory and temporary remove s390x support due to cgo issues * fix(ci): fix swag init output directory to cmd/dashboard/docs * fix(ci): set GOTOOLCHAIN=auto for gosec * feat: add system storage maintenance for SQLite and TSDB * shit * feat: add s390x support and improve service monitoring * ci: upgrade goreleaser-cross image to v1.25 * ci: add libzstd-dev:s390x for cross-compilation * ci: build libzstd for s390x from source * ci: add libzstd_linux_s390x.go for gozstd linking * ci: use vendor mode for s390x gozstd build * ci: clone zstd source for s390x build * refactor(tsdb): rename MaxDiskUsageGB to MinFreeDiskSpaceGB and optimize queries - Rename config to accurately reflect VictoriaMetrics behavior: minimum free disk space threshold - Add QueryServiceHistoryByServerID for batch query optimization - Fix hasStatus to avoid false status counting when only delay data exists - Fix service aggregation boundary: use successCount*2 >= count - Fix serviceID parsing with strconv.ParseUint error handling - Add TagFiltersCacheSize for better query performance * feat(api): add server metrics endpoint and simplify service history response - Add /server/:id/metrics API for querying TSDB server metrics - Simplify getServiceHistory by removing redundant data conversion - Change AvgDelay type from float32 to float64 - Remove generated swagger docs (to be regenerated) - Update TSDB query, writer and tests * chore: 临时禁用不支持前端 * ci: cache zstd build for s390x to speed up CI * fix(tsdb): fix race conditions, data correctness and optimize performance - Fix TOCTOU race between IsClosed() and write/query by holding RLock - Fix delay=0 excluded from stats by using hasDelay flag instead of value > 0 - Fix fmt.Sscanf -> strconv.ParseUint for server_id parsing with error logging - Fix buffer unbounded growth by flushing inside lock when over maxSize - Split makeMetricRow into makeServerMetricRow/makeServiceMetricRow - Extract InitGlobalSettings() from Open() for VictoriaMetrics globals - Remove redundant instance/GetInstance/SetInstance singleton - Add error logging for silently skipped block decode errors - Optimize WriteBatch* to build all rows in single write call - Optimize downsample to use linear scan instead of map for sorted data - Optimize query slice reuse across block iterations * 服务添加DisplayIndex (#1166) * 服务添加DisplayIndex * 根据ai建议修改 --------- Co-authored-by: huYang <306061454@qq.com> * fix(tsdb): restore SQLite fallback and monthly status reload on restart - Restore ServiceHistory model and SQLite write fallback when TSDB is disabled - Reload monthlyStatus (30-day) and serviceStatusToday from TSDB/SQLite on startup - Add SQLite fallback query for /service/:id/history and /server/:id/service - Remove breaking GET /service/:id endpoint, keep /service/:id/history only - Add QueryServiceDailyStats to TSDB for per-day aggregation - Add tests for monthly status and today stats loading from both TSDB and SQLite - Migrate ServiceHistory table only when TSDB is disabled * ci: exclude false-positive gosec rules G117, G703, G704 * feat(api): expose tsdb_enabled in setting response * ci: restore G115 exclusion accidentally dropped in previous commit * fix: update version numbers for OfficialAdmin and Official templates * chore: upgrade frontend * chore: upgrade frontend --------- Co-authored-by: 胡说丷刂 <34758853+laosan-xx@users.noreply.github.com> Co-authored-by: huYang <306061454@qq.com>
This commit is contained in:
@@ -12,6 +12,7 @@ import (
|
||||
"github.com/jinzhu/copier"
|
||||
geoipx "github.com/nezhahq/nezha/pkg/geoip"
|
||||
"github.com/nezhahq/nezha/pkg/grpcx"
|
||||
"github.com/nezhahq/nezha/pkg/tsdb"
|
||||
|
||||
"github.com/nezhahq/nezha/model"
|
||||
pb "github.com/nezhahq/nezha/proto"
|
||||
@@ -114,6 +115,44 @@ func (s *NezhaHandler) ReportSystemState(stream pb.NezhaService_ReportSystemStat
|
||||
server.LastActive = time.Now()
|
||||
server.State = &innerState
|
||||
|
||||
if singleton.TSDBEnabled() {
|
||||
maxTemp := 0.0
|
||||
for _, t := range innerState.Temperatures {
|
||||
if t.Temperature > maxTemp {
|
||||
maxTemp = t.Temperature
|
||||
}
|
||||
}
|
||||
maxGPU := 0.0
|
||||
for _, g := range innerState.GPU {
|
||||
if g > maxGPU {
|
||||
maxGPU = g
|
||||
}
|
||||
}
|
||||
if err := singleton.TSDBShared.WriteServerMetrics(&tsdb.ServerMetrics{
|
||||
ServerID: clientID,
|
||||
Timestamp: time.Now(),
|
||||
CPU: innerState.CPU,
|
||||
MemUsed: innerState.MemUsed,
|
||||
SwapUsed: innerState.SwapUsed,
|
||||
DiskUsed: innerState.DiskUsed,
|
||||
NetInSpeed: innerState.NetInSpeed,
|
||||
NetOutSpeed: innerState.NetOutSpeed,
|
||||
NetInTransfer: innerState.NetInTransfer,
|
||||
NetOutTransfer: innerState.NetOutTransfer,
|
||||
Load1: innerState.Load1,
|
||||
Load5: innerState.Load5,
|
||||
Load15: innerState.Load15,
|
||||
TCPConnCount: innerState.TcpConnCount,
|
||||
UDPConnCount: innerState.UdpConnCount,
|
||||
ProcessCount: innerState.ProcessCount,
|
||||
Temperature: maxTemp,
|
||||
Uptime: innerState.Uptime,
|
||||
GPU: maxGPU,
|
||||
}); err != nil {
|
||||
log.Printf("NEZHA>> Failed to write server metrics to TSDB: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
// 应对 dashboard / agent 重启的情况,如果从未记录过,先打点,等到小时时间点时入库
|
||||
if server.PrevTransferInSnapshot == 0 || server.PrevTransferOutSnapshot == 0 {
|
||||
server.PrevTransferInSnapshot = state.NetInTransfer
|
||||
|
||||
@@ -2,14 +2,14 @@
|
||||
name: "OfficialAdmin"
|
||||
repository: "https://github.com/nezhahq/admin-frontend"
|
||||
author: "nezhahq"
|
||||
version: "v1.14.7"
|
||||
version: "v2.0.3"
|
||||
is_admin: true
|
||||
is_official: true
|
||||
- path: "user-dist"
|
||||
name: "Official"
|
||||
repository: "https://github.com/hamster1963/nezha-dash-v1"
|
||||
author: "hamster1963"
|
||||
version: "v1.33.0"
|
||||
version: "v2.0.0"
|
||||
is_official: true
|
||||
- path: "nazhua-dist"
|
||||
name: "Nazhua"
|
||||
|
||||
@@ -16,6 +16,7 @@ import (
|
||||
"golang.org/x/exp/constraints"
|
||||
|
||||
"github.com/nezhahq/nezha/model"
|
||||
"github.com/nezhahq/nezha/pkg/tsdb"
|
||||
"github.com/nezhahq/nezha/pkg/utils"
|
||||
pb "github.com/nezhahq/nezha/proto"
|
||||
)
|
||||
@@ -39,7 +40,7 @@ type ReportData struct {
|
||||
type _TodayStatsOfService struct {
|
||||
Up uint64 // 今日在线计数
|
||||
Down uint64 // 今日离线计数
|
||||
Delay float32 // 今日平均延迟
|
||||
Delay float64 // 今日平均延迟
|
||||
}
|
||||
|
||||
type serviceResponseData = _TodayStatsOfService
|
||||
@@ -51,8 +52,9 @@ type serviceTaskStatus struct {
|
||||
}
|
||||
|
||||
type pingStore struct {
|
||||
count int
|
||||
ping float32
|
||||
count int
|
||||
ping float64
|
||||
successCount int
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -108,23 +110,7 @@ func NewServiceSentinel(serviceSentinelDispatchBus chan<- *model.Service) (*Serv
|
||||
|
||||
year, month, day := time.Now().Date()
|
||||
today := time.Date(year, month, day, 0, 0, 0, 0, Loc)
|
||||
|
||||
var mhs []model.ServiceHistory
|
||||
// 加载当日记录
|
||||
DB.Where("created_at >= ? AND server_id = 0", today).Find(&mhs)
|
||||
totalDelay := make(map[uint64]float32)
|
||||
totalDelayCount := make(map[uint64]float32)
|
||||
for _, mh := range mhs {
|
||||
totalDelay[mh.ServiceID] += mh.AvgDelay
|
||||
totalDelayCount[mh.ServiceID]++
|
||||
ss.serviceStatusToday[mh.ServiceID].Up += mh.Up
|
||||
ss.monthlyStatus[mh.ServiceID].TotalUp += mh.Up
|
||||
ss.serviceStatusToday[mh.ServiceID].Down += mh.Down
|
||||
ss.monthlyStatus[mh.ServiceID].TotalDown += mh.Down
|
||||
}
|
||||
for id, delay := range totalDelay {
|
||||
ss.serviceStatusToday[id].Delay = delay / float32(totalDelayCount[id])
|
||||
}
|
||||
ss.loadTodayStats(today)
|
||||
|
||||
// 启动服务监控器
|
||||
go ss.worker()
|
||||
@@ -135,6 +121,12 @@ func NewServiceSentinel(serviceSentinelDispatchBus chan<- *model.Service) (*Serv
|
||||
return nil, err
|
||||
}
|
||||
|
||||
// 每周日凌晨 4:00 执行系统存储维护
|
||||
_, err = CronShared.AddFunc("0 0 4 * * 0", PerformMaintenance)
|
||||
if err != nil {
|
||||
log.Printf("NEZHA>> Warning: failed to schedule maintenance task: %v", err)
|
||||
}
|
||||
|
||||
return ss, nil
|
||||
}
|
||||
|
||||
@@ -171,6 +163,16 @@ func (ss *ServiceSentinel) Dispatch(r ReportData) {
|
||||
ss.serviceReportChannel <- r
|
||||
}
|
||||
|
||||
// sortServices 按 DisplayIndex 降序、ID 升序排列服务列表
|
||||
func sortServices(services []*model.Service) {
|
||||
slices.SortFunc(services, func(a, b *model.Service) int {
|
||||
if a.DisplayIndex != b.DisplayIndex {
|
||||
return cmp.Compare(b.DisplayIndex, a.DisplayIndex)
|
||||
}
|
||||
return cmp.Compare(a.ID, b.ID)
|
||||
})
|
||||
}
|
||||
|
||||
func (ss *ServiceSentinel) UpdateServiceList() {
|
||||
ss.servicesLock.RLock()
|
||||
defer ss.servicesLock.RUnlock()
|
||||
@@ -179,9 +181,7 @@ func (ss *ServiceSentinel) UpdateServiceList() {
|
||||
defer ss.serviceListLock.Unlock()
|
||||
|
||||
ss.serviceList = utils.MapValuesToSlice(ss.services)
|
||||
slices.SortFunc(ss.serviceList, func(a, b *model.Service) int {
|
||||
return cmp.Compare(a.ID, b.ID)
|
||||
})
|
||||
sortServices(ss.serviceList)
|
||||
}
|
||||
|
||||
// loadServiceHistory 加载服务监控器的历史状态信息
|
||||
@@ -207,6 +207,7 @@ func (ss *ServiceSentinel) loadServiceHistory() error {
|
||||
ss.serviceStatusToday[service.ID] = &_TodayStatsOfService{}
|
||||
}
|
||||
ss.serviceList = services
|
||||
sortServices(ss.serviceList)
|
||||
|
||||
year, month, day := time.Now().Date()
|
||||
today := time.Date(year, month, day, 0, 0, 0, 0, Loc)
|
||||
@@ -215,33 +216,111 @@ func (ss *ServiceSentinel) loadServiceHistory() error {
|
||||
ss.monthlyStatus[service.ID] = &serviceResponseItem{
|
||||
service: service,
|
||||
ServiceResponseItem: model.ServiceResponseItem{
|
||||
Delay: &[30]float32{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
|
||||
Up: &[30]uint64{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
|
||||
Down: &[30]uint64{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
|
||||
Delay: &[30]float64{},
|
||||
Up: &[30]uint64{},
|
||||
Down: &[30]uint64{},
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
// 加载服务监控历史记录
|
||||
var mhs []model.ServiceHistory
|
||||
DB.Where("created_at > ? AND created_at < ? AND server_id = 0", today.AddDate(0, 0, -29), today).Find(&mhs)
|
||||
var delayCount = make(map[int]int)
|
||||
for _, mh := range mhs {
|
||||
dayIndex := 28 - (int(today.Sub(mh.CreatedAt).Hours()) / 24)
|
||||
if dayIndex < 0 {
|
||||
continue
|
||||
}
|
||||
ss.monthlyStatus[mh.ServiceID].Delay[dayIndex] = (ss.monthlyStatus[mh.ServiceID].Delay[dayIndex]*float32(delayCount[dayIndex]) + mh.AvgDelay) / float32(delayCount[dayIndex]+1)
|
||||
delayCount[dayIndex]++
|
||||
ss.monthlyStatus[mh.ServiceID].Up[dayIndex] += mh.Up
|
||||
ss.monthlyStatus[mh.ServiceID].TotalUp += mh.Up
|
||||
ss.monthlyStatus[mh.ServiceID].Down[dayIndex] += mh.Down
|
||||
ss.monthlyStatus[mh.ServiceID].TotalDown += mh.Down
|
||||
if TSDBEnabled() {
|
||||
ss.loadMonthlyStatusFromTSDB(services, today)
|
||||
} else {
|
||||
ss.loadMonthlyStatusFromDB(today)
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (ss *ServiceSentinel) loadMonthlyStatusFromTSDB(services []*model.Service, today time.Time) {
|
||||
for _, service := range services {
|
||||
dailyStats, err := TSDBShared.QueryServiceDailyStats(service.ID, today, 30)
|
||||
if err != nil {
|
||||
log.Printf("NEZHA>> Failed to load TSDB history for service %d: %v", service.ID, err)
|
||||
continue
|
||||
}
|
||||
ms := ss.monthlyStatus[service.ID]
|
||||
for i := 0; i < 29; i++ {
|
||||
ms.Up[i] = dailyStats[i].Up
|
||||
ms.TotalUp += dailyStats[i].Up
|
||||
ms.Down[i] = dailyStats[i].Down
|
||||
ms.TotalDown += dailyStats[i].Down
|
||||
ms.Delay[i] = dailyStats[i].Delay
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func (ss *ServiceSentinel) loadMonthlyStatusFromDB(today time.Time) {
|
||||
var mhs []model.ServiceHistory
|
||||
DB.Where("created_at > ? AND created_at < ? AND server_id = 0", today.AddDate(0, 0, -29), today).Find(&mhs)
|
||||
delayCount := make(map[uint64]map[int]int)
|
||||
for _, mh := range mhs {
|
||||
dayIndex := 28 - int(today.Sub(mh.CreatedAt).Hours())/24
|
||||
if dayIndex < 0 {
|
||||
continue
|
||||
}
|
||||
ms := ss.monthlyStatus[mh.ServiceID]
|
||||
if ms == nil {
|
||||
continue
|
||||
}
|
||||
if delayCount[mh.ServiceID] == nil {
|
||||
delayCount[mh.ServiceID] = make(map[int]int)
|
||||
}
|
||||
ms.Delay[dayIndex] = (ms.Delay[dayIndex]*float64(delayCount[mh.ServiceID][dayIndex]) + mh.AvgDelay) / float64(delayCount[mh.ServiceID][dayIndex]+1)
|
||||
delayCount[mh.ServiceID][dayIndex]++
|
||||
ms.Up[dayIndex] += mh.Up
|
||||
ms.TotalUp += mh.Up
|
||||
ms.Down[dayIndex] += mh.Down
|
||||
ms.TotalDown += mh.Down
|
||||
}
|
||||
}
|
||||
|
||||
func (ss *ServiceSentinel) loadTodayStats(today time.Time) {
|
||||
if TSDBEnabled() {
|
||||
for serviceID, ms := range ss.monthlyStatus {
|
||||
result, err := TSDBShared.QueryServiceHistory(serviceID, tsdb.Period1Day)
|
||||
if err != nil {
|
||||
log.Printf("NEZHA>> Failed to load TSDB today stats for service %d: %v", serviceID, err)
|
||||
continue
|
||||
}
|
||||
var totalUp, totalDown uint64
|
||||
var totalDelay float64
|
||||
var delayCount int
|
||||
for _, serverStats := range result.Servers {
|
||||
totalUp += serverStats.Stats.TotalUp
|
||||
totalDown += serverStats.Stats.TotalDown
|
||||
if serverStats.Stats.AvgDelay > 0 {
|
||||
totalDelay += serverStats.Stats.AvgDelay
|
||||
delayCount++
|
||||
}
|
||||
}
|
||||
ss.serviceStatusToday[serviceID].Up = totalUp
|
||||
ss.serviceStatusToday[serviceID].Down = totalDown
|
||||
if delayCount > 0 {
|
||||
ss.serviceStatusToday[serviceID].Delay = totalDelay / float64(delayCount)
|
||||
}
|
||||
ms.TotalUp += totalUp
|
||||
ms.TotalDown += totalDown
|
||||
}
|
||||
} else {
|
||||
var mhs []model.ServiceHistory
|
||||
DB.Where("created_at >= ? AND server_id = 0", today).Find(&mhs)
|
||||
totalDelay := make(map[uint64]float64)
|
||||
totalDelayCount := make(map[uint64]int)
|
||||
for _, mh := range mhs {
|
||||
ss.serviceStatusToday[mh.ServiceID].Up += mh.Up
|
||||
ss.monthlyStatus[mh.ServiceID].TotalUp += mh.Up
|
||||
ss.serviceStatusToday[mh.ServiceID].Down += mh.Down
|
||||
ss.monthlyStatus[mh.ServiceID].TotalDown += mh.Down
|
||||
totalDelay[mh.ServiceID] += mh.AvgDelay
|
||||
totalDelayCount[mh.ServiceID]++
|
||||
}
|
||||
for id, delay := range totalDelay {
|
||||
ss.serviceStatusToday[id].Delay = delay / float64(totalDelayCount[id])
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func (ss *ServiceSentinel) Update(m *model.Service) error {
|
||||
ss.serviceResponseDataStoreLock.Lock()
|
||||
defer ss.serviceResponseDataStoreLock.Unlock()
|
||||
@@ -266,9 +345,9 @@ func (ss *ServiceSentinel) Update(m *model.Service) error {
|
||||
ss.monthlyStatus[m.ID] = &serviceResponseItem{
|
||||
service: m,
|
||||
ServiceResponseItem: model.ServiceResponseItem{
|
||||
Delay: &[30]float32{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
|
||||
Up: &[30]uint64{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
|
||||
Down: &[30]uint64{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
|
||||
Delay: &[30]float64{},
|
||||
Up: &[30]uint64{},
|
||||
Down: &[30]uint64{},
|
||||
},
|
||||
}
|
||||
if ss.serviceCurrentStatusData[m.ID] == nil {
|
||||
@@ -406,6 +485,7 @@ func (ss *ServiceSentinel) worker() {
|
||||
|
||||
mh := r.Data
|
||||
if mh.Type == model.TaskTypeTCPPing || mh.Type == model.TaskTypeICMPPing {
|
||||
// TCP/ICMP Ping 使用平均值计算后再写入
|
||||
serviceTcpMap, ok := ss.serviceResponsePing[mh.GetId()]
|
||||
if !ok {
|
||||
serviceTcpMap = make(map[uint64]*pingStore)
|
||||
@@ -416,28 +496,56 @@ func (ss *ServiceSentinel) worker() {
|
||||
ts = &pingStore{}
|
||||
}
|
||||
ts.count++
|
||||
ts.ping = (ts.ping*float32(ts.count-1) + mh.Delay) / float32(ts.count)
|
||||
ts.ping = (ts.ping*float64(ts.count-1) + float64(mh.Delay)) / float64(ts.count)
|
||||
if mh.Successful {
|
||||
ts.successCount++
|
||||
}
|
||||
if ts.count == Conf.AvgPingCount {
|
||||
if err := DB.Create(&model.ServiceHistory{
|
||||
ServiceID: mh.GetId(),
|
||||
AvgDelay: ts.ping,
|
||||
Data: mh.Data,
|
||||
ServerID: r.Reporter,
|
||||
}).Error; err != nil {
|
||||
log.Printf("NEZHA>> Failed to save service monitor metrics: %v", err)
|
||||
if TSDBEnabled() {
|
||||
if err := TSDBShared.WriteServiceMetrics(&tsdb.ServiceMetrics{
|
||||
ServiceID: mh.GetId(),
|
||||
ServerID: r.Reporter,
|
||||
Timestamp: time.Now(),
|
||||
Delay: ts.ping,
|
||||
Successful: ts.successCount*2 >= ts.count,
|
||||
}); err != nil {
|
||||
log.Printf("NEZHA>> Failed to save service monitor metrics to TSDB: %v", err)
|
||||
}
|
||||
} else {
|
||||
if err := DB.Create(&model.ServiceHistory{
|
||||
ServiceID: mh.GetId(),
|
||||
AvgDelay: ts.ping,
|
||||
Data: mh.Data,
|
||||
ServerID: r.Reporter,
|
||||
}).Error; err != nil {
|
||||
log.Printf("NEZHA>> Failed to save service monitor metrics: %v", err)
|
||||
}
|
||||
}
|
||||
ts.count = 0
|
||||
ts.ping = mh.Delay
|
||||
ts.ping = 0
|
||||
ts.successCount = 0
|
||||
}
|
||||
serviceTcpMap[r.Reporter] = ts
|
||||
} else {
|
||||
if TSDBEnabled() {
|
||||
if err := TSDBShared.WriteServiceMetrics(&tsdb.ServiceMetrics{
|
||||
ServiceID: mh.GetId(),
|
||||
ServerID: r.Reporter,
|
||||
Timestamp: time.Now(),
|
||||
Delay: float64(mh.Delay),
|
||||
Successful: mh.Successful,
|
||||
}); err != nil {
|
||||
log.Printf("NEZHA>> Failed to save service monitor metrics to TSDB: %v", err)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
ss.serviceResponseDataStoreLock.Lock()
|
||||
// 写入当天状态
|
||||
if mh.Successful {
|
||||
ss.serviceStatusToday[mh.GetId()].Delay = (ss.serviceStatusToday[mh.
|
||||
GetId()].Delay*float32(ss.serviceStatusToday[mh.GetId()].Up) +
|
||||
mh.Delay) / float32(ss.serviceStatusToday[mh.GetId()].Up+1)
|
||||
GetId()].Delay*float64(ss.serviceStatusToday[mh.GetId()].Up) +
|
||||
float64(mh.Delay)) / float64(ss.serviceStatusToday[mh.GetId()].Up+1)
|
||||
ss.serviceStatusToday[mh.GetId()].Up++
|
||||
} else {
|
||||
ss.serviceStatusToday[mh.GetId()].Down++
|
||||
@@ -463,7 +571,7 @@ func (ss *ServiceSentinel) worker() {
|
||||
rd := ss.serviceResponseDataStore[mh.GetId()]
|
||||
if cs.Successful {
|
||||
rd.Up++
|
||||
rd.Delay = (rd.Delay*float32(rd.Up-1) + cs.Delay) / float32(rd.Up)
|
||||
rd.Delay = (rd.Delay*float64(rd.Up-1) + float64(cs.Delay)) / float64(rd.Up)
|
||||
} else {
|
||||
rd.Down++
|
||||
}
|
||||
@@ -482,20 +590,20 @@ func (ss *ServiceSentinel) worker() {
|
||||
stateCode = GetStatusCode(upPercent)
|
||||
}
|
||||
|
||||
// 数据持久化
|
||||
if len(ss.serviceCurrentStatusData[mh.GetId()].result) == _CurrentStatusSize {
|
||||
ss.serviceCurrentStatusData[mh.GetId()].t = currentTime
|
||||
rd := ss.serviceResponseDataStore[mh.GetId()]
|
||||
if err := DB.Create(&model.ServiceHistory{
|
||||
ServiceID: mh.GetId(),
|
||||
AvgDelay: rd.Delay,
|
||||
Data: mh.Data,
|
||||
Up: rd.Up,
|
||||
Down: rd.Down,
|
||||
}).Error; err != nil {
|
||||
log.Printf("NEZHA>> Failed to save service monitor metrics: %v", err)
|
||||
if !TSDBEnabled() {
|
||||
rd := ss.serviceResponseDataStore[mh.GetId()]
|
||||
if err := DB.Create(&model.ServiceHistory{
|
||||
ServiceID: mh.GetId(),
|
||||
AvgDelay: rd.Delay,
|
||||
Data: mh.Data,
|
||||
Up: rd.Up,
|
||||
Down: rd.Down,
|
||||
}).Error; err != nil {
|
||||
log.Printf("NEZHA>> Failed to save service monitor metrics: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
ss.serviceCurrentStatusData[mh.GetId()].result = ss.serviceCurrentStatusData[mh.GetId()].result[:0]
|
||||
}
|
||||
|
||||
|
||||
@@ -0,0 +1,336 @@
|
||||
package singleton
|
||||
|
||||
import (
|
||||
"os"
|
||||
"path/filepath"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/stretchr/testify/assert"
|
||||
"github.com/stretchr/testify/require"
|
||||
"gorm.io/driver/sqlite"
|
||||
"gorm.io/gorm"
|
||||
|
||||
"github.com/nezhahq/nezha/model"
|
||||
"github.com/nezhahq/nezha/pkg/tsdb"
|
||||
)
|
||||
|
||||
func newTestSentinel(serviceIDs []uint64) *ServiceSentinel {
|
||||
ss := &ServiceSentinel{
|
||||
serviceStatusToday: make(map[uint64]*_TodayStatsOfService),
|
||||
monthlyStatus: make(map[uint64]*serviceResponseItem),
|
||||
}
|
||||
for _, id := range serviceIDs {
|
||||
ss.serviceStatusToday[id] = &_TodayStatsOfService{}
|
||||
ss.monthlyStatus[id] = &serviceResponseItem{
|
||||
service: &model.Service{Common: model.Common{ID: id}},
|
||||
ServiceResponseItem: model.ServiceResponseItem{
|
||||
Delay: &[30]float64{},
|
||||
Up: &[30]uint64{},
|
||||
Down: &[30]uint64{},
|
||||
},
|
||||
}
|
||||
}
|
||||
return ss
|
||||
}
|
||||
|
||||
func setupTestDB(t *testing.T) func() {
|
||||
t.Helper()
|
||||
var err error
|
||||
DB, err = gorm.Open(sqlite.Open(":memory:"), &gorm.Config{})
|
||||
require.NoError(t, err)
|
||||
require.NoError(t, DB.AutoMigrate(model.ServiceHistory{}))
|
||||
return func() { DB = nil }
|
||||
}
|
||||
|
||||
func setupTestTSDB(t *testing.T) (*tsdb.TSDB, func()) {
|
||||
t.Helper()
|
||||
tempDir, err := os.MkdirTemp("", "tsdb_sentinel_test")
|
||||
require.NoError(t, err)
|
||||
config := &tsdb.Config{
|
||||
DataPath: filepath.Join(tempDir, "tsdb"),
|
||||
RetentionDays: 30,
|
||||
MinFreeDiskSpaceGB: 1,
|
||||
DedupInterval: time.Second,
|
||||
}
|
||||
db, err := tsdb.Open(config)
|
||||
require.NoError(t, err)
|
||||
TSDBShared = db
|
||||
return db, func() {
|
||||
db.Close()
|
||||
TSDBShared = nil
|
||||
os.RemoveAll(tempDir)
|
||||
}
|
||||
}
|
||||
|
||||
func TestLoadMonthlyStatusFromDB(t *testing.T) {
|
||||
cleanup := setupTestDB(t)
|
||||
defer cleanup()
|
||||
|
||||
year, month, day := time.Now().Date()
|
||||
today := time.Date(year, month, day, 0, 0, 0, 0, time.UTC)
|
||||
|
||||
serviceID := uint64(1)
|
||||
ss := newTestSentinel([]uint64{serviceID})
|
||||
|
||||
DB.Create(&model.ServiceHistory{
|
||||
ServiceID: serviceID,
|
||||
ServerID: 0,
|
||||
AvgDelay: 10.0,
|
||||
Up: 5,
|
||||
Down: 1,
|
||||
CreatedAt: today.Add(-25 * time.Hour),
|
||||
})
|
||||
DB.Create(&model.ServiceHistory{
|
||||
ServiceID: serviceID,
|
||||
ServerID: 0,
|
||||
AvgDelay: 20.0,
|
||||
Up: 3,
|
||||
Down: 2,
|
||||
CreatedAt: today.Add(-25 * time.Hour),
|
||||
})
|
||||
DB.Create(&model.ServiceHistory{
|
||||
ServiceID: serviceID,
|
||||
ServerID: 0,
|
||||
AvgDelay: 30.0,
|
||||
Up: 10,
|
||||
Down: 0,
|
||||
CreatedAt: today.Add(-49 * time.Hour),
|
||||
})
|
||||
|
||||
ss.loadMonthlyStatusFromDB(today)
|
||||
|
||||
ms := ss.monthlyStatus[serviceID]
|
||||
|
||||
// day -1: index 27, two records with AvgDelay 10 and 20
|
||||
assert.InDelta(t, 15.0, ms.Delay[27], 0.01)
|
||||
assert.Equal(t, uint64(8), ms.Up[27])
|
||||
assert.Equal(t, uint64(3), ms.Down[27])
|
||||
|
||||
// day -2: index 26
|
||||
assert.InDelta(t, 30.0, ms.Delay[26], 0.01)
|
||||
assert.Equal(t, uint64(10), ms.Up[26])
|
||||
assert.Equal(t, uint64(0), ms.Down[26])
|
||||
|
||||
// totals
|
||||
assert.Equal(t, uint64(18), ms.TotalUp)
|
||||
assert.Equal(t, uint64(3), ms.TotalDown)
|
||||
|
||||
// today (index 29) should be untouched
|
||||
assert.Equal(t, float64(0), ms.Delay[29])
|
||||
assert.Equal(t, uint64(0), ms.Up[29])
|
||||
}
|
||||
|
||||
func TestLoadMonthlyStatusFromDB_IgnoresToday(t *testing.T) {
|
||||
cleanup := setupTestDB(t)
|
||||
defer cleanup()
|
||||
|
||||
year, month, day := time.Now().Date()
|
||||
today := time.Date(year, month, day, 0, 0, 0, 0, time.UTC)
|
||||
|
||||
serviceID := uint64(1)
|
||||
ss := newTestSentinel([]uint64{serviceID})
|
||||
|
||||
DB.Create(&model.ServiceHistory{
|
||||
ServiceID: serviceID,
|
||||
ServerID: 0,
|
||||
AvgDelay: 50.0,
|
||||
Up: 100,
|
||||
Down: 5,
|
||||
CreatedAt: today.Add(2 * time.Hour),
|
||||
})
|
||||
|
||||
ss.loadMonthlyStatusFromDB(today)
|
||||
|
||||
ms := ss.monthlyStatus[serviceID]
|
||||
assert.Equal(t, uint64(0), ms.TotalUp)
|
||||
assert.Equal(t, uint64(0), ms.TotalDown)
|
||||
}
|
||||
|
||||
func TestLoadMonthlyStatusFromDB_UnknownServiceIgnored(t *testing.T) {
|
||||
cleanup := setupTestDB(t)
|
||||
defer cleanup()
|
||||
|
||||
year, month, day := time.Now().Date()
|
||||
today := time.Date(year, month, day, 0, 0, 0, 0, time.UTC)
|
||||
|
||||
ss := newTestSentinel([]uint64{1})
|
||||
|
||||
DB.Create(&model.ServiceHistory{
|
||||
ServiceID: 999,
|
||||
ServerID: 0,
|
||||
AvgDelay: 10.0,
|
||||
Up: 5,
|
||||
Down: 1,
|
||||
CreatedAt: today.Add(-25 * time.Hour),
|
||||
})
|
||||
|
||||
ss.loadMonthlyStatusFromDB(today)
|
||||
|
||||
ms := ss.monthlyStatus[uint64(1)]
|
||||
assert.Equal(t, uint64(0), ms.TotalUp)
|
||||
}
|
||||
|
||||
func TestLoadTodayStatsFromDB(t *testing.T) {
|
||||
cleanup := setupTestDB(t)
|
||||
defer cleanup()
|
||||
|
||||
year, month, day := time.Now().Date()
|
||||
today := time.Date(year, month, day, 0, 0, 0, 0, time.UTC)
|
||||
|
||||
serviceID := uint64(1)
|
||||
ss := newTestSentinel([]uint64{serviceID})
|
||||
|
||||
DB.Create(&model.ServiceHistory{
|
||||
ServiceID: serviceID,
|
||||
ServerID: 0,
|
||||
AvgDelay: 10.0,
|
||||
Up: 5,
|
||||
Down: 1,
|
||||
CreatedAt: today.Add(1 * time.Hour),
|
||||
})
|
||||
DB.Create(&model.ServiceHistory{
|
||||
ServiceID: serviceID,
|
||||
ServerID: 0,
|
||||
AvgDelay: 30.0,
|
||||
Up: 3,
|
||||
Down: 2,
|
||||
CreatedAt: today.Add(2 * time.Hour),
|
||||
})
|
||||
|
||||
ss.loadTodayStats(today)
|
||||
|
||||
st := ss.serviceStatusToday[serviceID]
|
||||
assert.Equal(t, uint64(8), st.Up)
|
||||
assert.Equal(t, uint64(3), st.Down)
|
||||
assert.InDelta(t, 20.0, st.Delay, 0.01)
|
||||
|
||||
ms := ss.monthlyStatus[serviceID]
|
||||
assert.Equal(t, uint64(8), ms.TotalUp)
|
||||
assert.Equal(t, uint64(3), ms.TotalDown)
|
||||
}
|
||||
|
||||
func TestLoadMonthlyStatusFromTSDB(t *testing.T) {
|
||||
db, cleanup := setupTestTSDB(t)
|
||||
defer cleanup()
|
||||
|
||||
year, month, day := time.Now().Date()
|
||||
today := time.Date(year, month, day, 0, 0, 0, 0, time.UTC)
|
||||
|
||||
serviceID := uint64(1)
|
||||
services := []*model.Service{{Common: model.Common{ID: serviceID}}}
|
||||
ss := newTestSentinel([]uint64{serviceID})
|
||||
|
||||
yesterday := today.Add(-25 * time.Hour)
|
||||
for i := 0; i < 5; i++ {
|
||||
ts := yesterday.Add(time.Duration(i) * time.Minute)
|
||||
require.NoError(t, db.WriteServiceMetrics(&tsdb.ServiceMetrics{
|
||||
ServiceID: serviceID,
|
||||
ServerID: 1,
|
||||
Timestamp: ts,
|
||||
Delay: float64(10 + i),
|
||||
Successful: true,
|
||||
}))
|
||||
}
|
||||
for i := 0; i < 3; i++ {
|
||||
require.NoError(t, db.WriteServiceMetrics(&tsdb.ServiceMetrics{
|
||||
ServiceID: serviceID,
|
||||
ServerID: 1,
|
||||
Timestamp: yesterday.Add(time.Duration(i+10) * time.Minute),
|
||||
Delay: float64(20 + i),
|
||||
Successful: false,
|
||||
}))
|
||||
}
|
||||
|
||||
db.Flush()
|
||||
|
||||
ss.loadMonthlyStatusFromTSDB(services, today)
|
||||
|
||||
ms := ss.monthlyStatus[serviceID]
|
||||
// day -1: dayIndex 28
|
||||
assert.Equal(t, uint64(5), ms.Up[28])
|
||||
assert.Equal(t, uint64(3), ms.Down[28])
|
||||
assert.Equal(t, uint64(5), ms.TotalUp)
|
||||
assert.Equal(t, uint64(3), ms.TotalDown)
|
||||
assert.Greater(t, ms.Delay[28], float64(0))
|
||||
|
||||
// today (index 29) should be untouched
|
||||
assert.Equal(t, uint64(0), ms.Up[29])
|
||||
}
|
||||
|
||||
func TestLoadTodayStatsFromTSDB(t *testing.T) {
|
||||
db, cleanup := setupTestTSDB(t)
|
||||
defer cleanup()
|
||||
|
||||
year, month, day := time.Now().Date()
|
||||
today := time.Date(year, month, day, 0, 0, 0, 0, time.UTC)
|
||||
|
||||
serviceID := uint64(1)
|
||||
ss := newTestSentinel([]uint64{serviceID})
|
||||
|
||||
now := time.Now()
|
||||
for i := 0; i < 4; i++ {
|
||||
ts := now.Add(-time.Duration(i) * time.Minute)
|
||||
require.NoError(t, db.WriteServiceMetrics(&tsdb.ServiceMetrics{
|
||||
ServiceID: serviceID,
|
||||
ServerID: 1,
|
||||
Timestamp: ts,
|
||||
Delay: float64(10 + i),
|
||||
Successful: true,
|
||||
}))
|
||||
}
|
||||
for i := 0; i < 2; i++ {
|
||||
ts := now.Add(-time.Duration(i+10) * time.Minute)
|
||||
require.NoError(t, db.WriteServiceMetrics(&tsdb.ServiceMetrics{
|
||||
ServiceID: serviceID,
|
||||
ServerID: 1,
|
||||
Timestamp: ts,
|
||||
Delay: 0,
|
||||
Successful: false,
|
||||
}))
|
||||
}
|
||||
|
||||
db.Flush()
|
||||
|
||||
ss.loadTodayStats(today)
|
||||
|
||||
st := ss.serviceStatusToday[serviceID]
|
||||
assert.Greater(t, st.Up, uint64(0))
|
||||
assert.Greater(t, st.Down, uint64(0))
|
||||
|
||||
ms := ss.monthlyStatus[serviceID]
|
||||
assert.Equal(t, st.Up, ms.TotalUp)
|
||||
assert.Equal(t, st.Down, ms.TotalDown)
|
||||
}
|
||||
|
||||
func TestLoadMonthlyStatusFromTSDB_NoDoubleCountToday(t *testing.T) {
|
||||
db, cleanup := setupTestTSDB(t)
|
||||
defer cleanup()
|
||||
|
||||
year, month, day := time.Now().Date()
|
||||
today := time.Date(year, month, day, 0, 0, 0, 0, time.UTC)
|
||||
|
||||
serviceID := uint64(1)
|
||||
services := []*model.Service{{Common: model.Common{ID: serviceID}}}
|
||||
ss := newTestSentinel([]uint64{serviceID})
|
||||
|
||||
now := time.Now()
|
||||
for i := 0; i < 5; i++ {
|
||||
require.NoError(t, db.WriteServiceMetrics(&tsdb.ServiceMetrics{
|
||||
ServiceID: serviceID,
|
||||
ServerID: 1,
|
||||
Timestamp: now.Add(-time.Duration(i) * time.Minute),
|
||||
Delay: 10.0,
|
||||
Successful: true,
|
||||
}))
|
||||
}
|
||||
db.Flush()
|
||||
|
||||
ss.loadMonthlyStatusFromTSDB(services, today)
|
||||
totalAfterMonthly := ss.monthlyStatus[serviceID].TotalUp
|
||||
|
||||
ss.loadTodayStats(today)
|
||||
totalAfterToday := ss.monthlyStatus[serviceID].TotalUp
|
||||
|
||||
assert.Equal(t, totalAfterMonthly+ss.serviceStatusToday[serviceID].Up, totalAfterToday)
|
||||
}
|
||||
@@ -87,12 +87,13 @@ func InitDBFromPath(path string) error {
|
||||
}
|
||||
err = DB.AutoMigrate(model.Server{}, model.User{}, model.ServerGroup{}, model.NotificationGroup{},
|
||||
model.Notification{}, model.AlertRule{}, model.Service{}, model.NotificationGroupNotification{},
|
||||
model.ServiceHistory{}, model.Cron{}, model.Transfer{}, model.ServerGroupServer{},
|
||||
model.Cron{}, model.Transfer{}, model.ServerGroupServer{},
|
||||
model.NAT{}, model.DDNSProfile{}, model.NotificationGroupNotification{},
|
||||
model.WAF{}, model.Oauth2Bind{})
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
@@ -130,14 +131,9 @@ func RecordTransferHourlyUsage(servers ...*model.Server) {
|
||||
log.Printf("NEZHA>> Saved traffic metrics to database. Affected %d row(s), Error: %v", len(txs), DB.Create(txs).Error)
|
||||
}
|
||||
|
||||
// CleanServiceHistory 清理无效或过时的 监控记录 和 流量记录
|
||||
func CleanServiceHistory() {
|
||||
// 清理已被删除的服务器的监控记录与流量记录
|
||||
DB.Unscoped().Delete(&model.ServiceHistory{}, "created_at < ? OR service_id NOT IN (SELECT `id` FROM services)", time.Now().AddDate(0, 0, -30))
|
||||
// 由于网络监控记录的数据较多,并且前端仅使用了 1 天的数据
|
||||
// 考虑到 sqlite 数据量问题,仅保留一天数据,
|
||||
// server_id = 0 的数据会用于/service页面的可用性展示
|
||||
DB.Unscoped().Delete(&model.ServiceHistory{}, "(created_at < ? AND server_id != 0) OR service_id NOT IN (SELECT `id` FROM services)", time.Now().AddDate(0, 0, -1))
|
||||
// CleanMonitorHistory 清理流量记录(TSDB 有自己的保留策略)
|
||||
func CleanMonitorHistory() {
|
||||
// 清理已被删除的服务器的流量记录
|
||||
DB.Unscoped().Delete(&model.Transfer{}, "server_id NOT IN (SELECT `id` FROM servers)")
|
||||
// 计算可清理流量记录的时长
|
||||
var allServerKeep time.Time
|
||||
@@ -179,6 +175,28 @@ func CleanServiceHistory() {
|
||||
}
|
||||
}
|
||||
|
||||
// PerformMaintenance 执行系统维护(SQLite VACUUM 和 TSDB 维护)
|
||||
func PerformMaintenance() {
|
||||
log.Println("NEZHA>> Starting system maintenance...")
|
||||
|
||||
// 1. SQLite 维护
|
||||
if DB != nil {
|
||||
log.Println("NEZHA>> SQLite: Starting VACUUM...")
|
||||
if err := DB.Exec("VACUUM").Error; err != nil {
|
||||
log.Printf("NEZHA>> SQLite: VACUUM failed: %v", err)
|
||||
} else {
|
||||
log.Println("NEZHA>> SQLite: VACUUM completed")
|
||||
}
|
||||
}
|
||||
|
||||
// 2. TSDB 维护
|
||||
if TSDBEnabled() {
|
||||
TSDBShared.Maintenance()
|
||||
}
|
||||
|
||||
log.Println("NEZHA>> System maintenance completed")
|
||||
}
|
||||
|
||||
// IPDesensitize 根据设置选择是否对IP进行打码处理 返回处理后的IP(关闭打码则返回原IP)
|
||||
func IPDesensitize(ip string) string {
|
||||
if Conf.EnablePlainIPInNotification {
|
||||
|
||||
@@ -0,0 +1,73 @@
|
||||
package singleton
|
||||
|
||||
import (
|
||||
"log"
|
||||
"time"
|
||||
|
||||
"github.com/nezhahq/nezha/model"
|
||||
"github.com/nezhahq/nezha/pkg/tsdb"
|
||||
)
|
||||
|
||||
var TSDBShared *tsdb.TSDB
|
||||
|
||||
func InitTSDB() error {
|
||||
config := &tsdb.Config{
|
||||
RetentionDays: 30,
|
||||
MinFreeDiskSpaceGB: 1,
|
||||
MaxMemoryMB: 256,
|
||||
}
|
||||
|
||||
if Conf.TSDB.DataPath != "" {
|
||||
config.DataPath = Conf.TSDB.DataPath
|
||||
}
|
||||
if Conf.TSDB.RetentionDays > 0 {
|
||||
config.RetentionDays = Conf.TSDB.RetentionDays
|
||||
}
|
||||
if Conf.TSDB.MinFreeDiskSpaceGB > 0 {
|
||||
config.MinFreeDiskSpaceGB = Conf.TSDB.MinFreeDiskSpaceGB
|
||||
}
|
||||
if Conf.TSDB.MaxMemoryMB > 0 {
|
||||
config.MaxMemoryMB = Conf.TSDB.MaxMemoryMB
|
||||
}
|
||||
if Conf.TSDB.WriteBufferSize > 0 {
|
||||
config.WriteBufferSize = Conf.TSDB.WriteBufferSize
|
||||
}
|
||||
if Conf.TSDB.WriteBufferFlushInterval > 0 {
|
||||
config.WriteBufferFlushInterval = time.Duration(Conf.TSDB.WriteBufferFlushInterval) * time.Second
|
||||
}
|
||||
|
||||
if !config.Enabled() {
|
||||
log.Println("NEZHA>> TSDB is disabled (tsdb.data_path not configured)")
|
||||
if DB != nil {
|
||||
return DB.AutoMigrate(model.ServiceHistory{})
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
var err error
|
||||
TSDBShared, err = tsdb.Open(config)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
log.Println("NEZHA>> TSDB initialized successfully")
|
||||
|
||||
if DB != nil && DB.Migrator().HasTable("service_histories") {
|
||||
log.Println("NEZHA>> Dropping legacy service_histories table (TSDB is now enabled). Historical data will NOT be migrated.")
|
||||
if err := DB.Migrator().DropTable("service_histories"); err != nil {
|
||||
log.Printf("NEZHA>> Warning: failed to drop service_histories table: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func TSDBEnabled() bool {
|
||||
return TSDBShared != nil && !TSDBShared.IsClosed()
|
||||
}
|
||||
|
||||
func CloseTSDB() {
|
||||
if TSDBShared != nil {
|
||||
TSDBShared.Close()
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user