feat(v2.0.0): tsdb (#1162)

* feat: tsdb

* fix(ci): remove --parseGoList=false from swag init to fix dependency resolution

* fix(ci): fix swag init directory and temporary remove s390x support due to cgo issues

* fix(ci): fix swag init output directory to cmd/dashboard/docs

* fix(ci): set GOTOOLCHAIN=auto for gosec

* feat: add system storage maintenance for SQLite and TSDB

* shit

* feat: add s390x support and improve service monitoring

* ci: upgrade goreleaser-cross image to v1.25

* ci: add libzstd-dev:s390x for cross-compilation

* ci: build libzstd for s390x from source

* ci: add libzstd_linux_s390x.go for gozstd linking

* ci: use vendor mode for s390x gozstd build

* ci: clone zstd source for s390x build

* refactor(tsdb): rename MaxDiskUsageGB to MinFreeDiskSpaceGB and optimize queries

- Rename config to accurately reflect VictoriaMetrics behavior: minimum free disk space threshold
- Add QueryServiceHistoryByServerID for batch query optimization
- Fix hasStatus to avoid false status counting when only delay data exists
- Fix service aggregation boundary: use successCount*2 >= count
- Fix serviceID parsing with strconv.ParseUint error handling
- Add TagFiltersCacheSize for better query performance

* feat(api): add server metrics endpoint and simplify service history response

- Add /server/:id/metrics API for querying TSDB server metrics
- Simplify getServiceHistory by removing redundant data conversion
- Change AvgDelay type from float32 to float64
- Remove generated swagger docs (to be regenerated)
- Update TSDB query, writer and tests

* chore: 临时禁用不支持前端

* ci: cache zstd build for s390x to speed up CI

* fix(tsdb): fix race conditions, data correctness and optimize performance

- Fix TOCTOU race between IsClosed() and write/query by holding RLock
- Fix delay=0 excluded from stats by using hasDelay flag instead of value > 0
- Fix fmt.Sscanf -> strconv.ParseUint for server_id parsing with error logging
- Fix buffer unbounded growth by flushing inside lock when over maxSize
- Split makeMetricRow into makeServerMetricRow/makeServiceMetricRow
- Extract InitGlobalSettings() from Open() for VictoriaMetrics globals
- Remove redundant instance/GetInstance/SetInstance singleton
- Add error logging for silently skipped block decode errors
- Optimize WriteBatch* to build all rows in single write call
- Optimize downsample to use linear scan instead of map for sorted data
- Optimize query slice reuse across block iterations

* 服务添加DisplayIndex (#1166)

* 服务添加DisplayIndex

* 根据ai建议修改

---------

Co-authored-by: huYang <306061454@qq.com>

* fix(tsdb): restore SQLite fallback and monthly status reload on restart

- Restore ServiceHistory model and SQLite write fallback when TSDB is disabled
- Reload monthlyStatus (30-day) and serviceStatusToday from TSDB/SQLite on startup
- Add SQLite fallback query for /service/:id/history and /server/:id/service
- Remove breaking GET /service/:id endpoint, keep /service/:id/history only
- Add QueryServiceDailyStats to TSDB for per-day aggregation
- Add tests for monthly status and today stats loading from both TSDB and SQLite
- Migrate ServiceHistory table only when TSDB is disabled

* ci: exclude false-positive gosec rules G117, G703, G704

* feat(api): expose tsdb_enabled in setting response

* ci: restore G115 exclusion accidentally dropped in previous commit

* fix: update version numbers for OfficialAdmin and Official templates

* chore: upgrade frontend

* chore: upgrade frontend

---------

Co-authored-by: 胡说丷刂 <34758853+laosan-xx@users.noreply.github.com>
Co-authored-by: huYang <306061454@qq.com>
This commit is contained in:
奶爸
2026-02-15 13:13:33 +08:00
committed by GitHub
parent 4c4758207d
commit e61772e858
28 changed files with 3054 additions and 221 deletions
+39
View File
@@ -12,6 +12,7 @@ import (
"github.com/jinzhu/copier"
geoipx "github.com/nezhahq/nezha/pkg/geoip"
"github.com/nezhahq/nezha/pkg/grpcx"
"github.com/nezhahq/nezha/pkg/tsdb"
"github.com/nezhahq/nezha/model"
pb "github.com/nezhahq/nezha/proto"
@@ -114,6 +115,44 @@ func (s *NezhaHandler) ReportSystemState(stream pb.NezhaService_ReportSystemStat
server.LastActive = time.Now()
server.State = &innerState
if singleton.TSDBEnabled() {
maxTemp := 0.0
for _, t := range innerState.Temperatures {
if t.Temperature > maxTemp {
maxTemp = t.Temperature
}
}
maxGPU := 0.0
for _, g := range innerState.GPU {
if g > maxGPU {
maxGPU = g
}
}
if err := singleton.TSDBShared.WriteServerMetrics(&tsdb.ServerMetrics{
ServerID: clientID,
Timestamp: time.Now(),
CPU: innerState.CPU,
MemUsed: innerState.MemUsed,
SwapUsed: innerState.SwapUsed,
DiskUsed: innerState.DiskUsed,
NetInSpeed: innerState.NetInSpeed,
NetOutSpeed: innerState.NetOutSpeed,
NetInTransfer: innerState.NetInTransfer,
NetOutTransfer: innerState.NetOutTransfer,
Load1: innerState.Load1,
Load5: innerState.Load5,
Load15: innerState.Load15,
TCPConnCount: innerState.TcpConnCount,
UDPConnCount: innerState.UdpConnCount,
ProcessCount: innerState.ProcessCount,
Temperature: maxTemp,
Uptime: innerState.Uptime,
GPU: maxGPU,
}); err != nil {
log.Printf("NEZHA>> Failed to write server metrics to TSDB: %v", err)
}
}
// 应对 dashboard / agent 重启的情况,如果从未记录过,先打点,等到小时时间点时入库
if server.PrevTransferInSnapshot == 0 || server.PrevTransferOutSnapshot == 0 {
server.PrevTransferInSnapshot = state.NetInTransfer
+2 -2
View File
@@ -2,14 +2,14 @@
name: "OfficialAdmin"
repository: "https://github.com/nezhahq/admin-frontend"
author: "nezhahq"
version: "v1.14.7"
version: "v2.0.3"
is_admin: true
is_official: true
- path: "user-dist"
name: "Official"
repository: "https://github.com/hamster1963/nezha-dash-v1"
author: "hamster1963"
version: "v1.33.0"
version: "v2.0.0"
is_official: true
- path: "nazhua-dist"
name: "Nazhua"
+175 -67
View File
@@ -16,6 +16,7 @@ import (
"golang.org/x/exp/constraints"
"github.com/nezhahq/nezha/model"
"github.com/nezhahq/nezha/pkg/tsdb"
"github.com/nezhahq/nezha/pkg/utils"
pb "github.com/nezhahq/nezha/proto"
)
@@ -39,7 +40,7 @@ type ReportData struct {
type _TodayStatsOfService struct {
Up uint64 // 今日在线计数
Down uint64 // 今日离线计数
Delay float32 // 今日平均延迟
Delay float64 // 今日平均延迟
}
type serviceResponseData = _TodayStatsOfService
@@ -51,8 +52,9 @@ type serviceTaskStatus struct {
}
type pingStore struct {
count int
ping float32
count int
ping float64
successCount int
}
/*
@@ -108,23 +110,7 @@ func NewServiceSentinel(serviceSentinelDispatchBus chan<- *model.Service) (*Serv
year, month, day := time.Now().Date()
today := time.Date(year, month, day, 0, 0, 0, 0, Loc)
var mhs []model.ServiceHistory
// 加载当日记录
DB.Where("created_at >= ? AND server_id = 0", today).Find(&mhs)
totalDelay := make(map[uint64]float32)
totalDelayCount := make(map[uint64]float32)
for _, mh := range mhs {
totalDelay[mh.ServiceID] += mh.AvgDelay
totalDelayCount[mh.ServiceID]++
ss.serviceStatusToday[mh.ServiceID].Up += mh.Up
ss.monthlyStatus[mh.ServiceID].TotalUp += mh.Up
ss.serviceStatusToday[mh.ServiceID].Down += mh.Down
ss.monthlyStatus[mh.ServiceID].TotalDown += mh.Down
}
for id, delay := range totalDelay {
ss.serviceStatusToday[id].Delay = delay / float32(totalDelayCount[id])
}
ss.loadTodayStats(today)
// 启动服务监控器
go ss.worker()
@@ -135,6 +121,12 @@ func NewServiceSentinel(serviceSentinelDispatchBus chan<- *model.Service) (*Serv
return nil, err
}
// 每周日凌晨 4:00 执行系统存储维护
_, err = CronShared.AddFunc("0 0 4 * * 0", PerformMaintenance)
if err != nil {
log.Printf("NEZHA>> Warning: failed to schedule maintenance task: %v", err)
}
return ss, nil
}
@@ -171,6 +163,16 @@ func (ss *ServiceSentinel) Dispatch(r ReportData) {
ss.serviceReportChannel <- r
}
// sortServices 按 DisplayIndex 降序、ID 升序排列服务列表
func sortServices(services []*model.Service) {
slices.SortFunc(services, func(a, b *model.Service) int {
if a.DisplayIndex != b.DisplayIndex {
return cmp.Compare(b.DisplayIndex, a.DisplayIndex)
}
return cmp.Compare(a.ID, b.ID)
})
}
func (ss *ServiceSentinel) UpdateServiceList() {
ss.servicesLock.RLock()
defer ss.servicesLock.RUnlock()
@@ -179,9 +181,7 @@ func (ss *ServiceSentinel) UpdateServiceList() {
defer ss.serviceListLock.Unlock()
ss.serviceList = utils.MapValuesToSlice(ss.services)
slices.SortFunc(ss.serviceList, func(a, b *model.Service) int {
return cmp.Compare(a.ID, b.ID)
})
sortServices(ss.serviceList)
}
// loadServiceHistory 加载服务监控器的历史状态信息
@@ -207,6 +207,7 @@ func (ss *ServiceSentinel) loadServiceHistory() error {
ss.serviceStatusToday[service.ID] = &_TodayStatsOfService{}
}
ss.serviceList = services
sortServices(ss.serviceList)
year, month, day := time.Now().Date()
today := time.Date(year, month, day, 0, 0, 0, 0, Loc)
@@ -215,33 +216,111 @@ func (ss *ServiceSentinel) loadServiceHistory() error {
ss.monthlyStatus[service.ID] = &serviceResponseItem{
service: service,
ServiceResponseItem: model.ServiceResponseItem{
Delay: &[30]float32{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
Up: &[30]uint64{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
Down: &[30]uint64{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
Delay: &[30]float64{},
Up: &[30]uint64{},
Down: &[30]uint64{},
},
}
}
// 加载服务监控历史记录
var mhs []model.ServiceHistory
DB.Where("created_at > ? AND created_at < ? AND server_id = 0", today.AddDate(0, 0, -29), today).Find(&mhs)
var delayCount = make(map[int]int)
for _, mh := range mhs {
dayIndex := 28 - (int(today.Sub(mh.CreatedAt).Hours()) / 24)
if dayIndex < 0 {
continue
}
ss.monthlyStatus[mh.ServiceID].Delay[dayIndex] = (ss.monthlyStatus[mh.ServiceID].Delay[dayIndex]*float32(delayCount[dayIndex]) + mh.AvgDelay) / float32(delayCount[dayIndex]+1)
delayCount[dayIndex]++
ss.monthlyStatus[mh.ServiceID].Up[dayIndex] += mh.Up
ss.monthlyStatus[mh.ServiceID].TotalUp += mh.Up
ss.monthlyStatus[mh.ServiceID].Down[dayIndex] += mh.Down
ss.monthlyStatus[mh.ServiceID].TotalDown += mh.Down
if TSDBEnabled() {
ss.loadMonthlyStatusFromTSDB(services, today)
} else {
ss.loadMonthlyStatusFromDB(today)
}
return nil
}
func (ss *ServiceSentinel) loadMonthlyStatusFromTSDB(services []*model.Service, today time.Time) {
for _, service := range services {
dailyStats, err := TSDBShared.QueryServiceDailyStats(service.ID, today, 30)
if err != nil {
log.Printf("NEZHA>> Failed to load TSDB history for service %d: %v", service.ID, err)
continue
}
ms := ss.monthlyStatus[service.ID]
for i := 0; i < 29; i++ {
ms.Up[i] = dailyStats[i].Up
ms.TotalUp += dailyStats[i].Up
ms.Down[i] = dailyStats[i].Down
ms.TotalDown += dailyStats[i].Down
ms.Delay[i] = dailyStats[i].Delay
}
}
}
func (ss *ServiceSentinel) loadMonthlyStatusFromDB(today time.Time) {
var mhs []model.ServiceHistory
DB.Where("created_at > ? AND created_at < ? AND server_id = 0", today.AddDate(0, 0, -29), today).Find(&mhs)
delayCount := make(map[uint64]map[int]int)
for _, mh := range mhs {
dayIndex := 28 - int(today.Sub(mh.CreatedAt).Hours())/24
if dayIndex < 0 {
continue
}
ms := ss.monthlyStatus[mh.ServiceID]
if ms == nil {
continue
}
if delayCount[mh.ServiceID] == nil {
delayCount[mh.ServiceID] = make(map[int]int)
}
ms.Delay[dayIndex] = (ms.Delay[dayIndex]*float64(delayCount[mh.ServiceID][dayIndex]) + mh.AvgDelay) / float64(delayCount[mh.ServiceID][dayIndex]+1)
delayCount[mh.ServiceID][dayIndex]++
ms.Up[dayIndex] += mh.Up
ms.TotalUp += mh.Up
ms.Down[dayIndex] += mh.Down
ms.TotalDown += mh.Down
}
}
func (ss *ServiceSentinel) loadTodayStats(today time.Time) {
if TSDBEnabled() {
for serviceID, ms := range ss.monthlyStatus {
result, err := TSDBShared.QueryServiceHistory(serviceID, tsdb.Period1Day)
if err != nil {
log.Printf("NEZHA>> Failed to load TSDB today stats for service %d: %v", serviceID, err)
continue
}
var totalUp, totalDown uint64
var totalDelay float64
var delayCount int
for _, serverStats := range result.Servers {
totalUp += serverStats.Stats.TotalUp
totalDown += serverStats.Stats.TotalDown
if serverStats.Stats.AvgDelay > 0 {
totalDelay += serverStats.Stats.AvgDelay
delayCount++
}
}
ss.serviceStatusToday[serviceID].Up = totalUp
ss.serviceStatusToday[serviceID].Down = totalDown
if delayCount > 0 {
ss.serviceStatusToday[serviceID].Delay = totalDelay / float64(delayCount)
}
ms.TotalUp += totalUp
ms.TotalDown += totalDown
}
} else {
var mhs []model.ServiceHistory
DB.Where("created_at >= ? AND server_id = 0", today).Find(&mhs)
totalDelay := make(map[uint64]float64)
totalDelayCount := make(map[uint64]int)
for _, mh := range mhs {
ss.serviceStatusToday[mh.ServiceID].Up += mh.Up
ss.monthlyStatus[mh.ServiceID].TotalUp += mh.Up
ss.serviceStatusToday[mh.ServiceID].Down += mh.Down
ss.monthlyStatus[mh.ServiceID].TotalDown += mh.Down
totalDelay[mh.ServiceID] += mh.AvgDelay
totalDelayCount[mh.ServiceID]++
}
for id, delay := range totalDelay {
ss.serviceStatusToday[id].Delay = delay / float64(totalDelayCount[id])
}
}
}
func (ss *ServiceSentinel) Update(m *model.Service) error {
ss.serviceResponseDataStoreLock.Lock()
defer ss.serviceResponseDataStoreLock.Unlock()
@@ -266,9 +345,9 @@ func (ss *ServiceSentinel) Update(m *model.Service) error {
ss.monthlyStatus[m.ID] = &serviceResponseItem{
service: m,
ServiceResponseItem: model.ServiceResponseItem{
Delay: &[30]float32{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
Up: &[30]uint64{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
Down: &[30]uint64{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
Delay: &[30]float64{},
Up: &[30]uint64{},
Down: &[30]uint64{},
},
}
if ss.serviceCurrentStatusData[m.ID] == nil {
@@ -406,6 +485,7 @@ func (ss *ServiceSentinel) worker() {
mh := r.Data
if mh.Type == model.TaskTypeTCPPing || mh.Type == model.TaskTypeICMPPing {
// TCP/ICMP Ping 使用平均值计算后再写入
serviceTcpMap, ok := ss.serviceResponsePing[mh.GetId()]
if !ok {
serviceTcpMap = make(map[uint64]*pingStore)
@@ -416,28 +496,56 @@ func (ss *ServiceSentinel) worker() {
ts = &pingStore{}
}
ts.count++
ts.ping = (ts.ping*float32(ts.count-1) + mh.Delay) / float32(ts.count)
ts.ping = (ts.ping*float64(ts.count-1) + float64(mh.Delay)) / float64(ts.count)
if mh.Successful {
ts.successCount++
}
if ts.count == Conf.AvgPingCount {
if err := DB.Create(&model.ServiceHistory{
ServiceID: mh.GetId(),
AvgDelay: ts.ping,
Data: mh.Data,
ServerID: r.Reporter,
}).Error; err != nil {
log.Printf("NEZHA>> Failed to save service monitor metrics: %v", err)
if TSDBEnabled() {
if err := TSDBShared.WriteServiceMetrics(&tsdb.ServiceMetrics{
ServiceID: mh.GetId(),
ServerID: r.Reporter,
Timestamp: time.Now(),
Delay: ts.ping,
Successful: ts.successCount*2 >= ts.count,
}); err != nil {
log.Printf("NEZHA>> Failed to save service monitor metrics to TSDB: %v", err)
}
} else {
if err := DB.Create(&model.ServiceHistory{
ServiceID: mh.GetId(),
AvgDelay: ts.ping,
Data: mh.Data,
ServerID: r.Reporter,
}).Error; err != nil {
log.Printf("NEZHA>> Failed to save service monitor metrics: %v", err)
}
}
ts.count = 0
ts.ping = mh.Delay
ts.ping = 0
ts.successCount = 0
}
serviceTcpMap[r.Reporter] = ts
} else {
if TSDBEnabled() {
if err := TSDBShared.WriteServiceMetrics(&tsdb.ServiceMetrics{
ServiceID: mh.GetId(),
ServerID: r.Reporter,
Timestamp: time.Now(),
Delay: float64(mh.Delay),
Successful: mh.Successful,
}); err != nil {
log.Printf("NEZHA>> Failed to save service monitor metrics to TSDB: %v", err)
}
}
}
ss.serviceResponseDataStoreLock.Lock()
// 写入当天状态
if mh.Successful {
ss.serviceStatusToday[mh.GetId()].Delay = (ss.serviceStatusToday[mh.
GetId()].Delay*float32(ss.serviceStatusToday[mh.GetId()].Up) +
mh.Delay) / float32(ss.serviceStatusToday[mh.GetId()].Up+1)
GetId()].Delay*float64(ss.serviceStatusToday[mh.GetId()].Up) +
float64(mh.Delay)) / float64(ss.serviceStatusToday[mh.GetId()].Up+1)
ss.serviceStatusToday[mh.GetId()].Up++
} else {
ss.serviceStatusToday[mh.GetId()].Down++
@@ -463,7 +571,7 @@ func (ss *ServiceSentinel) worker() {
rd := ss.serviceResponseDataStore[mh.GetId()]
if cs.Successful {
rd.Up++
rd.Delay = (rd.Delay*float32(rd.Up-1) + cs.Delay) / float32(rd.Up)
rd.Delay = (rd.Delay*float64(rd.Up-1) + float64(cs.Delay)) / float64(rd.Up)
} else {
rd.Down++
}
@@ -482,20 +590,20 @@ func (ss *ServiceSentinel) worker() {
stateCode = GetStatusCode(upPercent)
}
// 数据持久化
if len(ss.serviceCurrentStatusData[mh.GetId()].result) == _CurrentStatusSize {
ss.serviceCurrentStatusData[mh.GetId()].t = currentTime
rd := ss.serviceResponseDataStore[mh.GetId()]
if err := DB.Create(&model.ServiceHistory{
ServiceID: mh.GetId(),
AvgDelay: rd.Delay,
Data: mh.Data,
Up: rd.Up,
Down: rd.Down,
}).Error; err != nil {
log.Printf("NEZHA>> Failed to save service monitor metrics: %v", err)
if !TSDBEnabled() {
rd := ss.serviceResponseDataStore[mh.GetId()]
if err := DB.Create(&model.ServiceHistory{
ServiceID: mh.GetId(),
AvgDelay: rd.Delay,
Data: mh.Data,
Up: rd.Up,
Down: rd.Down,
}).Error; err != nil {
log.Printf("NEZHA>> Failed to save service monitor metrics: %v", err)
}
}
ss.serviceCurrentStatusData[mh.GetId()].result = ss.serviceCurrentStatusData[mh.GetId()].result[:0]
}
+336
View File
@@ -0,0 +1,336 @@
package singleton
import (
"os"
"path/filepath"
"testing"
"time"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
"gorm.io/driver/sqlite"
"gorm.io/gorm"
"github.com/nezhahq/nezha/model"
"github.com/nezhahq/nezha/pkg/tsdb"
)
func newTestSentinel(serviceIDs []uint64) *ServiceSentinel {
ss := &ServiceSentinel{
serviceStatusToday: make(map[uint64]*_TodayStatsOfService),
monthlyStatus: make(map[uint64]*serviceResponseItem),
}
for _, id := range serviceIDs {
ss.serviceStatusToday[id] = &_TodayStatsOfService{}
ss.monthlyStatus[id] = &serviceResponseItem{
service: &model.Service{Common: model.Common{ID: id}},
ServiceResponseItem: model.ServiceResponseItem{
Delay: &[30]float64{},
Up: &[30]uint64{},
Down: &[30]uint64{},
},
}
}
return ss
}
func setupTestDB(t *testing.T) func() {
t.Helper()
var err error
DB, err = gorm.Open(sqlite.Open(":memory:"), &gorm.Config{})
require.NoError(t, err)
require.NoError(t, DB.AutoMigrate(model.ServiceHistory{}))
return func() { DB = nil }
}
func setupTestTSDB(t *testing.T) (*tsdb.TSDB, func()) {
t.Helper()
tempDir, err := os.MkdirTemp("", "tsdb_sentinel_test")
require.NoError(t, err)
config := &tsdb.Config{
DataPath: filepath.Join(tempDir, "tsdb"),
RetentionDays: 30,
MinFreeDiskSpaceGB: 1,
DedupInterval: time.Second,
}
db, err := tsdb.Open(config)
require.NoError(t, err)
TSDBShared = db
return db, func() {
db.Close()
TSDBShared = nil
os.RemoveAll(tempDir)
}
}
func TestLoadMonthlyStatusFromDB(t *testing.T) {
cleanup := setupTestDB(t)
defer cleanup()
year, month, day := time.Now().Date()
today := time.Date(year, month, day, 0, 0, 0, 0, time.UTC)
serviceID := uint64(1)
ss := newTestSentinel([]uint64{serviceID})
DB.Create(&model.ServiceHistory{
ServiceID: serviceID,
ServerID: 0,
AvgDelay: 10.0,
Up: 5,
Down: 1,
CreatedAt: today.Add(-25 * time.Hour),
})
DB.Create(&model.ServiceHistory{
ServiceID: serviceID,
ServerID: 0,
AvgDelay: 20.0,
Up: 3,
Down: 2,
CreatedAt: today.Add(-25 * time.Hour),
})
DB.Create(&model.ServiceHistory{
ServiceID: serviceID,
ServerID: 0,
AvgDelay: 30.0,
Up: 10,
Down: 0,
CreatedAt: today.Add(-49 * time.Hour),
})
ss.loadMonthlyStatusFromDB(today)
ms := ss.monthlyStatus[serviceID]
// day -1: index 27, two records with AvgDelay 10 and 20
assert.InDelta(t, 15.0, ms.Delay[27], 0.01)
assert.Equal(t, uint64(8), ms.Up[27])
assert.Equal(t, uint64(3), ms.Down[27])
// day -2: index 26
assert.InDelta(t, 30.0, ms.Delay[26], 0.01)
assert.Equal(t, uint64(10), ms.Up[26])
assert.Equal(t, uint64(0), ms.Down[26])
// totals
assert.Equal(t, uint64(18), ms.TotalUp)
assert.Equal(t, uint64(3), ms.TotalDown)
// today (index 29) should be untouched
assert.Equal(t, float64(0), ms.Delay[29])
assert.Equal(t, uint64(0), ms.Up[29])
}
func TestLoadMonthlyStatusFromDB_IgnoresToday(t *testing.T) {
cleanup := setupTestDB(t)
defer cleanup()
year, month, day := time.Now().Date()
today := time.Date(year, month, day, 0, 0, 0, 0, time.UTC)
serviceID := uint64(1)
ss := newTestSentinel([]uint64{serviceID})
DB.Create(&model.ServiceHistory{
ServiceID: serviceID,
ServerID: 0,
AvgDelay: 50.0,
Up: 100,
Down: 5,
CreatedAt: today.Add(2 * time.Hour),
})
ss.loadMonthlyStatusFromDB(today)
ms := ss.monthlyStatus[serviceID]
assert.Equal(t, uint64(0), ms.TotalUp)
assert.Equal(t, uint64(0), ms.TotalDown)
}
func TestLoadMonthlyStatusFromDB_UnknownServiceIgnored(t *testing.T) {
cleanup := setupTestDB(t)
defer cleanup()
year, month, day := time.Now().Date()
today := time.Date(year, month, day, 0, 0, 0, 0, time.UTC)
ss := newTestSentinel([]uint64{1})
DB.Create(&model.ServiceHistory{
ServiceID: 999,
ServerID: 0,
AvgDelay: 10.0,
Up: 5,
Down: 1,
CreatedAt: today.Add(-25 * time.Hour),
})
ss.loadMonthlyStatusFromDB(today)
ms := ss.monthlyStatus[uint64(1)]
assert.Equal(t, uint64(0), ms.TotalUp)
}
func TestLoadTodayStatsFromDB(t *testing.T) {
cleanup := setupTestDB(t)
defer cleanup()
year, month, day := time.Now().Date()
today := time.Date(year, month, day, 0, 0, 0, 0, time.UTC)
serviceID := uint64(1)
ss := newTestSentinel([]uint64{serviceID})
DB.Create(&model.ServiceHistory{
ServiceID: serviceID,
ServerID: 0,
AvgDelay: 10.0,
Up: 5,
Down: 1,
CreatedAt: today.Add(1 * time.Hour),
})
DB.Create(&model.ServiceHistory{
ServiceID: serviceID,
ServerID: 0,
AvgDelay: 30.0,
Up: 3,
Down: 2,
CreatedAt: today.Add(2 * time.Hour),
})
ss.loadTodayStats(today)
st := ss.serviceStatusToday[serviceID]
assert.Equal(t, uint64(8), st.Up)
assert.Equal(t, uint64(3), st.Down)
assert.InDelta(t, 20.0, st.Delay, 0.01)
ms := ss.monthlyStatus[serviceID]
assert.Equal(t, uint64(8), ms.TotalUp)
assert.Equal(t, uint64(3), ms.TotalDown)
}
func TestLoadMonthlyStatusFromTSDB(t *testing.T) {
db, cleanup := setupTestTSDB(t)
defer cleanup()
year, month, day := time.Now().Date()
today := time.Date(year, month, day, 0, 0, 0, 0, time.UTC)
serviceID := uint64(1)
services := []*model.Service{{Common: model.Common{ID: serviceID}}}
ss := newTestSentinel([]uint64{serviceID})
yesterday := today.Add(-25 * time.Hour)
for i := 0; i < 5; i++ {
ts := yesterday.Add(time.Duration(i) * time.Minute)
require.NoError(t, db.WriteServiceMetrics(&tsdb.ServiceMetrics{
ServiceID: serviceID,
ServerID: 1,
Timestamp: ts,
Delay: float64(10 + i),
Successful: true,
}))
}
for i := 0; i < 3; i++ {
require.NoError(t, db.WriteServiceMetrics(&tsdb.ServiceMetrics{
ServiceID: serviceID,
ServerID: 1,
Timestamp: yesterday.Add(time.Duration(i+10) * time.Minute),
Delay: float64(20 + i),
Successful: false,
}))
}
db.Flush()
ss.loadMonthlyStatusFromTSDB(services, today)
ms := ss.monthlyStatus[serviceID]
// day -1: dayIndex 28
assert.Equal(t, uint64(5), ms.Up[28])
assert.Equal(t, uint64(3), ms.Down[28])
assert.Equal(t, uint64(5), ms.TotalUp)
assert.Equal(t, uint64(3), ms.TotalDown)
assert.Greater(t, ms.Delay[28], float64(0))
// today (index 29) should be untouched
assert.Equal(t, uint64(0), ms.Up[29])
}
func TestLoadTodayStatsFromTSDB(t *testing.T) {
db, cleanup := setupTestTSDB(t)
defer cleanup()
year, month, day := time.Now().Date()
today := time.Date(year, month, day, 0, 0, 0, 0, time.UTC)
serviceID := uint64(1)
ss := newTestSentinel([]uint64{serviceID})
now := time.Now()
for i := 0; i < 4; i++ {
ts := now.Add(-time.Duration(i) * time.Minute)
require.NoError(t, db.WriteServiceMetrics(&tsdb.ServiceMetrics{
ServiceID: serviceID,
ServerID: 1,
Timestamp: ts,
Delay: float64(10 + i),
Successful: true,
}))
}
for i := 0; i < 2; i++ {
ts := now.Add(-time.Duration(i+10) * time.Minute)
require.NoError(t, db.WriteServiceMetrics(&tsdb.ServiceMetrics{
ServiceID: serviceID,
ServerID: 1,
Timestamp: ts,
Delay: 0,
Successful: false,
}))
}
db.Flush()
ss.loadTodayStats(today)
st := ss.serviceStatusToday[serviceID]
assert.Greater(t, st.Up, uint64(0))
assert.Greater(t, st.Down, uint64(0))
ms := ss.monthlyStatus[serviceID]
assert.Equal(t, st.Up, ms.TotalUp)
assert.Equal(t, st.Down, ms.TotalDown)
}
func TestLoadMonthlyStatusFromTSDB_NoDoubleCountToday(t *testing.T) {
db, cleanup := setupTestTSDB(t)
defer cleanup()
year, month, day := time.Now().Date()
today := time.Date(year, month, day, 0, 0, 0, 0, time.UTC)
serviceID := uint64(1)
services := []*model.Service{{Common: model.Common{ID: serviceID}}}
ss := newTestSentinel([]uint64{serviceID})
now := time.Now()
for i := 0; i < 5; i++ {
require.NoError(t, db.WriteServiceMetrics(&tsdb.ServiceMetrics{
ServiceID: serviceID,
ServerID: 1,
Timestamp: now.Add(-time.Duration(i) * time.Minute),
Delay: 10.0,
Successful: true,
}))
}
db.Flush()
ss.loadMonthlyStatusFromTSDB(services, today)
totalAfterMonthly := ss.monthlyStatus[serviceID].TotalUp
ss.loadTodayStats(today)
totalAfterToday := ss.monthlyStatus[serviceID].TotalUp
assert.Equal(t, totalAfterMonthly+ss.serviceStatusToday[serviceID].Up, totalAfterToday)
}
+27 -9
View File
@@ -87,12 +87,13 @@ func InitDBFromPath(path string) error {
}
err = DB.AutoMigrate(model.Server{}, model.User{}, model.ServerGroup{}, model.NotificationGroup{},
model.Notification{}, model.AlertRule{}, model.Service{}, model.NotificationGroupNotification{},
model.ServiceHistory{}, model.Cron{}, model.Transfer{}, model.ServerGroupServer{},
model.Cron{}, model.Transfer{}, model.ServerGroupServer{},
model.NAT{}, model.DDNSProfile{}, model.NotificationGroupNotification{},
model.WAF{}, model.Oauth2Bind{})
if err != nil {
return err
}
return nil
}
@@ -130,14 +131,9 @@ func RecordTransferHourlyUsage(servers ...*model.Server) {
log.Printf("NEZHA>> Saved traffic metrics to database. Affected %d row(s), Error: %v", len(txs), DB.Create(txs).Error)
}
// CleanServiceHistory 清理无效或过时的 监控记录 和 流量记录
func CleanServiceHistory() {
// 清理已被删除的服务器的监控记录与流量记录
DB.Unscoped().Delete(&model.ServiceHistory{}, "created_at < ? OR service_id NOT IN (SELECT `id` FROM services)", time.Now().AddDate(0, 0, -30))
// 由于网络监控记录的数据较多,并且前端仅使用了 1 天的数据
// 考虑到 sqlite 数据量问题,仅保留一天数据,
// server_id = 0 的数据会用于/service页面的可用性展示
DB.Unscoped().Delete(&model.ServiceHistory{}, "(created_at < ? AND server_id != 0) OR service_id NOT IN (SELECT `id` FROM services)", time.Now().AddDate(0, 0, -1))
// CleanMonitorHistory 清理流量记录TSDB 有自己的保留策略)
func CleanMonitorHistory() {
// 清理已被删除的服务器的流量记录
DB.Unscoped().Delete(&model.Transfer{}, "server_id NOT IN (SELECT `id` FROM servers)")
// 计算可清理流量记录的时长
var allServerKeep time.Time
@@ -179,6 +175,28 @@ func CleanServiceHistory() {
}
}
// PerformMaintenance 执行系统维护(SQLite VACUUM 和 TSDB 维护)
func PerformMaintenance() {
log.Println("NEZHA>> Starting system maintenance...")
// 1. SQLite 维护
if DB != nil {
log.Println("NEZHA>> SQLite: Starting VACUUM...")
if err := DB.Exec("VACUUM").Error; err != nil {
log.Printf("NEZHA>> SQLite: VACUUM failed: %v", err)
} else {
log.Println("NEZHA>> SQLite: VACUUM completed")
}
}
// 2. TSDB 维护
if TSDBEnabled() {
TSDBShared.Maintenance()
}
log.Println("NEZHA>> System maintenance completed")
}
// IPDesensitize 根据设置选择是否对IP进行打码处理 返回处理后的IP(关闭打码则返回原IP)
func IPDesensitize(ip string) string {
if Conf.EnablePlainIPInNotification {
+73
View File
@@ -0,0 +1,73 @@
package singleton
import (
"log"
"time"
"github.com/nezhahq/nezha/model"
"github.com/nezhahq/nezha/pkg/tsdb"
)
var TSDBShared *tsdb.TSDB
func InitTSDB() error {
config := &tsdb.Config{
RetentionDays: 30,
MinFreeDiskSpaceGB: 1,
MaxMemoryMB: 256,
}
if Conf.TSDB.DataPath != "" {
config.DataPath = Conf.TSDB.DataPath
}
if Conf.TSDB.RetentionDays > 0 {
config.RetentionDays = Conf.TSDB.RetentionDays
}
if Conf.TSDB.MinFreeDiskSpaceGB > 0 {
config.MinFreeDiskSpaceGB = Conf.TSDB.MinFreeDiskSpaceGB
}
if Conf.TSDB.MaxMemoryMB > 0 {
config.MaxMemoryMB = Conf.TSDB.MaxMemoryMB
}
if Conf.TSDB.WriteBufferSize > 0 {
config.WriteBufferSize = Conf.TSDB.WriteBufferSize
}
if Conf.TSDB.WriteBufferFlushInterval > 0 {
config.WriteBufferFlushInterval = time.Duration(Conf.TSDB.WriteBufferFlushInterval) * time.Second
}
if !config.Enabled() {
log.Println("NEZHA>> TSDB is disabled (tsdb.data_path not configured)")
if DB != nil {
return DB.AutoMigrate(model.ServiceHistory{})
}
return nil
}
var err error
TSDBShared, err = tsdb.Open(config)
if err != nil {
return err
}
log.Println("NEZHA>> TSDB initialized successfully")
if DB != nil && DB.Migrator().HasTable("service_histories") {
log.Println("NEZHA>> Dropping legacy service_histories table (TSDB is now enabled). Historical data will NOT be migrated.")
if err := DB.Migrator().DropTable("service_histories"); err != nil {
log.Printf("NEZHA>> Warning: failed to drop service_histories table: %v", err)
}
}
return nil
}
func TSDBEnabled() bool {
return TSDBShared != nil && !TSDBShared.IsClosed()
}
func CloseTSDB() {
if TSDBShared != nil {
TSDBShared.Close()
}
}