diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 78551c6..1ecd3b9 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -15,17 +15,18 @@ jobs: goos: [linux, windows] goarch: [amd64] include: - - goos: linux - goarch: s390x - goos: linux goarch: arm64 + - goos: linux + goarch: s390x name: Build artifacts runs-on: ubuntu-latest container: - image: goreleaser/goreleaser-cross:v1.24 + image: goreleaser/goreleaser-cross:v1.25 steps: - - run: | + - name: Install dependencies + run: | apt update && apt install unzip curl -y wget https://github.com/mikefarah/yq/releases/latest/download/yq_linux_amd64 -O /usr/bin/yq chmod +x /usr/bin/yq @@ -53,7 +54,45 @@ jobs: - name: generate swagger docs run: | go install github.com/swaggo/swag/cmd/swag@latest - swag init --pd -d . -g ./cmd/dashboard/main.go -o ./cmd/dashboard/docs --parseGoList=false + swag init --pd -d cmd/dashboard -g main.go -o cmd/dashboard/docs + go mod tidy + + - name: Cache zstd for s390x + if: matrix.goarch == 's390x' + id: cache-zstd + uses: actions/cache@v4 + with: + path: /tmp/zstd-s390x + key: zstd-s390x-v1.5.7 + + - name: Build zstd for s390x + if: matrix.goarch == 's390x' + run: | + # gozstd doesn't ship prebuilt libzstd for s390x, build it manually + go mod vendor + + if [ -f /tmp/zstd-s390x/libzstd.a ]; then + echo "Using cached libzstd.a" + else + # Clone zstd source and build for s390x + ZSTD_VERSION=v1.5.7 + git clone --depth 1 --branch ${ZSTD_VERSION} https://github.com/facebook/zstd /tmp/zstd + cd /tmp/zstd/lib + CC=s390x-linux-gnu-gcc \ + CXX=s390x-linux-gnu-g++ \ + AR=s390x-linux-gnu-ar \ + ZSTD_LEGACY_SUPPORT=0 \ + make clean libzstd.a + mkdir -p /tmp/zstd-s390x + cp libzstd.a /tmp/zstd-s390x/libzstd.a + fi + + # Copy to vendor directory + GOZSTD_VENDOR=${GITHUB_WORKSPACE}/vendor/github.com/valyala/gozstd + cp /tmp/zstd-s390x/libzstd.a ${GOZSTD_VENDOR}/libzstd_linux_s390x.a + + # Create the Go file to link the library + printf '//go:build linux && s390x\n\npackage gozstd\n\n/*\n#cgo LDFLAGS: ${SRCDIR}/libzstd_linux_s390x.a\n*/\nimport "C"\n' > ${GOZSTD_VENDOR}/libzstd_linux_s390x.go - name: Build with tag if: contains(github.ref, 'refs/tags/') diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 2b5a6f2..a668c60 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -34,7 +34,7 @@ jobs: go install github.com/swaggo/swag/cmd/swag@latest touch ./cmd/dashboard/user-dist/a touch ./cmd/dashboard/admin-dist/a - swag init --pd -d . -g ./cmd/dashboard/main.go -o ./cmd/dashboard/docs --parseGoList=false + swag init --pd -d cmd/dashboard -g main.go -o cmd/dashboard/docs - name: Unit test run: | @@ -46,5 +46,7 @@ jobs: - name: Run Gosec Security Scanner if: runner.os == 'Linux' uses: securego/gosec@master + env: + GOTOOLCHAIN: auto with: - args: --exclude=G104,G402,G115,G203 ./... + args: --exclude=G104,G115,G117,G203,G402,G703,G704 ./... diff --git a/.goreleaser.yml b/.goreleaser.yml index 55e342e..5562b94 100644 --- a/.goreleaser.yml +++ b/.goreleaser.yml @@ -52,6 +52,7 @@ builds: flags: - -trimpath - -buildvcs=false + - -mod=vendor tags: - go_json goos: diff --git a/cmd/dashboard/controller/controller.go b/cmd/dashboard/controller/controller.go index 7683a89..4e08988 100644 --- a/cmd/dashboard/controller/controller.go +++ b/cmd/dashboard/controller/controller.go @@ -73,8 +73,10 @@ func routers(r *gin.Engine, frontendDist fs.FS) { optionalAuth.GET("/server-group", commonHandler(listServerGroup)) optionalAuth.GET("/service", commonHandler(showService)) - optionalAuth.GET("/service/:id", commonHandler(listServiceHistory)) optionalAuth.GET("/service/server", commonHandler(listServerWithServices)) + optionalAuth.GET("/service/:id/history", commonHandler(getServiceHistory)) + optionalAuth.GET("/server/:id/service", commonHandler(listServerServices)) + optionalAuth.GET("/server/:id/metrics", commonHandler(getServerMetrics)) auth := api.Group("", authMw) @@ -150,6 +152,7 @@ func routers(r *gin.Engine, frontendDist fs.FS) { auth.POST("/online-user/batch-block", adminHandler(batchBlockOnlineUser)) auth.PATCH("/setting", adminHandler(updateConfig)) + auth.POST("/maintenance", adminHandler(runMaintenance)) r.NoRoute(fallbackToFrontend(frontendDist)) } diff --git a/cmd/dashboard/controller/server.go b/cmd/dashboard/controller/server.go index dec44bc..880db1f 100644 --- a/cmd/dashboard/controller/server.go +++ b/cmd/dashboard/controller/server.go @@ -12,6 +12,7 @@ import ( "gorm.io/gorm" "github.com/nezhahq/nezha/model" + "github.com/nezhahq/nezha/pkg/tsdb" pb "github.com/nezhahq/nezha/proto" "github.com/nezhahq/nezha/service/singleton" ) @@ -366,3 +367,89 @@ func batchMoveServer(c *gin.Context) (any, error) { return nil, nil } + +var serverMetricMap = map[string]tsdb.MetricType{ + "cpu": tsdb.MetricServerCPU, + "memory": tsdb.MetricServerMemory, + "swap": tsdb.MetricServerSwap, + "disk": tsdb.MetricServerDisk, + "net_in_speed": tsdb.MetricServerNetInSpeed, + "net_out_speed": tsdb.MetricServerNetOutSpeed, + "net_in_transfer": tsdb.MetricServerNetInTransfer, + "net_out_transfer": tsdb.MetricServerNetOutTransfer, + "load1": tsdb.MetricServerLoad1, + "load5": tsdb.MetricServerLoad5, + "load15": tsdb.MetricServerLoad15, + "tcp_conn": tsdb.MetricServerTCPConn, + "udp_conn": tsdb.MetricServerUDPConn, + "process_count": tsdb.MetricServerProcessCount, + "temperature": tsdb.MetricServerTemperature, + "uptime": tsdb.MetricServerUptime, + "gpu": tsdb.MetricServerGPU, +} + +// Get server metrics history +// @Summary Get server metrics history +// @Security BearerAuth +// @Schemes +// @Description Get server metrics history for a specific server +// @Tags common +// @param id path uint true "Server ID" +// @param metric query string true "Metric name: cpu, memory, swap, disk, net_in_speed, net_out_speed, net_in_transfer, net_out_transfer, load1, load5, load15, tcp_conn, udp_conn, process_count, temperature, uptime, gpu" +// @param period query string false "Time period: 1d, 7d, 30d (default: 1d)" +// @Produce json +// @Success 200 {object} model.CommonResponse[model.ServerMetricsResponse] +// @Router /server/{id}/metrics [get] +func getServerMetrics(c *gin.Context) (*model.ServerMetricsResponse, error) { + idStr := c.Param("id") + serverID, err := strconv.ParseUint(idStr, 10, 64) + if err != nil { + return nil, err + } + + server, ok := singleton.ServerShared.Get(serverID) + if !ok { + return nil, singleton.Localizer.ErrorT("server not found") + } + + _, isMember := c.Get(model.CtxKeyAuthorizedUser) + if server.HideForGuest && !isMember { + return nil, singleton.Localizer.ErrorT("unauthorized") + } + + metricName := c.Query("metric") + metricType, ok := serverMetricMap[metricName] + if !ok { + return nil, singleton.Localizer.ErrorT("invalid metric name") + } + + periodStr := c.DefaultQuery("period", "1d") + period, err := tsdb.ParseQueryPeriod(periodStr) + if err != nil { + return nil, err + } + + if !isMember && period != tsdb.Period1Day { + return nil, singleton.Localizer.ErrorT("unauthorized: only 1d data available for guests") + } + + response := &model.ServerMetricsResponse{ + ServerID: serverID, + ServerName: server.Name, + Metric: metricName, + DataPoints: make([]model.ServerMetricsDataPoint, 0), + } + + if !singleton.TSDBEnabled() { + return response, nil + } + + points, err := singleton.TSDBShared.QueryServerMetrics(serverID, metricType, period) + if err != nil { + return nil, err + } + + response.DataPoints = points + + return response, nil +} diff --git a/cmd/dashboard/controller/service.go b/cmd/dashboard/controller/service.go index a15e2ca..3f97a73 100644 --- a/cmd/dashboard/controller/service.go +++ b/cmd/dashboard/controller/service.go @@ -9,11 +9,11 @@ import ( "github.com/gin-gonic/gin" "github.com/jinzhu/copier" + "gorm.io/gorm" "github.com/nezhahq/nezha/model" - "github.com/nezhahq/nezha/pkg/utils" + "github.com/nezhahq/nezha/pkg/tsdb" "github.com/nezhahq/nezha/service/singleton" - "gorm.io/gorm" ) // Show service @@ -55,7 +55,7 @@ func showService(c *gin.Context) (*model.ServiceResponse, error) { // @Param id query uint false "Resource ID" // @Produce json // @Success 200 {object} model.CommonResponse[[]model.Service] -// @Router /service [get] +// @Router /service/list [get] func listService(c *gin.Context) ([]*model.Service, error) { var ss []*model.Service ssl := singleton.ServiceSentinelShared.GetSortedList() @@ -66,96 +66,321 @@ func listService(c *gin.Context) ([]*model.Service, error) { return ss, nil } -// List service histories by server id +// Get service history +// @Summary Get service history by service ID +// @Security BearerAuth +// @Schemes +// @Description Get service monitoring history for a specific service +// @Tags common +// @param id path uint true "Service ID" +// @param period query string false "Time period: 1d, 7d, 30d (default: 1d)" +// @Produce json +// @Success 200 {object} model.CommonResponse[model.ServiceHistoryResponse] +// @Router /service/{id}/history [get] +func getServiceHistory(c *gin.Context) (*model.ServiceHistoryResponse, error) { + idStr := c.Param("id") + serviceID, err := strconv.ParseUint(idStr, 10, 64) + if err != nil { + return nil, err + } + + // 检查服务是否存在 + service, ok := singleton.ServiceSentinelShared.Get(serviceID) + if !ok || service == nil { + return nil, singleton.Localizer.ErrorT("service not found") + } + + // 解析时间范围 + periodStr := c.DefaultQuery("period", "1d") + period, err := tsdb.ParseQueryPeriod(periodStr) + if err != nil { + return nil, err + } + + // 权限检查:未登录用户只能查看 1d 数据 + _, isMember := c.Get(model.CtxKeyAuthorizedUser) + if !isMember && period != tsdb.Period1Day { + return nil, singleton.Localizer.ErrorT("unauthorized: only 1d data available for guests") + } + + response := &model.ServiceHistoryResponse{ + ServiceID: serviceID, + ServiceName: service.Name, + Servers: make([]model.ServerServiceStats, 0), + } + + if !singleton.TSDBEnabled() { + return queryServiceHistoryFromDB(serviceID, period, response) + } + + result, err := singleton.TSDBShared.QueryServiceHistory(serviceID, period) + if err != nil { + return nil, err + } + + serverMap := singleton.ServerShared.GetList() + + for i := range result.Servers { + if server, ok := serverMap[result.Servers[i].ServerID]; ok { + result.Servers[i].ServerName = server.Name + } + } + response.Servers = result.Servers + + return response, nil +} + +func queryServiceHistoryFromDB(serviceID uint64, period tsdb.QueryPeriod, response *model.ServiceHistoryResponse) (*model.ServiceHistoryResponse, error) { + since := time.Now().Add(-period.Duration()) + + var histories []model.ServiceHistory + if err := singleton.DB.Where("service_id = ? AND server_id != 0 AND created_at >= ?", serviceID, since). + Order("server_id, created_at").Find(&histories).Error; err != nil { + return nil, err + } + + serverMap := singleton.ServerShared.GetList() + grouped := make(map[uint64][]model.ServiceHistory) + for _, h := range histories { + grouped[h.ServerID] = append(grouped[h.ServerID], h) + } + + for serverID, records := range grouped { + stats := model.ServerServiceStats{ + ServerID: serverID, + } + if server, ok := serverMap[serverID]; ok { + stats.ServerName = server.Name + } + + var totalDelay float64 + var totalUp, totalDown uint64 + dps := make([]model.DataPoint, 0, len(records)) + for _, r := range records { + status := uint8(1) + if r.Down > 0 && r.Up == 0 { + status = 0 + } + dps = append(dps, model.DataPoint{ + Timestamp: r.CreatedAt.Unix() * 1000, + Delay: r.AvgDelay, + Status: status, + }) + totalDelay += r.AvgDelay + totalUp += r.Up + totalDown += r.Down + } + + var avgDelay float64 + if len(records) > 0 { + avgDelay = totalDelay / float64(len(records)) + } + var upPercent float32 + if totalUp+totalDown > 0 { + upPercent = float32(totalUp) / float32(totalUp+totalDown) * 100 + } + stats.Stats = model.ServiceHistorySummary{ + AvgDelay: avgDelay, + UpPercent: upPercent, + TotalUp: totalUp, + TotalDown: totalDown, + DataPoints: dps, + } + response.Servers = append(response.Servers, stats) + } + + return response, nil +} + +// List server services // @Summary List service histories by server id // @Security BearerAuth // @Schemes -// @Description List service histories by server id +// @Description List service histories for a specific server // @Tags common // @param id path uint true "Server ID" +// @param period query string false "Time period: 1d, 7d, 30d (default: 1d)" // @Produce json // @Success 200 {object} model.CommonResponse[[]model.ServiceInfos] -// @Router /service/{id} [get] -func listServiceHistory(c *gin.Context) ([]*model.ServiceInfos, error) { +// @Router /server/{id}/service [get] +func listServerServices(c *gin.Context) ([]*model.ServiceInfos, error) { idStr := c.Param("id") - id, err := strconv.ParseUint(idStr, 10, 64) + serverID, err := strconv.ParseUint(idStr, 10, 64) if err != nil { return nil, err } m := singleton.ServerShared.GetList() - server, ok := m[id] + server, ok := m[serverID] if !ok || server == nil { return nil, singleton.Localizer.ErrorT("server not found") } _, isMember := c.Get(model.CtxKeyAuthorizedUser) - authorized := isMember // TODO || isViewPasswordVerfied + authorized := isMember if server.HideForGuest && !authorized { return nil, singleton.Localizer.ErrorT("unauthorized") } - var serviceHistories []*model.ServiceHistory - if err := singleton.DB.Model(&model.ServiceHistory{}).Select("service_id, created_at, server_id, avg_delay"). - Where("server_id = ?", id).Where("created_at >= ?", time.Now().Add(-24*time.Hour)).Order("service_id, created_at"). - Scan(&serviceHistories).Error; err != nil { + // 解析时间范围 + periodStr := c.DefaultQuery("period", "1d") + period, err := tsdb.ParseQueryPeriod(periodStr) + if err != nil { return nil, err } - var sortedServiceIDs []uint64 - resultMap := make(map[uint64]*model.ServiceInfos) - for _, history := range serviceHistories { - infos, ok := resultMap[history.ServiceID] - service, _ := singleton.ServiceSentinelShared.Get(history.ServiceID) - if !ok { - infos = &model.ServiceInfos{ - ServiceID: history.ServiceID, - ServerID: history.ServerID, - ServiceName: service.Name, - ServerName: m[history.ServerID].Name, + // 权限检查:未登录用户只能查看 1d 数据 + if !isMember && period != tsdb.Period1Day { + return nil, singleton.Localizer.ErrorT("unauthorized: only 1d data available for guests") + } + + services := singleton.ServiceSentinelShared.GetSortedList() + + var result []*model.ServiceInfos + + if !singleton.TSDBEnabled() { + return queryServerServicesFromDB(serverID, server.Name, period, services) + } + + historyResults, err := singleton.TSDBShared.QueryServiceHistoryByServerID(serverID, period) + if err != nil { + return nil, err + } + + for _, service := range services { + if service.Cover == model.ServiceCoverAll { + if service.SkipServers[serverID] { + continue + } + } else { + if !service.SkipServers[serverID] { + continue } - resultMap[history.ServiceID] = infos - sortedServiceIDs = append(sortedServiceIDs, history.ServiceID) } - infos.CreatedAt = append(infos.CreatedAt, history.CreatedAt.Truncate(time.Minute).Unix()*1000) - infos.AvgDelay = append(infos.AvgDelay, history.AvgDelay) + + historyResult, ok := historyResults[service.ID] + if !ok || len(historyResult.Servers) == 0 { + continue + } + + serverStats := historyResult.Servers[0] + + infos := &model.ServiceInfos{ + ServiceID: service.ID, + ServerID: serverID, + ServiceName: service.Name, + ServerName: server.Name, + DisplayIndex: service.DisplayIndex, + CreatedAt: make([]int64, len(serverStats.Stats.DataPoints)), + AvgDelay: make([]float64, len(serverStats.Stats.DataPoints)), + } + + for i, dp := range serverStats.Stats.DataPoints { + infos.CreatedAt[i] = dp.Timestamp + infos.AvgDelay[i] = dp.Delay + } + + result = append(result, infos) } - ret := make([]*model.ServiceInfos, 0, len(sortedServiceIDs)) - for _, id := range sortedServiceIDs { - ret = append(ret, resultMap[id]) + return result, nil +} + +func queryServerServicesFromDB(serverID uint64, serverName string, period tsdb.QueryPeriod, services []*model.Service) ([]*model.ServiceInfos, error) { + since := time.Now().Add(-period.Duration()) + + var histories []model.ServiceHistory + if err := singleton.DB.Where("server_id = ? AND created_at >= ?", serverID, since). + Order("service_id, created_at").Find(&histories).Error; err != nil { + return nil, err } - return ret, nil + grouped := make(map[uint64][]model.ServiceHistory) + for _, h := range histories { + grouped[h.ServiceID] = append(grouped[h.ServiceID], h) + } + + var result []*model.ServiceInfos + for _, service := range services { + if service.Cover == model.ServiceCoverAll { + if service.SkipServers[serverID] { + continue + } + } else { + if !service.SkipServers[serverID] { + continue + } + } + + records, ok := grouped[service.ID] + if !ok { + continue + } + + infos := &model.ServiceInfos{ + ServiceID: service.ID, + ServerID: serverID, + ServiceName: service.Name, + ServerName: serverName, + DisplayIndex: service.DisplayIndex, + CreatedAt: make([]int64, 0, len(records)), + AvgDelay: make([]float64, 0, len(records)), + } + + for _, r := range records { + infos.CreatedAt = append(infos.CreatedAt, r.CreatedAt.Truncate(time.Minute).Unix()*1000) + infos.AvgDelay = append(infos.AvgDelay, r.AvgDelay) + } + + result = append(result, infos) + } + + return result, nil } // List server with service // @Summary List server with service // @Security BearerAuth // @Schemes -// @Description List server with service +// @Description List servers that have service monitoring data // @Tags common // @Produce json // @Success 200 {object} model.CommonResponse[[]uint64] // @Router /service/server [get] func listServerWithServices(c *gin.Context) ([]uint64, error) { - var serverIdsWithService []uint64 - if err := singleton.DB.Model(&model.ServiceHistory{}). - Select("distinct(server_id)"). - Where("server_id != 0"). - Find(&serverIdsWithService).Error; err != nil { - return nil, newGormError("%v", err) + // 从内存中获取有服务监控配置的服务器列表 + services := singleton.ServiceSentinelShared.GetList() + serverMap := singleton.ServerShared.GetList() + + serverIDSet := make(map[uint64]bool) + + for _, service := range services { + if service.Cover == model.ServiceCoverAll { + // 除了跳过的服务器,其他都包含 + for serverID := range serverMap { + if !service.SkipServers[serverID] { + serverIDSet[serverID] = true + } + } + } else { + // 只包含指定的服务器 + for serverID, enabled := range service.SkipServers { + if enabled { + serverIDSet[serverID] = true + } + } + } } _, isMember := c.Get(model.CtxKeyAuthorizedUser) - authorized := isMember // TODO || isViewPasswordVerfied + authorized := isMember var ret []uint64 - for _, id := range serverIdsWithService { - server, ok := singleton.ServerShared.Get(id) + for id := range serverIDSet { + server, ok := serverMap[id] if !ok || server == nil { - return nil, singleton.Localizer.ErrorT("server not found") + continue } if !server.HideForGuest || authorized { ret = append(ret, id) @@ -191,6 +416,7 @@ func createService(c *gin.Context) (uint64, error) { m.Type = mf.Type m.SkipServers = mf.SkipServers m.Cover = mf.Cover + m.DisplayIndex = mf.DisplayIndex m.Notify = mf.Notify m.NotificationGroupID = mf.NotificationGroupID m.Duration = mf.Duration @@ -210,21 +436,6 @@ func createService(c *gin.Context) (uint64, error) { return 0, newGormError("%v", err) } - var skipServers []uint64 - for k := range m.SkipServers { - skipServers = append(skipServers, k) - } - - var err error - if m.Cover == 0 { - err = singleton.DB.Unscoped().Delete(&model.ServiceHistory{}, "service_id = ? and server_id in (?)", m.ID, skipServers).Error - } else { - err = singleton.DB.Unscoped().Delete(&model.ServiceHistory{}, "service_id = ? and server_id not in (?)", m.ID, skipServers).Error - } - if err != nil { - return 0, err - } - if err := singleton.ServiceSentinelShared.Update(&m); err != nil { return 0, err } @@ -269,6 +480,7 @@ func updateService(c *gin.Context) (any, error) { m.Type = mf.Type m.SkipServers = mf.SkipServers m.Cover = mf.Cover + m.DisplayIndex = mf.DisplayIndex m.Notify = mf.Notify m.NotificationGroupID = mf.NotificationGroupID m.Duration = mf.Duration @@ -288,17 +500,6 @@ func updateService(c *gin.Context) (any, error) { return nil, newGormError("%v", err) } - skipServers := utils.MapKeysToSlice(mf.SkipServers) - - if m.Cover == model.ServiceCoverAll { - err = singleton.DB.Unscoped().Delete(&model.ServiceHistory{}, "service_id = ? and server_id in (?)", m.ID, skipServers).Error - } else { - err = singleton.DB.Unscoped().Delete(&model.ServiceHistory{}, "service_id = ? and server_id not in (?) and server_id > 0", m.ID, skipServers).Error - } - if err != nil { - return nil, err - } - if err := singleton.ServiceSentinelShared.Update(&m); err != nil { return nil, err } @@ -329,10 +530,7 @@ func batchDeleteService(c *gin.Context) (any, error) { } err := singleton.DB.Transaction(func(tx *gorm.DB) error { - if err := tx.Unscoped().Delete(&model.Service{}, "id in (?)", ids).Error; err != nil { - return err - } - return tx.Unscoped().Delete(&model.ServiceHistory{}, "service_id in (?)", ids).Error + return tx.Unscoped().Delete(&model.Service{}, "id in (?)", ids).Error }) if err != nil { return nil, err diff --git a/cmd/dashboard/controller/setting.go b/cmd/dashboard/controller/setting.go index 427b901..4fd8906 100644 --- a/cmd/dashboard/controller/setting.go +++ b/cmd/dashboard/controller/setting.go @@ -39,6 +39,7 @@ func listConfig(c *gin.Context) (*model.SettingResponse, error) { }, Version: singleton.Version, FrontendTemplates: singleton.FrontendTemplates, + TSDBEnabled: singleton.TSDBEnabled(), } if !authorized || !isAdmin { @@ -54,6 +55,7 @@ func listConfig(c *gin.Context) (*model.SettingResponse, error) { ConfigDashboard: configDashboard, Oauth2Providers: config.Oauth2Providers, }, + TSDBEnabled: singleton.TSDBEnabled(), } } @@ -113,3 +115,17 @@ func updateConfig(c *gin.Context) (any, error) { singleton.OnUpdateLang(singleton.Conf.Language) return nil, nil } + +// Perform maintenance +// @Summary Perform maintenance +// @Security BearerAuth +// @Schemes +// @Description Perform system maintenance (SQLite VACUUM and TSDB maintenance) +// @Tags admin required +// @Produce json +// @Success 200 {object} model.CommonResponse[any] +// @Router /maintenance [post] +func runMaintenance(c *gin.Context) (any, error) { + singleton.PerformMaintenance() + return nil, nil +} diff --git a/cmd/dashboard/main.go b/cmd/dashboard/main.go index 0ae23de..63752a2 100644 --- a/cmd/dashboard/main.go +++ b/cmd/dashboard/main.go @@ -11,6 +11,7 @@ import ( "net" "net/http" "os" + "runtime/debug" "strings" "time" _ "time/tzdata" @@ -65,8 +66,8 @@ func initSystem(bus chan<- *model.Service) error { return err } - // 每天的3:30 对 监控记录 和 流量记录 进行清理 - if _, err := singleton.CronShared.AddFunc("0 30 3 * * *", singleton.CleanServiceHistory); err != nil { + // 每天的3:30 对流量记录进行清理 + if _, err := singleton.CronShared.AddFunc("0 30 3 * * *", singleton.CleanMonitorHistory); err != nil { return err } @@ -109,12 +110,19 @@ func main() { os.Exit(0) } - serviceSentinelDispatchBus := make(chan *model.Service) // 用于传递服务监控任务信息的channel - // 初始化 dao 包 + serviceSentinelDispatchBus := make(chan *model.Service) if err := utils.FirstError(singleton.InitFrontendTemplates, func() error { return singleton.InitConfigFromPath(dashboardCliParam.ConfigFile) }, singleton.InitTimezoneAndCache, + func() error { + if singleton.Conf.Memory.GoMemLimitMB > 0 { + debug.SetMemoryLimit(singleton.Conf.Memory.GoMemLimitMB * 1024 * 1024) + log.Printf("NEZHA>> Go memory limit set to %d MB", singleton.Conf.Memory.GoMemLimitMB) + } + return nil + }, func() error { return singleton.InitDBFromPath(dashboardCliParam.DatabaseLocation) }, + singleton.InitTSDB, func() error { return initSystem(serviceSentinelDispatchBus) }); err != nil { log.Fatal(err) } @@ -124,7 +132,7 @@ func main() { log.Fatal(err) } - singleton.CleanServiceHistory() + singleton.CleanMonitorHistory() rpc.DispatchKeepalive() go rpc.DispatchTask(serviceSentinelDispatchBus) go singleton.AlertSentinelStart() @@ -172,6 +180,7 @@ func main() { }, func(c context.Context) error { log.Println("NEZHA>> Graceful::START") singleton.RecordTransferHourlyUsage() + singleton.CloseTSDB() log.Println("NEZHA>> Graceful::END") var err error if muxServerHTTPS != nil { diff --git a/go.mod b/go.mod index a1278bc..d604d2a 100644 --- a/go.mod +++ b/go.mod @@ -1,8 +1,9 @@ module github.com/nezhahq/nezha -go 1.25 +go 1.25.6 require ( + github.com/VictoriaMetrics/VictoriaMetrics v1.134.0 github.com/appleboy/gin-jwt/v2 v2.10.3 github.com/dustinkirkland/golang-petname v0.0.0-20240428194347-eebcea082ee0 github.com/gin-contrib/pprof v1.5.3 @@ -15,7 +16,7 @@ require ( github.com/knadh/koanf/maps v0.1.2 github.com/knadh/koanf/providers/env v1.1.0 github.com/knadh/koanf/providers/file v1.2.0 - github.com/knadh/koanf/v2 v2.2.0 + github.com/knadh/koanf/v2 v2.3.0 github.com/leonelquinteros/gotext v1.7.1 github.com/libdns/cloudflare v0.2.1 github.com/libdns/he v1.1.1 @@ -26,17 +27,18 @@ require ( github.com/oschwald/maxminddb-golang v1.13.1 github.com/patrickmn/go-cache v2.1.0+incompatible github.com/robfig/cron/v3 v3.0.1 - github.com/stretchr/testify v1.10.0 + github.com/stretchr/testify v1.11.1 github.com/swaggo/files v1.0.1 github.com/swaggo/gin-swagger v1.6.0 + github.com/swaggo/swag v1.16.4 github.com/tidwall/gjson v1.18.0 golang.org/x/crypto v0.45.0 - golang.org/x/exp v0.0.0-20250408133849-7e4ce0ab07d0 + golang.org/x/exp v0.0.0-20251002181428-27f1f14c8bb9 golang.org/x/net v0.47.0 - golang.org/x/oauth2 v0.29.0 + golang.org/x/oauth2 v0.32.0 golang.org/x/sync v0.18.0 - google.golang.org/grpc v1.72.0 - google.golang.org/protobuf v1.36.6 + google.golang.org/grpc v1.76.0 + google.golang.org/protobuf v1.36.10 gorm.io/driver/sqlite v1.5.7 gorm.io/gorm v1.26.0 sigs.k8s.io/yaml v1.4.0 @@ -44,8 +46,13 @@ require ( require ( github.com/KyleBanks/depth v1.2.1 // indirect + github.com/VictoriaMetrics/easyproto v1.1.3 // indirect + github.com/VictoriaMetrics/fastcache v1.13.2 // indirect + github.com/VictoriaMetrics/metrics v1.40.2 // indirect + github.com/VictoriaMetrics/metricsql v0.84.8 // indirect github.com/bytedance/sonic v1.13.2 // indirect github.com/bytedance/sonic/loader v0.2.4 // indirect + github.com/cespare/xxhash/v2 v2.3.0 // indirect github.com/cloudwego/base64x v0.1.5 // indirect github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc // indirect github.com/fsnotify/fsnotify v1.9.0 // indirect @@ -59,10 +66,12 @@ require ( github.com/go-playground/universal-translator v0.18.1 // indirect github.com/go-playground/validator/v10 v10.26.0 // indirect github.com/golang-jwt/jwt/v4 v4.5.2 // indirect + github.com/golang/snappy v1.0.0 // indirect github.com/jinzhu/inflection v1.0.0 // indirect github.com/jinzhu/now v1.1.5 // indirect github.com/josharian/intern v1.0.0 // indirect github.com/json-iterator/go v1.1.12 // indirect + github.com/klauspost/compress v1.18.0 // indirect github.com/klauspost/cpuid/v2 v2.2.10 // indirect github.com/leodido/go-urn v1.4.0 // indirect github.com/mailru/easyjson v0.9.0 // indirect @@ -71,23 +80,28 @@ require ( github.com/mitchellh/copystructure v1.2.0 // indirect github.com/mitchellh/reflectwalk v1.0.2 // indirect github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect - github.com/modern-go/reflect2 v1.0.2 // indirect + github.com/modern-go/reflect2 v1.0.3-0.20250322232337-35a7c28c31ee // indirect github.com/pelletier/go-toml/v2 v2.2.4 // indirect github.com/pkg/errors v0.9.1 // indirect github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect - github.com/swaggo/swag v1.16.4 // indirect + github.com/rogpeppe/go-internal v1.14.1 // indirect github.com/tidwall/match v1.1.1 // indirect github.com/tidwall/pretty v1.2.1 // indirect github.com/tidwall/sjson v1.2.5 // indirect github.com/twitchyliquid64/golang-asm v0.15.1 // indirect github.com/ugorji/go/codec v1.2.12 // indirect + github.com/valyala/bytebufferpool v1.0.0 // indirect + github.com/valyala/fastrand v1.1.0 // indirect + github.com/valyala/gozstd v1.24.0 // indirect + github.com/valyala/histogram v1.2.0 // indirect + github.com/valyala/quicktemplate v1.8.0 // indirect github.com/youmark/pkcs8 v0.0.0-20240726163527-a2c0da244d78 // indirect golang.org/x/arch v0.16.0 // indirect golang.org/x/mod v0.29.0 // indirect golang.org/x/sys v0.38.0 // indirect golang.org/x/text v0.31.0 // indirect - golang.org/x/time v0.11.0 // indirect + golang.org/x/time v0.14.0 // indirect golang.org/x/tools v0.38.0 // indirect - google.golang.org/genproto/googleapis/rpc v0.0.0-20250428153025-10db94c68c34 // indirect + google.golang.org/genproto/googleapis/rpc v0.0.0-20251007200510-49b9836ed3ff // indirect gopkg.in/yaml.v3 v3.0.1 // indirect ) diff --git a/go.sum b/go.sum index 407e96c..22350d6 100644 --- a/go.sum +++ b/go.sum @@ -1,5 +1,17 @@ github.com/KyleBanks/depth v1.2.1 h1:5h8fQADFrWtarTdtDudMmGsC7GPbOAu6RVB3ffsVFHc= github.com/KyleBanks/depth v1.2.1/go.mod h1:jzSb9d0L43HxTQfT+oSA1EEp2q+ne2uh6XgeJcm8brE= +github.com/VictoriaMetrics/VictoriaMetrics v1.134.0 h1:0FgGM0rVRcTzd9qtO1gHlgLC/kBA1gsi+iwSYXAa/rQ= +github.com/VictoriaMetrics/VictoriaMetrics v1.134.0/go.mod h1:vUnt83zBB65TkVq7zSjSuJFUZFwnmV+hdF2KmkUbY0U= +github.com/VictoriaMetrics/easyproto v1.1.3 h1:gRSA3ZQs7n4+5I+SniDWD59jde1jVq4JmgQ9HUUyvk4= +github.com/VictoriaMetrics/easyproto v1.1.3/go.mod h1:QlGlzaJnDfFd8Lk6Ci/fuLxfTo3/GThPs2KH23mv710= +github.com/VictoriaMetrics/fastcache v1.13.2 h1:2XTB49aLSuCex7e9P5rqrfQcMkzGjh5Vq3GMFa8YpCA= +github.com/VictoriaMetrics/fastcache v1.13.2/go.mod h1:hHXhl4DA2fTL2HTZDJFXWgW0LNjo6B+4aj2Wmng3TjU= +github.com/VictoriaMetrics/metrics v1.40.2 h1:OVSjKcQEx6JAwGeu8/KQm9Su5qJ72TMEW4xYn5vw3Ac= +github.com/VictoriaMetrics/metrics v1.40.2/go.mod h1:XE4uudAAIRaJE614Tl5HMrtoEU6+GDZO4QTnNSsZRuA= +github.com/VictoriaMetrics/metricsql v0.84.8 h1:5JXrvPJiYkYNqJVT7+hMZmpAwRHd3txBdlVIw4rJ1VM= +github.com/VictoriaMetrics/metricsql v0.84.8/go.mod h1:d4EisFO6ONP/HIGDYTAtwrejJBBeKGQYiRl095bS4QQ= +github.com/allegro/bigcache v1.2.1-0.20190218064605-e24eb225f156 h1:eMwmnE/GDgah4HI848JfFxHt+iPb26b4zyfspmqY0/8= +github.com/allegro/bigcache v1.2.1-0.20190218064605-e24eb225f156/go.mod h1:Cb/ax3seSYIx7SuZdm2G2xzfwmv3TPSk2ucNfQESPXM= github.com/appleboy/gin-jwt/v2 v2.10.3 h1:KNcPC+XPRNpuoBh+j+rgs5bQxN+SwG/0tHbIqpRoBGc= github.com/appleboy/gin-jwt/v2 v2.10.3/go.mod h1:LDUaQ8mF2W6LyXIbd5wqlV2SFebuyYs4RDwqMNgpsp8= github.com/appleboy/gofight/v2 v2.1.2 h1:VOy3jow4vIK8BRQJoC/I9muxyYlJ2yb9ht2hZoS3rf4= @@ -9,6 +21,8 @@ github.com/bytedance/sonic v1.13.2/go.mod h1:o68xyaF9u2gvVBuGHPlUVCy+ZfmNNO5ETf1 github.com/bytedance/sonic/loader v0.1.1/go.mod h1:ncP89zfokxS5LZrJxl5z0UJcsk4M4yY2JpfqGeCtNLU= github.com/bytedance/sonic/loader v0.2.4 h1:ZWCw4stuXUsn1/+zQDqeE7JKP+QO47tz7QCNan80NzY= github.com/bytedance/sonic/loader v0.2.4/go.mod h1:N8A3vUdtUebEY2/VQC0MyhYeKUFosQU6FxH2JmUe6VI= +github.com/cespare/xxhash/v2 v2.3.0 h1:UL815xU9SqsFlibzuggzjXhog7bL6oX9BbNZnL2UFvs= +github.com/cespare/xxhash/v2 v2.3.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs= github.com/cloudwego/base64x v0.1.5 h1:XPciSp1xaq2VCSt6lF0phncD4koWyULpl5bUxbfCyP4= github.com/cloudwego/base64x v0.1.5/go.mod h1:0zlkT4Wn5C6NdauXdJRhSKRlJvmclQ1hhJgA0rcu/8w= github.com/cloudwego/iasm v0.2.0/go.mod h1:8rXZaNYT2n95jn+zTI1sDr+IgcD2GVs0nlbbQPiEFhY= @@ -30,8 +44,8 @@ github.com/gin-contrib/sse v1.1.0 h1:n0w2GMuUpWDVp7qSpvze6fAu9iRxJY4Hmj6AmBOU05w github.com/gin-contrib/sse v1.1.0/go.mod h1:hxRZ5gVpWMT7Z0B0gSNYqqsSCNIJMjzvm6fqCz9vjwM= github.com/gin-gonic/gin v1.10.0 h1:nTuyha1TYqgedzytsKYqna+DfLos46nTv2ygFy86HFU= github.com/gin-gonic/gin v1.10.0/go.mod h1:4PMNQiOhvDRa013RKVbsiNwoyezlm2rm0uX/T7kzp5Y= -github.com/go-logr/logr v1.4.2 h1:6pFjapn8bFcIbiKo3XT4j/BhANplGihG6tvd+8rYgrY= -github.com/go-logr/logr v1.4.2/go.mod h1:9T104GzyrTigFIr8wt5mBrctHMim0Nb2HLGrmQ40KvY= +github.com/go-logr/logr v1.4.3 h1:CjnDlHq8ikf6E492q6eKboGOC0T8CDaOvkHCIg8idEI= +github.com/go-logr/logr v1.4.3/go.mod h1:9T104GzyrTigFIr8wt5mBrctHMim0Nb2HLGrmQ40KvY= github.com/go-logr/stdr v1.2.2 h1:hSWxHoqTgW2S2qGc0LTAI563KZ5YKYRhT3MFKZMbjag= github.com/go-logr/stdr v1.2.2/go.mod h1:mMo/vtBO5dYbehREoey6XUKy/eSumjCCveDpRre4VKE= github.com/go-openapi/jsonpointer v0.21.0 h1:YgdVicSA9vH5RiHs9TZW5oyafXZFc6+2Vc1rr/O9oNQ= @@ -58,9 +72,11 @@ github.com/golang-jwt/jwt/v4 v4.5.2 h1:YtQM7lnr8iZ+j5q71MGKkNw9Mn7AjHM68uc9g5fXe github.com/golang-jwt/jwt/v4 v4.5.2/go.mod h1:m21LjoU+eqJr34lmDMbreY2eSTRJ1cv77w39/MY0Ch0= github.com/golang/protobuf v1.5.4 h1:i7eJL8qZTpSEXOPTxNKhASYpMn+8e5Q6AdndVa1dWek= github.com/golang/protobuf v1.5.4/go.mod h1:lnTiLA8Wa4RWRcIUkrtSVa5nRhsEGBg48fD6rSs7xps= +github.com/golang/snappy v1.0.0 h1:Oy607GVXHs7RtbggtPBnr2RmDArIsAefDwvrdWvRhGs= +github.com/golang/snappy v1.0.0/go.mod h1:/XxbfmMg8lxefKM7IXC3fBNl/7bRcc72aCRzEWrmP2Q= github.com/google/go-cmp v0.5.9/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= -github.com/google/go-cmp v0.6.0 h1:ofyhxvXcZhMsU5ulbFiLKl/XBFqE1GSq7atu8tAmTRI= -github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= +github.com/google/go-cmp v0.7.0 h1:wk8382ETsv4JYUZwIsn6YpYiWiBsYLSJiTsyBybVuN8= +github.com/google/go-cmp v0.7.0/go.mod h1:pXiqmnSA92OHEEa9HXL2W4E7lf9JzCmGVUdgjX3N/iU= github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg= github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0= github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= @@ -78,6 +94,8 @@ github.com/josharian/intern v1.0.0 h1:vlS4z54oSdjm0bgjRigI+G1HpF+tI+9rE5LLzOg8Hm github.com/josharian/intern v1.0.0/go.mod h1:5DoeVV0s6jJacbCEi61lwdGj/aVlrQvzHFFd8Hwg//Y= github.com/json-iterator/go v1.1.12 h1:PV8peI4a0ysnczrg+LtxykD8LfKY9ML6u2jnxaEnrnM= github.com/json-iterator/go v1.1.12/go.mod h1:e30LSqwooZae/UwlEbR2852Gd8hjQvJoHmT4TnhNGBo= +github.com/klauspost/compress v1.18.0 h1:c/Cqfb0r+Yi+JtIEq73FWXVkRonBlf0CRNYc8Zttxdo= +github.com/klauspost/compress v1.18.0/go.mod h1:2Pp+KzxcywXVXMr50+X0Q/Lsb43OQHYWRCY2AiWywWQ= github.com/klauspost/cpuid/v2 v2.0.9/go.mod h1:FInQzS24/EEf25PyTYn52gqo7WaD8xa0213Md/qVLRg= github.com/klauspost/cpuid/v2 v2.2.10 h1:tBs3QSyvjDyFTq3uoc/9xFpCuOsJQFNPiAhYdw2skhE= github.com/klauspost/cpuid/v2 v2.2.10/go.mod h1:hqwkgyIinND0mEev00jJYCxPNVRVXFQeu1XKlok6oO0= @@ -87,8 +105,8 @@ github.com/knadh/koanf/providers/env v1.1.0 h1:U2VXPY0f+CsNDkvdsG8GcsnK4ah85WwWy github.com/knadh/koanf/providers/env v1.1.0/go.mod h1:QhHHHZ87h9JxJAn2czdEl6pdkNnDh/JS1Vtsyt65hTY= github.com/knadh/koanf/providers/file v1.2.0 h1:hrUJ6Y9YOA49aNu/RSYzOTFlqzXSCpmYIDXI7OJU6+U= github.com/knadh/koanf/providers/file v1.2.0/go.mod h1:bp1PM5f83Q+TOUu10J/0ApLBd9uIzg+n9UgthfY+nRA= -github.com/knadh/koanf/v2 v2.2.0 h1:FZFwd9bUjpb8DyCWARUBy5ovuhDs1lI87dOEn2K8UVU= -github.com/knadh/koanf/v2 v2.2.0/go.mod h1:PSFru3ufQgTsI7IF+95rf9s8XA1+aHxKuO/W+dPoHEY= +github.com/knadh/koanf/v2 v2.3.0 h1:Qg076dDRFHvqnKG97ZEsi9TAg2/nFTa9hCdcSa1lvlM= +github.com/knadh/koanf/v2 v2.3.0/go.mod h1:gRb40VRAbd4iJMYYD5IxZ6hfuopFcXBpc9bbQpZwo28= github.com/knz/go-libedit v1.10.1/go.mod h1:MZTVkCWyz0oBc7JOWP3wNAzd002ZbM/5hgShxwh4x8M= github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE= github.com/kr/pretty v0.3.1/go.mod h1:hoEshYVHaxMs3cyo3Yncou5ZscifuDolrwPKZanG3xk= @@ -119,8 +137,9 @@ github.com/mitchellh/reflectwalk v1.0.2/go.mod h1:mSTlrgnPZtwu0c4WaC2kGObEpuNDbx github.com/modern-go/concurrent v0.0.0-20180228061459-e0a39a4cb421/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q= github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd h1:TRLaZ9cD/w8PVh93nsPXa1VrQ6jlwL5oN8l14QlcNfg= github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q= -github.com/modern-go/reflect2 v1.0.2 h1:xBagoLtFs94CBntxluKeaWgTMpvLxC4ur3nMaC9Gz0M= github.com/modern-go/reflect2 v1.0.2/go.mod h1:yWuevngMOJpCy52FWWMvUC8ws7m/LJsjYzDa0/r8luk= +github.com/modern-go/reflect2 v1.0.3-0.20250322232337-35a7c28c31ee h1:W5t00kpgFdJifH4BDsTlE89Zl93FEloxaWZfGcifgq8= +github.com/modern-go/reflect2 v1.0.3-0.20250322232337-35a7c28c31ee/go.mod h1:yWuevngMOJpCy52FWWMvUC8ws7m/LJsjYzDa0/r8luk= github.com/nezhahq/libdns-tencentcloud v0.0.0-20250501081622-bd293105845a h1:wCB9wDZi2JlTfMtE09s5VjSaQpk4EXegvja4wEzx2vk= github.com/nezhahq/libdns-tencentcloud v0.0.0-20250501081622-bd293105845a/go.mod h1:CUbNGv2k24auuhwa7MMVXl45fniBMm2eVi57FlWLcIs= github.com/ory/graceful v0.1.3 h1:FaeXcHZh168WzS+bqruqWEw/HgXWLdNv2nJ+fbhxbhc= @@ -138,8 +157,8 @@ github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 h1:Jamvg5psRI github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= github.com/robfig/cron/v3 v3.0.1 h1:WdRxkvbJztn8LMz/QEvLN5sBU+xKpSqwwUO1Pjr4qDs= github.com/robfig/cron/v3 v3.0.1/go.mod h1:eQICP3HwyT7UooqI/z+Ov+PtYAWygg1TEWWzGIFLtro= -github.com/rogpeppe/go-internal v1.11.0 h1:cWPaGQEPrBb5/AsnsZesgZZ9yb1OQ+GOISoDNXVBh4M= -github.com/rogpeppe/go-internal v1.11.0/go.mod h1:ddIwULY96R17DhadqLgMfk9H9tvdUzkipdSkR5nkCZA= +github.com/rogpeppe/go-internal v1.14.1 h1:UQB4HGPB6osV0SQTLymcB4TgvyWu6ZyliaW0tI/otEQ= +github.com/rogpeppe/go-internal v1.14.1/go.mod h1:MaRKkUm5W0goXpeCfT7UZI6fk/L7L7so1lCWt35ZSgc= github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw= github.com/stretchr/objx v0.5.0/go.mod h1:Yh+to48EsGEfYuaHDzXPcE3xhTkx73EhmCGUpEOglKo= @@ -149,8 +168,8 @@ github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/ github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU= github.com/stretchr/testify v1.8.1/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4= -github.com/stretchr/testify v1.10.0 h1:Xv5erBjTwe/5IxqUQTdXv5kgmIvbHo3QQyRwhJsOfJA= -github.com/stretchr/testify v1.10.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY= +github.com/stretchr/testify v1.11.1 h1:7s2iGBzp5EwR7/aIZr8ao5+dra3wiQyKjjFuvgVKu7U= +github.com/stretchr/testify v1.11.1/go.mod h1:wZwfW3scLgRK+23gO65QZefKpKQRnfz6sD981Nm4B6U= github.com/swaggo/files v1.0.1 h1:J1bVJ4XHZNq0I46UU90611i9/YzdrF7x92oX1ig5IdE= github.com/swaggo/files v1.0.1/go.mod h1:0qXmMNH6sXNf+73t65aKeB+ApmgxdnkQzVTAj2uaMUg= github.com/swaggo/gin-swagger v1.6.0 h1:y8sxvQ3E20/RCyrXeFfg60r6H0Z+SwpTjMYsMm+zy8M= @@ -171,29 +190,39 @@ github.com/twitchyliquid64/golang-asm v0.15.1 h1:SU5vSMR7hnwNxj24w34ZyCi/FmDZTkS github.com/twitchyliquid64/golang-asm v0.15.1/go.mod h1:a1lVb/DtPvCB8fslRZhAngC2+aY1QWCk3Cedj/Gdt08= github.com/ugorji/go/codec v1.2.12 h1:9LC83zGrHhuUA9l16C9AHXAqEV/2wBQ4nkvumAE65EE= github.com/ugorji/go/codec v1.2.12/go.mod h1:UNopzCgEMSXjBc6AOMqYvWC1ktqTAfzJZUZgYf6w6lg= +github.com/valyala/bytebufferpool v1.0.0 h1:GqA5TC/0021Y/b9FG4Oi9Mr3q7XYx6KllzawFIhcdPw= +github.com/valyala/bytebufferpool v1.0.0/go.mod h1:6bBcMArwyJ5K/AmCkWv1jt77kVWyCJ6HpOuEn7z0Csc= +github.com/valyala/fastrand v1.1.0 h1:f+5HkLW4rsgzdNoleUOB69hyT9IlD2ZQh9GyDMfb5G8= +github.com/valyala/fastrand v1.1.0/go.mod h1:HWqCzkrkg6QXT8V2EXWvXCoow7vLwOFN002oeRzjapQ= +github.com/valyala/gozstd v1.24.0 h1:M/9L3h7bVwbj2gZwrmuoaxzwVrmBUvos2jG9cZtuhlc= +github.com/valyala/gozstd v1.24.0/go.mod h1:y5Ew47GLlP37EkTB+B4s7r6A5rdaeB7ftbl9zoYiIPQ= +github.com/valyala/histogram v1.2.0 h1:wyYGAZZt3CpwUiIb9AU/Zbllg1llXyrtApRS815OLoQ= +github.com/valyala/histogram v1.2.0/go.mod h1:Hb4kBwb4UxsaNbbbh+RRz8ZR6pdodR57tzWUS3BUzXY= +github.com/valyala/quicktemplate v1.8.0 h1:zU0tjbIqTRgKQzFY1L42zq0qR3eh4WoQQdIdqCysW5k= +github.com/valyala/quicktemplate v1.8.0/go.mod h1:qIqW8/igXt8fdrUln5kOSb+KWMaJ4Y8QUsfd1k6L2jM= github.com/youmark/pkcs8 v0.0.0-20240726163527-a2c0da244d78 h1:ilQV1hzziu+LLM3zUTJ0trRztfwgjqKnBWNtSRkbmwM= github.com/youmark/pkcs8 v0.0.0-20240726163527-a2c0da244d78/go.mod h1:aL8wCCfTfSfmXjznFBSZNN13rSJjlIOI1fUNAtF7rmI= github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY= -go.opentelemetry.io/auto/sdk v1.1.0 h1:cH53jehLUN6UFLY71z+NDOiNJqDdPRaXzTel0sJySYA= -go.opentelemetry.io/auto/sdk v1.1.0/go.mod h1:3wSPjt5PWp2RhlCcmmOial7AvC4DQqZb7a7wCow3W8A= -go.opentelemetry.io/otel v1.34.0 h1:zRLXxLCgL1WyKsPVrgbSdMN4c0FMkDAskSTQP+0hdUY= -go.opentelemetry.io/otel v1.34.0/go.mod h1:OWFPOQ+h4G8xpyjgqo4SxJYdDQ/qmRH+wivy7zzx9oI= -go.opentelemetry.io/otel/metric v1.34.0 h1:+eTR3U0MyfWjRDhmFMxe2SsW64QrZ84AOhvqS7Y+PoQ= -go.opentelemetry.io/otel/metric v1.34.0/go.mod h1:CEDrp0fy2D0MvkXE+dPV7cMi8tWZwX3dmaIhwPOaqHE= -go.opentelemetry.io/otel/sdk v1.34.0 h1:95zS4k/2GOy069d321O8jWgYsW3MzVV+KuSPKp7Wr1A= -go.opentelemetry.io/otel/sdk v1.34.0/go.mod h1:0e/pNiaMAqaykJGKbi+tSjWfNNHMTxoC9qANsCzbyxU= -go.opentelemetry.io/otel/sdk/metric v1.34.0 h1:5CeK9ujjbFVL5c1PhLuStg1wxA7vQv7ce1EK0Gyvahk= -go.opentelemetry.io/otel/sdk/metric v1.34.0/go.mod h1:jQ/r8Ze28zRKoNRdkjCZxfs6YvBTG1+YIqyFVFYec5w= -go.opentelemetry.io/otel/trace v1.34.0 h1:+ouXS2V8Rd4hp4580a8q23bg0azF2nI8cqLYnC8mh/k= -go.opentelemetry.io/otel/trace v1.34.0/go.mod h1:Svm7lSjQD7kG7KJ/MUHPVXSDGz2OX4h0M2jHBhmSfRE= +go.opentelemetry.io/auto/sdk v1.2.1 h1:jXsnJ4Lmnqd11kwkBV2LgLoFMZKizbCi5fNZ/ipaZ64= +go.opentelemetry.io/auto/sdk v1.2.1/go.mod h1:KRTj+aOaElaLi+wW1kO/DZRXwkF4C5xPbEe3ZiIhN7Y= +go.opentelemetry.io/otel v1.38.0 h1:RkfdswUDRimDg0m2Az18RKOsnI8UDzppJAtj01/Ymk8= +go.opentelemetry.io/otel v1.38.0/go.mod h1:zcmtmQ1+YmQM9wrNsTGV/q/uyusom3P8RxwExxkZhjM= +go.opentelemetry.io/otel/metric v1.38.0 h1:Kl6lzIYGAh5M159u9NgiRkmoMKjvbsKtYRwgfrA6WpA= +go.opentelemetry.io/otel/metric v1.38.0/go.mod h1:kB5n/QoRM8YwmUahxvI3bO34eVtQf2i4utNVLr9gEmI= +go.opentelemetry.io/otel/sdk v1.38.0 h1:l48sr5YbNf2hpCUj/FoGhW9yDkl+Ma+LrVl8qaM5b+E= +go.opentelemetry.io/otel/sdk v1.38.0/go.mod h1:ghmNdGlVemJI3+ZB5iDEuk4bWA3GkTpW+DOoZMYBVVg= +go.opentelemetry.io/otel/sdk/metric v1.38.0 h1:aSH66iL0aZqo//xXzQLYozmWrXxyFkBJ6qT5wthqPoM= +go.opentelemetry.io/otel/sdk/metric v1.38.0/go.mod h1:dg9PBnW9XdQ1Hd6ZnRz689CbtrUp0wMMs9iPcgT9EZA= +go.opentelemetry.io/otel/trace v1.38.0 h1:Fxk5bKrDZJUH+AMyyIXGcFAPah0oRcT+LuNtJrmcNLE= +go.opentelemetry.io/otel/trace v1.38.0/go.mod h1:j1P9ivuFsTceSWe1oY+EeW3sc+Pp42sO++GHkg4wwhs= golang.org/x/arch v0.16.0 h1:foMtLTdyOmIniqWCHjY6+JxuC54XP1fDwx4N0ASyW+U= golang.org/x/arch v0.16.0/go.mod h1:JmwW7aLIoRUKgaTzhkiEFxvcEiQGyOg9BMonBJUS7EE= golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc= golang.org/x/crypto v0.45.0 h1:jMBrvKuj23MTlT0bQEOBcAE0mjg8mK9RXFhRH6nyF3Q= golang.org/x/crypto v0.45.0/go.mod h1:XTGrrkGJve7CYK7J8PEww4aY7gM3qMCElcJQ8n8JdX4= -golang.org/x/exp v0.0.0-20250408133849-7e4ce0ab07d0 h1:R84qjqJb5nVJMxqWYb3np9L5ZsaDtB+a39EqjV0JSUM= -golang.org/x/exp v0.0.0-20250408133849-7e4ce0ab07d0/go.mod h1:S9Xr4PYopiDyqSyp5NjCrhFrqg6A5zA2E/iPHPhqnS8= +golang.org/x/exp v0.0.0-20251002181428-27f1f14c8bb9 h1:TQwNpfvNkxAVlItJf6Cr5JTsVZoC/Sj7K3OZv2Pc14A= +golang.org/x/exp v0.0.0-20251002181428-27f1f14c8bb9/go.mod h1:TwQYMMnGpvZyc+JpB/UAuTNIsVJifOlSkrZkhcvpVUk= golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4/go.mod h1:jJ57K6gSWd91VN4djpZkiMVwK6gcyfeH4XE8wZrZaV4= golang.org/x/mod v0.29.0 h1:HV8lRxZC4l2cr3Zq1LvtOsi/ThTgWnUk/y64QSs8GwA= golang.org/x/mod v0.29.0/go.mod h1:NyhrlYXJ2H4eJiRy/WDBO6HMqZQ6q9nk4JzS3NuCK+w= @@ -203,8 +232,8 @@ golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug golang.org/x/net v0.7.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs= golang.org/x/net v0.47.0 h1:Mx+4dIFzqraBXUugkia1OOvlD6LemFo1ALMHjrXDOhY= golang.org/x/net v0.47.0/go.mod h1:/jNxtkgq5yWUGYkaZGqo27cfGZ1c5Nen03aYrrKpVRU= -golang.org/x/oauth2 v0.29.0 h1:WdYw2tdTK1S8olAzWHdgeqfy+Mtm9XNhv/xJsY65d98= -golang.org/x/oauth2 v0.29.0/go.mod h1:onh5ek6nERTohokkhCD/y2cV4Do3fxFHFuAejCkRWT8= +golang.org/x/oauth2 v0.32.0 h1:jsCblLleRMDrxMN29H3z/k1KliIvpLgCkE6R8FXXNgY= +golang.org/x/oauth2 v0.32.0/go.mod h1:lzm5WQJQwKZ3nwavOZ3IS5Aulzxi68dUSgRHujetwEA= golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.18.0 h1:kr88TuHDroi+UVf+0hZnirlk8o8T+4MrK6mr60WkH/I= @@ -227,20 +256,22 @@ golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ= golang.org/x/text v0.7.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8= golang.org/x/text v0.31.0 h1:aC8ghyu4JhP8VojJ2lEHBnochRno1sgL6nEi9WGFGMM= golang.org/x/text v0.31.0/go.mod h1:tKRAlv61yKIjGGHX/4tP1LTbc13YSec1pxVEWXzfoeM= -golang.org/x/time v0.11.0 h1:/bpjEDfN9tkoN/ryeYHnv5hcMlc8ncjMcM4XBk5NWV0= -golang.org/x/time v0.11.0/go.mod h1:CDIdPxbZBQxdj6cxyCIdrNogrJKMJ7pr37NYpMcMDSg= +golang.org/x/time v0.14.0 h1:MRx4UaLrDotUKUdCIqzPC48t1Y9hANFKIRpNx+Te8PI= +golang.org/x/time v0.14.0/go.mod h1:eL/Oa2bBBK0TkX57Fyni+NgnyQQN4LitPmob2Hjnqw4= golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc= golang.org/x/tools v0.38.0 h1:Hx2Xv8hISq8Lm16jvBZ2VQf+RLmbd7wVUsALibYI/IQ= golang.org/x/tools v0.38.0/go.mod h1:yEsQ/d/YK8cjh0L6rZlY8tgtlKiBNTL14pGDJPJpYQs= golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= -google.golang.org/genproto/googleapis/rpc v0.0.0-20250428153025-10db94c68c34 h1:h6p3mQqrmT1XkHVTfzLdNz1u7IhINeZkz67/xTbOuWs= -google.golang.org/genproto/googleapis/rpc v0.0.0-20250428153025-10db94c68c34/go.mod h1:qQ0YXyHHx3XkvlzUtpXDkS29lDSafHMZBAZDc03LQ3A= -google.golang.org/grpc v1.72.0 h1:S7UkcVa60b5AAQTaO6ZKamFp1zMZSU0fGDK2WZLbBnM= -google.golang.org/grpc v1.72.0/go.mod h1:wH5Aktxcg25y1I3w7H69nHfXdOG3UiadoBtjh3izSDM= -google.golang.org/protobuf v1.36.6 h1:z1NpPI8ku2WgiWnf+t9wTPsn6eP1L7ksHUlkfLvd9xY= -google.golang.org/protobuf v1.36.6/go.mod h1:jduwjTPXsFjZGTmRluh+L6NjiWu7pchiJ2/5YcXBHnY= +gonum.org/v1/gonum v0.16.0 h1:5+ul4Swaf3ESvrOnidPp4GZbzf0mxVQpDCYUQE7OJfk= +gonum.org/v1/gonum v0.16.0/go.mod h1:fef3am4MQ93R2HHpKnLk4/Tbh/s0+wqD5nfa6Pnwy4E= +google.golang.org/genproto/googleapis/rpc v0.0.0-20251007200510-49b9836ed3ff h1:A90eA31Wq6HOMIQlLfzFwzqGKBTuaVztYu/g8sn+8Zc= +google.golang.org/genproto/googleapis/rpc v0.0.0-20251007200510-49b9836ed3ff/go.mod h1:7i2o+ce6H/6BluujYR+kqX3GKH+dChPTQU19wjRPiGk= +google.golang.org/grpc v1.76.0 h1:UnVkv1+uMLYXoIz6o7chp59WfQUYA2ex/BXQ9rHZu7A= +google.golang.org/grpc v1.76.0/go.mod h1:Ju12QI8M6iQJtbcsV+awF5a4hfJMLi4X0JLo94ULZ6c= +google.golang.org/protobuf v1.36.10 h1:AYd7cD/uASjIL6Q9LiTjz8JLcrh/88q5UObnmY3aOOE= +google.golang.org/protobuf v1.36.10/go.mod h1:HTf+CrKn2C3g5S8VImy6tdcUvCska2kB7j23XfzDpco= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk= gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c/go.mod h1:JHkPIbrfpd72SG/EVd6muEfDQjcINNoR0C8j2r3qZ4Q= diff --git a/model/config.go b/model/config.go index 380cce8..9e54931 100644 --- a/model/config.go +++ b/model/config.go @@ -70,6 +70,12 @@ type Config struct { // HTTPS 配置 HTTPS HTTPSConf `koanf:"https" json:"https"` + // TSDB 配置 + TSDB TSDBConf `koanf:"tsdb" json:"tsdb"` + + // 内存配置 + Memory MemoryConf `koanf:"memory" json:"memory"` + k *koanf.Koanf `json:"-"` filePath string `json:"-"` } @@ -81,6 +87,22 @@ type HTTPSConf struct { TLSKeyPath string `koanf:"tls_key_path" json:"tls_key_path,omitempty"` } +// TSDBConf TSDB 配置 +type TSDBConf struct { + DataPath string `koanf:"data_path" json:"data_path,omitempty"` + RetentionDays uint16 `koanf:"retention_days" json:"retention_days,omitempty"` + MinFreeDiskSpaceGB float64 `koanf:"min_free_disk_space_gb" json:"min_free_disk_space_gb,omitempty"` + MaxMemoryMB int64 `koanf:"max_memory_mb" json:"max_memory_mb,omitempty"` + WriteBufferSize int `koanf:"write_buffer_size" json:"write_buffer_size,omitempty"` + WriteBufferFlushInterval int `koanf:"write_buffer_flush_interval" json:"write_buffer_flush_interval,omitempty"` +} + +// MemoryConf 内存配置 +type MemoryConf struct { + // GoMemLimitMB Go 运行时内存限制(MB),0 表示不限制 + GoMemLimitMB int64 `koanf:"go_mem_limit_mb" json:"go_mem_limit_mb,omitempty"` +} + // Read 读取配置文件并应用 func (c *Config) Read(path string, frontendTemplates []FrontendTemplate) error { c.k = koanf.New(".") diff --git a/model/service.go b/model/service.go index 0be9f38..72153bc 100644 --- a/model/service.go +++ b/model/service.go @@ -53,6 +53,7 @@ type Service struct { Target string `json:"target"` SkipServersRaw string `json:"-"` Duration uint64 `json:"duration"` + DisplayIndex int `json:"display_index"` // 展示排序,越大越靠前 Notify bool `json:"notify,omitempty"` NotificationGroupID uint64 `json:"notification_group_id"` // 当前服务监控所属的通知组 ID Cover uint8 `json:"cover"` diff --git a/model/service_api.go b/model/service_api.go index ad877f2..b314fc1 100644 --- a/model/service_api.go +++ b/model/service_api.go @@ -7,6 +7,7 @@ type ServiceForm struct { Target string `json:"target,omitempty"` Type uint8 `json:"type,omitempty"` Cover uint8 `json:"cover,omitempty"` + DisplayIndex int `json:"display_index,omitempty" default:"0"` // 展示排序,越大越靠前 Notify bool `json:"notify,omitempty" validate:"optional"` Duration uint64 `json:"duration,omitempty"` MinLatency float32 `json:"min_latency,omitempty" default:"0.0"` @@ -26,7 +27,7 @@ type ServiceResponseItem struct { CurrentDown uint64 `json:"current_down"` TotalUp uint64 `json:"total_up"` TotalDown uint64 `json:"total_down"` - Delay *[30]float32 `json:"delay,omitempty"` + Delay *[30]float64 `json:"delay,omitempty"` Up *[30]uint64 `json:"up,omitempty"` Down *[30]uint64 `json:"down,omitempty"` } diff --git a/model/service_history.go b/model/service_history.go index a607c94..769ab53 100644 --- a/model/service_history.go +++ b/model/service_history.go @@ -10,7 +10,7 @@ type ServiceHistory struct { UpdatedAt time.Time `gorm:"autoUpdateTime" json:"updated_at,omitempty"` ServiceID uint64 `gorm:"index:idx_server_id_created_at_service_id_avg_delay" json:"service_id,omitempty"` ServerID uint64 `gorm:"index:idx_server_id_created_at_service_id_avg_delay" json:"server_id,omitempty"` - AvgDelay float32 `gorm:"index:idx_server_id_created_at_service_id_avg_delay" json:"avg_delay,omitempty"` // 平均延迟,毫秒 + AvgDelay float64 `gorm:"index:idx_server_id_created_at_service_id_avg_delay" json:"avg_delay,omitempty"` // 平均延迟,毫秒 Up uint64 `json:"up,omitempty"` // 检查状态良好计数 Down uint64 `json:"down,omitempty"` // 检查状态异常计数 Data string `json:"data,omitempty"` diff --git a/model/service_history_api.go b/model/service_history_api.go index 88337a7..4b003c0 100644 --- a/model/service_history_api.go +++ b/model/service_history_api.go @@ -1,10 +1,56 @@ package model +// ServiceInfos 服务监控信息(兼容旧API) type ServiceInfos struct { - ServiceID uint64 `json:"monitor_id"` - ServerID uint64 `json:"server_id"` - ServiceName string `json:"monitor_name"` - ServerName string `json:"server_name"` - CreatedAt []int64 `json:"created_at"` - AvgDelay []float32 `json:"avg_delay"` + ServiceID uint64 `json:"monitor_id"` + ServerID uint64 `json:"server_id"` + ServiceName string `json:"monitor_name"` + ServerName string `json:"server_name"` + DisplayIndex int `json:"display_index"` // 展示排序,越大越靠前 + CreatedAt []int64 `json:"created_at"` + AvgDelay []float64 `json:"avg_delay"` +} + +// DataPoint 数据点 +type DataPoint struct { + Timestamp int64 `json:"ts"` + Delay float64 `json:"delay"` + Status uint8 `json:"status"` // 1=成功, 0=失败 +} + +// ServiceHistorySummary 服务历史统计摘要 +type ServiceHistorySummary struct { + AvgDelay float64 `json:"avg_delay"` + UpPercent float32 `json:"up_percent"` + TotalUp uint64 `json:"total_up"` + TotalDown uint64 `json:"total_down"` + DataPoints []DataPoint `json:"data_points,omitempty"` +} + +// ServerServiceStats 某服务器对某服务的统计 +type ServerServiceStats struct { + ServerID uint64 `json:"server_id"` + ServerName string `json:"server_name,omitempty"` + Stats ServiceHistorySummary `json:"stats"` +} + +// ServiceHistoryResponse 服务历史查询响应 +type ServiceHistoryResponse struct { + ServiceID uint64 `json:"service_id"` + ServiceName string `json:"service_name,omitempty"` + Servers []ServerServiceStats `json:"servers"` +} + +// ServerMetricsDataPoint 服务器指标数据点 +type ServerMetricsDataPoint struct { + Timestamp int64 `json:"ts"` + Value float64 `json:"value"` +} + +// ServerMetricsResponse 服务器指标历史查询响应 +type ServerMetricsResponse struct { + ServerID uint64 `json:"server_id"` + ServerName string `json:"server_name,omitempty"` + Metric string `json:"metric"` + DataPoints []ServerMetricsDataPoint `json:"data_points"` } diff --git a/model/setting_api.go b/model/setting_api.go index 5cef3af..d9ef7d9 100644 --- a/model/setting_api.go +++ b/model/setting_api.go @@ -10,8 +10,8 @@ type SettingForm struct { InstallHost string `json:"install_host,omitempty" validate:"optional"` CustomCode string `json:"custom_code,omitempty" validate:"optional"` CustomCodeDashboard string `json:"custom_code_dashboard,omitempty" validate:"optional"` - WebRealIPHeader string `json:"web_real_ip_header,omitempty" validate:"optional"` // 前端真实IP - AgentRealIPHeader string `json:"agent_real_ip_header,omitempty" validate:"optional"` // Agent真实IP + WebRealIPHeader string `json:"web_real_ip_header,omitempty" validate:"optional"` // 前端真实IP + AgentRealIPHeader string `json:"agent_real_ip_header,omitempty" validate:"optional"` // Agent真实IP UserTemplate string `json:"user_template,omitempty" validate:"optional"` AgentTLS bool `json:"tls,omitempty" validate:"optional"` @@ -42,4 +42,5 @@ type SettingResponse struct { Version string `json:"version,omitempty"` FrontendTemplates []FrontendTemplate `json:"frontend_templates,omitempty"` + TSDBEnabled bool `json:"tsdb_enabled"` } diff --git a/pkg/tsdb/config.go b/pkg/tsdb/config.go new file mode 100644 index 0000000..1d5d267 --- /dev/null +++ b/pkg/tsdb/config.go @@ -0,0 +1,67 @@ +package tsdb + +import "time" + +// Config TSDB 配置选项 +type Config struct { + // DataPath 数据存储路径,为空则不启用 TSDB + DataPath string `koanf:"data_path" json:"data_path,omitempty"` + // RetentionDays 数据保留天数,默认 30 天 + RetentionDays uint16 `koanf:"retention_days" json:"retention_days,omitempty"` + // MinFreeDiskSpaceGB 最小磁盘剩余空间(GB),默认 1GB + // 当磁盘剩余空间低于此值时,TSDB 将停止接收新数据以防止磁盘耗尽 + MinFreeDiskSpaceGB float64 `koanf:"min_free_disk_space_gb" json:"min_free_disk_space_gb,omitempty"` + // MaxMemoryMB 最大内存使用量(MB),默认 256MB,用于限制 VictoriaMetrics 缓存 + MaxMemoryMB int64 `koanf:"max_memory_mb" json:"max_memory_mb,omitempty"` + // DedupInterval 去重间隔,默认 30 秒 + DedupInterval time.Duration `koanf:"dedup_interval" json:"dedup_interval,omitempty"` + // WriteBufferSize 写入缓冲区大小,默认 512,达到此数量后批量写入 + WriteBufferSize int `koanf:"write_buffer_size" json:"write_buffer_size,omitempty"` + // WriteBufferFlushInterval 写入缓冲区刷新间隔,默认 5 秒 + WriteBufferFlushInterval time.Duration `koanf:"write_buffer_flush_interval" json:"write_buffer_flush_interval,omitempty"` +} + +// DefaultConfig 返回默认配置(不设置 DataPath,需要显式配置才启用) +func DefaultConfig() *Config { + return &Config{ + DataPath: "", + RetentionDays: 30, + MinFreeDiskSpaceGB: 1, + MaxMemoryMB: 256, + DedupInterval: 30 * time.Second, + WriteBufferSize: 512, + WriteBufferFlushInterval: 5 * time.Second, + } +} + +// Validate 验证配置有效性并填充默认值 +func (c *Config) Validate() { + if c.RetentionDays == 0 { + c.RetentionDays = 30 + } + if c.MinFreeDiskSpaceGB <= 0 { + c.MinFreeDiskSpaceGB = 1 + } + if c.MaxMemoryMB <= 0 { + c.MaxMemoryMB = 256 + } + if c.DedupInterval <= 0 { + c.DedupInterval = 30 * time.Second + } + if c.WriteBufferSize <= 0 { + c.WriteBufferSize = 512 + } + if c.WriteBufferFlushInterval <= 0 { + c.WriteBufferFlushInterval = 5 * time.Second + } +} + +// Enabled 检查是否启用 TSDB +func (c *Config) Enabled() bool { + return c.DataPath != "" +} + +// MinFreeDiskSpaceBytes 返回最小磁盘剩余空间(字节) +func (c *Config) MinFreeDiskSpaceBytes() int64 { + return int64(c.MinFreeDiskSpaceGB * 1024 * 1024 * 1024) +} diff --git a/pkg/tsdb/maintenance.go b/pkg/tsdb/maintenance.go new file mode 100644 index 0000000..fe9c73e --- /dev/null +++ b/pkg/tsdb/maintenance.go @@ -0,0 +1,17 @@ +package tsdb + +import ( + "log" +) + +func (db *TSDB) Maintenance() { + db.mu.RLock() + defer db.mu.RUnlock() + if db.closed { + return + } + + log.Println("NEZHA>> TSDB starting maintenance (flush)...") + db.storage.DebugFlush() + log.Println("NEZHA>> TSDB maintenance completed") +} diff --git a/pkg/tsdb/query.go b/pkg/tsdb/query.go new file mode 100644 index 0000000..e5b7379 --- /dev/null +++ b/pkg/tsdb/query.go @@ -0,0 +1,664 @@ +package tsdb + +import ( + "fmt" + "log" + "sort" + "strconv" + "time" + + "github.com/VictoriaMetrics/VictoriaMetrics/lib/storage" + + "github.com/nezhahq/nezha/model" +) + +// QueryPeriod 查询时间段 +type QueryPeriod string + +const ( + Period1Day QueryPeriod = "1d" + Period7Days QueryPeriod = "7d" + Period30Days QueryPeriod = "30d" +) + +// ParseQueryPeriod 解析查询时间段 +func ParseQueryPeriod(s string) (QueryPeriod, error) { + switch s { + case "1d", "": + return Period1Day, nil + case "7d": + return Period7Days, nil + case "30d": + return Period30Days, nil + default: + return "", fmt.Errorf("invalid period: %s, expected 1d, 7d, or 30d", s) + } +} + +// Duration 返回时间段的时长 +func (p QueryPeriod) Duration() time.Duration { + switch p { + case Period7Days: + return 7 * 24 * time.Hour + case Period30Days: + return 30 * 24 * time.Hour + default: + return 24 * time.Hour + } +} + +// DownsampleInterval 返回降采样间隔 +// 1d: 5分钟一个点 (288个点) +// 7d: 30分钟一个点 (336个点) +// 30d: 2小时一个点 (360个点) +func (p QueryPeriod) DownsampleInterval() time.Duration { + switch p { + case Period7Days: + return 30 * time.Minute + case Period30Days: + return 2 * time.Hour + default: + return 5 * time.Minute + } +} + +// Type aliases for model types used in tsdb package +type ( + DataPoint = model.DataPoint + ServiceHistorySummary = model.ServiceHistorySummary + ServerServiceStats = model.ServerServiceStats + ServiceHistoryResult = model.ServiceHistoryResponse + MetricDataPoint = model.ServerMetricsDataPoint +) + +type rawDataPoint struct { + timestamp int64 + value float64 + status float64 + hasDelay bool + hasStatus bool +} + +func (db *TSDB) QueryServiceHistory(serviceID uint64, period QueryPeriod) (*ServiceHistoryResult, error) { + db.mu.RLock() + defer db.mu.RUnlock() + if db.closed { + return nil, fmt.Errorf("TSDB is closed") + } + + now := time.Now() + tr := storage.TimeRange{ + MinTimestamp: now.Add(-period.Duration()).UnixMilli(), + MaxTimestamp: now.UnixMilli(), + } + + serviceIDStr := strconv.FormatUint(serviceID, 10) + + delayData, err := db.queryMetricByServiceID(MetricServiceDelay, serviceIDStr, tr) + if err != nil { + return nil, fmt.Errorf("failed to query delay data: %w", err) + } + + statusData, err := db.queryMetricByServiceID(MetricServiceStatus, serviceIDStr, tr) + if err != nil { + return nil, fmt.Errorf("failed to query status data: %w", err) + } + + result := &ServiceHistoryResult{ + ServiceID: serviceID, + Servers: make([]ServerServiceStats, 0), + } + + serverDataMap := make(map[uint64]map[int64]*rawDataPoint) + + for serverID, points := range delayData { + if serverDataMap[serverID] == nil { + serverDataMap[serverID] = make(map[int64]*rawDataPoint) + } + for _, p := range points { + serverDataMap[serverID][p.timestamp] = &rawDataPoint{ + timestamp: p.timestamp, + value: p.value, + hasDelay: true, + } + } + } + + for serverID, points := range statusData { + if serverDataMap[serverID] == nil { + serverDataMap[serverID] = make(map[int64]*rawDataPoint) + } + for _, p := range points { + if existing, ok := serverDataMap[serverID][p.timestamp]; ok { + existing.status = p.value + existing.hasStatus = true + } else { + serverDataMap[serverID][p.timestamp] = &rawDataPoint{ + timestamp: p.timestamp, + status: p.value, + hasStatus: true, + } + } + } + } + + for serverID, pointsMap := range serverDataMap { + points := make([]rawDataPoint, 0, len(pointsMap)) + for _, p := range pointsMap { + points = append(points, *p) + } + stats := calculateStats(points, period.DownsampleInterval()) + result.Servers = append(result.Servers, ServerServiceStats{ + ServerID: serverID, + Stats: stats, + }) + } + + sort.Slice(result.Servers, func(i, j int) bool { + return result.Servers[i].ServerID < result.Servers[j].ServerID + }) + + return result, nil +} + +type DailyServiceStats struct { + Up uint64 + Down uint64 + Delay float64 +} + +func (db *TSDB) QueryServiceDailyStats(serviceID uint64, today time.Time, days int) ([]DailyServiceStats, error) { + db.mu.RLock() + defer db.mu.RUnlock() + if db.closed { + return nil, fmt.Errorf("TSDB is closed") + } + + stats := make([]DailyServiceStats, days) + serviceIDStr := strconv.FormatUint(serviceID, 10) + + start := today.AddDate(0, 0, -(days - 1)) + tr := storage.TimeRange{ + MinTimestamp: start.UnixMilli(), + MaxTimestamp: today.UnixMilli(), + } + + statusData, err := db.queryMetricByServiceID(MetricServiceStatus, serviceIDStr, tr) + if err != nil { + return nil, err + } + delayData, err := db.queryMetricByServiceID(MetricServiceDelay, serviceIDStr, tr) + if err != nil { + return nil, err + } + + for _, points := range statusData { + for _, p := range points { + ts := time.UnixMilli(p.timestamp) + dayIndex := (days - 1) - int(today.Sub(ts).Hours())/24 + if dayIndex < 0 || dayIndex >= days { + continue + } + if p.value >= 0.5 { + stats[dayIndex].Up++ + } else { + stats[dayIndex].Down++ + } + } + } + + delayCount := make([]int, days) + for _, points := range delayData { + for _, p := range points { + ts := time.UnixMilli(p.timestamp) + dayIndex := (days - 1) - int(today.Sub(ts).Hours())/24 + if dayIndex < 0 || dayIndex >= days { + continue + } + stats[dayIndex].Delay = (stats[dayIndex].Delay*float64(delayCount[dayIndex]) + p.value) / float64(delayCount[dayIndex]+1) + delayCount[dayIndex]++ + } + } + + return stats, nil +} + +type metricPoint struct { + timestamp int64 + value float64 +} + +func (db *TSDB) queryMetricByServiceID(metric MetricType, serviceID string, tr storage.TimeRange) (map[uint64][]metricPoint, error) { + tfs := storage.NewTagFilters() + if err := tfs.Add(nil, []byte(metric), false, false); err != nil { + return nil, err + } + if err := tfs.Add([]byte("service_id"), []byte(serviceID), false, false); err != nil { + return nil, err + } + + deadline := uint64(time.Now().Add(30 * time.Second).Unix()) + + var search storage.Search + search.Init(nil, db.storage, []*storage.TagFilters{tfs}, tr, 100000, deadline) + defer search.MustClose() + + result := make(map[uint64][]metricPoint) + var timestamps []int64 + var values []float64 + + for search.NextMetricBlock() { + mbr := search.MetricBlockRef + var block storage.Block + mbr.BlockRef.MustReadBlock(&block) + + mn := storage.GetMetricName() + if err := mn.Unmarshal(mbr.MetricName); err != nil { + log.Printf("NEZHA>> TSDB: failed to unmarshal metric name: %v", err) + storage.PutMetricName(mn) + continue + } + + serverIDBytes := mn.GetTagValue("server_id") + if len(serverIDBytes) == 0 { + storage.PutMetricName(mn) + continue + } + + serverID, err := strconv.ParseUint(string(serverIDBytes), 10, 64) + if err != nil { + log.Printf("NEZHA>> TSDB: failed to parse server_id %q: %v", string(serverIDBytes), err) + storage.PutMetricName(mn) + continue + } + storage.PutMetricName(mn) + + if err := block.UnmarshalData(); err != nil { + log.Printf("NEZHA>> TSDB: failed to unmarshal block data: %v", err) + continue + } + + timestamps = timestamps[:0] + values = values[:0] + timestamps, values = block.AppendRowsWithTimeRangeFilter(timestamps, values, tr) + + for i := range timestamps { + result[serverID] = append(result[serverID], metricPoint{ + timestamp: timestamps[i], + value: values[i], + }) + } + } + + if err := search.Error(); err != nil { + return nil, err + } + + return result, nil +} + +func calculateStats(points []rawDataPoint, downsampleInterval time.Duration) ServiceHistorySummary { + if len(points) == 0 { + return ServiceHistorySummary{} + } + + sort.Slice(points, func(i, j int) bool { + return points[i].timestamp < points[j].timestamp + }) + + var totalDelay float64 + var delayCount int + var totalUp, totalDown uint64 + + for _, p := range points { + if p.hasDelay { + totalDelay += p.value + delayCount++ + } + if p.hasStatus { + if p.status >= 0.5 { + totalUp++ + } else { + totalDown++ + } + } + } + + summary := ServiceHistorySummary{ + TotalUp: totalUp, + TotalDown: totalDown, + } + + if delayCount > 0 { + summary.AvgDelay = totalDelay / float64(delayCount) + } + + if totalUp+totalDown > 0 { + summary.UpPercent = float32(totalUp) / float32(totalUp+totalDown) * 100 + } + + summary.DataPoints = downsample(points, downsampleInterval) + + return summary +} + +func downsample(points []rawDataPoint, interval time.Duration) []DataPoint { + if len(points) == 0 { + return nil + } + + intervalMs := interval.Milliseconds() + result := make([]DataPoint, 0) + + // points 已排序,线性扫描分桶 + bucketStart := (points[0].timestamp / intervalMs) * intervalMs + var totalDelay float64 + var delayCount, upCount, statusCount int + + flushBucket := func() { + var avgDelay float64 + if delayCount > 0 { + avgDelay = totalDelay / float64(delayCount) + } + var status uint8 + if statusCount > 0 && upCount > statusCount/2 { + status = 1 + } + result = append(result, DataPoint{ + Timestamp: bucketStart, + Delay: avgDelay, + Status: status, + }) + } + + for _, p := range points { + key := (p.timestamp / intervalMs) * intervalMs + if key != bucketStart { + flushBucket() + bucketStart = key + totalDelay = 0 + delayCount = 0 + upCount = 0 + statusCount = 0 + } + if p.hasDelay { + totalDelay += p.value + delayCount++ + } + if p.hasStatus { + statusCount++ + if p.status >= 0.5 { + upCount++ + } + } + } + flushBucket() + + return result +} + +func downsampleMetrics(points []rawDataPoint, interval time.Duration, useLastValue bool) []MetricDataPoint { + if len(points) == 0 { + return nil + } + + sort.Slice(points, func(i, j int) bool { + return points[i].timestamp < points[j].timestamp + }) + + intervalMs := interval.Milliseconds() + result := make([]MetricDataPoint, 0) + + bucketStart := (points[0].timestamp / intervalMs) * intervalMs + var total float64 + var count int + var last rawDataPoint + + flushBucket := func() { + var value float64 + if useLastValue { + value = last.value + } else if count > 0 { + value = total / float64(count) + } + result = append(result, MetricDataPoint{ + Timestamp: bucketStart, + Value: value, + }) + } + + for _, p := range points { + key := (p.timestamp / intervalMs) * intervalMs + if key != bucketStart { + flushBucket() + bucketStart = key + total = 0 + count = 0 + } + total += p.value + count++ + last = p + } + flushBucket() + + return result +} + +// isCumulativeMetric 判断指标是否为累积型(单调递增) +func isCumulativeMetric(metric MetricType) bool { + switch metric { + case MetricServerNetInTransfer, MetricServerNetOutTransfer, MetricServerUptime: + return true + default: + return false + } +} + +func (db *TSDB) QueryServerMetrics(serverID uint64, metric MetricType, period QueryPeriod) ([]MetricDataPoint, error) { + db.mu.RLock() + defer db.mu.RUnlock() + if db.closed { + return nil, fmt.Errorf("TSDB is closed") + } + + now := time.Now() + tr := storage.TimeRange{ + MinTimestamp: now.Add(-period.Duration()).UnixMilli(), + MaxTimestamp: now.UnixMilli(), + } + + serverIDStr := strconv.FormatUint(serverID, 10) + + tfs := storage.NewTagFilters() + if err := tfs.Add(nil, []byte(metric), false, false); err != nil { + return nil, err + } + if err := tfs.Add([]byte("server_id"), []byte(serverIDStr), false, false); err != nil { + return nil, err + } + + deadline := uint64(time.Now().Add(30 * time.Second).Unix()) + + var search storage.Search + search.Init(nil, db.storage, []*storage.TagFilters{tfs}, tr, 100000, deadline) + defer search.MustClose() + + var points []rawDataPoint + var timestamps []int64 + var values []float64 + + for search.NextMetricBlock() { + mbr := search.MetricBlockRef + var block storage.Block + mbr.BlockRef.MustReadBlock(&block) + + if err := block.UnmarshalData(); err != nil { + log.Printf("NEZHA>> TSDB: failed to unmarshal block data: %v", err) + continue + } + + timestamps = timestamps[:0] + values = values[:0] + timestamps, values = block.AppendRowsWithTimeRangeFilter(timestamps, values, tr) + + for i := range timestamps { + points = append(points, rawDataPoint{ + timestamp: timestamps[i], + value: values[i], + }) + } + } + + if err := search.Error(); err != nil { + return nil, err + } + + return downsampleMetrics(points, period.DownsampleInterval(), isCumulativeMetric(metric)), nil +} + +func (db *TSDB) QueryServiceHistoryByServerID(serverID uint64, period QueryPeriod) (map[uint64]*ServiceHistoryResult, error) { + db.mu.RLock() + defer db.mu.RUnlock() + if db.closed { + return nil, fmt.Errorf("TSDB is closed") + } + + now := time.Now() + tr := storage.TimeRange{ + MinTimestamp: now.Add(-period.Duration()).UnixMilli(), + MaxTimestamp: now.UnixMilli(), + } + + serverIDStr := strconv.FormatUint(serverID, 10) + + delayData, err := db.queryMetricByServerID(MetricServiceDelay, serverIDStr, tr) + if err != nil { + return nil, fmt.Errorf("failed to query delay data: %w", err) + } + + statusData, err := db.queryMetricByServerID(MetricServiceStatus, serverIDStr, tr) + if err != nil { + return nil, fmt.Errorf("failed to query status data: %w", err) + } + + serviceDataMap := make(map[uint64]map[int64]*rawDataPoint) + + for serviceID, points := range delayData { + if serviceDataMap[serviceID] == nil { + serviceDataMap[serviceID] = make(map[int64]*rawDataPoint) + } + for _, p := range points { + serviceDataMap[serviceID][p.timestamp] = &rawDataPoint{ + timestamp: p.timestamp, + value: p.value, + hasDelay: true, + } + } + } + + for serviceID, points := range statusData { + if serviceDataMap[serviceID] == nil { + serviceDataMap[serviceID] = make(map[int64]*rawDataPoint) + } + for _, p := range points { + if existing, ok := serviceDataMap[serviceID][p.timestamp]; ok { + existing.status = p.value + existing.hasStatus = true + } else { + serviceDataMap[serviceID][p.timestamp] = &rawDataPoint{ + timestamp: p.timestamp, + status: p.value, + hasStatus: true, + } + } + } + } + + results := make(map[uint64]*ServiceHistoryResult) + + for serviceID, pointsMap := range serviceDataMap { + points := make([]rawDataPoint, 0, len(pointsMap)) + for _, p := range pointsMap { + points = append(points, *p) + } + stats := calculateStats(points, period.DownsampleInterval()) + results[serviceID] = &ServiceHistoryResult{ + ServiceID: serviceID, + Servers: []ServerServiceStats{{ + ServerID: serverID, + Stats: stats, + }}, + } + } + + return results, nil +} + +func (db *TSDB) queryMetricByServerID(metric MetricType, serverID string, tr storage.TimeRange) (map[uint64][]metricPoint, error) { + tfs := storage.NewTagFilters() + if err := tfs.Add(nil, []byte(metric), false, false); err != nil { + return nil, err + } + if err := tfs.Add([]byte("server_id"), []byte(serverID), false, false); err != nil { + return nil, err + } + + deadline := uint64(time.Now().Add(30 * time.Second).Unix()) + + var search storage.Search + search.Init(nil, db.storage, []*storage.TagFilters{tfs}, tr, 100000, deadline) + defer search.MustClose() + + result := make(map[uint64][]metricPoint) + var timestamps []int64 + var values []float64 + + for search.NextMetricBlock() { + mbr := search.MetricBlockRef + var block storage.Block + mbr.BlockRef.MustReadBlock(&block) + + mn := storage.GetMetricName() + if err := mn.Unmarshal(mbr.MetricName); err != nil { + log.Printf("NEZHA>> TSDB: failed to unmarshal metric name: %v", err) + storage.PutMetricName(mn) + continue + } + + serviceIDBytes := mn.GetTagValue("service_id") + if len(serviceIDBytes) == 0 { + storage.PutMetricName(mn) + continue + } + + serviceID, err := strconv.ParseUint(string(serviceIDBytes), 10, 64) + if err != nil { + log.Printf("NEZHA>> TSDB: failed to parse service_id %q: %v", string(serviceIDBytes), err) + storage.PutMetricName(mn) + continue + } + storage.PutMetricName(mn) + + if err := block.UnmarshalData(); err != nil { + log.Printf("NEZHA>> TSDB: failed to unmarshal block data: %v", err) + continue + } + + timestamps = timestamps[:0] + values = values[:0] + timestamps, values = block.AppendRowsWithTimeRangeFilter(timestamps, values, tr) + + for i := range timestamps { + result[serviceID] = append(result[serviceID], metricPoint{ + timestamp: timestamps[i], + value: values[i], + }) + } + } + + if err := search.Error(); err != nil { + return nil, err + } + + return result, nil +} diff --git a/pkg/tsdb/tsdb.go b/pkg/tsdb/tsdb.go new file mode 100644 index 0000000..d3627b6 --- /dev/null +++ b/pkg/tsdb/tsdb.go @@ -0,0 +1,117 @@ +package tsdb + +import ( + "fmt" + "log" + "path/filepath" + "sync" + "time" + + "github.com/VictoriaMetrics/VictoriaMetrics/lib/storage" +) + +// TSDB 封装 VictoriaMetrics 存储 +type TSDB struct { + storage *storage.Storage + config *Config + mu sync.RWMutex + closed bool + + writer *bufferedWriter +} + +// InitGlobalSettings 初始化 VictoriaMetrics 包级别的全局设置。 +// 这些设置是进程级别的,应在 Open() 之前调用且只调用一次。 +func InitGlobalSettings(config *Config) { + memBytes := int(config.MaxMemoryMB * 1024 * 1024) + storage.SetTSIDCacheSize(memBytes * 35 / 100) + storage.SetMetricNameCacheSize(memBytes * 10 / 100) + storage.SetTagFiltersCacheSize(memBytes * 5 / 100) + storage.SetMetadataStorageSize(memBytes * 1 / 100) + + storage.SetDedupInterval(config.DedupInterval) + storage.SetFreeDiskSpaceLimit(config.MinFreeDiskSpaceBytes()) + storage.SetDataFlushInterval(5 * time.Second) +} + +// Open 打开或创建 TSDB 存储 +func Open(config *Config) (*TSDB, error) { + if config == nil { + config = DefaultConfig() + } + + config.Validate() + + dataPath := config.DataPath + if !filepath.IsAbs(dataPath) { + absPath, err := filepath.Abs(dataPath) + if err != nil { + return nil, fmt.Errorf("failed to get absolute path: %w", err) + } + dataPath = absPath + } + + InitGlobalSettings(config) + + opts := storage.OpenOptions{ + Retention: time.Duration(config.RetentionDays) * 24 * time.Hour, + } + + stor := storage.MustOpenStorage(dataPath, opts) + + db := &TSDB{ + storage: stor, + config: config, + } + + db.writer = newBufferedWriter(db, config.WriteBufferSize, config.WriteBufferFlushInterval) + + log.Printf("NEZHA>> TSDB opened at %s, retention: %d days, min free disk: %.1f GB, max memory: %d MB", + dataPath, config.RetentionDays, config.MinFreeDiskSpaceGB, config.MaxMemoryMB) + + return db, nil +} + +// Close 关闭 TSDB 存储 +func (db *TSDB) Close() error { + db.mu.Lock() + defer db.mu.Unlock() + + if db.closed { + return nil + } + + if db.writer != nil { + db.writer.stop() + } + + db.storage.MustClose() + db.closed = true + log.Println("NEZHA>> TSDB closed") + return nil +} + +// Storage 返回底层存储对象(用于高级查询) +func (db *TSDB) Storage() *storage.Storage { + return db.storage +} + +// Config 返回配置 +func (db *TSDB) Config() *Config { + return db.config +} + +// IsClosed 检查是否已关闭 +func (db *TSDB) IsClosed() bool { + db.mu.RLock() + defer db.mu.RUnlock() + return db.closed +} + +// Flush 强制刷盘(主要用于测试) +func (db *TSDB) Flush() { + if db.writer != nil { + db.writer.flush() + } + db.storage.DebugFlush() +} diff --git a/pkg/tsdb/tsdb_test.go b/pkg/tsdb/tsdb_test.go new file mode 100644 index 0000000..fe2ab4d --- /dev/null +++ b/pkg/tsdb/tsdb_test.go @@ -0,0 +1,622 @@ +package tsdb + +import ( + "os" + "path/filepath" + "testing" + "time" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func TestConfig_Defaults(t *testing.T) { + config := DefaultConfig() + + assert.Equal(t, "", config.DataPath) // 默认为空,不启用 TSDB + assert.Equal(t, uint16(30), config.RetentionDays) + assert.Equal(t, float64(1), config.MinFreeDiskSpaceGB) + assert.Equal(t, 30*time.Second, config.DedupInterval) + assert.False(t, config.Enabled()) +} + +func TestConfig_Enabled(t *testing.T) { + config := &Config{DataPath: ""} + assert.False(t, config.Enabled()) + + config.DataPath = "data/tsdb" + assert.True(t, config.Enabled()) +} + +func TestConfig_MinFreeDiskSpaceBytes(t *testing.T) { + config := &Config{MinFreeDiskSpaceGB: 5} + expected := int64(5 * 1024 * 1024 * 1024) + assert.Equal(t, expected, config.MinFreeDiskSpaceBytes()) +} + +func TestTSDB_OpenClose(t *testing.T) { + tempDir, err := os.MkdirTemp("", "tsdb_test") + require.NoError(t, err) + defer os.RemoveAll(tempDir) + + config := &Config{ + DataPath: filepath.Join(tempDir, "tsdb"), + RetentionDays: 1, + MinFreeDiskSpaceGB: 1, + DedupInterval: time.Second, + } + + db, err := Open(config) + require.NoError(t, err) + require.NotNil(t, db) + + assert.False(t, db.IsClosed()) + assert.NotNil(t, db.Storage()) + assert.Equal(t, config, db.Config()) + + err = db.Close() + require.NoError(t, err) + assert.True(t, db.IsClosed()) + + // 重复关闭应该安全 + err = db.Close() + require.NoError(t, err) +} + +func TestTSDB_WriteServerMetrics(t *testing.T) { + tempDir, err := os.MkdirTemp("", "tsdb_test") + require.NoError(t, err) + defer os.RemoveAll(tempDir) + + config := &Config{ + DataPath: filepath.Join(tempDir, "tsdb"), + RetentionDays: 1, + MinFreeDiskSpaceGB: 1, + DedupInterval: time.Second, + } + + db, err := Open(config) + require.NoError(t, err) + defer db.Close() + + metrics := &ServerMetrics{ + ServerID: 1, + Timestamp: time.Now(), + CPU: 50.5, + MemUsed: 1024 * 1024 * 1024, + SwapUsed: 512 * 1024 * 1024, + DiskUsed: 10 * 1024 * 1024 * 1024, + NetInSpeed: 1000000, + NetOutSpeed: 500000, + NetInTransfer: 1000000000, + NetOutTransfer: 500000000, + Load1: 1.5, + Load5: 1.2, + Load15: 1.0, + TCPConnCount: 100, + UDPConnCount: 50, + ProcessCount: 200, + Temperature: 65.5, + Uptime: 86400, + GPU: 30.0, + } + + err = db.WriteServerMetrics(metrics) + require.NoError(t, err) + + // 强制刷盘 + db.Flush() +} + +func TestTSDB_WriteServiceMetrics(t *testing.T) { + tempDir, err := os.MkdirTemp("", "tsdb_test") + require.NoError(t, err) + defer os.RemoveAll(tempDir) + + config := &Config{ + DataPath: filepath.Join(tempDir, "tsdb"), + RetentionDays: 1, + MinFreeDiskSpaceGB: 1, + DedupInterval: time.Second, + } + + db, err := Open(config) + require.NoError(t, err) + defer db.Close() + + metrics := &ServiceMetrics{ + ServiceID: 1, + ServerID: 1, + Timestamp: time.Now(), + Delay: 45.5, + Successful: true, + } + + err = db.WriteServiceMetrics(metrics) + require.NoError(t, err) + + // 测试失败状态 + metrics2 := &ServiceMetrics{ + ServiceID: 1, + ServerID: 2, + Timestamp: time.Now(), + Delay: 0, + Successful: false, + } + + err = db.WriteServiceMetrics(metrics2) + require.NoError(t, err) + + // 强制刷盘 + db.Flush() +} + +func TestTSDB_WriteBatchMetrics(t *testing.T) { + tempDir, err := os.MkdirTemp("", "tsdb_test") + require.NoError(t, err) + defer os.RemoveAll(tempDir) + + config := &Config{ + DataPath: filepath.Join(tempDir, "tsdb"), + RetentionDays: 1, + MinFreeDiskSpaceGB: 1, + DedupInterval: time.Second, + } + + db, err := Open(config) + require.NoError(t, err) + defer db.Close() + + // 批量写入服务器指标 + serverMetrics := []*ServerMetrics{ + {ServerID: 1, Timestamp: time.Now(), CPU: 10.0}, + {ServerID: 2, Timestamp: time.Now(), CPU: 20.0}, + {ServerID: 3, Timestamp: time.Now(), CPU: 30.0}, + } + + err = db.WriteBatchServerMetrics(serverMetrics) + require.NoError(t, err) + + // 批量写入服务指标 + serviceMetrics := []*ServiceMetrics{ + {ServiceID: 1, ServerID: 1, Timestamp: time.Now(), Delay: 10.0, Successful: true}, + {ServiceID: 1, ServerID: 2, Timestamp: time.Now(), Delay: 20.0, Successful: true}, + {ServiceID: 2, ServerID: 1, Timestamp: time.Now(), Delay: 15.0, Successful: false}, + } + + err = db.WriteBatchServiceMetrics(serviceMetrics) + require.NoError(t, err) + + db.Flush() +} + +func TestTSDB_WriteToClosedDB(t *testing.T) { + tempDir, err := os.MkdirTemp("", "tsdb_test") + require.NoError(t, err) + defer os.RemoveAll(tempDir) + + config := &Config{ + DataPath: filepath.Join(tempDir, "tsdb"), + RetentionDays: 1, + MinFreeDiskSpaceGB: 1, + DedupInterval: time.Second, + } + + db, err := Open(config) + require.NoError(t, err) + + db.Close() + + // 写入已关闭的数据库应该返回错误 + err = db.WriteServerMetrics(&ServerMetrics{ServerID: 1, Timestamp: time.Now()}) + assert.Error(t, err) + + err = db.WriteServiceMetrics(&ServiceMetrics{ServiceID: 1, ServerID: 1, Timestamp: time.Now()}) + assert.Error(t, err) +} + +func TestQueryPeriod_Parse(t *testing.T) { + tests := []struct { + input string + expected QueryPeriod + hasError bool + }{ + {"1d", Period1Day, false}, + {"7d", Period7Days, false}, + {"30d", Period30Days, false}, + {"", Period1Day, false}, + {"invalid", "", true}, + {"1w", "", true}, + } + + for _, tt := range tests { + t.Run(tt.input, func(t *testing.T) { + period, err := ParseQueryPeriod(tt.input) + if tt.hasError { + assert.Error(t, err) + } else { + assert.NoError(t, err) + assert.Equal(t, tt.expected, period) + } + }) + } +} + +func TestQueryPeriod_Duration(t *testing.T) { + assert.Equal(t, 24*time.Hour, Period1Day.Duration()) + assert.Equal(t, 7*24*time.Hour, Period7Days.Duration()) + assert.Equal(t, 30*24*time.Hour, Period30Days.Duration()) +} + +func TestQueryPeriod_DownsampleInterval(t *testing.T) { + assert.Equal(t, 5*time.Minute, Period1Day.DownsampleInterval()) + assert.Equal(t, 30*time.Minute, Period7Days.DownsampleInterval()) + assert.Equal(t, 2*time.Hour, Period30Days.DownsampleInterval()) +} + +func TestTSDB_QueryServiceHistory(t *testing.T) { + tempDir, err := os.MkdirTemp("", "tsdb_test") + require.NoError(t, err) + defer os.RemoveAll(tempDir) + + config := &Config{ + DataPath: filepath.Join(tempDir, "tsdb"), + RetentionDays: 1, + MinFreeDiskSpaceGB: 1, + DedupInterval: time.Second, + } + + db, err := Open(config) + require.NoError(t, err) + defer db.Close() + + // 写入测试数据 + now := time.Now() + serviceID := uint64(100) + serverID1 := uint64(1) + serverID2 := uint64(2) + + // 写入多条服务监控数据 + for i := 0; i < 10; i++ { + ts := now.Add(-time.Duration(i) * time.Minute) + + // 服务器1的数据:成功 + err := db.WriteServiceMetrics(&ServiceMetrics{ + ServiceID: serviceID, + ServerID: serverID1, + Timestamp: ts, + Delay: float64(10 + i), + Successful: true, + }) + require.NoError(t, err) + + // 服务器2的数据:部分失败 + err = db.WriteServiceMetrics(&ServiceMetrics{ + ServiceID: serviceID, + ServerID: serverID2, + Timestamp: ts, + Delay: float64(20 + i), + Successful: i%2 == 0, // 偶数成功,奇数失败 + }) + require.NoError(t, err) + } + + // 强制刷盘确保数据可见 + db.Flush() + + // 查询服务历史 + result, err := db.QueryServiceHistory(serviceID, Period1Day) + require.NoError(t, err) + require.NotNil(t, result) + + assert.Equal(t, serviceID, result.ServiceID) + require.Len(t, result.Servers, 2, "expected 2 servers") + + // 验证服务器统计 + for _, server := range result.Servers { + if server.ServerID == serverID1 { + // 服务器1全部成功 + assert.Equal(t, uint64(10), server.Stats.TotalUp) + assert.Equal(t, uint64(0), server.Stats.TotalDown) + assert.Equal(t, float32(100), server.Stats.UpPercent) + } else if server.ServerID == serverID2 { + // 服务器2一半成功 + assert.Equal(t, uint64(5), server.Stats.TotalUp) + assert.Equal(t, uint64(5), server.Stats.TotalDown) + assert.Equal(t, float32(50), server.Stats.UpPercent) + } + } +} + +func TestTSDB_QueryServerMetrics(t *testing.T) { + tempDir, err := os.MkdirTemp("", "tsdb_test") + require.NoError(t, err) + defer os.RemoveAll(tempDir) + + config := &Config{ + DataPath: filepath.Join(tempDir, "tsdb"), + RetentionDays: 1, + MinFreeDiskSpaceGB: 1, + DedupInterval: time.Second, + } + + db, err := Open(config) + require.NoError(t, err) + defer db.Close() + + // 写入测试数据 + now := time.Now() + serverID := uint64(1) + + for i := 0; i < 10; i++ { + ts := now.Add(-time.Duration(i) * time.Minute) + err := db.WriteServerMetrics(&ServerMetrics{ + ServerID: serverID, + Timestamp: ts, + CPU: float64(10 + i*5), + }) + require.NoError(t, err) + } + + // 强制刷盘确保数据可见 + db.Flush() + + // 查询服务器指标 + result, err := db.QueryServerMetrics(serverID, MetricServerCPU, Period1Day) + require.NoError(t, err) + require.NotEmpty(t, result, "expected data points") +} + +func TestTSDB_QueryEmptyResult(t *testing.T) { + tempDir, err := os.MkdirTemp("", "tsdb_test") + require.NoError(t, err) + defer os.RemoveAll(tempDir) + + config := &Config{ + DataPath: filepath.Join(tempDir, "tsdb"), + RetentionDays: 1, + MinFreeDiskSpaceGB: 1, + DedupInterval: time.Second, + } + + db, err := Open(config) + require.NoError(t, err) + defer db.Close() + + // 查询不存在的服务历史 + result, err := db.QueryServiceHistory(9999, Period1Day) + require.NoError(t, err) + require.NotNil(t, result) + assert.Empty(t, result.Servers) + + // 查询不存在的服务器指标 + serverResult, err := db.QueryServerMetrics(9999, MetricServerCPU, Period1Day) + require.NoError(t, err) + assert.Empty(t, serverResult) +} + +func TestTSDB_QueryClosedDB(t *testing.T) { + tempDir, err := os.MkdirTemp("", "tsdb_test") + require.NoError(t, err) + defer os.RemoveAll(tempDir) + + config := &Config{ + DataPath: filepath.Join(tempDir, "tsdb"), + RetentionDays: 1, + MinFreeDiskSpaceGB: 1, + DedupInterval: time.Second, + } + + db, err := Open(config) + require.NoError(t, err) + db.Close() + + // 查询已关闭的数据库应该返回错误 + _, err = db.QueryServiceHistory(1, Period1Day) + assert.Error(t, err) + + _, err = db.QueryServerMetrics(1, MetricServerCPU, Period1Day) + assert.Error(t, err) +} + +func TestDownsample(t *testing.T) { + points := []rawDataPoint{ + {timestamp: 0, value: 10, status: 1, hasDelay: true, hasStatus: true}, + {timestamp: 1000, value: 20, status: 1, hasDelay: true, hasStatus: true}, + {timestamp: 2000, value: 30, status: 0, hasDelay: true, hasStatus: true}, + {timestamp: 3000, value: 40, status: 1, hasDelay: true, hasStatus: true}, + {timestamp: 4000, value: 50, status: 1, hasDelay: true, hasStatus: true}, + } + + result := downsample(points, 2*time.Second) + + assert.Len(t, result, 3) + + for i := 1; i < len(result); i++ { + assert.Greater(t, result[i].Timestamp, result[i-1].Timestamp) + } +} + +func TestCalculateStats(t *testing.T) { + points := []rawDataPoint{ + {timestamp: 1000, value: 10, status: 1, hasDelay: true, hasStatus: true}, + {timestamp: 2000, value: 20, status: 1, hasDelay: true, hasStatus: true}, + {timestamp: 3000, value: 30, status: 0, hasDelay: true, hasStatus: true}, + {timestamp: 4000, value: 40, status: 1, hasDelay: true, hasStatus: true}, + } + + stats := calculateStats(points, 5*time.Minute) + + assert.Equal(t, uint64(3), stats.TotalUp) + assert.Equal(t, uint64(1), stats.TotalDown) + assert.Equal(t, float32(75), stats.UpPercent) + assert.Equal(t, float64(25), stats.AvgDelay) +} + +func TestCalculateStats_ZeroDelay(t *testing.T) { + points := []rawDataPoint{ + {timestamp: 1000, value: 0, status: 1, hasDelay: true, hasStatus: true}, + {timestamp: 2000, value: 10, status: 1, hasDelay: true, hasStatus: true}, + } + + stats := calculateStats(points, 5*time.Minute) + + assert.Equal(t, float64(5), stats.AvgDelay) + assert.Equal(t, uint64(2), stats.TotalUp) +} + +func TestCalculateStatsEmpty(t *testing.T) { + points := []rawDataPoint{} + stats := calculateStats(points, 5*time.Minute) + + assert.Equal(t, uint64(0), stats.TotalUp) + assert.Equal(t, uint64(0), stats.TotalDown) + assert.Equal(t, float32(0), stats.UpPercent) + assert.Equal(t, float64(0), stats.AvgDelay) + assert.Nil(t, stats.DataPoints) +} + +func TestTSDB_QueryServerMetrics_Float64Precision(t *testing.T) { + tempDir, err := os.MkdirTemp("", "tsdb_test") + require.NoError(t, err) + defer os.RemoveAll(tempDir) + + config := &Config{ + DataPath: filepath.Join(tempDir, "tsdb"), + RetentionDays: 1, + MinFreeDiskSpaceGB: 1, + DedupInterval: time.Second, + } + + db, err := Open(config) + require.NoError(t, err) + defer db.Close() + + now := time.Now() + serverID := uint64(1) + largeMemValue := uint64(17_179_869_184) // 16GB + + err = db.WriteServerMetrics(&ServerMetrics{ + ServerID: serverID, + Timestamp: now, + MemUsed: largeMemValue, + }) + require.NoError(t, err) + + db.Flush() + + result, err := db.QueryServerMetrics(serverID, MetricServerMemory, Period1Day) + require.NoError(t, err) + require.NotEmpty(t, result) + + // float64 可以精确表示该值,float32 会丢失精度 + assert.Equal(t, float64(largeMemValue), result[0].Value) +} + +func TestTSDB_QueryServiceHistoryByServerID(t *testing.T) { + tempDir, err := os.MkdirTemp("", "tsdb_test") + require.NoError(t, err) + defer os.RemoveAll(tempDir) + + config := &Config{ + DataPath: filepath.Join(tempDir, "tsdb"), + RetentionDays: 1, + MinFreeDiskSpaceGB: 1, + DedupInterval: time.Second, + } + + db, err := Open(config) + require.NoError(t, err) + defer db.Close() + + now := time.Now() + serverID := uint64(1) + serviceID1 := uint64(100) + serviceID2 := uint64(200) + + // 写入两个服务在同一服务器上的数据 + for i := 0; i < 5; i++ { + ts := now.Add(-time.Duration(i) * time.Minute) + + err := db.WriteServiceMetrics(&ServiceMetrics{ + ServiceID: serviceID1, + ServerID: serverID, + Timestamp: ts, + Delay: float64(10 + i), + Successful: true, + }) + require.NoError(t, err) + + err = db.WriteServiceMetrics(&ServiceMetrics{ + ServiceID: serviceID2, + ServerID: serverID, + Timestamp: ts, + Delay: float64(20 + i), + Successful: i%2 == 0, + }) + require.NoError(t, err) + } + + db.Flush() + + results, err := db.QueryServiceHistoryByServerID(serverID, Period1Day) + require.NoError(t, err) + require.Len(t, results, 2, "expected 2 services") + + // 验证 service1:全部成功 + s1, ok := results[serviceID1] + require.True(t, ok) + assert.Equal(t, serviceID1, s1.ServiceID) + require.Len(t, s1.Servers, 1) + assert.Equal(t, serverID, s1.Servers[0].ServerID) + assert.Equal(t, uint64(5), s1.Servers[0].Stats.TotalUp) + assert.Equal(t, uint64(0), s1.Servers[0].Stats.TotalDown) + + // 验证 service2:部分成功 + s2, ok := results[serviceID2] + require.True(t, ok) + assert.Equal(t, serviceID2, s2.ServiceID) + assert.Equal(t, uint64(3), s2.Servers[0].Stats.TotalUp) + assert.Equal(t, uint64(2), s2.Servers[0].Stats.TotalDown) +} + +func TestTSDB_QueryServiceHistoryByServerID_Empty(t *testing.T) { + tempDir, err := os.MkdirTemp("", "tsdb_test") + require.NoError(t, err) + defer os.RemoveAll(tempDir) + + config := &Config{ + DataPath: filepath.Join(tempDir, "tsdb"), + RetentionDays: 1, + MinFreeDiskSpaceGB: 1, + DedupInterval: time.Second, + } + + db, err := Open(config) + require.NoError(t, err) + defer db.Close() + + results, err := db.QueryServiceHistoryByServerID(9999, Period1Day) + require.NoError(t, err) + assert.Empty(t, results) +} + +func TestTSDB_QueryServiceHistoryByServerID_ClosedDB(t *testing.T) { + tempDir, err := os.MkdirTemp("", "tsdb_test") + require.NoError(t, err) + defer os.RemoveAll(tempDir) + + config := &Config{ + DataPath: filepath.Join(tempDir, "tsdb"), + RetentionDays: 1, + MinFreeDiskSpaceGB: 1, + DedupInterval: time.Second, + } + + db, err := Open(config) + require.NoError(t, err) + db.Close() + + _, err = db.QueryServiceHistoryByServerID(1, Period1Day) + assert.Error(t, err) +} diff --git a/pkg/tsdb/writer.go b/pkg/tsdb/writer.go new file mode 100644 index 0000000..7896c09 --- /dev/null +++ b/pkg/tsdb/writer.go @@ -0,0 +1,301 @@ +package tsdb + +import ( + "fmt" + "strconv" + "sync" + "time" + + "github.com/VictoriaMetrics/VictoriaMetrics/lib/prompb" + "github.com/VictoriaMetrics/VictoriaMetrics/lib/storage" +) + +type bufferedWriter struct { + db *TSDB + buffer []storage.MetricRow + mu sync.Mutex + maxSize int + flushTicker *time.Ticker + stopCh chan struct{} + wg sync.WaitGroup +} + +func newBufferedWriter(db *TSDB, maxSize int, flushInterval time.Duration) *bufferedWriter { + w := &bufferedWriter{ + db: db, + buffer: make([]storage.MetricRow, 0, maxSize), + maxSize: maxSize, + flushTicker: time.NewTicker(flushInterval), + stopCh: make(chan struct{}), + } + w.wg.Add(1) + go w.flushLoop() + return w +} + +func (w *bufferedWriter) flushLoop() { + defer w.wg.Done() + for { + select { + case <-w.flushTicker.C: + w.flush() + case <-w.stopCh: + w.flush() + return + } + } +} + +func (w *bufferedWriter) write(rows []storage.MetricRow) { + w.mu.Lock() + w.buffer = append(w.buffer, rows...) + if len(w.buffer) >= w.maxSize { + rows := w.buffer + w.buffer = make([]storage.MetricRow, 0, w.maxSize) + w.mu.Unlock() + w.db.storage.AddRows(rows, 64) + return + } + w.mu.Unlock() +} + +func (w *bufferedWriter) flush() { + w.mu.Lock() + if len(w.buffer) == 0 { + w.mu.Unlock() + return + } + rows := w.buffer + w.buffer = make([]storage.MetricRow, 0, w.maxSize) + w.mu.Unlock() + + w.db.storage.AddRows(rows, 64) +} + +func (w *bufferedWriter) stop() { + w.flushTicker.Stop() + close(w.stopCh) + w.wg.Wait() +} + +// MetricType 指标类型 +type MetricType string + +const ( + // 服务器指标 + MetricServerCPU MetricType = "nezha_server_cpu" + MetricServerMemory MetricType = "nezha_server_memory" + MetricServerSwap MetricType = "nezha_server_swap" + MetricServerDisk MetricType = "nezha_server_disk" + MetricServerNetInSpeed MetricType = "nezha_server_net_in_speed" + MetricServerNetOutSpeed MetricType = "nezha_server_net_out_speed" + MetricServerNetInTransfer MetricType = "nezha_server_net_in_transfer" + MetricServerNetOutTransfer MetricType = "nezha_server_net_out_transfer" + MetricServerLoad1 MetricType = "nezha_server_load1" + MetricServerLoad5 MetricType = "nezha_server_load5" + MetricServerLoad15 MetricType = "nezha_server_load15" + MetricServerTCPConn MetricType = "nezha_server_tcp_conn" + MetricServerUDPConn MetricType = "nezha_server_udp_conn" + MetricServerProcessCount MetricType = "nezha_server_process_count" + MetricServerTemperature MetricType = "nezha_server_temperature" + MetricServerUptime MetricType = "nezha_server_uptime" + MetricServerGPU MetricType = "nezha_server_gpu" + + // 服务监控指标 + MetricServiceDelay MetricType = "nezha_service_delay" + MetricServiceStatus MetricType = "nezha_service_status" +) + +// ServerMetrics 服务器指标数据 +type ServerMetrics struct { + ServerID uint64 + Timestamp time.Time + CPU float64 + MemUsed uint64 + SwapUsed uint64 + DiskUsed uint64 + NetInSpeed uint64 + NetOutSpeed uint64 + NetInTransfer uint64 + NetOutTransfer uint64 + Load1 float64 + Load5 float64 + Load15 float64 + TCPConnCount uint64 + UDPConnCount uint64 + ProcessCount uint64 + Temperature float64 + Uptime uint64 + GPU float64 +} + +// ServiceMetrics 服务监控指标数据 +type ServiceMetrics struct { + ServiceID uint64 + ServerID uint64 + Timestamp time.Time + Delay float64 + Successful bool +} + +func (db *TSDB) WriteServerMetrics(m *ServerMetrics) error { + db.mu.RLock() + defer db.mu.RUnlock() + if db.closed { + return fmt.Errorf("TSDB is closed") + } + + ts := m.Timestamp.UnixMilli() + serverIDStr := strconv.FormatUint(m.ServerID, 10) + + rows := []storage.MetricRow{ + makeServerMetricRow(MetricServerCPU, serverIDStr, ts, m.CPU), + makeServerMetricRow(MetricServerMemory, serverIDStr, ts, float64(m.MemUsed)), + makeServerMetricRow(MetricServerSwap, serverIDStr, ts, float64(m.SwapUsed)), + makeServerMetricRow(MetricServerDisk, serverIDStr, ts, float64(m.DiskUsed)), + makeServerMetricRow(MetricServerNetInSpeed, serverIDStr, ts, float64(m.NetInSpeed)), + makeServerMetricRow(MetricServerNetOutSpeed, serverIDStr, ts, float64(m.NetOutSpeed)), + makeServerMetricRow(MetricServerNetInTransfer, serverIDStr, ts, float64(m.NetInTransfer)), + makeServerMetricRow(MetricServerNetOutTransfer, serverIDStr, ts, float64(m.NetOutTransfer)), + makeServerMetricRow(MetricServerLoad1, serverIDStr, ts, m.Load1), + makeServerMetricRow(MetricServerLoad5, serverIDStr, ts, m.Load5), + makeServerMetricRow(MetricServerLoad15, serverIDStr, ts, m.Load15), + makeServerMetricRow(MetricServerTCPConn, serverIDStr, ts, float64(m.TCPConnCount)), + makeServerMetricRow(MetricServerUDPConn, serverIDStr, ts, float64(m.UDPConnCount)), + makeServerMetricRow(MetricServerProcessCount, serverIDStr, ts, float64(m.ProcessCount)), + makeServerMetricRow(MetricServerTemperature, serverIDStr, ts, m.Temperature), + makeServerMetricRow(MetricServerUptime, serverIDStr, ts, float64(m.Uptime)), + makeServerMetricRow(MetricServerGPU, serverIDStr, ts, m.GPU), + } + + if db.writer != nil { + db.writer.write(rows) + } else { + db.storage.AddRows(rows, 64) + } + return nil +} + +func (db *TSDB) WriteServiceMetrics(m *ServiceMetrics) error { + db.mu.RLock() + defer db.mu.RUnlock() + if db.closed { + return fmt.Errorf("TSDB is closed") + } + + ts := m.Timestamp.UnixMilli() + serviceIDStr := strconv.FormatUint(m.ServiceID, 10) + serverIDStr := strconv.FormatUint(m.ServerID, 10) + + var status float64 + if m.Successful { + status = 1 + } + + rows := []storage.MetricRow{ + makeServiceMetricRow(MetricServiceDelay, serviceIDStr, serverIDStr, ts, m.Delay), + makeServiceMetricRow(MetricServiceStatus, serviceIDStr, serverIDStr, ts, status), + } + + if db.writer != nil { + db.writer.write(rows) + } else { + db.storage.AddRows(rows, 64) + } + return nil +} + +func makeServerMetricRow(metric MetricType, serverID string, timestamp int64, value float64) storage.MetricRow { + labels := []prompb.Label{ + {Name: "__name__", Value: string(metric)}, + {Name: "server_id", Value: serverID}, + } + return storage.MetricRow{ + MetricNameRaw: storage.MarshalMetricNameRaw(nil, labels), + Timestamp: timestamp, + Value: value, + } +} + +func makeServiceMetricRow(metric MetricType, serviceID, serverID string, timestamp int64, value float64) storage.MetricRow { + labels := []prompb.Label{ + {Name: "__name__", Value: string(metric)}, + {Name: "service_id", Value: serviceID}, + {Name: "server_id", Value: serverID}, + } + return storage.MetricRow{ + MetricNameRaw: storage.MarshalMetricNameRaw(nil, labels), + Timestamp: timestamp, + Value: value, + } +} + +func (db *TSDB) WriteBatchServerMetrics(metrics []*ServerMetrics) error { + db.mu.RLock() + defer db.mu.RUnlock() + if db.closed { + return fmt.Errorf("TSDB is closed") + } + + rows := make([]storage.MetricRow, 0, len(metrics)*17) + for _, m := range metrics { + ts := m.Timestamp.UnixMilli() + serverIDStr := strconv.FormatUint(m.ServerID, 10) + rows = append(rows, + makeServerMetricRow(MetricServerCPU, serverIDStr, ts, m.CPU), + makeServerMetricRow(MetricServerMemory, serverIDStr, ts, float64(m.MemUsed)), + makeServerMetricRow(MetricServerSwap, serverIDStr, ts, float64(m.SwapUsed)), + makeServerMetricRow(MetricServerDisk, serverIDStr, ts, float64(m.DiskUsed)), + makeServerMetricRow(MetricServerNetInSpeed, serverIDStr, ts, float64(m.NetInSpeed)), + makeServerMetricRow(MetricServerNetOutSpeed, serverIDStr, ts, float64(m.NetOutSpeed)), + makeServerMetricRow(MetricServerNetInTransfer, serverIDStr, ts, float64(m.NetInTransfer)), + makeServerMetricRow(MetricServerNetOutTransfer, serverIDStr, ts, float64(m.NetOutTransfer)), + makeServerMetricRow(MetricServerLoad1, serverIDStr, ts, m.Load1), + makeServerMetricRow(MetricServerLoad5, serverIDStr, ts, m.Load5), + makeServerMetricRow(MetricServerLoad15, serverIDStr, ts, m.Load15), + makeServerMetricRow(MetricServerTCPConn, serverIDStr, ts, float64(m.TCPConnCount)), + makeServerMetricRow(MetricServerUDPConn, serverIDStr, ts, float64(m.UDPConnCount)), + makeServerMetricRow(MetricServerProcessCount, serverIDStr, ts, float64(m.ProcessCount)), + makeServerMetricRow(MetricServerTemperature, serverIDStr, ts, m.Temperature), + makeServerMetricRow(MetricServerUptime, serverIDStr, ts, float64(m.Uptime)), + makeServerMetricRow(MetricServerGPU, serverIDStr, ts, m.GPU), + ) + } + + if db.writer != nil { + db.writer.write(rows) + } else { + db.storage.AddRows(rows, 64) + } + return nil +} + +func (db *TSDB) WriteBatchServiceMetrics(metrics []*ServiceMetrics) error { + db.mu.RLock() + defer db.mu.RUnlock() + if db.closed { + return fmt.Errorf("TSDB is closed") + } + + rows := make([]storage.MetricRow, 0, len(metrics)*2) + for _, m := range metrics { + ts := m.Timestamp.UnixMilli() + serviceIDStr := strconv.FormatUint(m.ServiceID, 10) + serverIDStr := strconv.FormatUint(m.ServerID, 10) + var status float64 + if m.Successful { + status = 1 + } + rows = append(rows, + makeServiceMetricRow(MetricServiceDelay, serviceIDStr, serverIDStr, ts, m.Delay), + makeServiceMetricRow(MetricServiceStatus, serviceIDStr, serverIDStr, ts, status), + ) + } + + if db.writer != nil { + db.writer.write(rows) + } else { + db.storage.AddRows(rows, 64) + } + return nil +} diff --git a/service/rpc/nezha.go b/service/rpc/nezha.go index 69c1c26..6373e50 100644 --- a/service/rpc/nezha.go +++ b/service/rpc/nezha.go @@ -12,6 +12,7 @@ import ( "github.com/jinzhu/copier" geoipx "github.com/nezhahq/nezha/pkg/geoip" "github.com/nezhahq/nezha/pkg/grpcx" + "github.com/nezhahq/nezha/pkg/tsdb" "github.com/nezhahq/nezha/model" pb "github.com/nezhahq/nezha/proto" @@ -114,6 +115,44 @@ func (s *NezhaHandler) ReportSystemState(stream pb.NezhaService_ReportSystemStat server.LastActive = time.Now() server.State = &innerState + if singleton.TSDBEnabled() { + maxTemp := 0.0 + for _, t := range innerState.Temperatures { + if t.Temperature > maxTemp { + maxTemp = t.Temperature + } + } + maxGPU := 0.0 + for _, g := range innerState.GPU { + if g > maxGPU { + maxGPU = g + } + } + if err := singleton.TSDBShared.WriteServerMetrics(&tsdb.ServerMetrics{ + ServerID: clientID, + Timestamp: time.Now(), + CPU: innerState.CPU, + MemUsed: innerState.MemUsed, + SwapUsed: innerState.SwapUsed, + DiskUsed: innerState.DiskUsed, + NetInSpeed: innerState.NetInSpeed, + NetOutSpeed: innerState.NetOutSpeed, + NetInTransfer: innerState.NetInTransfer, + NetOutTransfer: innerState.NetOutTransfer, + Load1: innerState.Load1, + Load5: innerState.Load5, + Load15: innerState.Load15, + TCPConnCount: innerState.TcpConnCount, + UDPConnCount: innerState.UdpConnCount, + ProcessCount: innerState.ProcessCount, + Temperature: maxTemp, + Uptime: innerState.Uptime, + GPU: maxGPU, + }); err != nil { + log.Printf("NEZHA>> Failed to write server metrics to TSDB: %v", err) + } + } + // 应对 dashboard / agent 重启的情况,如果从未记录过,先打点,等到小时时间点时入库 if server.PrevTransferInSnapshot == 0 || server.PrevTransferOutSnapshot == 0 { server.PrevTransferInSnapshot = state.NetInTransfer diff --git a/service/singleton/frontend-templates.yaml b/service/singleton/frontend-templates.yaml index d257fbd..37ddd0b 100644 --- a/service/singleton/frontend-templates.yaml +++ b/service/singleton/frontend-templates.yaml @@ -2,14 +2,14 @@ name: "OfficialAdmin" repository: "https://github.com/nezhahq/admin-frontend" author: "nezhahq" - version: "v1.14.7" + version: "v2.0.3" is_admin: true is_official: true - path: "user-dist" name: "Official" repository: "https://github.com/hamster1963/nezha-dash-v1" author: "hamster1963" - version: "v1.33.0" + version: "v2.0.0" is_official: true - path: "nazhua-dist" name: "Nazhua" diff --git a/service/singleton/servicesentinel.go b/service/singleton/servicesentinel.go index 310c49a..9b7aef4 100644 --- a/service/singleton/servicesentinel.go +++ b/service/singleton/servicesentinel.go @@ -16,6 +16,7 @@ import ( "golang.org/x/exp/constraints" "github.com/nezhahq/nezha/model" + "github.com/nezhahq/nezha/pkg/tsdb" "github.com/nezhahq/nezha/pkg/utils" pb "github.com/nezhahq/nezha/proto" ) @@ -39,7 +40,7 @@ type ReportData struct { type _TodayStatsOfService struct { Up uint64 // 今日在线计数 Down uint64 // 今日离线计数 - Delay float32 // 今日平均延迟 + Delay float64 // 今日平均延迟 } type serviceResponseData = _TodayStatsOfService @@ -51,8 +52,9 @@ type serviceTaskStatus struct { } type pingStore struct { - count int - ping float32 + count int + ping float64 + successCount int } /* @@ -108,23 +110,7 @@ func NewServiceSentinel(serviceSentinelDispatchBus chan<- *model.Service) (*Serv year, month, day := time.Now().Date() today := time.Date(year, month, day, 0, 0, 0, 0, Loc) - - var mhs []model.ServiceHistory - // 加载当日记录 - DB.Where("created_at >= ? AND server_id = 0", today).Find(&mhs) - totalDelay := make(map[uint64]float32) - totalDelayCount := make(map[uint64]float32) - for _, mh := range mhs { - totalDelay[mh.ServiceID] += mh.AvgDelay - totalDelayCount[mh.ServiceID]++ - ss.serviceStatusToday[mh.ServiceID].Up += mh.Up - ss.monthlyStatus[mh.ServiceID].TotalUp += mh.Up - ss.serviceStatusToday[mh.ServiceID].Down += mh.Down - ss.monthlyStatus[mh.ServiceID].TotalDown += mh.Down - } - for id, delay := range totalDelay { - ss.serviceStatusToday[id].Delay = delay / float32(totalDelayCount[id]) - } + ss.loadTodayStats(today) // 启动服务监控器 go ss.worker() @@ -135,6 +121,12 @@ func NewServiceSentinel(serviceSentinelDispatchBus chan<- *model.Service) (*Serv return nil, err } + // 每周日凌晨 4:00 执行系统存储维护 + _, err = CronShared.AddFunc("0 0 4 * * 0", PerformMaintenance) + if err != nil { + log.Printf("NEZHA>> Warning: failed to schedule maintenance task: %v", err) + } + return ss, nil } @@ -171,6 +163,16 @@ func (ss *ServiceSentinel) Dispatch(r ReportData) { ss.serviceReportChannel <- r } +// sortServices 按 DisplayIndex 降序、ID 升序排列服务列表 +func sortServices(services []*model.Service) { + slices.SortFunc(services, func(a, b *model.Service) int { + if a.DisplayIndex != b.DisplayIndex { + return cmp.Compare(b.DisplayIndex, a.DisplayIndex) + } + return cmp.Compare(a.ID, b.ID) + }) +} + func (ss *ServiceSentinel) UpdateServiceList() { ss.servicesLock.RLock() defer ss.servicesLock.RUnlock() @@ -179,9 +181,7 @@ func (ss *ServiceSentinel) UpdateServiceList() { defer ss.serviceListLock.Unlock() ss.serviceList = utils.MapValuesToSlice(ss.services) - slices.SortFunc(ss.serviceList, func(a, b *model.Service) int { - return cmp.Compare(a.ID, b.ID) - }) + sortServices(ss.serviceList) } // loadServiceHistory 加载服务监控器的历史状态信息 @@ -207,6 +207,7 @@ func (ss *ServiceSentinel) loadServiceHistory() error { ss.serviceStatusToday[service.ID] = &_TodayStatsOfService{} } ss.serviceList = services + sortServices(ss.serviceList) year, month, day := time.Now().Date() today := time.Date(year, month, day, 0, 0, 0, 0, Loc) @@ -215,33 +216,111 @@ func (ss *ServiceSentinel) loadServiceHistory() error { ss.monthlyStatus[service.ID] = &serviceResponseItem{ service: service, ServiceResponseItem: model.ServiceResponseItem{ - Delay: &[30]float32{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, - Up: &[30]uint64{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, - Down: &[30]uint64{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, + Delay: &[30]float64{}, + Up: &[30]uint64{}, + Down: &[30]uint64{}, }, } } - // 加载服务监控历史记录 - var mhs []model.ServiceHistory - DB.Where("created_at > ? AND created_at < ? AND server_id = 0", today.AddDate(0, 0, -29), today).Find(&mhs) - var delayCount = make(map[int]int) - for _, mh := range mhs { - dayIndex := 28 - (int(today.Sub(mh.CreatedAt).Hours()) / 24) - if dayIndex < 0 { - continue - } - ss.monthlyStatus[mh.ServiceID].Delay[dayIndex] = (ss.monthlyStatus[mh.ServiceID].Delay[dayIndex]*float32(delayCount[dayIndex]) + mh.AvgDelay) / float32(delayCount[dayIndex]+1) - delayCount[dayIndex]++ - ss.monthlyStatus[mh.ServiceID].Up[dayIndex] += mh.Up - ss.monthlyStatus[mh.ServiceID].TotalUp += mh.Up - ss.monthlyStatus[mh.ServiceID].Down[dayIndex] += mh.Down - ss.monthlyStatus[mh.ServiceID].TotalDown += mh.Down + if TSDBEnabled() { + ss.loadMonthlyStatusFromTSDB(services, today) + } else { + ss.loadMonthlyStatusFromDB(today) } return nil } +func (ss *ServiceSentinel) loadMonthlyStatusFromTSDB(services []*model.Service, today time.Time) { + for _, service := range services { + dailyStats, err := TSDBShared.QueryServiceDailyStats(service.ID, today, 30) + if err != nil { + log.Printf("NEZHA>> Failed to load TSDB history for service %d: %v", service.ID, err) + continue + } + ms := ss.monthlyStatus[service.ID] + for i := 0; i < 29; i++ { + ms.Up[i] = dailyStats[i].Up + ms.TotalUp += dailyStats[i].Up + ms.Down[i] = dailyStats[i].Down + ms.TotalDown += dailyStats[i].Down + ms.Delay[i] = dailyStats[i].Delay + } + } +} + +func (ss *ServiceSentinel) loadMonthlyStatusFromDB(today time.Time) { + var mhs []model.ServiceHistory + DB.Where("created_at > ? AND created_at < ? AND server_id = 0", today.AddDate(0, 0, -29), today).Find(&mhs) + delayCount := make(map[uint64]map[int]int) + for _, mh := range mhs { + dayIndex := 28 - int(today.Sub(mh.CreatedAt).Hours())/24 + if dayIndex < 0 { + continue + } + ms := ss.monthlyStatus[mh.ServiceID] + if ms == nil { + continue + } + if delayCount[mh.ServiceID] == nil { + delayCount[mh.ServiceID] = make(map[int]int) + } + ms.Delay[dayIndex] = (ms.Delay[dayIndex]*float64(delayCount[mh.ServiceID][dayIndex]) + mh.AvgDelay) / float64(delayCount[mh.ServiceID][dayIndex]+1) + delayCount[mh.ServiceID][dayIndex]++ + ms.Up[dayIndex] += mh.Up + ms.TotalUp += mh.Up + ms.Down[dayIndex] += mh.Down + ms.TotalDown += mh.Down + } +} + +func (ss *ServiceSentinel) loadTodayStats(today time.Time) { + if TSDBEnabled() { + for serviceID, ms := range ss.monthlyStatus { + result, err := TSDBShared.QueryServiceHistory(serviceID, tsdb.Period1Day) + if err != nil { + log.Printf("NEZHA>> Failed to load TSDB today stats for service %d: %v", serviceID, err) + continue + } + var totalUp, totalDown uint64 + var totalDelay float64 + var delayCount int + for _, serverStats := range result.Servers { + totalUp += serverStats.Stats.TotalUp + totalDown += serverStats.Stats.TotalDown + if serverStats.Stats.AvgDelay > 0 { + totalDelay += serverStats.Stats.AvgDelay + delayCount++ + } + } + ss.serviceStatusToday[serviceID].Up = totalUp + ss.serviceStatusToday[serviceID].Down = totalDown + if delayCount > 0 { + ss.serviceStatusToday[serviceID].Delay = totalDelay / float64(delayCount) + } + ms.TotalUp += totalUp + ms.TotalDown += totalDown + } + } else { + var mhs []model.ServiceHistory + DB.Where("created_at >= ? AND server_id = 0", today).Find(&mhs) + totalDelay := make(map[uint64]float64) + totalDelayCount := make(map[uint64]int) + for _, mh := range mhs { + ss.serviceStatusToday[mh.ServiceID].Up += mh.Up + ss.monthlyStatus[mh.ServiceID].TotalUp += mh.Up + ss.serviceStatusToday[mh.ServiceID].Down += mh.Down + ss.monthlyStatus[mh.ServiceID].TotalDown += mh.Down + totalDelay[mh.ServiceID] += mh.AvgDelay + totalDelayCount[mh.ServiceID]++ + } + for id, delay := range totalDelay { + ss.serviceStatusToday[id].Delay = delay / float64(totalDelayCount[id]) + } + } +} + func (ss *ServiceSentinel) Update(m *model.Service) error { ss.serviceResponseDataStoreLock.Lock() defer ss.serviceResponseDataStoreLock.Unlock() @@ -266,9 +345,9 @@ func (ss *ServiceSentinel) Update(m *model.Service) error { ss.monthlyStatus[m.ID] = &serviceResponseItem{ service: m, ServiceResponseItem: model.ServiceResponseItem{ - Delay: &[30]float32{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, - Up: &[30]uint64{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, - Down: &[30]uint64{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, + Delay: &[30]float64{}, + Up: &[30]uint64{}, + Down: &[30]uint64{}, }, } if ss.serviceCurrentStatusData[m.ID] == nil { @@ -406,6 +485,7 @@ func (ss *ServiceSentinel) worker() { mh := r.Data if mh.Type == model.TaskTypeTCPPing || mh.Type == model.TaskTypeICMPPing { + // TCP/ICMP Ping 使用平均值计算后再写入 serviceTcpMap, ok := ss.serviceResponsePing[mh.GetId()] if !ok { serviceTcpMap = make(map[uint64]*pingStore) @@ -416,28 +496,56 @@ func (ss *ServiceSentinel) worker() { ts = &pingStore{} } ts.count++ - ts.ping = (ts.ping*float32(ts.count-1) + mh.Delay) / float32(ts.count) + ts.ping = (ts.ping*float64(ts.count-1) + float64(mh.Delay)) / float64(ts.count) + if mh.Successful { + ts.successCount++ + } if ts.count == Conf.AvgPingCount { - if err := DB.Create(&model.ServiceHistory{ - ServiceID: mh.GetId(), - AvgDelay: ts.ping, - Data: mh.Data, - ServerID: r.Reporter, - }).Error; err != nil { - log.Printf("NEZHA>> Failed to save service monitor metrics: %v", err) + if TSDBEnabled() { + if err := TSDBShared.WriteServiceMetrics(&tsdb.ServiceMetrics{ + ServiceID: mh.GetId(), + ServerID: r.Reporter, + Timestamp: time.Now(), + Delay: ts.ping, + Successful: ts.successCount*2 >= ts.count, + }); err != nil { + log.Printf("NEZHA>> Failed to save service monitor metrics to TSDB: %v", err) + } + } else { + if err := DB.Create(&model.ServiceHistory{ + ServiceID: mh.GetId(), + AvgDelay: ts.ping, + Data: mh.Data, + ServerID: r.Reporter, + }).Error; err != nil { + log.Printf("NEZHA>> Failed to save service monitor metrics: %v", err) + } } ts.count = 0 - ts.ping = mh.Delay + ts.ping = 0 + ts.successCount = 0 } serviceTcpMap[r.Reporter] = ts + } else { + if TSDBEnabled() { + if err := TSDBShared.WriteServiceMetrics(&tsdb.ServiceMetrics{ + ServiceID: mh.GetId(), + ServerID: r.Reporter, + Timestamp: time.Now(), + Delay: float64(mh.Delay), + Successful: mh.Successful, + }); err != nil { + log.Printf("NEZHA>> Failed to save service monitor metrics to TSDB: %v", err) + } + } } ss.serviceResponseDataStoreLock.Lock() // 写入当天状态 if mh.Successful { ss.serviceStatusToday[mh.GetId()].Delay = (ss.serviceStatusToday[mh. - GetId()].Delay*float32(ss.serviceStatusToday[mh.GetId()].Up) + - mh.Delay) / float32(ss.serviceStatusToday[mh.GetId()].Up+1) + GetId()].Delay*float64(ss.serviceStatusToday[mh.GetId()].Up) + + float64(mh.Delay)) / float64(ss.serviceStatusToday[mh.GetId()].Up+1) ss.serviceStatusToday[mh.GetId()].Up++ } else { ss.serviceStatusToday[mh.GetId()].Down++ @@ -463,7 +571,7 @@ func (ss *ServiceSentinel) worker() { rd := ss.serviceResponseDataStore[mh.GetId()] if cs.Successful { rd.Up++ - rd.Delay = (rd.Delay*float32(rd.Up-1) + cs.Delay) / float32(rd.Up) + rd.Delay = (rd.Delay*float64(rd.Up-1) + float64(cs.Delay)) / float64(rd.Up) } else { rd.Down++ } @@ -482,20 +590,20 @@ func (ss *ServiceSentinel) worker() { stateCode = GetStatusCode(upPercent) } - // 数据持久化 if len(ss.serviceCurrentStatusData[mh.GetId()].result) == _CurrentStatusSize { ss.serviceCurrentStatusData[mh.GetId()].t = currentTime - rd := ss.serviceResponseDataStore[mh.GetId()] - if err := DB.Create(&model.ServiceHistory{ - ServiceID: mh.GetId(), - AvgDelay: rd.Delay, - Data: mh.Data, - Up: rd.Up, - Down: rd.Down, - }).Error; err != nil { - log.Printf("NEZHA>> Failed to save service monitor metrics: %v", err) + if !TSDBEnabled() { + rd := ss.serviceResponseDataStore[mh.GetId()] + if err := DB.Create(&model.ServiceHistory{ + ServiceID: mh.GetId(), + AvgDelay: rd.Delay, + Data: mh.Data, + Up: rd.Up, + Down: rd.Down, + }).Error; err != nil { + log.Printf("NEZHA>> Failed to save service monitor metrics: %v", err) + } } - ss.serviceCurrentStatusData[mh.GetId()].result = ss.serviceCurrentStatusData[mh.GetId()].result[:0] } diff --git a/service/singleton/servicesentinel_test.go b/service/singleton/servicesentinel_test.go new file mode 100644 index 0000000..bc0492c --- /dev/null +++ b/service/singleton/servicesentinel_test.go @@ -0,0 +1,336 @@ +package singleton + +import ( + "os" + "path/filepath" + "testing" + "time" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + "gorm.io/driver/sqlite" + "gorm.io/gorm" + + "github.com/nezhahq/nezha/model" + "github.com/nezhahq/nezha/pkg/tsdb" +) + +func newTestSentinel(serviceIDs []uint64) *ServiceSentinel { + ss := &ServiceSentinel{ + serviceStatusToday: make(map[uint64]*_TodayStatsOfService), + monthlyStatus: make(map[uint64]*serviceResponseItem), + } + for _, id := range serviceIDs { + ss.serviceStatusToday[id] = &_TodayStatsOfService{} + ss.monthlyStatus[id] = &serviceResponseItem{ + service: &model.Service{Common: model.Common{ID: id}}, + ServiceResponseItem: model.ServiceResponseItem{ + Delay: &[30]float64{}, + Up: &[30]uint64{}, + Down: &[30]uint64{}, + }, + } + } + return ss +} + +func setupTestDB(t *testing.T) func() { + t.Helper() + var err error + DB, err = gorm.Open(sqlite.Open(":memory:"), &gorm.Config{}) + require.NoError(t, err) + require.NoError(t, DB.AutoMigrate(model.ServiceHistory{})) + return func() { DB = nil } +} + +func setupTestTSDB(t *testing.T) (*tsdb.TSDB, func()) { + t.Helper() + tempDir, err := os.MkdirTemp("", "tsdb_sentinel_test") + require.NoError(t, err) + config := &tsdb.Config{ + DataPath: filepath.Join(tempDir, "tsdb"), + RetentionDays: 30, + MinFreeDiskSpaceGB: 1, + DedupInterval: time.Second, + } + db, err := tsdb.Open(config) + require.NoError(t, err) + TSDBShared = db + return db, func() { + db.Close() + TSDBShared = nil + os.RemoveAll(tempDir) + } +} + +func TestLoadMonthlyStatusFromDB(t *testing.T) { + cleanup := setupTestDB(t) + defer cleanup() + + year, month, day := time.Now().Date() + today := time.Date(year, month, day, 0, 0, 0, 0, time.UTC) + + serviceID := uint64(1) + ss := newTestSentinel([]uint64{serviceID}) + + DB.Create(&model.ServiceHistory{ + ServiceID: serviceID, + ServerID: 0, + AvgDelay: 10.0, + Up: 5, + Down: 1, + CreatedAt: today.Add(-25 * time.Hour), + }) + DB.Create(&model.ServiceHistory{ + ServiceID: serviceID, + ServerID: 0, + AvgDelay: 20.0, + Up: 3, + Down: 2, + CreatedAt: today.Add(-25 * time.Hour), + }) + DB.Create(&model.ServiceHistory{ + ServiceID: serviceID, + ServerID: 0, + AvgDelay: 30.0, + Up: 10, + Down: 0, + CreatedAt: today.Add(-49 * time.Hour), + }) + + ss.loadMonthlyStatusFromDB(today) + + ms := ss.monthlyStatus[serviceID] + + // day -1: index 27, two records with AvgDelay 10 and 20 + assert.InDelta(t, 15.0, ms.Delay[27], 0.01) + assert.Equal(t, uint64(8), ms.Up[27]) + assert.Equal(t, uint64(3), ms.Down[27]) + + // day -2: index 26 + assert.InDelta(t, 30.0, ms.Delay[26], 0.01) + assert.Equal(t, uint64(10), ms.Up[26]) + assert.Equal(t, uint64(0), ms.Down[26]) + + // totals + assert.Equal(t, uint64(18), ms.TotalUp) + assert.Equal(t, uint64(3), ms.TotalDown) + + // today (index 29) should be untouched + assert.Equal(t, float64(0), ms.Delay[29]) + assert.Equal(t, uint64(0), ms.Up[29]) +} + +func TestLoadMonthlyStatusFromDB_IgnoresToday(t *testing.T) { + cleanup := setupTestDB(t) + defer cleanup() + + year, month, day := time.Now().Date() + today := time.Date(year, month, day, 0, 0, 0, 0, time.UTC) + + serviceID := uint64(1) + ss := newTestSentinel([]uint64{serviceID}) + + DB.Create(&model.ServiceHistory{ + ServiceID: serviceID, + ServerID: 0, + AvgDelay: 50.0, + Up: 100, + Down: 5, + CreatedAt: today.Add(2 * time.Hour), + }) + + ss.loadMonthlyStatusFromDB(today) + + ms := ss.monthlyStatus[serviceID] + assert.Equal(t, uint64(0), ms.TotalUp) + assert.Equal(t, uint64(0), ms.TotalDown) +} + +func TestLoadMonthlyStatusFromDB_UnknownServiceIgnored(t *testing.T) { + cleanup := setupTestDB(t) + defer cleanup() + + year, month, day := time.Now().Date() + today := time.Date(year, month, day, 0, 0, 0, 0, time.UTC) + + ss := newTestSentinel([]uint64{1}) + + DB.Create(&model.ServiceHistory{ + ServiceID: 999, + ServerID: 0, + AvgDelay: 10.0, + Up: 5, + Down: 1, + CreatedAt: today.Add(-25 * time.Hour), + }) + + ss.loadMonthlyStatusFromDB(today) + + ms := ss.monthlyStatus[uint64(1)] + assert.Equal(t, uint64(0), ms.TotalUp) +} + +func TestLoadTodayStatsFromDB(t *testing.T) { + cleanup := setupTestDB(t) + defer cleanup() + + year, month, day := time.Now().Date() + today := time.Date(year, month, day, 0, 0, 0, 0, time.UTC) + + serviceID := uint64(1) + ss := newTestSentinel([]uint64{serviceID}) + + DB.Create(&model.ServiceHistory{ + ServiceID: serviceID, + ServerID: 0, + AvgDelay: 10.0, + Up: 5, + Down: 1, + CreatedAt: today.Add(1 * time.Hour), + }) + DB.Create(&model.ServiceHistory{ + ServiceID: serviceID, + ServerID: 0, + AvgDelay: 30.0, + Up: 3, + Down: 2, + CreatedAt: today.Add(2 * time.Hour), + }) + + ss.loadTodayStats(today) + + st := ss.serviceStatusToday[serviceID] + assert.Equal(t, uint64(8), st.Up) + assert.Equal(t, uint64(3), st.Down) + assert.InDelta(t, 20.0, st.Delay, 0.01) + + ms := ss.monthlyStatus[serviceID] + assert.Equal(t, uint64(8), ms.TotalUp) + assert.Equal(t, uint64(3), ms.TotalDown) +} + +func TestLoadMonthlyStatusFromTSDB(t *testing.T) { + db, cleanup := setupTestTSDB(t) + defer cleanup() + + year, month, day := time.Now().Date() + today := time.Date(year, month, day, 0, 0, 0, 0, time.UTC) + + serviceID := uint64(1) + services := []*model.Service{{Common: model.Common{ID: serviceID}}} + ss := newTestSentinel([]uint64{serviceID}) + + yesterday := today.Add(-25 * time.Hour) + for i := 0; i < 5; i++ { + ts := yesterday.Add(time.Duration(i) * time.Minute) + require.NoError(t, db.WriteServiceMetrics(&tsdb.ServiceMetrics{ + ServiceID: serviceID, + ServerID: 1, + Timestamp: ts, + Delay: float64(10 + i), + Successful: true, + })) + } + for i := 0; i < 3; i++ { + require.NoError(t, db.WriteServiceMetrics(&tsdb.ServiceMetrics{ + ServiceID: serviceID, + ServerID: 1, + Timestamp: yesterday.Add(time.Duration(i+10) * time.Minute), + Delay: float64(20 + i), + Successful: false, + })) + } + + db.Flush() + + ss.loadMonthlyStatusFromTSDB(services, today) + + ms := ss.monthlyStatus[serviceID] + // day -1: dayIndex 28 + assert.Equal(t, uint64(5), ms.Up[28]) + assert.Equal(t, uint64(3), ms.Down[28]) + assert.Equal(t, uint64(5), ms.TotalUp) + assert.Equal(t, uint64(3), ms.TotalDown) + assert.Greater(t, ms.Delay[28], float64(0)) + + // today (index 29) should be untouched + assert.Equal(t, uint64(0), ms.Up[29]) +} + +func TestLoadTodayStatsFromTSDB(t *testing.T) { + db, cleanup := setupTestTSDB(t) + defer cleanup() + + year, month, day := time.Now().Date() + today := time.Date(year, month, day, 0, 0, 0, 0, time.UTC) + + serviceID := uint64(1) + ss := newTestSentinel([]uint64{serviceID}) + + now := time.Now() + for i := 0; i < 4; i++ { + ts := now.Add(-time.Duration(i) * time.Minute) + require.NoError(t, db.WriteServiceMetrics(&tsdb.ServiceMetrics{ + ServiceID: serviceID, + ServerID: 1, + Timestamp: ts, + Delay: float64(10 + i), + Successful: true, + })) + } + for i := 0; i < 2; i++ { + ts := now.Add(-time.Duration(i+10) * time.Minute) + require.NoError(t, db.WriteServiceMetrics(&tsdb.ServiceMetrics{ + ServiceID: serviceID, + ServerID: 1, + Timestamp: ts, + Delay: 0, + Successful: false, + })) + } + + db.Flush() + + ss.loadTodayStats(today) + + st := ss.serviceStatusToday[serviceID] + assert.Greater(t, st.Up, uint64(0)) + assert.Greater(t, st.Down, uint64(0)) + + ms := ss.monthlyStatus[serviceID] + assert.Equal(t, st.Up, ms.TotalUp) + assert.Equal(t, st.Down, ms.TotalDown) +} + +func TestLoadMonthlyStatusFromTSDB_NoDoubleCountToday(t *testing.T) { + db, cleanup := setupTestTSDB(t) + defer cleanup() + + year, month, day := time.Now().Date() + today := time.Date(year, month, day, 0, 0, 0, 0, time.UTC) + + serviceID := uint64(1) + services := []*model.Service{{Common: model.Common{ID: serviceID}}} + ss := newTestSentinel([]uint64{serviceID}) + + now := time.Now() + for i := 0; i < 5; i++ { + require.NoError(t, db.WriteServiceMetrics(&tsdb.ServiceMetrics{ + ServiceID: serviceID, + ServerID: 1, + Timestamp: now.Add(-time.Duration(i) * time.Minute), + Delay: 10.0, + Successful: true, + })) + } + db.Flush() + + ss.loadMonthlyStatusFromTSDB(services, today) + totalAfterMonthly := ss.monthlyStatus[serviceID].TotalUp + + ss.loadTodayStats(today) + totalAfterToday := ss.monthlyStatus[serviceID].TotalUp + + assert.Equal(t, totalAfterMonthly+ss.serviceStatusToday[serviceID].Up, totalAfterToday) +} diff --git a/service/singleton/singleton.go b/service/singleton/singleton.go index e586d0b..b04be1a 100644 --- a/service/singleton/singleton.go +++ b/service/singleton/singleton.go @@ -87,12 +87,13 @@ func InitDBFromPath(path string) error { } err = DB.AutoMigrate(model.Server{}, model.User{}, model.ServerGroup{}, model.NotificationGroup{}, model.Notification{}, model.AlertRule{}, model.Service{}, model.NotificationGroupNotification{}, - model.ServiceHistory{}, model.Cron{}, model.Transfer{}, model.ServerGroupServer{}, + model.Cron{}, model.Transfer{}, model.ServerGroupServer{}, model.NAT{}, model.DDNSProfile{}, model.NotificationGroupNotification{}, model.WAF{}, model.Oauth2Bind{}) if err != nil { return err } + return nil } @@ -130,14 +131,9 @@ func RecordTransferHourlyUsage(servers ...*model.Server) { log.Printf("NEZHA>> Saved traffic metrics to database. Affected %d row(s), Error: %v", len(txs), DB.Create(txs).Error) } -// CleanServiceHistory 清理无效或过时的 监控记录 和 流量记录 -func CleanServiceHistory() { - // 清理已被删除的服务器的监控记录与流量记录 - DB.Unscoped().Delete(&model.ServiceHistory{}, "created_at < ? OR service_id NOT IN (SELECT `id` FROM services)", time.Now().AddDate(0, 0, -30)) - // 由于网络监控记录的数据较多,并且前端仅使用了 1 天的数据 - // 考虑到 sqlite 数据量问题,仅保留一天数据, - // server_id = 0 的数据会用于/service页面的可用性展示 - DB.Unscoped().Delete(&model.ServiceHistory{}, "(created_at < ? AND server_id != 0) OR service_id NOT IN (SELECT `id` FROM services)", time.Now().AddDate(0, 0, -1)) +// CleanMonitorHistory 清理流量记录(TSDB 有自己的保留策略) +func CleanMonitorHistory() { + // 清理已被删除的服务器的流量记录 DB.Unscoped().Delete(&model.Transfer{}, "server_id NOT IN (SELECT `id` FROM servers)") // 计算可清理流量记录的时长 var allServerKeep time.Time @@ -179,6 +175,28 @@ func CleanServiceHistory() { } } +// PerformMaintenance 执行系统维护(SQLite VACUUM 和 TSDB 维护) +func PerformMaintenance() { + log.Println("NEZHA>> Starting system maintenance...") + + // 1. SQLite 维护 + if DB != nil { + log.Println("NEZHA>> SQLite: Starting VACUUM...") + if err := DB.Exec("VACUUM").Error; err != nil { + log.Printf("NEZHA>> SQLite: VACUUM failed: %v", err) + } else { + log.Println("NEZHA>> SQLite: VACUUM completed") + } + } + + // 2. TSDB 维护 + if TSDBEnabled() { + TSDBShared.Maintenance() + } + + log.Println("NEZHA>> System maintenance completed") +} + // IPDesensitize 根据设置选择是否对IP进行打码处理 返回处理后的IP(关闭打码则返回原IP) func IPDesensitize(ip string) string { if Conf.EnablePlainIPInNotification { diff --git a/service/singleton/tsdb.go b/service/singleton/tsdb.go new file mode 100644 index 0000000..745e83d --- /dev/null +++ b/service/singleton/tsdb.go @@ -0,0 +1,73 @@ +package singleton + +import ( + "log" + "time" + + "github.com/nezhahq/nezha/model" + "github.com/nezhahq/nezha/pkg/tsdb" +) + +var TSDBShared *tsdb.TSDB + +func InitTSDB() error { + config := &tsdb.Config{ + RetentionDays: 30, + MinFreeDiskSpaceGB: 1, + MaxMemoryMB: 256, + } + + if Conf.TSDB.DataPath != "" { + config.DataPath = Conf.TSDB.DataPath + } + if Conf.TSDB.RetentionDays > 0 { + config.RetentionDays = Conf.TSDB.RetentionDays + } + if Conf.TSDB.MinFreeDiskSpaceGB > 0 { + config.MinFreeDiskSpaceGB = Conf.TSDB.MinFreeDiskSpaceGB + } + if Conf.TSDB.MaxMemoryMB > 0 { + config.MaxMemoryMB = Conf.TSDB.MaxMemoryMB + } + if Conf.TSDB.WriteBufferSize > 0 { + config.WriteBufferSize = Conf.TSDB.WriteBufferSize + } + if Conf.TSDB.WriteBufferFlushInterval > 0 { + config.WriteBufferFlushInterval = time.Duration(Conf.TSDB.WriteBufferFlushInterval) * time.Second + } + + if !config.Enabled() { + log.Println("NEZHA>> TSDB is disabled (tsdb.data_path not configured)") + if DB != nil { + return DB.AutoMigrate(model.ServiceHistory{}) + } + return nil + } + + var err error + TSDBShared, err = tsdb.Open(config) + if err != nil { + return err + } + + log.Println("NEZHA>> TSDB initialized successfully") + + if DB != nil && DB.Migrator().HasTable("service_histories") { + log.Println("NEZHA>> Dropping legacy service_histories table (TSDB is now enabled). Historical data will NOT be migrated.") + if err := DB.Migrator().DropTable("service_histories"); err != nil { + log.Printf("NEZHA>> Warning: failed to drop service_histories table: %v", err) + } + } + + return nil +} + +func TSDBEnabled() bool { + return TSDBShared != nil && !TSDBShared.IsClosed() +} + +func CloseTSDB() { + if TSDBShared != nil { + TSDBShared.Close() + } +}