mirror of https://github.com/coder/coder.git
feat(coderd/database): keep only 1 day of `workspace_agent_stats` after rollup (#12674)
This commit is contained in:
parent
4a6693a171
commit
e17e8aa3c9
|
@ -965,7 +965,7 @@ func (r *RootCmd) Server(newAPI func(context.Context, *coderd.Options) (*coderd.
|
|||
defer shutdownConns()
|
||||
|
||||
// Ensures that old database entries are cleaned up over time!
|
||||
purger := dbpurge.New(ctx, logger, options.Database)
|
||||
purger := dbpurge.New(ctx, logger.Named("dbpurge"), options.Database)
|
||||
defer purger.Close()
|
||||
|
||||
// Updates workspace usage
|
||||
|
|
|
@ -1506,13 +1506,65 @@ func (q *FakeQuerier) DeleteOldWorkspaceAgentStats(_ context.Context) error {
|
|||
q.mutex.Lock()
|
||||
defer q.mutex.Unlock()
|
||||
|
||||
/*
|
||||
DELETE FROM
|
||||
workspace_agent_stats
|
||||
WHERE
|
||||
created_at < (
|
||||
SELECT
|
||||
COALESCE(
|
||||
-- When generating initial template usage stats, all the
|
||||
-- raw agent stats are needed, after that only ~30 mins
|
||||
-- from last rollup is needed. Deployment stats seem to
|
||||
-- use between 15 mins and 1 hour of data. We keep a
|
||||
-- little bit more (1 day) just in case.
|
||||
MAX(start_time) - '1 days'::interval,
|
||||
-- Fall back to 6 months ago if there are no template
|
||||
-- usage stats so that we don't delete the data before
|
||||
-- it's rolled up.
|
||||
NOW() - '6 months'::interval
|
||||
)
|
||||
FROM
|
||||
template_usage_stats
|
||||
)
|
||||
AND created_at < (
|
||||
-- Delete at most in batches of 3 days (with a batch size of 3 days, we
|
||||
-- can clear out the previous 6 months of data in ~60 iterations) whilst
|
||||
-- keeping the DB load relatively low.
|
||||
SELECT
|
||||
COALESCE(MIN(created_at) + '3 days'::interval, NOW())
|
||||
FROM
|
||||
workspace_agent_stats
|
||||
);
|
||||
*/
|
||||
|
||||
now := dbtime.Now()
|
||||
sixMonthInterval := 6 * 30 * 24 * time.Hour
|
||||
sixMonthsAgo := now.Add(-sixMonthInterval)
|
||||
var limit time.Time
|
||||
// MAX
|
||||
for _, stat := range q.templateUsageStats {
|
||||
if stat.StartTime.After(limit) {
|
||||
limit = stat.StartTime.AddDate(0, 0, -1)
|
||||
}
|
||||
}
|
||||
// COALESCE
|
||||
if limit.IsZero() {
|
||||
limit = now.AddDate(0, -6, 0)
|
||||
}
|
||||
|
||||
var validStats []database.WorkspaceAgentStat
|
||||
var batchLimit time.Time
|
||||
for _, stat := range q.workspaceAgentStats {
|
||||
if stat.CreatedAt.Before(sixMonthsAgo) {
|
||||
if batchLimit.IsZero() || stat.CreatedAt.Before(batchLimit) {
|
||||
batchLimit = stat.CreatedAt
|
||||
}
|
||||
}
|
||||
if batchLimit.IsZero() {
|
||||
batchLimit = time.Now()
|
||||
} else {
|
||||
batchLimit = batchLimit.AddDate(0, 0, 3)
|
||||
}
|
||||
for _, stat := range q.workspaceAgentStats {
|
||||
if stat.CreatedAt.Before(limit) && stat.CreatedAt.Before(batchLimit) {
|
||||
continue
|
||||
}
|
||||
validStats = append(validStats, stat)
|
||||
|
|
|
@ -24,7 +24,6 @@ const (
|
|||
// This is for cleaning up old, unused resources from the database that take up space.
|
||||
func New(ctx context.Context, logger slog.Logger, db database.Store) io.Closer {
|
||||
closed := make(chan struct{})
|
||||
logger = logger.Named("dbpurge")
|
||||
|
||||
ctx, cancelFunc := context.WithCancel(ctx)
|
||||
//nolint:gocritic // The system purges old db records without user input.
|
||||
|
|
|
@ -11,12 +11,14 @@ import (
|
|||
"go.uber.org/goleak"
|
||||
"golang.org/x/exp/slices"
|
||||
|
||||
"cdr.dev/slog"
|
||||
"cdr.dev/slog/sloggers/slogtest"
|
||||
|
||||
"github.com/coder/coder/v2/coderd/database"
|
||||
"github.com/coder/coder/v2/coderd/database/dbgen"
|
||||
"github.com/coder/coder/v2/coderd/database/dbmem"
|
||||
"github.com/coder/coder/v2/coderd/database/dbpurge"
|
||||
"github.com/coder/coder/v2/coderd/database/dbrollup"
|
||||
"github.com/coder/coder/v2/coderd/database/dbtestutil"
|
||||
"github.com/coder/coder/v2/coderd/database/dbtime"
|
||||
"github.com/coder/coder/v2/provisionerd/proto"
|
||||
|
@ -40,27 +42,62 @@ func TestDeleteOldWorkspaceAgentStats(t *testing.T) {
|
|||
t.Parallel()
|
||||
|
||||
db, _ := dbtestutil.NewDB(t)
|
||||
logger := slogtest.Make(t, &slogtest.Options{IgnoreErrors: true})
|
||||
logger := slogtest.Make(t, &slogtest.Options{IgnoreErrors: true}).Leveled(slog.LevelDebug)
|
||||
|
||||
now := dbtime.Now()
|
||||
|
||||
defer func() {
|
||||
if t.Failed() {
|
||||
t.Logf("Test failed, printing rows...")
|
||||
ctx := testutil.Context(t, testutil.WaitShort)
|
||||
wasRows, err := db.GetWorkspaceAgentStats(ctx, now.AddDate(0, -7, 0))
|
||||
if err == nil {
|
||||
for _, row := range wasRows {
|
||||
t.Logf("workspace agent stat: %v", row)
|
||||
}
|
||||
}
|
||||
tusRows, err := db.GetTemplateUsageStats(context.Background(), database.GetTemplateUsageStatsParams{
|
||||
StartTime: now.AddDate(0, -7, 0),
|
||||
EndTime: now,
|
||||
})
|
||||
if err == nil {
|
||||
for _, row := range tusRows {
|
||||
t.Logf("template usage stat: %v", row)
|
||||
}
|
||||
}
|
||||
}
|
||||
}()
|
||||
|
||||
ctx, cancel := context.WithTimeout(context.Background(), testutil.WaitShort)
|
||||
defer cancel()
|
||||
|
||||
now := dbtime.Now()
|
||||
|
||||
// given
|
||||
// Let's use RxBytes to identify stat entries.
|
||||
// Stat inserted 6 months + 1 hour ago, should be deleted.
|
||||
first := dbgen.WorkspaceAgentStat(t, db, database.WorkspaceAgentStat{
|
||||
CreatedAt: now.Add(-6*30*24*time.Hour - time.Hour),
|
||||
CreatedAt: now.AddDate(0, -6, 0).Add(-time.Hour),
|
||||
ConnectionCount: 1,
|
||||
ConnectionMedianLatencyMS: 1,
|
||||
RxBytes: 1111,
|
||||
SessionCountSSH: 1,
|
||||
})
|
||||
|
||||
// Stat inserted 6 months - 1 hour ago, should not be deleted.
|
||||
// Stat inserted 6 months - 1 hour ago, should not be deleted before rollup.
|
||||
second := dbgen.WorkspaceAgentStat(t, db, database.WorkspaceAgentStat{
|
||||
CreatedAt: now.Add(-5*30*24*time.Hour + time.Hour),
|
||||
CreatedAt: now.AddDate(0, -6, 0).Add(time.Hour),
|
||||
ConnectionCount: 1,
|
||||
ConnectionMedianLatencyMS: 1,
|
||||
RxBytes: 2222,
|
||||
SessionCountSSH: 1,
|
||||
})
|
||||
|
||||
// Stat inserted 6 months - 1 day - 2 hour ago, should not be deleted at all.
|
||||
third := dbgen.WorkspaceAgentStat(t, db, database.WorkspaceAgentStat{
|
||||
CreatedAt: now.AddDate(0, -6, 0).AddDate(0, 0, 1).Add(2 * time.Hour),
|
||||
ConnectionCount: 1,
|
||||
ConnectionMedianLatencyMS: 1,
|
||||
RxBytes: 3333,
|
||||
SessionCountSSH: 1,
|
||||
})
|
||||
|
||||
// when
|
||||
|
@ -70,15 +107,39 @@ func TestDeleteOldWorkspaceAgentStats(t *testing.T) {
|
|||
// then
|
||||
var stats []database.GetWorkspaceAgentStatsRow
|
||||
var err error
|
||||
require.Eventually(t, func() bool {
|
||||
require.Eventuallyf(t, func() bool {
|
||||
// Query all stats created not earlier than 7 months ago
|
||||
stats, err = db.GetWorkspaceAgentStats(ctx, now.Add(-7*30*24*time.Hour))
|
||||
stats, err = db.GetWorkspaceAgentStats(ctx, now.AddDate(0, -7, 0))
|
||||
if err != nil {
|
||||
return false
|
||||
}
|
||||
return !containsWorkspaceAgentStat(stats, first) &&
|
||||
containsWorkspaceAgentStat(stats, second)
|
||||
}, testutil.WaitShort, testutil.IntervalFast, stats)
|
||||
}, testutil.WaitShort, testutil.IntervalFast, "it should delete old stats: %v", stats)
|
||||
|
||||
// when
|
||||
events := make(chan dbrollup.Event)
|
||||
rolluper := dbrollup.New(logger, db, dbrollup.WithEventChannel(events))
|
||||
defer rolluper.Close()
|
||||
|
||||
_, _ = <-events, <-events
|
||||
|
||||
// Start a new purger to immediately trigger delete after rollup.
|
||||
_ = closer.Close()
|
||||
closer = dbpurge.New(ctx, logger, db)
|
||||
defer closer.Close()
|
||||
|
||||
// then
|
||||
require.Eventuallyf(t, func() bool {
|
||||
// Query all stats created not earlier than 7 months ago
|
||||
stats, err = db.GetWorkspaceAgentStats(ctx, now.AddDate(0, -7, 0))
|
||||
if err != nil {
|
||||
return false
|
||||
}
|
||||
return !containsWorkspaceAgentStat(stats, first) &&
|
||||
!containsWorkspaceAgentStat(stats, second) &&
|
||||
containsWorkspaceAgentStat(stats, third)
|
||||
}, testutil.WaitShort, testutil.IntervalFast, "it should delete old stats after rollup: %v", stats)
|
||||
}
|
||||
|
||||
func containsWorkspaceAgentStat(stats []database.GetWorkspaceAgentStatsRow, needle database.WorkspaceAgentStat) bool {
|
||||
|
|
|
@ -10111,7 +10111,35 @@ func (q *sqlQuerier) UpdateWorkspaceAgentStartupByID(ctx context.Context, arg Up
|
|||
}
|
||||
|
||||
const deleteOldWorkspaceAgentStats = `-- name: DeleteOldWorkspaceAgentStats :exec
|
||||
DELETE FROM workspace_agent_stats WHERE created_at < NOW() - INTERVAL '180 days'
|
||||
DELETE FROM
|
||||
workspace_agent_stats
|
||||
WHERE
|
||||
created_at < (
|
||||
SELECT
|
||||
COALESCE(
|
||||
-- When generating initial template usage stats, all the
|
||||
-- raw agent stats are needed, after that only ~30 mins
|
||||
-- from last rollup is needed. Deployment stats seem to
|
||||
-- use between 15 mins and 1 hour of data. We keep a
|
||||
-- little bit more (1 day) just in case.
|
||||
MAX(start_time) - '1 days'::interval,
|
||||
-- Fall back to 6 months ago if there are no template
|
||||
-- usage stats so that we don't delete the data before
|
||||
-- it's rolled up.
|
||||
NOW() - '6 months'::interval
|
||||
)
|
||||
FROM
|
||||
template_usage_stats
|
||||
)
|
||||
AND created_at < (
|
||||
-- Delete at most in batches of 3 days (with a batch size of 3 days, we
|
||||
-- can clear out the previous 6 months of data in ~60 iterations) whilst
|
||||
-- keeping the DB load relatively low.
|
||||
SELECT
|
||||
COALESCE(MIN(created_at) + '3 days'::interval, NOW())
|
||||
FROM
|
||||
workspace_agent_stats
|
||||
)
|
||||
`
|
||||
|
||||
func (q *sqlQuerier) DeleteOldWorkspaceAgentStats(ctx context.Context) error {
|
||||
|
|
|
@ -66,7 +66,35 @@ ORDER BY
|
|||
date ASC;
|
||||
|
||||
-- name: DeleteOldWorkspaceAgentStats :exec
|
||||
DELETE FROM workspace_agent_stats WHERE created_at < NOW() - INTERVAL '180 days';
|
||||
DELETE FROM
|
||||
workspace_agent_stats
|
||||
WHERE
|
||||
created_at < (
|
||||
SELECT
|
||||
COALESCE(
|
||||
-- When generating initial template usage stats, all the
|
||||
-- raw agent stats are needed, after that only ~30 mins
|
||||
-- from last rollup is needed. Deployment stats seem to
|
||||
-- use between 15 mins and 1 hour of data. We keep a
|
||||
-- little bit more (1 day) just in case.
|
||||
MAX(start_time) - '1 days'::interval,
|
||||
-- Fall back to 6 months ago if there are no template
|
||||
-- usage stats so that we don't delete the data before
|
||||
-- it's rolled up.
|
||||
NOW() - '6 months'::interval
|
||||
)
|
||||
FROM
|
||||
template_usage_stats
|
||||
)
|
||||
AND created_at < (
|
||||
-- Delete at most in batches of 3 days (with a batch size of 3 days, we
|
||||
-- can clear out the previous 6 months of data in ~60 iterations) whilst
|
||||
-- keeping the DB load relatively low.
|
||||
SELECT
|
||||
COALESCE(MIN(created_at) + '3 days'::interval, NOW())
|
||||
FROM
|
||||
workspace_agent_stats
|
||||
);
|
||||
|
||||
-- name: GetDeploymentWorkspaceAgentStats :one
|
||||
WITH agent_stats AS (
|
||||
|
|
Loading…
Reference in New Issue