feat(coderd/database): keep only 1 day of `workspace_agent_stats` after rollup (#12674)

This commit is contained in:
Mathias Fredriksson 2024-04-22 13:11:50 +03:00 committed by GitHub
parent 4a6693a171
commit e17e8aa3c9
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
6 changed files with 184 additions and 16 deletions

View File

@ -965,7 +965,7 @@ func (r *RootCmd) Server(newAPI func(context.Context, *coderd.Options) (*coderd.
defer shutdownConns()
// Ensures that old database entries are cleaned up over time!
purger := dbpurge.New(ctx, logger, options.Database)
purger := dbpurge.New(ctx, logger.Named("dbpurge"), options.Database)
defer purger.Close()
// Updates workspace usage

View File

@ -1506,13 +1506,65 @@ func (q *FakeQuerier) DeleteOldWorkspaceAgentStats(_ context.Context) error {
q.mutex.Lock()
defer q.mutex.Unlock()
/*
DELETE FROM
workspace_agent_stats
WHERE
created_at < (
SELECT
COALESCE(
-- When generating initial template usage stats, all the
-- raw agent stats are needed, after that only ~30 mins
-- from last rollup is needed. Deployment stats seem to
-- use between 15 mins and 1 hour of data. We keep a
-- little bit more (1 day) just in case.
MAX(start_time) - '1 days'::interval,
-- Fall back to 6 months ago if there are no template
-- usage stats so that we don't delete the data before
-- it's rolled up.
NOW() - '6 months'::interval
)
FROM
template_usage_stats
)
AND created_at < (
-- Delete at most in batches of 3 days (with a batch size of 3 days, we
-- can clear out the previous 6 months of data in ~60 iterations) whilst
-- keeping the DB load relatively low.
SELECT
COALESCE(MIN(created_at) + '3 days'::interval, NOW())
FROM
workspace_agent_stats
);
*/
now := dbtime.Now()
sixMonthInterval := 6 * 30 * 24 * time.Hour
sixMonthsAgo := now.Add(-sixMonthInterval)
var limit time.Time
// MAX
for _, stat := range q.templateUsageStats {
if stat.StartTime.After(limit) {
limit = stat.StartTime.AddDate(0, 0, -1)
}
}
// COALESCE
if limit.IsZero() {
limit = now.AddDate(0, -6, 0)
}
var validStats []database.WorkspaceAgentStat
var batchLimit time.Time
for _, stat := range q.workspaceAgentStats {
if stat.CreatedAt.Before(sixMonthsAgo) {
if batchLimit.IsZero() || stat.CreatedAt.Before(batchLimit) {
batchLimit = stat.CreatedAt
}
}
if batchLimit.IsZero() {
batchLimit = time.Now()
} else {
batchLimit = batchLimit.AddDate(0, 0, 3)
}
for _, stat := range q.workspaceAgentStats {
if stat.CreatedAt.Before(limit) && stat.CreatedAt.Before(batchLimit) {
continue
}
validStats = append(validStats, stat)

View File

@ -24,7 +24,6 @@ const (
// This is for cleaning up old, unused resources from the database that take up space.
func New(ctx context.Context, logger slog.Logger, db database.Store) io.Closer {
closed := make(chan struct{})
logger = logger.Named("dbpurge")
ctx, cancelFunc := context.WithCancel(ctx)
//nolint:gocritic // The system purges old db records without user input.

View File

@ -11,12 +11,14 @@ import (
"go.uber.org/goleak"
"golang.org/x/exp/slices"
"cdr.dev/slog"
"cdr.dev/slog/sloggers/slogtest"
"github.com/coder/coder/v2/coderd/database"
"github.com/coder/coder/v2/coderd/database/dbgen"
"github.com/coder/coder/v2/coderd/database/dbmem"
"github.com/coder/coder/v2/coderd/database/dbpurge"
"github.com/coder/coder/v2/coderd/database/dbrollup"
"github.com/coder/coder/v2/coderd/database/dbtestutil"
"github.com/coder/coder/v2/coderd/database/dbtime"
"github.com/coder/coder/v2/provisionerd/proto"
@ -40,27 +42,62 @@ func TestDeleteOldWorkspaceAgentStats(t *testing.T) {
t.Parallel()
db, _ := dbtestutil.NewDB(t)
logger := slogtest.Make(t, &slogtest.Options{IgnoreErrors: true})
logger := slogtest.Make(t, &slogtest.Options{IgnoreErrors: true}).Leveled(slog.LevelDebug)
now := dbtime.Now()
defer func() {
if t.Failed() {
t.Logf("Test failed, printing rows...")
ctx := testutil.Context(t, testutil.WaitShort)
wasRows, err := db.GetWorkspaceAgentStats(ctx, now.AddDate(0, -7, 0))
if err == nil {
for _, row := range wasRows {
t.Logf("workspace agent stat: %v", row)
}
}
tusRows, err := db.GetTemplateUsageStats(context.Background(), database.GetTemplateUsageStatsParams{
StartTime: now.AddDate(0, -7, 0),
EndTime: now,
})
if err == nil {
for _, row := range tusRows {
t.Logf("template usage stat: %v", row)
}
}
}
}()
ctx, cancel := context.WithTimeout(context.Background(), testutil.WaitShort)
defer cancel()
now := dbtime.Now()
// given
// Let's use RxBytes to identify stat entries.
// Stat inserted 6 months + 1 hour ago, should be deleted.
first := dbgen.WorkspaceAgentStat(t, db, database.WorkspaceAgentStat{
CreatedAt: now.Add(-6*30*24*time.Hour - time.Hour),
CreatedAt: now.AddDate(0, -6, 0).Add(-time.Hour),
ConnectionCount: 1,
ConnectionMedianLatencyMS: 1,
RxBytes: 1111,
SessionCountSSH: 1,
})
// Stat inserted 6 months - 1 hour ago, should not be deleted.
// Stat inserted 6 months - 1 hour ago, should not be deleted before rollup.
second := dbgen.WorkspaceAgentStat(t, db, database.WorkspaceAgentStat{
CreatedAt: now.Add(-5*30*24*time.Hour + time.Hour),
CreatedAt: now.AddDate(0, -6, 0).Add(time.Hour),
ConnectionCount: 1,
ConnectionMedianLatencyMS: 1,
RxBytes: 2222,
SessionCountSSH: 1,
})
// Stat inserted 6 months - 1 day - 2 hour ago, should not be deleted at all.
third := dbgen.WorkspaceAgentStat(t, db, database.WorkspaceAgentStat{
CreatedAt: now.AddDate(0, -6, 0).AddDate(0, 0, 1).Add(2 * time.Hour),
ConnectionCount: 1,
ConnectionMedianLatencyMS: 1,
RxBytes: 3333,
SessionCountSSH: 1,
})
// when
@ -70,15 +107,39 @@ func TestDeleteOldWorkspaceAgentStats(t *testing.T) {
// then
var stats []database.GetWorkspaceAgentStatsRow
var err error
require.Eventually(t, func() bool {
require.Eventuallyf(t, func() bool {
// Query all stats created not earlier than 7 months ago
stats, err = db.GetWorkspaceAgentStats(ctx, now.Add(-7*30*24*time.Hour))
stats, err = db.GetWorkspaceAgentStats(ctx, now.AddDate(0, -7, 0))
if err != nil {
return false
}
return !containsWorkspaceAgentStat(stats, first) &&
containsWorkspaceAgentStat(stats, second)
}, testutil.WaitShort, testutil.IntervalFast, stats)
}, testutil.WaitShort, testutil.IntervalFast, "it should delete old stats: %v", stats)
// when
events := make(chan dbrollup.Event)
rolluper := dbrollup.New(logger, db, dbrollup.WithEventChannel(events))
defer rolluper.Close()
_, _ = <-events, <-events
// Start a new purger to immediately trigger delete after rollup.
_ = closer.Close()
closer = dbpurge.New(ctx, logger, db)
defer closer.Close()
// then
require.Eventuallyf(t, func() bool {
// Query all stats created not earlier than 7 months ago
stats, err = db.GetWorkspaceAgentStats(ctx, now.AddDate(0, -7, 0))
if err != nil {
return false
}
return !containsWorkspaceAgentStat(stats, first) &&
!containsWorkspaceAgentStat(stats, second) &&
containsWorkspaceAgentStat(stats, third)
}, testutil.WaitShort, testutil.IntervalFast, "it should delete old stats after rollup: %v", stats)
}
func containsWorkspaceAgentStat(stats []database.GetWorkspaceAgentStatsRow, needle database.WorkspaceAgentStat) bool {

View File

@ -10111,7 +10111,35 @@ func (q *sqlQuerier) UpdateWorkspaceAgentStartupByID(ctx context.Context, arg Up
}
const deleteOldWorkspaceAgentStats = `-- name: DeleteOldWorkspaceAgentStats :exec
DELETE FROM workspace_agent_stats WHERE created_at < NOW() - INTERVAL '180 days'
DELETE FROM
workspace_agent_stats
WHERE
created_at < (
SELECT
COALESCE(
-- When generating initial template usage stats, all the
-- raw agent stats are needed, after that only ~30 mins
-- from last rollup is needed. Deployment stats seem to
-- use between 15 mins and 1 hour of data. We keep a
-- little bit more (1 day) just in case.
MAX(start_time) - '1 days'::interval,
-- Fall back to 6 months ago if there are no template
-- usage stats so that we don't delete the data before
-- it's rolled up.
NOW() - '6 months'::interval
)
FROM
template_usage_stats
)
AND created_at < (
-- Delete at most in batches of 3 days (with a batch size of 3 days, we
-- can clear out the previous 6 months of data in ~60 iterations) whilst
-- keeping the DB load relatively low.
SELECT
COALESCE(MIN(created_at) + '3 days'::interval, NOW())
FROM
workspace_agent_stats
)
`
func (q *sqlQuerier) DeleteOldWorkspaceAgentStats(ctx context.Context) error {

View File

@ -66,7 +66,35 @@ ORDER BY
date ASC;
-- name: DeleteOldWorkspaceAgentStats :exec
DELETE FROM workspace_agent_stats WHERE created_at < NOW() - INTERVAL '180 days';
DELETE FROM
workspace_agent_stats
WHERE
created_at < (
SELECT
COALESCE(
-- When generating initial template usage stats, all the
-- raw agent stats are needed, after that only ~30 mins
-- from last rollup is needed. Deployment stats seem to
-- use between 15 mins and 1 hour of data. We keep a
-- little bit more (1 day) just in case.
MAX(start_time) - '1 days'::interval,
-- Fall back to 6 months ago if there are no template
-- usage stats so that we don't delete the data before
-- it's rolled up.
NOW() - '6 months'::interval
)
FROM
template_usage_stats
)
AND created_at < (
-- Delete at most in batches of 3 days (with a batch size of 3 days, we
-- can clear out the previous 6 months of data in ~60 iterations) whilst
-- keeping the DB load relatively low.
SELECT
COALESCE(MIN(created_at) + '3 days'::interval, NOW())
FROM
workspace_agent_stats
);
-- name: GetDeploymentWorkspaceAgentStats :one
WITH agent_stats AS (