From e17e8aa3c99eedf8fdf49fef54d56c99780b2493 Mon Sep 17 00:00:00 2001 From: Mathias Fredriksson Date: Mon, 22 Apr 2024 13:11:50 +0300 Subject: [PATCH] feat(coderd/database): keep only 1 day of `workspace_agent_stats` after rollup (#12674) --- cli/server.go | 2 +- coderd/database/dbmem/dbmem.go | 58 +++++++++++++- coderd/database/dbpurge/dbpurge.go | 1 - coderd/database/dbpurge/dbpurge_test.go | 79 ++++++++++++++++--- coderd/database/queries.sql.go | 30 ++++++- .../database/queries/workspaceagentstats.sql | 30 ++++++- 6 files changed, 184 insertions(+), 16 deletions(-) diff --git a/cli/server.go b/cli/server.go index d278dbea35..5b804f9278 100644 --- a/cli/server.go +++ b/cli/server.go @@ -965,7 +965,7 @@ func (r *RootCmd) Server(newAPI func(context.Context, *coderd.Options) (*coderd. defer shutdownConns() // Ensures that old database entries are cleaned up over time! - purger := dbpurge.New(ctx, logger, options.Database) + purger := dbpurge.New(ctx, logger.Named("dbpurge"), options.Database) defer purger.Close() // Updates workspace usage diff --git a/coderd/database/dbmem/dbmem.go b/coderd/database/dbmem/dbmem.go index 9ce98ff14c..fcc3140133 100644 --- a/coderd/database/dbmem/dbmem.go +++ b/coderd/database/dbmem/dbmem.go @@ -1506,13 +1506,65 @@ func (q *FakeQuerier) DeleteOldWorkspaceAgentStats(_ context.Context) error { q.mutex.Lock() defer q.mutex.Unlock() + /* + DELETE FROM + workspace_agent_stats + WHERE + created_at < ( + SELECT + COALESCE( + -- When generating initial template usage stats, all the + -- raw agent stats are needed, after that only ~30 mins + -- from last rollup is needed. Deployment stats seem to + -- use between 15 mins and 1 hour of data. We keep a + -- little bit more (1 day) just in case. + MAX(start_time) - '1 days'::interval, + -- Fall back to 6 months ago if there are no template + -- usage stats so that we don't delete the data before + -- it's rolled up. + NOW() - '6 months'::interval + ) + FROM + template_usage_stats + ) + AND created_at < ( + -- Delete at most in batches of 3 days (with a batch size of 3 days, we + -- can clear out the previous 6 months of data in ~60 iterations) whilst + -- keeping the DB load relatively low. + SELECT + COALESCE(MIN(created_at) + '3 days'::interval, NOW()) + FROM + workspace_agent_stats + ); + */ + now := dbtime.Now() - sixMonthInterval := 6 * 30 * 24 * time.Hour - sixMonthsAgo := now.Add(-sixMonthInterval) + var limit time.Time + // MAX + for _, stat := range q.templateUsageStats { + if stat.StartTime.After(limit) { + limit = stat.StartTime.AddDate(0, 0, -1) + } + } + // COALESCE + if limit.IsZero() { + limit = now.AddDate(0, -6, 0) + } var validStats []database.WorkspaceAgentStat + var batchLimit time.Time for _, stat := range q.workspaceAgentStats { - if stat.CreatedAt.Before(sixMonthsAgo) { + if batchLimit.IsZero() || stat.CreatedAt.Before(batchLimit) { + batchLimit = stat.CreatedAt + } + } + if batchLimit.IsZero() { + batchLimit = time.Now() + } else { + batchLimit = batchLimit.AddDate(0, 0, 3) + } + for _, stat := range q.workspaceAgentStats { + if stat.CreatedAt.Before(limit) && stat.CreatedAt.Before(batchLimit) { continue } validStats = append(validStats, stat) diff --git a/coderd/database/dbpurge/dbpurge.go b/coderd/database/dbpurge/dbpurge.go index c4b5a609a3..d3fc56a8c5 100644 --- a/coderd/database/dbpurge/dbpurge.go +++ b/coderd/database/dbpurge/dbpurge.go @@ -24,7 +24,6 @@ const ( // This is for cleaning up old, unused resources from the database that take up space. func New(ctx context.Context, logger slog.Logger, db database.Store) io.Closer { closed := make(chan struct{}) - logger = logger.Named("dbpurge") ctx, cancelFunc := context.WithCancel(ctx) //nolint:gocritic // The system purges old db records without user input. diff --git a/coderd/database/dbpurge/dbpurge_test.go b/coderd/database/dbpurge/dbpurge_test.go index 9e949f270d..4255409ba6 100644 --- a/coderd/database/dbpurge/dbpurge_test.go +++ b/coderd/database/dbpurge/dbpurge_test.go @@ -11,12 +11,14 @@ import ( "go.uber.org/goleak" "golang.org/x/exp/slices" + "cdr.dev/slog" "cdr.dev/slog/sloggers/slogtest" "github.com/coder/coder/v2/coderd/database" "github.com/coder/coder/v2/coderd/database/dbgen" "github.com/coder/coder/v2/coderd/database/dbmem" "github.com/coder/coder/v2/coderd/database/dbpurge" + "github.com/coder/coder/v2/coderd/database/dbrollup" "github.com/coder/coder/v2/coderd/database/dbtestutil" "github.com/coder/coder/v2/coderd/database/dbtime" "github.com/coder/coder/v2/provisionerd/proto" @@ -40,27 +42,62 @@ func TestDeleteOldWorkspaceAgentStats(t *testing.T) { t.Parallel() db, _ := dbtestutil.NewDB(t) - logger := slogtest.Make(t, &slogtest.Options{IgnoreErrors: true}) + logger := slogtest.Make(t, &slogtest.Options{IgnoreErrors: true}).Leveled(slog.LevelDebug) + + now := dbtime.Now() + + defer func() { + if t.Failed() { + t.Logf("Test failed, printing rows...") + ctx := testutil.Context(t, testutil.WaitShort) + wasRows, err := db.GetWorkspaceAgentStats(ctx, now.AddDate(0, -7, 0)) + if err == nil { + for _, row := range wasRows { + t.Logf("workspace agent stat: %v", row) + } + } + tusRows, err := db.GetTemplateUsageStats(context.Background(), database.GetTemplateUsageStatsParams{ + StartTime: now.AddDate(0, -7, 0), + EndTime: now, + }) + if err == nil { + for _, row := range tusRows { + t.Logf("template usage stat: %v", row) + } + } + } + }() ctx, cancel := context.WithTimeout(context.Background(), testutil.WaitShort) defer cancel() - now := dbtime.Now() - // given // Let's use RxBytes to identify stat entries. // Stat inserted 6 months + 1 hour ago, should be deleted. first := dbgen.WorkspaceAgentStat(t, db, database.WorkspaceAgentStat{ - CreatedAt: now.Add(-6*30*24*time.Hour - time.Hour), + CreatedAt: now.AddDate(0, -6, 0).Add(-time.Hour), + ConnectionCount: 1, ConnectionMedianLatencyMS: 1, RxBytes: 1111, + SessionCountSSH: 1, }) - // Stat inserted 6 months - 1 hour ago, should not be deleted. + // Stat inserted 6 months - 1 hour ago, should not be deleted before rollup. second := dbgen.WorkspaceAgentStat(t, db, database.WorkspaceAgentStat{ - CreatedAt: now.Add(-5*30*24*time.Hour + time.Hour), + CreatedAt: now.AddDate(0, -6, 0).Add(time.Hour), + ConnectionCount: 1, ConnectionMedianLatencyMS: 1, RxBytes: 2222, + SessionCountSSH: 1, + }) + + // Stat inserted 6 months - 1 day - 2 hour ago, should not be deleted at all. + third := dbgen.WorkspaceAgentStat(t, db, database.WorkspaceAgentStat{ + CreatedAt: now.AddDate(0, -6, 0).AddDate(0, 0, 1).Add(2 * time.Hour), + ConnectionCount: 1, + ConnectionMedianLatencyMS: 1, + RxBytes: 3333, + SessionCountSSH: 1, }) // when @@ -70,15 +107,39 @@ func TestDeleteOldWorkspaceAgentStats(t *testing.T) { // then var stats []database.GetWorkspaceAgentStatsRow var err error - require.Eventually(t, func() bool { + require.Eventuallyf(t, func() bool { // Query all stats created not earlier than 7 months ago - stats, err = db.GetWorkspaceAgentStats(ctx, now.Add(-7*30*24*time.Hour)) + stats, err = db.GetWorkspaceAgentStats(ctx, now.AddDate(0, -7, 0)) if err != nil { return false } return !containsWorkspaceAgentStat(stats, first) && containsWorkspaceAgentStat(stats, second) - }, testutil.WaitShort, testutil.IntervalFast, stats) + }, testutil.WaitShort, testutil.IntervalFast, "it should delete old stats: %v", stats) + + // when + events := make(chan dbrollup.Event) + rolluper := dbrollup.New(logger, db, dbrollup.WithEventChannel(events)) + defer rolluper.Close() + + _, _ = <-events, <-events + + // Start a new purger to immediately trigger delete after rollup. + _ = closer.Close() + closer = dbpurge.New(ctx, logger, db) + defer closer.Close() + + // then + require.Eventuallyf(t, func() bool { + // Query all stats created not earlier than 7 months ago + stats, err = db.GetWorkspaceAgentStats(ctx, now.AddDate(0, -7, 0)) + if err != nil { + return false + } + return !containsWorkspaceAgentStat(stats, first) && + !containsWorkspaceAgentStat(stats, second) && + containsWorkspaceAgentStat(stats, third) + }, testutil.WaitShort, testutil.IntervalFast, "it should delete old stats after rollup: %v", stats) } func containsWorkspaceAgentStat(stats []database.GetWorkspaceAgentStatsRow, needle database.WorkspaceAgentStat) bool { diff --git a/coderd/database/queries.sql.go b/coderd/database/queries.sql.go index 5b2b54929d..8594436116 100644 --- a/coderd/database/queries.sql.go +++ b/coderd/database/queries.sql.go @@ -10111,7 +10111,35 @@ func (q *sqlQuerier) UpdateWorkspaceAgentStartupByID(ctx context.Context, arg Up } const deleteOldWorkspaceAgentStats = `-- name: DeleteOldWorkspaceAgentStats :exec -DELETE FROM workspace_agent_stats WHERE created_at < NOW() - INTERVAL '180 days' +DELETE FROM + workspace_agent_stats +WHERE + created_at < ( + SELECT + COALESCE( + -- When generating initial template usage stats, all the + -- raw agent stats are needed, after that only ~30 mins + -- from last rollup is needed. Deployment stats seem to + -- use between 15 mins and 1 hour of data. We keep a + -- little bit more (1 day) just in case. + MAX(start_time) - '1 days'::interval, + -- Fall back to 6 months ago if there are no template + -- usage stats so that we don't delete the data before + -- it's rolled up. + NOW() - '6 months'::interval + ) + FROM + template_usage_stats + ) + AND created_at < ( + -- Delete at most in batches of 3 days (with a batch size of 3 days, we + -- can clear out the previous 6 months of data in ~60 iterations) whilst + -- keeping the DB load relatively low. + SELECT + COALESCE(MIN(created_at) + '3 days'::interval, NOW()) + FROM + workspace_agent_stats + ) ` func (q *sqlQuerier) DeleteOldWorkspaceAgentStats(ctx context.Context) error { diff --git a/coderd/database/queries/workspaceagentstats.sql b/coderd/database/queries/workspaceagentstats.sql index 4b7f86fba4..5623fd2cf9 100644 --- a/coderd/database/queries/workspaceagentstats.sql +++ b/coderd/database/queries/workspaceagentstats.sql @@ -66,7 +66,35 @@ ORDER BY date ASC; -- name: DeleteOldWorkspaceAgentStats :exec -DELETE FROM workspace_agent_stats WHERE created_at < NOW() - INTERVAL '180 days'; +DELETE FROM + workspace_agent_stats +WHERE + created_at < ( + SELECT + COALESCE( + -- When generating initial template usage stats, all the + -- raw agent stats are needed, after that only ~30 mins + -- from last rollup is needed. Deployment stats seem to + -- use between 15 mins and 1 hour of data. We keep a + -- little bit more (1 day) just in case. + MAX(start_time) - '1 days'::interval, + -- Fall back to 6 months ago if there are no template + -- usage stats so that we don't delete the data before + -- it's rolled up. + NOW() - '6 months'::interval + ) + FROM + template_usage_stats + ) + AND created_at < ( + -- Delete at most in batches of 3 days (with a batch size of 3 days, we + -- can clear out the previous 6 months of data in ~60 iterations) whilst + -- keeping the DB load relatively low. + SELECT + COALESCE(MIN(created_at) + '3 days'::interval, NOW()) + FROM + workspace_agent_stats + ); -- name: GetDeploymentWorkspaceAgentStats :one WITH agent_stats AS (