From b183236482ffece232895a402e2c1a66e9fc4635 Mon Sep 17 00:00:00 2001 From: Mathias Fredriksson Date: Mon, 25 Mar 2024 17:42:02 +0200 Subject: [PATCH] feat(coderd/database): use `template_usage_stats` in `*ByTemplate` insights queries (#12668) This PR updates the `*ByTempalte` insights queries used for generating Prometheus metrics to behave the same way as the new rollup query and re-written insights queries that utilize the rolled up data. --- coderd/database/dbmem/dbmem.go | 2 +- coderd/database/querier.go | 4 + coderd/database/queries.sql.go | 185 ++++++++----- coderd/database/queries/insights.sql | 174 ++++++++----- .../insights/metricscollector.go | 2 +- .../insights/metricscollector_test.go | 246 +++++++----------- .../insights/testdata/insights-metrics.json | 6 +- 7 files changed, 331 insertions(+), 288 deletions(-) diff --git a/coderd/database/dbmem/dbmem.go b/coderd/database/dbmem/dbmem.go index 366bf32491..abf99d7fbb 100644 --- a/coderd/database/dbmem/dbmem.go +++ b/coderd/database/dbmem/dbmem.go @@ -3280,7 +3280,7 @@ func (q *FakeQuerier) GetTemplateAppInsightsByTemplate(ctx context.Context, arg for _, usageKey := range usageKeys { r := database.GetTemplateAppInsightsByTemplateRow{ TemplateID: usageKey.TemplateID, - DisplayName: sql.NullString{String: usageKey.DisplayName, Valid: true}, + DisplayName: usageKey.DisplayName, SlugOrPort: usageKey.Slug, } for _, mUserUsage := range usageByTemplateAppUser[usageKey] { diff --git a/coderd/database/querier.go b/coderd/database/querier.go index 929af475eb..bf1a1909fe 100644 --- a/coderd/database/querier.go +++ b/coderd/database/querier.go @@ -173,6 +173,8 @@ type sqlcQuerier interface { // timeframe. The result can be filtered on template_ids, meaning only user data // from workspaces based on those templates will be included. GetTemplateAppInsights(ctx context.Context, arg GetTemplateAppInsightsParams) ([]GetTemplateAppInsightsRow, error) + // GetTemplateAppInsightsByTemplate is used for Prometheus metrics. Keep + // in sync with GetTemplateAppInsights and UpsertTemplateUsageStats. GetTemplateAppInsightsByTemplate(ctx context.Context, arg GetTemplateAppInsightsByTemplateParams) ([]GetTemplateAppInsightsByTemplateRow, error) GetTemplateAverageBuildTime(ctx context.Context, arg GetTemplateAverageBuildTimeParams) (GetTemplateAverageBuildTimeRow, error) GetTemplateByID(ctx context.Context, id uuid.UUID) (Template, error) @@ -193,6 +195,8 @@ type sqlcQuerier interface { // that interval will be shorter than a full one. If there is no data for a selected // interval/template, it will be included in the results with 0 active users. GetTemplateInsightsByInterval(ctx context.Context, arg GetTemplateInsightsByIntervalParams) ([]GetTemplateInsightsByIntervalRow, error) + // GetTemplateInsightsByTemplate is used for Prometheus metrics. Keep + // in sync with GetTemplateInsights and UpsertTemplateUsageStats. GetTemplateInsightsByTemplate(ctx context.Context, arg GetTemplateInsightsByTemplateParams) ([]GetTemplateInsightsByTemplateRow, error) // GetTemplateParameterInsights does for each template in a given timeframe, // look for the latest workspace build (for every workspace) that has been diff --git a/coderd/database/queries.sql.go b/coderd/database/queries.sql.go index fb76533d1a..72a75f53b3 100644 --- a/coderd/database/queries.sql.go +++ b/coderd/database/queries.sql.go @@ -1840,51 +1840,67 @@ func (q *sqlQuerier) GetTemplateAppInsights(ctx context.Context, arg GetTemplate } const getTemplateAppInsightsByTemplate = `-- name: GetTemplateAppInsightsByTemplate :many -WITH app_stats_by_user_and_agent AS ( - SELECT - s.start_time, - 60 as seconds, - w.template_id, - was.user_id, - was.agent_id, - was.slug_or_port, - wa.display_name, - (wa.slug IS NOT NULL)::boolean AS is_app - FROM workspace_app_stats was - JOIN workspaces w ON ( - w.id = was.workspace_id +WITH + -- This CTE is used to explode app usage into minute buckets, then + -- flatten the users app usage within the template so that usage in + -- multiple workspaces under one template is only counted once for + -- every minute. + app_insights AS ( + SELECT + w.template_id, + was.user_id, + -- Both app stats and agent stats track web terminal usage, but + -- by different means. The app stats value should be more + -- accurate so we don't want to discard it just yet. + CASE + WHEN was.access_method = 'terminal' + THEN '[terminal]' -- Unique name, app names can't contain brackets. + ELSE was.slug_or_port + END::text AS app_name, + COALESCE(wa.display_name, '') AS display_name, + (wa.slug IS NOT NULL)::boolean AS is_app, + COUNT(DISTINCT s.minute_bucket) AS app_minutes + FROM + workspace_app_stats AS was + JOIN + workspaces AS w + ON + w.id = was.workspace_id + -- We do a left join here because we want to include user IDs that have used + -- e.g. ports when counting active users. + LEFT JOIN + workspace_apps wa + ON + wa.agent_id = was.agent_id + AND wa.slug = was.slug_or_port + -- Generate a series of minute buckets for each session for computing the + -- mintes/bucket. + CROSS JOIN + generate_series( + date_trunc('minute', was.session_started_at), + -- Subtract 1 microsecond to avoid creating an extra series. + date_trunc('minute', was.session_ended_at - '1 microsecond'::interval), + '1 minute'::interval + ) AS s(minute_bucket) + WHERE + s.minute_bucket >= $1::timestamptz + AND s.minute_bucket < $2::timestamptz + GROUP BY + w.template_id, was.user_id, was.access_method, was.slug_or_port, wa.display_name, wa.slug ) - -- We do a left join here because we want to include user IDs that have used - -- e.g. ports when counting active users. - LEFT JOIN workspace_apps wa ON ( - wa.agent_id = was.agent_id - AND wa.slug = was.slug_or_port - ) - -- This table contains both 1 minute entries and >1 minute entries, - -- to calculate this with our uniqueness constraints, we generate series - -- for the longer intervals. - CROSS JOIN LATERAL generate_series( - date_trunc('minute', was.session_started_at), - -- Subtract 1 microsecond to avoid creating an extra series. - date_trunc('minute', was.session_ended_at - '1 microsecond'::interval), - '1 minute'::interval - ) s(start_time) - WHERE - s.start_time >= $1::timestamptz - -- Subtract one minute because the series only contains the start time. - AND s.start_time < ($2::timestamptz) - '1 minute'::interval - GROUP BY s.start_time, w.template_id, was.user_id, was.agent_id, was.slug_or_port, wa.display_name, wa.slug -) SELECT template_id, - display_name, - slug_or_port, - COALESCE(COUNT(DISTINCT user_id))::bigint AS active_users, - SUM(seconds) AS usage_seconds -FROM app_stats_by_user_and_agent -WHERE is_app IS TRUE -GROUP BY template_id, display_name, slug_or_port + app_name AS slug_or_port, + display_name AS display_name, + COUNT(DISTINCT user_id)::bigint AS active_users, + (SUM(app_minutes) * 60)::bigint AS usage_seconds +FROM + app_insights +WHERE + is_app IS TRUE +GROUP BY + template_id, slug_or_port, display_name ` type GetTemplateAppInsightsByTemplateParams struct { @@ -1893,13 +1909,15 @@ type GetTemplateAppInsightsByTemplateParams struct { } type GetTemplateAppInsightsByTemplateRow struct { - TemplateID uuid.UUID `db:"template_id" json:"template_id"` - DisplayName sql.NullString `db:"display_name" json:"display_name"` - SlugOrPort string `db:"slug_or_port" json:"slug_or_port"` - ActiveUsers int64 `db:"active_users" json:"active_users"` - UsageSeconds int64 `db:"usage_seconds" json:"usage_seconds"` + TemplateID uuid.UUID `db:"template_id" json:"template_id"` + SlugOrPort string `db:"slug_or_port" json:"slug_or_port"` + DisplayName string `db:"display_name" json:"display_name"` + ActiveUsers int64 `db:"active_users" json:"active_users"` + UsageSeconds int64 `db:"usage_seconds" json:"usage_seconds"` } +// GetTemplateAppInsightsByTemplate is used for Prometheus metrics. Keep +// in sync with GetTemplateAppInsights and UpsertTemplateUsageStats. func (q *sqlQuerier) GetTemplateAppInsightsByTemplate(ctx context.Context, arg GetTemplateAppInsightsByTemplateParams) ([]GetTemplateAppInsightsByTemplateRow, error) { rows, err := q.db.QueryContext(ctx, getTemplateAppInsightsByTemplate, arg.StartTime, arg.EndTime) if err != nil { @@ -1911,8 +1929,8 @@ func (q *sqlQuerier) GetTemplateAppInsightsByTemplate(ctx context.Context, arg G var i GetTemplateAppInsightsByTemplateRow if err := rows.Scan( &i.TemplateID, - &i.DisplayName, &i.SlugOrPort, + &i.DisplayName, &i.ActiveUsers, &i.UsageSeconds, ); err != nil { @@ -2120,32 +2138,57 @@ func (q *sqlQuerier) GetTemplateInsightsByInterval(ctx context.Context, arg GetT } const getTemplateInsightsByTemplate = `-- name: GetTemplateInsightsByTemplate :many -WITH agent_stats_by_interval_and_user AS ( - SELECT - date_trunc('minute', was.created_at) AS created_at_trunc, - was.template_id, - was.user_id, - CASE WHEN SUM(was.session_count_vscode) > 0 THEN 60 ELSE 0 END AS usage_vscode_seconds, - CASE WHEN SUM(was.session_count_jetbrains) > 0 THEN 60 ELSE 0 END AS usage_jetbrains_seconds, - CASE WHEN SUM(was.session_count_reconnecting_pty) > 0 THEN 60 ELSE 0 END AS usage_reconnecting_pty_seconds, - CASE WHEN SUM(was.session_count_ssh) > 0 THEN 60 ELSE 0 END AS usage_ssh_seconds - FROM workspace_agent_stats was - WHERE - was.created_at >= $1::timestamptz - AND was.created_at < $2::timestamptz - AND was.connection_count > 0 - GROUP BY created_at_trunc, was.template_id, was.user_id -) +WITH + -- This CTE is used to truncate agent usage into minute buckets, then + -- flatten the users agent usage within the template so that usage in + -- multiple workspaces under one template is only counted once for + -- every minute (per user). + insights AS ( + SELECT + template_id, + user_id, + COUNT(DISTINCT CASE WHEN session_count_ssh > 0 THEN date_trunc('minute', created_at) ELSE NULL END) AS ssh_mins, + -- TODO(mafredri): Enable when we have the column. + -- COUNT(DISTINCT CASE WHEN session_count_sftp > 0 THEN date_trunc('minute', created_at) ELSE NULL END) AS sftp_mins, + COUNT(DISTINCT CASE WHEN session_count_reconnecting_pty > 0 THEN date_trunc('minute', created_at) ELSE NULL END) AS reconnecting_pty_mins, + COUNT(DISTINCT CASE WHEN session_count_vscode > 0 THEN date_trunc('minute', created_at) ELSE NULL END) AS vscode_mins, + COUNT(DISTINCT CASE WHEN session_count_jetbrains > 0 THEN date_trunc('minute', created_at) ELSE NULL END) AS jetbrains_mins, + -- NOTE(mafredri): The agent stats are currently very unreliable, and + -- sometimes the connections are missing, even during active sessions. + -- Since we can't fully rely on this, we check for "any connection + -- within this bucket". A better solution here would be preferable. + MAX(connection_count) > 0 AS has_connection + FROM + workspace_agent_stats + WHERE + created_at >= $1::timestamptz + AND created_at < $2::timestamptz + -- Inclusion criteria to filter out empty results. + AND ( + session_count_ssh > 0 + -- TODO(mafredri): Enable when we have the column. + -- OR session_count_sftp > 0 + OR session_count_reconnecting_pty > 0 + OR session_count_vscode > 0 + OR session_count_jetbrains > 0 + ) + GROUP BY + template_id, user_id + ) SELECT template_id, - COALESCE(COUNT(DISTINCT user_id))::bigint AS active_users, - COALESCE(SUM(usage_vscode_seconds), 0)::bigint AS usage_vscode_seconds, - COALESCE(SUM(usage_jetbrains_seconds), 0)::bigint AS usage_jetbrains_seconds, - COALESCE(SUM(usage_reconnecting_pty_seconds), 0)::bigint AS usage_reconnecting_pty_seconds, - COALESCE(SUM(usage_ssh_seconds), 0)::bigint AS usage_ssh_seconds -FROM agent_stats_by_interval_and_user -GROUP BY template_id + COUNT(DISTINCT user_id)::bigint AS active_users, + (SUM(vscode_mins) * 60)::bigint AS usage_vscode_seconds, + (SUM(jetbrains_mins) * 60)::bigint AS usage_jetbrains_seconds, + (SUM(reconnecting_pty_mins) * 60)::bigint AS usage_reconnecting_pty_seconds, + (SUM(ssh_mins) * 60)::bigint AS usage_ssh_seconds +FROM + insights +WHERE + has_connection +GROUP BY + template_id ` type GetTemplateInsightsByTemplateParams struct { @@ -2162,6 +2205,8 @@ type GetTemplateInsightsByTemplateRow struct { UsageSshSeconds int64 `db:"usage_ssh_seconds" json:"usage_ssh_seconds"` } +// GetTemplateInsightsByTemplate is used for Prometheus metrics. Keep +// in sync with GetTemplateInsights and UpsertTemplateUsageStats. func (q *sqlQuerier) GetTemplateInsightsByTemplate(ctx context.Context, arg GetTemplateInsightsByTemplateParams) ([]GetTemplateInsightsByTemplateRow, error) { rows, err := q.db.QueryContext(ctx, getTemplateInsightsByTemplate, arg.StartTime, arg.EndTime) if err != nil { diff --git a/coderd/database/queries/insights.sql b/coderd/database/queries/insights.sql index 8491cdc7c4..08960afcb4 100644 --- a/coderd/database/queries/insights.sql +++ b/coderd/database/queries/insights.sql @@ -145,32 +145,59 @@ FROM insights; -- name: GetTemplateInsightsByTemplate :many -WITH agent_stats_by_interval_and_user AS ( - SELECT - date_trunc('minute', was.created_at) AS created_at_trunc, - was.template_id, - was.user_id, - CASE WHEN SUM(was.session_count_vscode) > 0 THEN 60 ELSE 0 END AS usage_vscode_seconds, - CASE WHEN SUM(was.session_count_jetbrains) > 0 THEN 60 ELSE 0 END AS usage_jetbrains_seconds, - CASE WHEN SUM(was.session_count_reconnecting_pty) > 0 THEN 60 ELSE 0 END AS usage_reconnecting_pty_seconds, - CASE WHEN SUM(was.session_count_ssh) > 0 THEN 60 ELSE 0 END AS usage_ssh_seconds - FROM workspace_agent_stats was - WHERE - was.created_at >= @start_time::timestamptz - AND was.created_at < @end_time::timestamptz - AND was.connection_count > 0 - GROUP BY created_at_trunc, was.template_id, was.user_id -) +-- GetTemplateInsightsByTemplate is used for Prometheus metrics. Keep +-- in sync with GetTemplateInsights and UpsertTemplateUsageStats. +WITH + -- This CTE is used to truncate agent usage into minute buckets, then + -- flatten the users agent usage within the template so that usage in + -- multiple workspaces under one template is only counted once for + -- every minute (per user). + insights AS ( + SELECT + template_id, + user_id, + COUNT(DISTINCT CASE WHEN session_count_ssh > 0 THEN date_trunc('minute', created_at) ELSE NULL END) AS ssh_mins, + -- TODO(mafredri): Enable when we have the column. + -- COUNT(DISTINCT CASE WHEN session_count_sftp > 0 THEN date_trunc('minute', created_at) ELSE NULL END) AS sftp_mins, + COUNT(DISTINCT CASE WHEN session_count_reconnecting_pty > 0 THEN date_trunc('minute', created_at) ELSE NULL END) AS reconnecting_pty_mins, + COUNT(DISTINCT CASE WHEN session_count_vscode > 0 THEN date_trunc('minute', created_at) ELSE NULL END) AS vscode_mins, + COUNT(DISTINCT CASE WHEN session_count_jetbrains > 0 THEN date_trunc('minute', created_at) ELSE NULL END) AS jetbrains_mins, + -- NOTE(mafredri): The agent stats are currently very unreliable, and + -- sometimes the connections are missing, even during active sessions. + -- Since we can't fully rely on this, we check for "any connection + -- within this bucket". A better solution here would be preferable. + MAX(connection_count) > 0 AS has_connection + FROM + workspace_agent_stats + WHERE + created_at >= @start_time::timestamptz + AND created_at < @end_time::timestamptz + -- Inclusion criteria to filter out empty results. + AND ( + session_count_ssh > 0 + -- TODO(mafredri): Enable when we have the column. + -- OR session_count_sftp > 0 + OR session_count_reconnecting_pty > 0 + OR session_count_vscode > 0 + OR session_count_jetbrains > 0 + ) + GROUP BY + template_id, user_id + ) SELECT template_id, - COALESCE(COUNT(DISTINCT user_id))::bigint AS active_users, - COALESCE(SUM(usage_vscode_seconds), 0)::bigint AS usage_vscode_seconds, - COALESCE(SUM(usage_jetbrains_seconds), 0)::bigint AS usage_jetbrains_seconds, - COALESCE(SUM(usage_reconnecting_pty_seconds), 0)::bigint AS usage_reconnecting_pty_seconds, - COALESCE(SUM(usage_ssh_seconds), 0)::bigint AS usage_ssh_seconds -FROM agent_stats_by_interval_and_user -GROUP BY template_id; + COUNT(DISTINCT user_id)::bigint AS active_users, + (SUM(vscode_mins) * 60)::bigint AS usage_vscode_seconds, + (SUM(jetbrains_mins) * 60)::bigint AS usage_jetbrains_seconds, + (SUM(reconnecting_pty_mins) * 60)::bigint AS usage_reconnecting_pty_seconds, + (SUM(ssh_mins) * 60)::bigint AS usage_ssh_seconds +FROM + insights +WHERE + has_connection +GROUP BY + template_id; -- name: GetTemplateAppInsights :many -- GetTemplateAppInsights returns the aggregate usage of each app in a given @@ -267,51 +294,70 @@ GROUP BY t.template_ids, ai.app_name, ai.display_name, ai.icon, ai.is_app; -- name: GetTemplateAppInsightsByTemplate :many -WITH app_stats_by_user_and_agent AS ( - SELECT - s.start_time, - 60 as seconds, - w.template_id, - was.user_id, - was.agent_id, - was.slug_or_port, - wa.display_name, - (wa.slug IS NOT NULL)::boolean AS is_app - FROM workspace_app_stats was - JOIN workspaces w ON ( - w.id = was.workspace_id +-- GetTemplateAppInsightsByTemplate is used for Prometheus metrics. Keep +-- in sync with GetTemplateAppInsights and UpsertTemplateUsageStats. +WITH + -- This CTE is used to explode app usage into minute buckets, then + -- flatten the users app usage within the template so that usage in + -- multiple workspaces under one template is only counted once for + -- every minute. + app_insights AS ( + SELECT + w.template_id, + was.user_id, + -- Both app stats and agent stats track web terminal usage, but + -- by different means. The app stats value should be more + -- accurate so we don't want to discard it just yet. + CASE + WHEN was.access_method = 'terminal' + THEN '[terminal]' -- Unique name, app names can't contain brackets. + ELSE was.slug_or_port + END::text AS app_name, + COALESCE(wa.display_name, '') AS display_name, + (wa.slug IS NOT NULL)::boolean AS is_app, + COUNT(DISTINCT s.minute_bucket) AS app_minutes + FROM + workspace_app_stats AS was + JOIN + workspaces AS w + ON + w.id = was.workspace_id + -- We do a left join here because we want to include user IDs that have used + -- e.g. ports when counting active users. + LEFT JOIN + workspace_apps wa + ON + wa.agent_id = was.agent_id + AND wa.slug = was.slug_or_port + -- Generate a series of minute buckets for each session for computing the + -- mintes/bucket. + CROSS JOIN + generate_series( + date_trunc('minute', was.session_started_at), + -- Subtract 1 microsecond to avoid creating an extra series. + date_trunc('minute', was.session_ended_at - '1 microsecond'::interval), + '1 minute'::interval + ) AS s(minute_bucket) + WHERE + s.minute_bucket >= @start_time::timestamptz + AND s.minute_bucket < @end_time::timestamptz + GROUP BY + w.template_id, was.user_id, was.access_method, was.slug_or_port, wa.display_name, wa.slug ) - -- We do a left join here because we want to include user IDs that have used - -- e.g. ports when counting active users. - LEFT JOIN workspace_apps wa ON ( - wa.agent_id = was.agent_id - AND wa.slug = was.slug_or_port - ) - -- This table contains both 1 minute entries and >1 minute entries, - -- to calculate this with our uniqueness constraints, we generate series - -- for the longer intervals. - CROSS JOIN LATERAL generate_series( - date_trunc('minute', was.session_started_at), - -- Subtract 1 microsecond to avoid creating an extra series. - date_trunc('minute', was.session_ended_at - '1 microsecond'::interval), - '1 minute'::interval - ) s(start_time) - WHERE - s.start_time >= @start_time::timestamptz - -- Subtract one minute because the series only contains the start time. - AND s.start_time < (@end_time::timestamptz) - '1 minute'::interval - GROUP BY s.start_time, w.template_id, was.user_id, was.agent_id, was.slug_or_port, wa.display_name, wa.slug -) SELECT template_id, - display_name, - slug_or_port, - COALESCE(COUNT(DISTINCT user_id))::bigint AS active_users, - SUM(seconds) AS usage_seconds -FROM app_stats_by_user_and_agent -WHERE is_app IS TRUE -GROUP BY template_id, display_name, slug_or_port; + app_name AS slug_or_port, + display_name AS display_name, + COUNT(DISTINCT user_id)::bigint AS active_users, + (SUM(app_minutes) * 60)::bigint AS usage_seconds +FROM + app_insights +WHERE + is_app IS TRUE +GROUP BY + template_id, slug_or_port, display_name; + -- name: GetTemplateInsightsByInterval :many -- GetTemplateInsightsByInterval returns all intervals between start and end diff --git a/coderd/prometheusmetrics/insights/metricscollector.go b/coderd/prometheusmetrics/insights/metricscollector.go index 1534d09cd4..7dcf6025f2 100644 --- a/coderd/prometheusmetrics/insights/metricscollector.go +++ b/coderd/prometheusmetrics/insights/metricscollector.go @@ -194,7 +194,7 @@ func (mc *MetricsCollector) Collect(metricsCh chan<- prometheus.Metric) { // Custom apps for _, appRow := range data.apps { metricsCh <- prometheus.MustNewConstMetric(applicationsUsageSecondsDesc, prometheus.GaugeValue, float64(appRow.UsageSeconds), data.templateNames[appRow.TemplateID], - appRow.DisplayName.String, appRow.SlugOrPort) + appRow.DisplayName, appRow.SlugOrPort) } // Built-in apps diff --git a/coderd/prometheusmetrics/insights/metricscollector_test.go b/coderd/prometheusmetrics/insights/metricscollector_test.go index 47591815a5..598c154db0 100644 --- a/coderd/prometheusmetrics/insights/metricscollector_test.go +++ b/coderd/prometheusmetrics/insights/metricscollector_test.go @@ -3,12 +3,13 @@ package insights_test import ( "context" "encoding/json" - "io" + "fmt" "os" "strings" "testing" "time" + "github.com/google/go-cmp/cmp" "github.com/google/uuid" "github.com/prometheus/client_golang/prometheus" io_prometheus_client "github.com/prometheus/client_model/go" @@ -17,17 +18,14 @@ import ( "cdr.dev/slog" "cdr.dev/slog/sloggers/slogtest" - "github.com/coder/coder/v2/agent" - "github.com/coder/coder/v2/agent/agenttest" "github.com/coder/coder/v2/coderd/coderdtest" + "github.com/coder/coder/v2/coderd/database" "github.com/coder/coder/v2/coderd/database/dbauthz" + "github.com/coder/coder/v2/coderd/database/dbgen" "github.com/coder/coder/v2/coderd/database/dbtestutil" "github.com/coder/coder/v2/coderd/prometheusmetrics/insights" "github.com/coder/coder/v2/coderd/workspaceapps" - "github.com/coder/coder/v2/codersdk" "github.com/coder/coder/v2/codersdk/agentsdk" - "github.com/coder/coder/v2/provisioner/echo" - "github.com/coder/coder/v2/provisionersdk/proto" "github.com/coder/coder/v2/testutil" ) @@ -35,7 +33,7 @@ func TestCollectInsights(t *testing.T) { t.Parallel() logger := slogtest.Make(t, &slogtest.Options{IgnoreErrors: true}) - db, ps := dbtestutil.NewDB(t) + db, ps := dbtestutil.NewDB(t, dbtestutil.WithDumpOnFailure()) options := &coderdtest.Options{ IncludeProvisionerDaemon: true, @@ -43,8 +41,10 @@ func TestCollectInsights(t *testing.T) { Database: db, Pubsub: ps, } - client := coderdtest.New(t, options) - client.SetLogger(logger.Named("client").Leveled(slog.LevelDebug)) + ownerClient := coderdtest.New(t, options) + ownerClient.SetLogger(logger.Named("ownerClient").Leveled(slog.LevelDebug)) + owner := coderdtest.CreateFirstUser(t, ownerClient) + client, user := coderdtest.CreateAnotherUser(t, ownerClient, owner.OrganizationID) // Given // Initialize metrics collector @@ -54,49 +54,55 @@ func TestCollectInsights(t *testing.T) { registry := prometheus.NewRegistry() registry.Register(mc) - // Create two users, one that will appear in the report and another that - // won't (due to not having/using a workspace). - user := coderdtest.CreateFirstUser(t, client) - _, _ = coderdtest.CreateAnotherUser(t, client, user.OrganizationID) - authToken := uuid.NewString() - version := coderdtest.CreateTemplateVersion(t, client, user.OrganizationID, &echo.Responses{ - Parse: echo.ParseComplete, - ProvisionPlan: provisionPlanWithParameters(), - ProvisionApply: provisionApplyWithAgentAndApp(authToken), - }) - template := coderdtest.CreateTemplate(t, client, user.OrganizationID, version.ID, func(ctr *codersdk.CreateTemplateRequest) { - ctr.Name = "golden-template" - }) - require.Empty(t, template.BuildTimeStats[codersdk.WorkspaceTransitionStart]) - - coderdtest.AwaitTemplateVersionJobCompleted(t, client, version.ID) - workspace := coderdtest.CreateWorkspace(t, client, user.OrganizationID, template.ID, func(cwr *codersdk.CreateWorkspaceRequest) { - cwr.RichParameterValues = []codersdk.WorkspaceBuildParameter{ - {Name: "first_parameter", Value: "Foobar"}, - {Name: "second_parameter", Value: "true"}, - {Name: "third_parameter", Value: "789"}, - } - }) - coderdtest.AwaitWorkspaceBuildJobCompleted(t, client, workspace.LatestBuild.ID) + var ( + orgID = owner.OrganizationID + tpl = dbgen.Template(t, db, database.Template{OrganizationID: orgID, CreatedBy: user.ID, Name: "golden-template"}) + ver = dbgen.TemplateVersion(t, db, database.TemplateVersion{OrganizationID: orgID, CreatedBy: user.ID, TemplateID: uuid.NullUUID{UUID: tpl.ID, Valid: true}}) + param1 = dbgen.TemplateVersionParameter(t, db, database.TemplateVersionParameter{TemplateVersionID: ver.ID, Name: "first_parameter"}) + param2 = dbgen.TemplateVersionParameter(t, db, database.TemplateVersionParameter{TemplateVersionID: ver.ID, Name: "second_parameter", Type: "bool"}) + param3 = dbgen.TemplateVersionParameter(t, db, database.TemplateVersionParameter{TemplateVersionID: ver.ID, Name: "third_parameter", Type: "number"}) + workspace1 = dbgen.Workspace(t, db, database.Workspace{OrganizationID: orgID, TemplateID: tpl.ID, OwnerID: user.ID}) + workspace2 = dbgen.Workspace(t, db, database.Workspace{OrganizationID: orgID, TemplateID: tpl.ID, OwnerID: user.ID}) + job1 = dbgen.ProvisionerJob(t, db, ps, database.ProvisionerJob{OrganizationID: orgID}) + job2 = dbgen.ProvisionerJob(t, db, ps, database.ProvisionerJob{OrganizationID: orgID}) + build1 = dbgen.WorkspaceBuild(t, db, database.WorkspaceBuild{TemplateVersionID: ver.ID, WorkspaceID: workspace1.ID, JobID: job1.ID}) + build2 = dbgen.WorkspaceBuild(t, db, database.WorkspaceBuild{TemplateVersionID: ver.ID, WorkspaceID: workspace2.ID, JobID: job2.ID}) + res1 = dbgen.WorkspaceResource(t, db, database.WorkspaceResource{JobID: build1.JobID}) + res2 = dbgen.WorkspaceResource(t, db, database.WorkspaceResource{JobID: build2.JobID}) + agent1 = dbgen.WorkspaceAgent(t, db, database.WorkspaceAgent{ResourceID: res1.ID}) + agent2 = dbgen.WorkspaceAgent(t, db, database.WorkspaceAgent{ResourceID: res2.ID}) + app1 = dbgen.WorkspaceApp(t, db, database.WorkspaceApp{AgentID: agent1.ID, Slug: "golden-slug", DisplayName: "Golden Slug"}) + app2 = dbgen.WorkspaceApp(t, db, database.WorkspaceApp{AgentID: agent2.ID, Slug: "golden-slug", DisplayName: "Golden Slug"}) + _ = dbgen.WorkspaceBuildParameters(t, db, []database.WorkspaceBuildParameter{ + {WorkspaceBuildID: build1.ID, Name: param1.Name, Value: "Foobar"}, + {WorkspaceBuildID: build1.ID, Name: param2.Name, Value: "true"}, + {WorkspaceBuildID: build1.ID, Name: param3.Name, Value: "789"}, + }) + _ = dbgen.WorkspaceBuildParameters(t, db, []database.WorkspaceBuildParameter{ + {WorkspaceBuildID: build2.ID, Name: param1.Name, Value: "Baz"}, + {WorkspaceBuildID: build2.ID, Name: param2.Name, Value: "true"}, + {WorkspaceBuildID: build2.ID, Name: param3.Name, Value: "999"}, + }) + ) // Start an agent so that we can generate stats. - agentClient := agentsdk.New(client.URL) - agentClient.SetSessionToken(authToken) - agentClient.SDK.SetLogger(logger.Leveled(slog.LevelDebug).Named("agent")) - - _ = agenttest.New(t, client.URL, authToken, func(o *agent.Options) { - o.Client = agentClient - }) - resources := coderdtest.AwaitWorkspaceAgents(t, client, workspace.ID) + var agentClients []*agentsdk.Client + for i, agent := range []database.WorkspaceAgent{agent1, agent2} { + agentClient := agentsdk.New(client.URL) + agentClient.SetSessionToken(agent.AuthToken.String()) + agentClient.SDK.SetLogger(logger.Leveled(slog.LevelDebug).Named(fmt.Sprintf("agent%d", i+1))) + agentClients = append(agentClients, agentClient) + } // Fake app stats - _, err = agentClient.PostStats(context.Background(), &agentsdk.Stats{ - // ConnectionsByProto can't be nil, otherwise stats get rejected - ConnectionsByProto: map[string]int64{"TCP": 1}, + _, err = agentClients[0].PostStats(context.Background(), &agentsdk.Stats{ // ConnectionCount must be positive as database query ignores stats with no active connections at the time frame - ConnectionCount: 74, - // SessionCountJetBrains, SessionCountVSCode must be positive, but the exact value is ignored. + ConnectionsByProto: map[string]int64{"TCP": 1}, + ConnectionCount: 1, + ConnectionMedianLatencyMS: 15, + // Session counts must be positive, but the exact value is ignored. // Database query approximates it to 60s of usage. + SessionCountSSH: 99, SessionCountJetBrains: 47, SessionCountVSCode: 34, }) @@ -104,17 +110,42 @@ func TestCollectInsights(t *testing.T) { // Fake app usage reporter := workspaceapps.NewStatsDBReporter(db, workspaceapps.DefaultStatsDBReporterBatchSize) + refTime := time.Now().Add(-3 * time.Minute).Truncate(time.Minute) //nolint:gocritic // This is a test. err = reporter.Report(dbauthz.AsSystemRestricted(context.Background()), []workspaceapps.StatsReport{ { - UserID: user.UserID, - WorkspaceID: workspace.ID, - AgentID: resources[0].Agents[0].ID, - AccessMethod: "terminal", - SlugOrPort: "golden-slug", + UserID: user.ID, + WorkspaceID: workspace1.ID, + AgentID: agent1.ID, + AccessMethod: "path", + SlugOrPort: app1.Slug, SessionID: uuid.New(), - SessionStartedAt: time.Now().Add(-3 * time.Minute), - SessionEndedAt: time.Now().Add(-time.Minute).Add(-time.Second), + SessionStartedAt: refTime, + SessionEndedAt: refTime.Add(2 * time.Minute).Add(-time.Second), + Requests: 1, + }, + // Same usage on differrent workspace/agent in same template, + // should not be counted as extra. + { + UserID: user.ID, + WorkspaceID: workspace2.ID, + AgentID: agent2.ID, + AccessMethod: "path", + SlugOrPort: app2.Slug, + SessionID: uuid.New(), + SessionStartedAt: refTime, + SessionEndedAt: refTime.Add(2 * time.Minute).Add(-time.Second), + Requests: 1, + }, + { + UserID: user.ID, + WorkspaceID: workspace2.ID, + AgentID: agent2.ID, + AccessMethod: "path", + SlugOrPort: app2.Slug, + SessionID: uuid.New(), + SessionStartedAt: refTime.Add(2 * time.Minute), + SessionEndedAt: refTime.Add(2 * time.Minute).Add(30 * time.Second), Requests: 1, }, }) @@ -128,34 +159,6 @@ func TestCollectInsights(t *testing.T) { require.NoError(t, err) defer closeFunc() - // Connect to the agent to generate usage/latency stats. - conn, err := client.DialWorkspaceAgent(ctx, resources[0].Agents[0].ID, &codersdk.DialWorkspaceAgentOptions{ - Logger: logger.Named("client"), - }) - require.NoError(t, err) - defer conn.Close() - - sshConn, err := conn.SSHClient(ctx) - require.NoError(t, err) - defer sshConn.Close() - - sess, err := sshConn.NewSession() - require.NoError(t, err) - defer sess.Close() - - r, w := io.Pipe() - defer r.Close() - defer w.Close() - sess.Stdin = r - sess.Stdout = io.Discard - err = sess.Start("cat") - require.NoError(t, err) - - defer func() { - _ = sess.Close() - _ = sshConn.Close() - }() - goldenFile, err := os.ReadFile("testdata/insights-metrics.json") require.NoError(t, err) golden := map[string]int{} @@ -163,13 +166,16 @@ func TestCollectInsights(t *testing.T) { require.NoError(t, err) collected := map[string]int{} - assert.Eventuallyf(t, func() bool { + ok := assert.Eventuallyf(t, func() bool { // When metrics, err := registry.Gather() - require.NoError(t, err) + if !assert.NoError(t, err) { + return false + } // Then for _, metric := range metrics { + t.Logf("metric: %s: %#v", metric.GetName(), metric) switch metric.GetName() { case "coderd_insights_applications_usage_seconds", "coderd_insights_templates_active_users", "coderd_insights_parameters": for _, m := range metric.Metric { @@ -180,12 +186,16 @@ func TestCollectInsights(t *testing.T) { collected[key] = int(m.Gauge.GetValue()) } default: - require.FailNowf(t, "unexpected metric collected", "metric: %s", metric.GetName()) + assert.Failf(t, "unexpected metric collected", "metric: %s", metric.GetName()) } } - return insightsMetricsAreEqual(golden, collected) - }, testutil.WaitMedium, testutil.IntervalFast, "template insights are inconsistent with golden files, got: %v", collected) + return assert.ObjectsAreEqualValues(golden, collected) + }, testutil.WaitMedium, testutil.IntervalFast, "template insights are inconsistent with golden files") + if !ok { + diff := cmp.Diff(golden, collected) + assert.Empty(t, diff, "template insights are inconsistent with golden files (-golden +collected)") + } } func metricLabelAsString(m *io_prometheus_client.Metric) string { @@ -195,67 +205,3 @@ func metricLabelAsString(m *io_prometheus_client.Metric) string { } return strings.Join(labels, ",") } - -func provisionPlanWithParameters() []*proto.Response { - return []*proto.Response{ - { - Type: &proto.Response_Plan{ - Plan: &proto.PlanComplete{ - Parameters: []*proto.RichParameter{ - {Name: "first_parameter", Type: "string", Mutable: true}, - {Name: "second_parameter", Type: "bool", Mutable: true}, - {Name: "third_parameter", Type: "number", Mutable: true}, - }, - }, - }, - }, - } -} - -func provisionApplyWithAgentAndApp(authToken string) []*proto.Response { - return []*proto.Response{{ - Type: &proto.Response_Apply{ - Apply: &proto.ApplyComplete{ - Resources: []*proto.Resource{{ - Name: "example", - Type: "aws_instance", - Agents: []*proto.Agent{{ - Id: uuid.NewString(), - Name: "example", - Auth: &proto.Agent_Token{ - Token: authToken, - }, - Apps: []*proto.App{ - { - Slug: "golden-slug", - DisplayName: "Golden Slug", - SharingLevel: proto.AppSharingLevel_OWNER, - Url: "http://localhost:1234", - }, - }, - }}, - }}, - }, - }, - }} -} - -// insightsMetricsAreEqual patches collected metrics to be used -// in comparison with golden metrics using `assert.ObjectsAreEqualValues`. -// Collected metrics must be patched as sometimes they may slip -// due to timestamp truncation. -// See: -// https://github.com/coder/coder/blob/92ef0baff3b632c52c2335aae1d643a3cc49e26a/coderd/database/dbmem/dbmem.go#L2463 -// https://github.com/coder/coder/blob/9b6433e3a7c788b7e87b7d8f539ea111957a0cf1/coderd/database/queries/insights.sql#L246 -func insightsMetricsAreEqual(golden, collected map[string]int) bool { - greaterOrEqualKeys := []string{ - "coderd_insights_applications_usage_seconds[application_name=Golden Slug,slug=golden-slug,template_name=golden-template]", - "coderd_insights_applications_usage_seconds[application_name=SSH,slug=,template_name=golden-template]", - } - for _, key := range greaterOrEqualKeys { - if v, ok := collected[key]; ok && v > golden[key] { - collected[key] = golden[key] - } - } - return assert.ObjectsAreEqualValues(golden, collected) -} diff --git a/coderd/prometheusmetrics/insights/testdata/insights-metrics.json b/coderd/prometheusmetrics/insights/testdata/insights-metrics.json index 13753474e5..e672ed304a 100644 --- a/coderd/prometheusmetrics/insights/testdata/insights-metrics.json +++ b/coderd/prometheusmetrics/insights/testdata/insights-metrics.json @@ -3,9 +3,11 @@ "coderd_insights_applications_usage_seconds[application_name=Visual Studio Code,slug=,template_name=golden-template]": 60, "coderd_insights_applications_usage_seconds[application_name=Web Terminal,slug=,template_name=golden-template]": 0, "coderd_insights_applications_usage_seconds[application_name=SSH,slug=,template_name=golden-template]": 60, - "coderd_insights_applications_usage_seconds[application_name=Golden Slug,slug=golden-slug,template_name=golden-template]": 120, + "coderd_insights_applications_usage_seconds[application_name=Golden Slug,slug=golden-slug,template_name=golden-template]": 180, "coderd_insights_parameters[parameter_name=first_parameter,parameter_type=string,parameter_value=Foobar,template_name=golden-template]": 1, - "coderd_insights_parameters[parameter_name=second_parameter,parameter_type=bool,parameter_value=true,template_name=golden-template]": 1, + "coderd_insights_parameters[parameter_name=first_parameter,parameter_type=string,parameter_value=Baz,template_name=golden-template]": 1, + "coderd_insights_parameters[parameter_name=second_parameter,parameter_type=bool,parameter_value=true,template_name=golden-template]": 2, "coderd_insights_parameters[parameter_name=third_parameter,parameter_type=number,parameter_value=789,template_name=golden-template]": 1, + "coderd_insights_parameters[parameter_name=third_parameter,parameter_type=number,parameter_value=999,template_name=golden-template]": 1, "coderd_insights_templates_active_users[template_name=golden-template]": 1 }