diff --git a/cli/server.go b/cli/server.go index 6788639193..e02a891022 100644 --- a/cli/server.go +++ b/cli/server.go @@ -229,13 +229,13 @@ func enablePrometheus( afterCtx(ctx, closeInsightsMetricsCollector) if vals.Prometheus.CollectAgentStats { - closeAgentStatsFunc, err := prometheusmetrics.AgentStats(ctx, logger, options.PrometheusRegistry, options.Database, time.Now(), 0) + closeAgentStatsFunc, err := prometheusmetrics.AgentStats(ctx, logger, options.PrometheusRegistry, options.Database, time.Now(), 0, options.DeploymentValues.Prometheus.AggregateAgentStatsBy.Value()) if err != nil { return nil, xerrors.Errorf("register agent stats prometheus metric: %w", err) } afterCtx(ctx, closeAgentStatsFunc) - metricsAggregator, err := prometheusmetrics.NewMetricsAggregator(logger, options.PrometheusRegistry, 0) + metricsAggregator, err := prometheusmetrics.NewMetricsAggregator(logger, options.PrometheusRegistry, 0, options.DeploymentValues.Prometheus.AggregateAgentStatsBy.Value()) if err != nil { return nil, xerrors.Errorf("can't initialize metrics aggregator: %w", err) } diff --git a/cli/testdata/coder_server_--help.golden b/cli/testdata/coder_server_--help.golden index 632a96a470..3c3c0f4031 100644 --- a/cli/testdata/coder_server_--help.golden +++ b/cli/testdata/coder_server_--help.golden @@ -123,6 +123,11 @@ INTROSPECTION / PROMETHEUS OPTIONS: --prometheus-address host:port, $CODER_PROMETHEUS_ADDRESS (default: 127.0.0.1:2112) The bind address to serve prometheus metrics. + --prometheus-aggregate-agent-stats-by string-array, $CODER_PROMETHEUS_AGGREGATE_AGENT_STATS_BY (default: agent_name,template_name,username,workspace_name) + When collecting agent stats, aggregate metrics by a given set of + comma-separated labels to reduce cardinality. Accepted values are + agent_name, template_name, username, workspace_name. + --prometheus-collect-agent-stats bool, $CODER_PROMETHEUS_COLLECT_AGENT_STATS Collect agent stats (may increase charges for metrics storage). diff --git a/cli/testdata/server-config.yaml.golden b/cli/testdata/server-config.yaml.golden index 8996387ff4..14ad5df67f 100644 --- a/cli/testdata/server-config.yaml.golden +++ b/cli/testdata/server-config.yaml.golden @@ -188,6 +188,15 @@ introspection: # Collect agent stats (may increase charges for metrics storage). # (default: , type: bool) collect_agent_stats: false + # When collecting agent stats, aggregate metrics by a given set of comma-separated + # labels to reduce cardinality. Accepted values are agent_name, template_name, + # username, workspace_name. + # (default: agent_name,template_name,username,workspace_name, type: string-array) + aggregate_agent_stats_by: + - agent_name + - template_name + - username + - workspace_name # Collect database metrics (may increase charges for metrics storage). # (default: false, type: bool) collect_db_metrics: false diff --git a/coderd/agentmetrics/labels.go b/coderd/agentmetrics/labels.go new file mode 100644 index 0000000000..7257f1bb61 --- /dev/null +++ b/coderd/agentmetrics/labels.go @@ -0,0 +1,38 @@ +package agentmetrics + +import ( + "strings" + + "golang.org/x/xerrors" +) + +const ( + LabelAgentName = "agent_name" + LabelTemplateName = "template_name" + LabelUsername = "username" + LabelWorkspaceName = "workspace_name" +) + +var ( + LabelAll = []string{LabelAgentName, LabelTemplateName, LabelUsername, LabelWorkspaceName} + LabelAgentStats = []string{LabelAgentName, LabelUsername, LabelWorkspaceName} +) + +// ValidateAggregationLabels ensures a given set of labels are valid aggregation labels. +func ValidateAggregationLabels(labels []string) error { + acceptable := LabelAll + + seen := make(map[string]any, len(acceptable)) + for _, label := range acceptable { + seen[label] = nil + } + + for _, label := range labels { + if _, found := seen[label]; !found { + return xerrors.Errorf("%q is not a valid aggregation label; only one or more of %q are acceptable", + label, strings.Join(acceptable, ", ")) + } + } + + return nil +} diff --git a/coderd/agentmetrics/labels_test.go b/coderd/agentmetrics/labels_test.go new file mode 100644 index 0000000000..b383ca0b25 --- /dev/null +++ b/coderd/agentmetrics/labels_test.go @@ -0,0 +1,57 @@ +package agentmetrics_test + +import ( + "testing" + + "github.com/stretchr/testify/require" + + "github.com/coder/coder/v2/coderd/agentmetrics" +) + +func TestValidateAggregationLabels(t *testing.T) { + t.Parallel() + + tests := []struct { + name string + labels []string + expectedErr bool + }{ + { + name: "empty list is valid", + }, + { + name: "single valid entry", + labels: []string{agentmetrics.LabelTemplateName}, + }, + { + name: "multiple valid entries", + labels: []string{agentmetrics.LabelTemplateName, agentmetrics.LabelUsername}, + }, + { + name: "repeated valid entries are not invalid", + labels: []string{agentmetrics.LabelTemplateName, agentmetrics.LabelUsername, agentmetrics.LabelUsername, agentmetrics.LabelUsername}, + }, + { + name: "empty entry is invalid", + labels: []string{""}, + expectedErr: true, + }, + { + name: "all valid entries", + labels: agentmetrics.LabelAll, + }, + } + + for _, tc := range tests { + tc := tc + + t.Run(tc.name, func(t *testing.T) { + t.Parallel() + + err := agentmetrics.ValidateAggregationLabels(tc.labels) + if tc.expectedErr { + require.Error(t, err) + } + }) + } +} diff --git a/coderd/apidoc/docs.go b/coderd/apidoc/docs.go index 1ed25f09fc..886c1835ce 100644 --- a/coderd/apidoc/docs.go +++ b/coderd/apidoc/docs.go @@ -10952,6 +10952,12 @@ const docTemplate = `{ "address": { "$ref": "#/definitions/clibase.HostPort" }, + "aggregate_agent_stats_by": { + "type": "array", + "items": { + "type": "string" + } + }, "collect_agent_stats": { "type": "boolean" }, diff --git a/coderd/apidoc/swagger.json b/coderd/apidoc/swagger.json index fce1864aa8..9d8d121864 100644 --- a/coderd/apidoc/swagger.json +++ b/coderd/apidoc/swagger.json @@ -9860,6 +9860,12 @@ "address": { "$ref": "#/definitions/clibase.HostPort" }, + "aggregate_agent_stats_by": { + "type": "array", + "items": { + "type": "string" + } + }, "collect_agent_stats": { "type": "boolean" }, diff --git a/coderd/prometheusmetrics/aggregator.go b/coderd/prometheusmetrics/aggregator.go index 40ad6c7b2f..44ade677d5 100644 --- a/coderd/prometheusmetrics/aggregator.go +++ b/coderd/prometheusmetrics/aggregator.go @@ -8,8 +8,11 @@ import ( "time" "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/common/model" "golang.org/x/xerrors" + "github.com/coder/coder/v2/coderd/agentmetrics" + "cdr.dev/slog" agentproto "github.com/coder/coder/v2/agent/proto" @@ -43,9 +46,10 @@ type MetricsAggregator struct { collectCh chan (chan []prometheus.Metric) updateCh chan updateRequest - storeSizeGauge prometheus.Gauge - updateHistogram prometheus.Histogram - cleanupHistogram prometheus.Histogram + storeSizeGauge prometheus.Gauge + updateHistogram prometheus.Histogram + cleanupHistogram prometheus.Histogram + aggregateByLabels []string } type updateRequest struct { @@ -68,6 +72,8 @@ type annotatedMetric struct { templateName string expiryDate time.Time + + aggregateByLabels []string } type metricKey struct { @@ -102,13 +108,28 @@ func hashKey(req *updateRequest, m *agentproto.Stats_Metric) metricKey { var _ prometheus.Collector = new(MetricsAggregator) func (am *annotatedMetric) asPrometheus() (prometheus.Metric, error) { - labels := make([]string, 0, len(agentMetricsLabels)+len(am.Labels)) - labelValues := make([]string, 0, len(agentMetricsLabels)+len(am.Labels)) + var ( + baseLabelNames = am.aggregateByLabels + baseLabelValues []string + extraLabels = am.Labels + ) - labels = append(labels, agentMetricsLabels...) - labelValues = append(labelValues, am.username, am.workspaceName, am.agentName, am.templateName) + for _, label := range baseLabelNames { + val, err := am.getFieldByLabel(label) + if err != nil { + return nil, err + } - for _, l := range am.Labels { + baseLabelValues = append(baseLabelValues, val) + } + + labels := make([]string, 0, len(baseLabelNames)+len(extraLabels)) + labelValues := make([]string, 0, len(baseLabelNames)+len(extraLabels)) + + labels = append(labels, baseLabelNames...) + labelValues = append(labelValues, baseLabelValues...) + + for _, l := range extraLabels { labels = append(labels, l.Name) labelValues = append(labelValues, l.Value) } @@ -118,10 +139,48 @@ func (am *annotatedMetric) asPrometheus() (prometheus.Metric, error) { if err != nil { return nil, err } + return prometheus.MustNewConstMetric(desc, valueType, am.Value, labelValues...), nil } -func NewMetricsAggregator(logger slog.Logger, registerer prometheus.Registerer, duration time.Duration) (*MetricsAggregator, error) { +// getFieldByLabel returns the related field value for a given label +func (am *annotatedMetric) getFieldByLabel(label string) (string, error) { + var labelVal string + switch label { + case agentmetrics.LabelWorkspaceName: + labelVal = am.workspaceName + case agentmetrics.LabelTemplateName: + labelVal = am.templateName + case agentmetrics.LabelAgentName: + labelVal = am.agentName + case agentmetrics.LabelUsername: + labelVal = am.username + default: + return "", xerrors.Errorf("unexpected label: %q", label) + } + + return labelVal, nil +} + +func (am *annotatedMetric) shallowCopy() annotatedMetric { + stats := &agentproto.Stats_Metric{ + Name: am.Name, + Type: am.Type, + Value: am.Value, + Labels: am.Labels, + } + + return annotatedMetric{ + Stats_Metric: stats, + username: am.username, + workspaceName: am.workspaceName, + agentName: am.agentName, + templateName: am.templateName, + expiryDate: am.expiryDate, + } +} + +func NewMetricsAggregator(logger slog.Logger, registerer prometheus.Registerer, duration time.Duration, aggregateByLabels []string) (*MetricsAggregator, error) { metricsCleanupInterval := defaultMetricsCleanupInterval if duration > 0 { metricsCleanupInterval = duration @@ -174,9 +233,66 @@ func NewMetricsAggregator(logger slog.Logger, registerer prometheus.Registerer, storeSizeGauge: storeSizeGauge, updateHistogram: updateHistogram, cleanupHistogram: cleanupHistogram, + + aggregateByLabels: aggregateByLabels, }, nil } +// labelAggregator is used to control cardinality of collected Prometheus metrics by pre-aggregating series based on given labels. +type labelAggregator struct { + aggregations map[string]float64 + metrics map[string]annotatedMetric +} + +func newLabelAggregator(size int) *labelAggregator { + return &labelAggregator{ + aggregations: make(map[string]float64, size), + metrics: make(map[string]annotatedMetric, size), + } +} + +func (a *labelAggregator) aggregate(am annotatedMetric, labels []string) error { + // Use a LabelSet because it can give deterministic fingerprints of label combinations regardless of map ordering. + labelSet := make(model.LabelSet, len(labels)) + + for _, label := range labels { + val, err := am.getFieldByLabel(label) + if err != nil { + return err + } + + labelSet[model.LabelName(label)] = model.LabelValue(val) + } + + // Memoize based on the metric name & the unique combination of labels. + key := fmt.Sprintf("%s:%v", am.Stats_Metric.Name, labelSet.FastFingerprint()) + + // Aggregate the value based on the key. + a.aggregations[key] += am.Value + + metric, found := a.metrics[key] + if !found { + // Take a copy of the given annotatedMetric because it may be manipulated later and contains pointers. + metric = am.shallowCopy() + } + + // Store the metric. + metric.aggregateByLabels = labels + metric.Value = a.aggregations[key] + + a.metrics[key] = metric + + return nil +} + +func (a *labelAggregator) listMetrics() []annotatedMetric { + var out []annotatedMetric + for _, am := range a.metrics { + out = append(out, am) + } + return out +} + func (ma *MetricsAggregator) Run(ctx context.Context) func() { ctx, cancelFunc := context.WithCancel(ctx) done := make(chan struct{}) @@ -216,8 +332,38 @@ func (ma *MetricsAggregator) Run(ctx context.Context) func() { case outputCh := <-ma.collectCh: ma.log.Debug(ctx, "collect metrics") + var input []annotatedMetric output := make([]prometheus.Metric, 0, len(ma.store)) - for _, m := range ma.store { + + if len(ma.aggregateByLabels) == 0 { + ma.aggregateByLabels = agentmetrics.LabelAll + } + + // If custom aggregation labels have not been chosen, generate Prometheus metrics without any pre-aggregation. + // This results in higher cardinality, but may be desirable in larger deployments. + // + // Default behavior. + if len(ma.aggregateByLabels) == len(agentmetrics.LabelAll) { + for _, m := range ma.store { + // Aggregate by all available metrics. + m.aggregateByLabels = defaultAgentMetricsLabels + input = append(input, m) + } + } else { + // However, if custom aggregations have been chosen, we need to aggregate the values from the annotated + // metrics because we cannot register multiple metric series with the same labels. + la := newLabelAggregator(len(ma.store)) + + for _, m := range ma.store { + if err := la.aggregate(m, ma.aggregateByLabels); err != nil { + ma.log.Error(ctx, "can't aggregate labels", slog.F("labels", strings.Join(ma.aggregateByLabels, ",")), slog.Error(err)) + } + } + + input = la.listMetrics() + } + + for _, m := range input { promMetric, err := m.asPrometheus() if err != nil { ma.log.Error(ctx, "can't convert Prometheus value type", slog.F("name", m.Name), slog.F("type", m.Type), slog.F("value", m.Value), slog.Error(err)) @@ -225,6 +371,7 @@ func (ma *MetricsAggregator) Run(ctx context.Context) func() { } output = append(output, promMetric) } + outputCh <- output close(outputCh) case <-cleanupTicker.C: @@ -260,7 +407,7 @@ func (ma *MetricsAggregator) Run(ctx context.Context) func() { func (*MetricsAggregator) Describe(_ chan<- *prometheus.Desc) { } -var agentMetricsLabels = []string{usernameLabel, workspaceNameLabel, agentNameLabel, templateNameLabel} +var defaultAgentMetricsLabels = []string{agentmetrics.LabelUsername, agentmetrics.LabelWorkspaceName, agentmetrics.LabelAgentName, agentmetrics.LabelTemplateName} // AgentMetricLabels are the labels used to decorate an agent's metrics. // This list should match the list of labels in agentMetricsLabels. diff --git a/coderd/prometheusmetrics/aggregator_test.go b/coderd/prometheusmetrics/aggregator_test.go index bc17dc9be7..412dfae1e7 100644 --- a/coderd/prometheusmetrics/aggregator_test.go +++ b/coderd/prometheusmetrics/aggregator_test.go @@ -2,6 +2,7 @@ package prometheusmetrics_test import ( "context" + "fmt" "sort" "strings" "sync/atomic" @@ -14,6 +15,7 @@ import ( "github.com/stretchr/testify/require" "cdr.dev/slog/sloggers/slogtest" + "github.com/coder/coder/v2/coderd/agentmetrics" agentproto "github.com/coder/coder/v2/agent/proto" "github.com/coder/coder/v2/coderd/prometheusmetrics" @@ -40,7 +42,7 @@ func TestUpdateMetrics_MetricsDoNotExpire(t *testing.T) { // given registry := prometheus.NewRegistry() - metricsAggregator, err := prometheusmetrics.NewMetricsAggregator(slogtest.Make(t, &slogtest.Options{IgnoreErrors: true}), registry, time.Hour) // time.Hour, so metrics won't expire + metricsAggregator, err := prometheusmetrics.NewMetricsAggregator(slogtest.Make(t, &slogtest.Options{IgnoreErrors: true}), registry, time.Hour, nil) // time.Hour, so metrics won't expire require.NoError(t, err) ctx, cancelFunc := context.WithCancel(context.Background()) @@ -93,54 +95,54 @@ func TestUpdateMetrics_MetricsDoNotExpire(t *testing.T) { } commonLabels := []*agentproto.Stats_Metric_Label{ - {Name: "agent_name", Value: testAgentName}, - {Name: "username", Value: testUsername}, - {Name: "workspace_name", Value: testWorkspaceName}, - {Name: "template_name", Value: testTemplateName}, + {Name: agentmetrics.LabelAgentName, Value: testAgentName}, + {Name: agentmetrics.LabelUsername, Value: testUsername}, + {Name: agentmetrics.LabelWorkspaceName, Value: testWorkspaceName}, + {Name: agentmetrics.LabelTemplateName, Value: testTemplateName}, } expected := []*agentproto.Stats_Metric{ {Name: "a_counter_one", Type: agentproto.Stats_Metric_COUNTER, Value: 1, Labels: commonLabels}, {Name: "b_counter_two", Type: agentproto.Stats_Metric_COUNTER, Value: -9, Labels: []*agentproto.Stats_Metric_Label{ - {Name: "agent_name", Value: testAgentName}, + {Name: agentmetrics.LabelAgentName, Value: testAgentName}, {Name: "lizz", Value: "rizz"}, - {Name: "username", Value: testUsername}, - {Name: "workspace_name", Value: testWorkspaceName}, - {Name: "template_name", Value: testTemplateName}, + {Name: agentmetrics.LabelUsername, Value: testUsername}, + {Name: agentmetrics.LabelWorkspaceName, Value: testWorkspaceName}, + {Name: agentmetrics.LabelTemplateName, Value: testTemplateName}, }}, {Name: "b_counter_two", Type: agentproto.Stats_Metric_COUNTER, Value: 4, Labels: commonLabels}, {Name: "c_gauge_three", Type: agentproto.Stats_Metric_GAUGE, Value: 2, Labels: []*agentproto.Stats_Metric_Label{ - {Name: "agent_name", Value: testAgentName}, + {Name: agentmetrics.LabelAgentName, Value: testAgentName}, {Name: "foobar", Value: "Foobaz"}, {Name: "hello", Value: "world"}, - {Name: "username", Value: testUsername}, - {Name: "workspace_name", Value: testWorkspaceName}, - {Name: "template_name", Value: testTemplateName}, + {Name: agentmetrics.LabelUsername, Value: testUsername}, + {Name: agentmetrics.LabelWorkspaceName, Value: testWorkspaceName}, + {Name: agentmetrics.LabelTemplateName, Value: testTemplateName}, }}, {Name: "c_gauge_three", Type: agentproto.Stats_Metric_GAUGE, Value: 5, Labels: commonLabels}, {Name: "d_gauge_four", Type: agentproto.Stats_Metric_GAUGE, Value: 6, Labels: commonLabels}, {Name: "e_gauge_four", Type: agentproto.Stats_Metric_GAUGE, Value: 17, Labels: []*agentproto.Stats_Metric_Label{ - {Name: "agent_name", Value: testAgentName}, + {Name: agentmetrics.LabelAgentName, Value: testAgentName}, {Name: "cat", Value: "do,=g"}, {Name: "hello", Value: "wo,,rld"}, - {Name: "username", Value: testUsername}, - {Name: "workspace_name", Value: testWorkspaceName}, - {Name: "template_name", Value: testTemplateName}, + {Name: agentmetrics.LabelUsername, Value: testUsername}, + {Name: agentmetrics.LabelWorkspaceName, Value: testWorkspaceName}, + {Name: agentmetrics.LabelTemplateName, Value: testTemplateName}, }}, {Name: "e_gauge_four", Type: agentproto.Stats_Metric_GAUGE, Value: 15, Labels: []*agentproto.Stats_Metric_Label{ - {Name: "agent_name", Value: testAgentName}, + {Name: agentmetrics.LabelAgentName, Value: testAgentName}, {Name: "foobar", Value: "Foo,ba=z"}, {Name: "halo", Value: "wor\\,d=1,e=\\,2"}, {Name: "hello", Value: "wo,,r=d"}, - {Name: "username", Value: testUsername}, - {Name: "workspace_name", Value: testWorkspaceName}, - {Name: "template_name", Value: testTemplateName}, + {Name: agentmetrics.LabelUsername, Value: testUsername}, + {Name: agentmetrics.LabelWorkspaceName, Value: testWorkspaceName}, + {Name: agentmetrics.LabelTemplateName, Value: testTemplateName}, }}, {Name: "f_gauge_four", Type: agentproto.Stats_Metric_GAUGE, Value: 8, Labels: []*agentproto.Stats_Metric_Label{ - {Name: "agent_name", Value: testAgentName}, + {Name: agentmetrics.LabelAgentName, Value: testAgentName}, {Name: "foobar", Value: "foobaz"}, - {Name: "username", Value: testUsername}, - {Name: "workspace_name", Value: testWorkspaceName}, - {Name: "template_name", Value: testTemplateName}, + {Name: agentmetrics.LabelUsername, Value: testUsername}, + {Name: agentmetrics.LabelWorkspaceName, Value: testWorkspaceName}, + {Name: agentmetrics.LabelTemplateName, Value: testTemplateName}, }}, } @@ -175,6 +177,11 @@ func verifyCollectedMetrics(t *testing.T, expected []*agentproto.Stats_Metric, a return false } + // ensure stable iteration order + sort.Slice(expected, func(i, j int) bool { + return expected[i].Name < expected[j].Name + }) + sort.Slice(actual, func(i, j int) bool { m1 := prometheusMetricToString(t, actual[i]) m2 := prometheusMetricToString(t, actual[j]) @@ -199,9 +206,11 @@ func verifyCollectedMetrics(t *testing.T, expected []*agentproto.Stats_Metric, a dtoLabels := asMetricAgentLabels(d.GetLabel()) // dto labels are sorted in alphabetical order. - sort.Slice(e.Labels, func(i, j int) bool { + sortFn := func(i, j int) bool { return e.Labels[i].Name < e.Labels[j].Name - }) + } + sort.Slice(e.Labels, sortFn) + sort.Slice(dtoLabels, sortFn) require.Equal(t, e.Labels, dtoLabels, d.String()) } return true @@ -253,7 +262,7 @@ func TestUpdateMetrics_MetricsExpire(t *testing.T) { // given registry := prometheus.NewRegistry() - metricsAggregator, err := prometheusmetrics.NewMetricsAggregator(slogtest.Make(t, &slogtest.Options{IgnoreErrors: true}), registry, time.Millisecond) + metricsAggregator, err := prometheusmetrics.NewMetricsAggregator(slogtest.Make(t, &slogtest.Options{IgnoreErrors: true}), registry, time.Millisecond, agentmetrics.LabelAll) require.NoError(t, err) ctx, cancelFunc := context.WithCancel(context.Background()) @@ -291,18 +300,354 @@ func TestUpdateMetrics_MetricsExpire(t *testing.T) { }, testutil.WaitShort, testutil.IntervalFast) } +func TestLabelsAggregation(t *testing.T) { + t.Parallel() + + type statCollection struct { + labels prometheusmetrics.AgentMetricLabels + metrics []*agentproto.Stats_Metric + } + + commonLabels := []*agentproto.Stats_Metric_Label{ + {Name: agentmetrics.LabelUsername, Value: testUsername}, + {Name: agentmetrics.LabelAgentName, Value: testAgentName}, + {Name: agentmetrics.LabelWorkspaceName, Value: testWorkspaceName}, + {Name: agentmetrics.LabelTemplateName, Value: testTemplateName}, + } + + tests := []struct { + name string + given []statCollection + expected []*agentproto.Stats_Metric + aggregateOn []string + }{ + { + name: "label aggregations not specified, keep all (high cardinality, default behavior)", + aggregateOn: agentmetrics.LabelAll, + given: []statCollection{ + { + labels: testLabels, + metrics: []*agentproto.Stats_Metric{ + {Name: "user_counter", Type: agentproto.Stats_Metric_COUNTER, Value: 1}, + }, + }, + { + labels: testLabels, + metrics: []*agentproto.Stats_Metric{ + {Name: "active_conns", Type: agentproto.Stats_Metric_GAUGE, Value: 4}, + }, + }, + }, + expected: []*agentproto.Stats_Metric{ + {Name: "user_counter", Type: agentproto.Stats_Metric_COUNTER, Value: 1, Labels: commonLabels}, + {Name: "active_conns", Type: agentproto.Stats_Metric_GAUGE, Value: 4, Labels: commonLabels}, + }, + }, + { + // Scenario: 2 users are using the same agent and we've configured the deployment to aggregate on the "agent_name" label. + name: "single label aggregation, aggregating to single metric", + aggregateOn: []string{agentmetrics.LabelAgentName}, + given: []statCollection{ + { + labels: prometheusmetrics.AgentMetricLabels{ + Username: "user1", + AgentName: "agent1", + }, + metrics: []*agentproto.Stats_Metric{ + {Name: "user_counter", Type: agentproto.Stats_Metric_COUNTER, Value: 1}, + }, + }, + { + labels: prometheusmetrics.AgentMetricLabels{ + Username: "user2", + AgentName: "agent1", + }, + metrics: []*agentproto.Stats_Metric{ + {Name: "user_counter", Type: agentproto.Stats_Metric_COUNTER, Value: 7}, + }, + }, + }, + expected: []*agentproto.Stats_Metric{ + // We only observed one agent_name value, so all metrics are aggregated to a single series. + {Name: "user_counter", Type: agentproto.Stats_Metric_COUNTER, Value: 8, Labels: []*agentproto.Stats_Metric_Label{ + {Name: agentmetrics.LabelAgentName, Value: "agent1"}, + }}, + }, + }, + { + // Scenario: as above, but we're aggregating on two invariant labels. + name: "multiple label aggregation, aggregating to single metric", + aggregateOn: []string{agentmetrics.LabelAgentName, agentmetrics.LabelTemplateName}, + given: []statCollection{ + { + labels: prometheusmetrics.AgentMetricLabels{ + Username: "user1", + AgentName: "agent1", + TemplateName: "template1", + }, + metrics: []*agentproto.Stats_Metric{ + {Name: "user_counter", Type: agentproto.Stats_Metric_COUNTER, Value: 1}, + }, + }, + { + labels: prometheusmetrics.AgentMetricLabels{ + Username: "user2", + AgentName: "agent1", + TemplateName: "template1", + }, + metrics: []*agentproto.Stats_Metric{ + {Name: "user_counter", Type: agentproto.Stats_Metric_COUNTER, Value: 7}, + }, + }, + }, + expected: []*agentproto.Stats_Metric{ + // We only observed one agent_name & template_name tuple, so all metrics are aggregated to a single series. + {Name: "user_counter", Type: agentproto.Stats_Metric_COUNTER, Value: 8, Labels: []*agentproto.Stats_Metric_Label{ + {Name: agentmetrics.LabelAgentName, Value: "agent1"}, + {Name: agentmetrics.LabelTemplateName, Value: "template1"}, + }}, + }, + }, + { + // Scenario: aggregating on a label which is unique across all metrics. + name: "single label aggregation, aggregating to multiple metrics", + aggregateOn: []string{agentmetrics.LabelUsername}, + given: []statCollection{ + { + labels: prometheusmetrics.AgentMetricLabels{ + Username: "user1", + AgentName: "agent1", + TemplateName: "template1", + }, + metrics: []*agentproto.Stats_Metric{ + {Name: "user_counter", Type: agentproto.Stats_Metric_COUNTER, Value: 1}, + }, + }, + { + labels: prometheusmetrics.AgentMetricLabels{ + Username: "user2", + AgentName: "agent1", + TemplateName: "template1", + }, + metrics: []*agentproto.Stats_Metric{ + {Name: "user_counter", Type: agentproto.Stats_Metric_COUNTER, Value: 7}, + }, + }, + }, + expected: []*agentproto.Stats_Metric{ + // We observed two unique username values, and therefore we have a metric for each. + {Name: "user_counter", Type: agentproto.Stats_Metric_COUNTER, Value: 1, Labels: []*agentproto.Stats_Metric_Label{ + {Name: agentmetrics.LabelUsername, Value: "user1"}, + }}, + {Name: "user_counter", Type: agentproto.Stats_Metric_COUNTER, Value: 7, Labels: []*agentproto.Stats_Metric_Label{ + {Name: agentmetrics.LabelUsername, Value: "user2"}, + }}, + }, + }, + { + // Scenario: aggregating on a label which is unique across all metrics, plus two invariant labels. + name: "multiple label aggregation, aggregating to multiple metrics", + aggregateOn: []string{agentmetrics.LabelUsername, agentmetrics.LabelAgentName, agentmetrics.LabelTemplateName}, + given: []statCollection{ + { + labels: prometheusmetrics.AgentMetricLabels{ + Username: "user1", + AgentName: "agent1", + TemplateName: "template1", + }, + metrics: []*agentproto.Stats_Metric{ + {Name: "user_counter", Type: agentproto.Stats_Metric_COUNTER, Value: 1}, + }, + }, + { + labels: prometheusmetrics.AgentMetricLabels{ + Username: "user2", + AgentName: "agent1", + TemplateName: "template1", + }, + metrics: []*agentproto.Stats_Metric{ + {Name: "user_counter", Type: agentproto.Stats_Metric_COUNTER, Value: 7}, + }, + }, + }, + expected: []*agentproto.Stats_Metric{ + // We observed two unique username values, and therefore we have a metric for each. + {Name: "user_counter", Type: agentproto.Stats_Metric_COUNTER, Value: 1, Labels: []*agentproto.Stats_Metric_Label{ + {Name: agentmetrics.LabelUsername, Value: "user1"}, + {Name: agentmetrics.LabelAgentName, Value: "agent1"}, + {Name: agentmetrics.LabelTemplateName, Value: "template1"}, + }}, + {Name: "user_counter", Type: agentproto.Stats_Metric_COUNTER, Value: 7, Labels: []*agentproto.Stats_Metric_Label{ + {Name: agentmetrics.LabelUsername, Value: "user2"}, + {Name: agentmetrics.LabelAgentName, Value: "agent1"}, + {Name: agentmetrics.LabelTemplateName, Value: "template1"}, + }}, + }, + }, + { + name: "extra labels are retained, even with label aggregations", + aggregateOn: []string{agentmetrics.LabelUsername}, + given: []statCollection{ + { + labels: testLabels, + metrics: []*agentproto.Stats_Metric{ + {Name: "user_counter", Type: agentproto.Stats_Metric_COUNTER, Value: 1}, + }, + }, + { + labels: testLabels, + metrics: []*agentproto.Stats_Metric{ + {Name: "extra_label", Type: agentproto.Stats_Metric_COUNTER, Value: 27, Labels: []*agentproto.Stats_Metric_Label{ + {Name: "lizz", Value: "rizz"}, + }}, + }, + }, + }, + expected: []*agentproto.Stats_Metric{ + {Name: "user_counter", Type: agentproto.Stats_Metric_COUNTER, Value: 1, Labels: []*agentproto.Stats_Metric_Label{ + {Name: agentmetrics.LabelUsername, Value: testUsername}, + }}, + {Name: "extra_label", Type: agentproto.Stats_Metric_COUNTER, Value: 27, Labels: []*agentproto.Stats_Metric_Label{ + {Name: "lizz", Value: "rizz"}, + {Name: agentmetrics.LabelUsername, Value: testUsername}, + }}, + }, + }, + { + // Both counters and gauges should have all their values summed to produce the correct output. + name: "counters & gauges behave identically", + aggregateOn: []string{agentmetrics.LabelTemplateName}, + given: []statCollection{ + { + labels: prometheusmetrics.AgentMetricLabels{ + Username: "username1", + TemplateName: "template1", + }, + metrics: []*agentproto.Stats_Metric{ + {Name: "user_counter", Type: agentproto.Stats_Metric_COUNTER, Value: 1}, + {Name: "active_conns", Type: agentproto.Stats_Metric_GAUGE, Value: 3}, + }, + }, + { + labels: prometheusmetrics.AgentMetricLabels{ + Username: "username2", + TemplateName: "template1", + }, + metrics: []*agentproto.Stats_Metric{ + {Name: "user_counter", Type: agentproto.Stats_Metric_COUNTER, Value: 2}, + {Name: "active_conns", Type: agentproto.Stats_Metric_GAUGE, Value: 4}, + }, + }, + }, + expected: []*agentproto.Stats_Metric{ + {Name: "user_counter", Type: agentproto.Stats_Metric_COUNTER, Value: 3, Labels: []*agentproto.Stats_Metric_Label{ + {Name: agentmetrics.LabelTemplateName, Value: "template1"}, + }}, + {Name: "active_conns", Type: agentproto.Stats_Metric_GAUGE, Value: 7, Labels: []*agentproto.Stats_Metric_Label{ + {Name: agentmetrics.LabelTemplateName, Value: "template1"}, + }}, + }, + }, + { + // Scenario: validation fails and an invalid label is selected for aggregation. + name: "invalid label aggregation", + aggregateOn: []string{"nonsense"}, + given: []statCollection{ + { + labels: testLabels, + metrics: []*agentproto.Stats_Metric{ + {Name: "user_counter", Type: agentproto.Stats_Metric_COUNTER, Value: 1}, + }, + }, + }, + // Nothing will be returned. + expected: []*agentproto.Stats_Metric{}, + }, + { + // Scenario: validation fails and an empty list is given for aggregation. + name: "empty label aggregation list", + aggregateOn: []string{}, + given: []statCollection{ + { + labels: testLabels, + metrics: []*agentproto.Stats_Metric{ + {Name: "user_counter", Type: agentproto.Stats_Metric_COUNTER, Value: 1}, + }, + }, + }, + // Default aggregation will be used. + expected: []*agentproto.Stats_Metric{ + {Name: "user_counter", Type: agentproto.Stats_Metric_COUNTER, Value: 1, Labels: commonLabels}, + }, + }, + } + + for _, tc := range tests { + tc := tc + + t.Run(tc.name, func(t *testing.T) { + t.Parallel() + + // given + registry := prometheus.NewRegistry() + metricsAggregator, err := prometheusmetrics.NewMetricsAggregator(slogtest.Make(t, &slogtest.Options{IgnoreErrors: true}), registry, time.Hour, tc.aggregateOn) // time.Hour, so metrics won't expire + require.NoError(t, err) + + ctx, cancelFunc := context.WithCancel(context.Background()) + t.Cleanup(cancelFunc) + + closeFunc := metricsAggregator.Run(ctx) + t.Cleanup(closeFunc) + + // when + for _, sc := range tc.given { + metricsAggregator.Update(ctx, sc.labels, sc.metrics) + } + + // then + require.Eventually(t, func() bool { + var actual []prometheus.Metric + metricsCh := make(chan prometheus.Metric) + + done := make(chan struct{}, 1) + defer close(done) + go func() { + for m := range metricsCh { + actual = append(actual, m) + } + done <- struct{}{} + }() + metricsAggregator.Collect(metricsCh) + close(metricsCh) + <-done + return verifyCollectedMetrics(t, tc.expected, actual) + }, testutil.WaitMedium, testutil.IntervalSlow) + }) + } +} + func Benchmark_MetricsAggregator_Run(b *testing.B) { + benchmarkRunner(b, agentmetrics.LabelAll) +} + +func Benchmark_MetricsAggregator_RunWithAggregations(b *testing.B) { + for i := 1; i <= len(agentmetrics.LabelAll); i++ { + b.Run(fmt.Sprintf("%d labels", i), func(b *testing.B) { + benchmarkRunner(b, agentmetrics.LabelAll[0:i]) + }) + } +} + +func benchmarkRunner(b *testing.B, aggregateByLabels []string) { + b.ReportAllocs() + // Number of metrics to generate and send in each iteration. // Hard-coded to 1024 to avoid overflowing the queue in the metrics aggregator. numMetrics := 1024 // given registry := prometheus.NewRegistry() - metricsAggregator := must(prometheusmetrics.NewMetricsAggregator( - slogtest.Make(b, &slogtest.Options{IgnoreErrors: true}), - registry, - time.Hour, - )) + metricsAggregator := must(prometheusmetrics.NewMetricsAggregator(slogtest.Make(b, &slogtest.Options{IgnoreErrors: true}), registry, time.Hour, aggregateByLabels)) ctx, cancelFunc := context.WithCancel(context.Background()) b.Cleanup(cancelFunc) diff --git a/coderd/prometheusmetrics/prometheusmetrics.go b/coderd/prometheusmetrics/prometheusmetrics.go index e1928fec5f..36bf9b887d 100644 --- a/coderd/prometheusmetrics/prometheusmetrics.go +++ b/coderd/prometheusmetrics/prometheusmetrics.go @@ -10,27 +10,20 @@ import ( "sync/atomic" "time" - "github.com/coder/coder/v2/codersdk" - "github.com/google/uuid" "github.com/prometheus/client_golang/prometheus" "tailscale.com/tailcfg" "cdr.dev/slog" + "github.com/coder/coder/v2/coderd/agentmetrics" "github.com/coder/coder/v2/coderd/database" "github.com/coder/coder/v2/coderd/database/dbauthz" "github.com/coder/coder/v2/coderd/database/dbtime" + "github.com/coder/coder/v2/codersdk" "github.com/coder/coder/v2/tailnet" ) -const ( - templateNameLabel = "template_name" - agentNameLabel = "agent_name" - usernameLabel = "username" - workspaceNameLabel = "workspace_name" -) - // ActiveUsers tracks the number of users that have authenticated within the past hour. func ActiveUsers(ctx context.Context, registerer prometheus.Registerer, db database.Store, duration time.Duration) (func(), error) { if duration == 0 { @@ -156,7 +149,7 @@ func Agents(ctx context.Context, logger slog.Logger, registerer prometheus.Regis Subsystem: "agents", Name: "up", Help: "The number of active agents per workspace.", - }, []string{usernameLabel, workspaceNameLabel, templateNameLabel, "template_version"})) + }, []string{agentmetrics.LabelUsername, agentmetrics.LabelWorkspaceName, agentmetrics.LabelTemplateName, "template_version"})) err := registerer.Register(agentsGauge) if err != nil { return nil, err @@ -167,7 +160,7 @@ func Agents(ctx context.Context, logger slog.Logger, registerer prometheus.Regis Subsystem: "agents", Name: "connections", Help: "Agent connections with statuses.", - }, []string{agentNameLabel, usernameLabel, workspaceNameLabel, "status", "lifecycle_state", "tailnet_node"})) + }, []string{agentmetrics.LabelAgentName, agentmetrics.LabelUsername, agentmetrics.LabelWorkspaceName, "status", "lifecycle_state", "tailnet_node"})) err = registerer.Register(agentsConnectionsGauge) if err != nil { return nil, err @@ -178,7 +171,7 @@ func Agents(ctx context.Context, logger slog.Logger, registerer prometheus.Regis Subsystem: "agents", Name: "connection_latencies_seconds", Help: "Agent connection latencies in seconds.", - }, []string{agentNameLabel, usernameLabel, workspaceNameLabel, "derp_region", "preferred"})) + }, []string{agentmetrics.LabelAgentName, agentmetrics.LabelUsername, agentmetrics.LabelWorkspaceName, "derp_region", "preferred"})) err = registerer.Register(agentsConnectionLatenciesGauge) if err != nil { return nil, err @@ -189,7 +182,7 @@ func Agents(ctx context.Context, logger slog.Logger, registerer prometheus.Regis Subsystem: "agents", Name: "apps", Help: "Agent applications with statuses.", - }, []string{agentNameLabel, usernameLabel, workspaceNameLabel, "app_name", "health"})) + }, []string{agentmetrics.LabelAgentName, agentmetrics.LabelUsername, agentmetrics.LabelWorkspaceName, "app_name", "health"})) err = registerer.Register(agentsAppsGauge) if err != nil { return nil, err @@ -335,11 +328,17 @@ func Agents(ctx context.Context, logger slog.Logger, registerer prometheus.Regis }, nil } -func AgentStats(ctx context.Context, logger slog.Logger, registerer prometheus.Registerer, db database.Store, initialCreateAfter time.Time, duration time.Duration) (func(), error) { +func AgentStats(ctx context.Context, logger slog.Logger, registerer prometheus.Registerer, db database.Store, initialCreateAfter time.Time, duration time.Duration, aggregateByLabels []string) (func(), error) { if duration == 0 { duration = 1 * time.Minute } + if len(aggregateByLabels) == 0 { + aggregateByLabels = agentmetrics.LabelAgentStats + } + + aggregateByLabels = filterAcceptableAgentLabels(aggregateByLabels) + metricsCollectorAgentStats := prometheus.NewHistogram(prometheus.HistogramOpts{ Namespace: "coderd", Subsystem: "prometheusmetrics", @@ -357,7 +356,7 @@ func AgentStats(ctx context.Context, logger slog.Logger, registerer prometheus.R Subsystem: "agentstats", Name: "tx_bytes", Help: "Agent Tx bytes", - }, []string{agentNameLabel, usernameLabel, workspaceNameLabel})) + }, aggregateByLabels)) err = registerer.Register(agentStatsTxBytesGauge) if err != nil { return nil, err @@ -368,7 +367,7 @@ func AgentStats(ctx context.Context, logger slog.Logger, registerer prometheus.R Subsystem: "agentstats", Name: "rx_bytes", Help: "Agent Rx bytes", - }, []string{agentNameLabel, usernameLabel, workspaceNameLabel})) + }, aggregateByLabels)) err = registerer.Register(agentStatsRxBytesGauge) if err != nil { return nil, err @@ -379,7 +378,7 @@ func AgentStats(ctx context.Context, logger slog.Logger, registerer prometheus.R Subsystem: "agentstats", Name: "connection_count", Help: "The number of established connections by agent", - }, []string{agentNameLabel, usernameLabel, workspaceNameLabel})) + }, aggregateByLabels)) err = registerer.Register(agentStatsConnectionCountGauge) if err != nil { return nil, err @@ -390,7 +389,7 @@ func AgentStats(ctx context.Context, logger slog.Logger, registerer prometheus.R Subsystem: "agentstats", Name: "connection_median_latency_seconds", Help: "The median agent connection latency in seconds", - }, []string{agentNameLabel, usernameLabel, workspaceNameLabel})) + }, aggregateByLabels)) err = registerer.Register(agentStatsConnectionMedianLatencyGauge) if err != nil { return nil, err @@ -401,7 +400,7 @@ func AgentStats(ctx context.Context, logger slog.Logger, registerer prometheus.R Subsystem: "agentstats", Name: "session_count_jetbrains", Help: "The number of session established by JetBrains", - }, []string{agentNameLabel, usernameLabel, workspaceNameLabel})) + }, aggregateByLabels)) err = registerer.Register(agentStatsSessionCountJetBrainsGauge) if err != nil { return nil, err @@ -412,7 +411,7 @@ func AgentStats(ctx context.Context, logger slog.Logger, registerer prometheus.R Subsystem: "agentstats", Name: "session_count_reconnecting_pty", Help: "The number of session established by reconnecting PTY", - }, []string{agentNameLabel, usernameLabel, workspaceNameLabel})) + }, aggregateByLabels)) err = registerer.Register(agentStatsSessionCountReconnectingPTYGauge) if err != nil { return nil, err @@ -423,7 +422,7 @@ func AgentStats(ctx context.Context, logger slog.Logger, registerer prometheus.R Subsystem: "agentstats", Name: "session_count_ssh", Help: "The number of session established by SSH", - }, []string{agentNameLabel, usernameLabel, workspaceNameLabel})) + }, aggregateByLabels)) err = registerer.Register(agentStatsSessionCountSSHGauge) if err != nil { return nil, err @@ -434,7 +433,7 @@ func AgentStats(ctx context.Context, logger slog.Logger, registerer prometheus.R Subsystem: "agentstats", Name: "session_count_vscode", Help: "The number of session established by VSCode", - }, []string{agentNameLabel, usernameLabel, workspaceNameLabel})) + }, aggregateByLabels)) err = registerer.Register(agentStatsSessionCountVSCodeGauge) if err != nil { return nil, err @@ -466,16 +465,28 @@ func AgentStats(ctx context.Context, logger slog.Logger, registerer prometheus.R logger.Error(ctx, "can't get agent stats", slog.Error(err)) } else { for _, agentStat := range stats { - agentStatsRxBytesGauge.WithLabelValues(VectorOperationAdd, float64(agentStat.RxBytes), agentStat.AgentName, agentStat.Username, agentStat.WorkspaceName) - agentStatsTxBytesGauge.WithLabelValues(VectorOperationAdd, float64(agentStat.TxBytes), agentStat.AgentName, agentStat.Username, agentStat.WorkspaceName) + var labelValues []string + for _, label := range aggregateByLabels { + switch label { + case agentmetrics.LabelUsername: + labelValues = append(labelValues, agentStat.Username) + case agentmetrics.LabelWorkspaceName: + labelValues = append(labelValues, agentStat.WorkspaceName) + case agentmetrics.LabelAgentName: + labelValues = append(labelValues, agentStat.AgentName) + } + } - agentStatsConnectionCountGauge.WithLabelValues(VectorOperationSet, float64(agentStat.ConnectionCount), agentStat.AgentName, agentStat.Username, agentStat.WorkspaceName) - agentStatsConnectionMedianLatencyGauge.WithLabelValues(VectorOperationSet, agentStat.ConnectionMedianLatencyMS/1000.0 /* (to seconds) */, agentStat.AgentName, agentStat.Username, agentStat.WorkspaceName) + agentStatsRxBytesGauge.WithLabelValues(VectorOperationAdd, float64(agentStat.RxBytes), labelValues...) + agentStatsTxBytesGauge.WithLabelValues(VectorOperationAdd, float64(agentStat.TxBytes), labelValues...) - agentStatsSessionCountJetBrainsGauge.WithLabelValues(VectorOperationSet, float64(agentStat.SessionCountJetBrains), agentStat.AgentName, agentStat.Username, agentStat.WorkspaceName) - agentStatsSessionCountReconnectingPTYGauge.WithLabelValues(VectorOperationSet, float64(agentStat.SessionCountReconnectingPTY), agentStat.AgentName, agentStat.Username, agentStat.WorkspaceName) - agentStatsSessionCountSSHGauge.WithLabelValues(VectorOperationSet, float64(agentStat.SessionCountSSH), agentStat.AgentName, agentStat.Username, agentStat.WorkspaceName) - agentStatsSessionCountVSCodeGauge.WithLabelValues(VectorOperationSet, float64(agentStat.SessionCountVSCode), agentStat.AgentName, agentStat.Username, agentStat.WorkspaceName) + agentStatsConnectionCountGauge.WithLabelValues(VectorOperationSet, float64(agentStat.ConnectionCount), labelValues...) + agentStatsConnectionMedianLatencyGauge.WithLabelValues(VectorOperationSet, agentStat.ConnectionMedianLatencyMS/1000.0 /* (to seconds) */, labelValues...) + + agentStatsSessionCountJetBrainsGauge.WithLabelValues(VectorOperationSet, float64(agentStat.SessionCountJetBrains), labelValues...) + agentStatsSessionCountReconnectingPTYGauge.WithLabelValues(VectorOperationSet, float64(agentStat.SessionCountReconnectingPTY), labelValues...) + agentStatsSessionCountSSHGauge.WithLabelValues(VectorOperationSet, float64(agentStat.SessionCountSSH), labelValues...) + agentStatsSessionCountVSCodeGauge.WithLabelValues(VectorOperationSet, float64(agentStat.SessionCountVSCode), labelValues...) } if len(stats) > 0 { @@ -504,3 +515,17 @@ func AgentStats(ctx context.Context, logger slog.Logger, registerer prometheus.R <-done }, nil } + +// filterAcceptableAgentLabels handles a slightly messy situation whereby `prometheus-aggregate-agent-stats-by` can control on +// which labels agent stats are aggregated, but for these specific metrics in this file there is no `template` label value, +// and therefore we have to exclude it from the list of acceptable labels. +func filterAcceptableAgentLabels(labels []string) []string { + out := make([]string, 0, len(labels)) + for _, label := range labels { + if label != agentmetrics.LabelTemplateName { + out = append(out, label) + } + } + + return out +} diff --git a/coderd/prometheusmetrics/prometheusmetrics_internal_test.go b/coderd/prometheusmetrics/prometheusmetrics_internal_test.go new file mode 100644 index 0000000000..5eaf1d92ed --- /dev/null +++ b/coderd/prometheusmetrics/prometheusmetrics_internal_test.go @@ -0,0 +1,40 @@ +package prometheusmetrics + +import ( + "testing" + + "github.com/stretchr/testify/require" + + "github.com/coder/coder/v2/coderd/agentmetrics" +) + +func TestFilterAcceptableAgentLabels(t *testing.T) { + t.Parallel() + + tests := []struct { + name string + input []string + expected []string + }{ + { + name: "template label is ignored", + input: []string{agentmetrics.LabelTemplateName}, + expected: []string{}, + }, + { + name: "all other labels are returned", + input: agentmetrics.LabelAll, + expected: []string{agentmetrics.LabelAgentName, agentmetrics.LabelUsername, agentmetrics.LabelWorkspaceName}, + }, + } + + for _, tc := range tests { + tc := tc + + t.Run(tc.name, func(t *testing.T) { + t.Parallel() + + require.Equal(t, tc.expected, filterAcceptableAgentLabels(tc.input)) + }) + } +} diff --git a/coderd/prometheusmetrics/prometheusmetrics_test.go b/coderd/prometheusmetrics/prometheusmetrics_test.go index 645f179256..47ddb830c4 100644 --- a/coderd/prometheusmetrics/prometheusmetrics_test.go +++ b/coderd/prometheusmetrics/prometheusmetrics_test.go @@ -11,10 +11,6 @@ import ( "testing" "time" - "github.com/coder/coder/v2/coderd/batchstats" - "github.com/coder/coder/v2/coderd/database/dbtestutil" - "github.com/coder/coder/v2/coderd/database/dbtime" - "github.com/google/uuid" "github.com/prometheus/client_golang/prometheus" "github.com/stretchr/testify/assert" @@ -24,10 +20,14 @@ import ( "cdr.dev/slog" "cdr.dev/slog/sloggers/slogtest" + "github.com/coder/coder/v2/coderd/agentmetrics" + "github.com/coder/coder/v2/coderd/batchstats" "github.com/coder/coder/v2/coderd/coderdtest" "github.com/coder/coder/v2/coderd/database" "github.com/coder/coder/v2/coderd/database/dbgen" "github.com/coder/coder/v2/coderd/database/dbmem" + "github.com/coder/coder/v2/coderd/database/dbtestutil" + "github.com/coder/coder/v2/coderd/database/dbtime" "github.com/coder/coder/v2/coderd/prometheusmetrics" "github.com/coder/coder/v2/codersdk" "github.com/coder/coder/v2/codersdk/agentsdk" @@ -451,7 +451,7 @@ func TestAgentStats(t *testing.T) { // and it doesn't depend on the real time. closeFunc, err := prometheusmetrics.AgentStats(ctx, slogtest.Make(t, &slogtest.Options{ IgnoreErrors: true, - }), registry, db, time.Now().Add(-time.Minute), time.Millisecond) + }), registry, db, time.Now().Add(-time.Minute), time.Millisecond, agentmetrics.LabelAll) require.NoError(t, err) t.Cleanup(closeFunc) diff --git a/codersdk/deployment.go b/codersdk/deployment.go index ef4feeab06..b2be661e9a 100644 --- a/codersdk/deployment.go +++ b/codersdk/deployment.go @@ -4,6 +4,7 @@ import ( "context" "encoding/json" "flag" + "fmt" "net/http" "os" "path/filepath" @@ -18,6 +19,7 @@ import ( "github.com/coder/coder/v2/buildinfo" "github.com/coder/coder/v2/cli/clibase" + "github.com/coder/coder/v2/coderd/agentmetrics" "github.com/coder/coder/v2/coderd/workspaceapps/appurl" ) @@ -255,10 +257,11 @@ type DERPConfig struct { } type PrometheusConfig struct { - Enable clibase.Bool `json:"enable" typescript:",notnull"` - Address clibase.HostPort `json:"address" typescript:",notnull"` - CollectAgentStats clibase.Bool `json:"collect_agent_stats" typescript:",notnull"` - CollectDBMetrics clibase.Bool `json:"collect_db_metrics" typescript:",notnull"` + Enable clibase.Bool `json:"enable" typescript:",notnull"` + Address clibase.HostPort `json:"address" typescript:",notnull"` + CollectAgentStats clibase.Bool `json:"collect_agent_stats" typescript:",notnull"` + CollectDBMetrics clibase.Bool `json:"collect_db_metrics" typescript:",notnull"` + AggregateAgentStatsBy clibase.StringArray `json:"aggregate_agent_stats_by" typescript:",notnull"` } type PprofConfig struct { @@ -942,6 +945,22 @@ when required by your organization's security policy.`, Group: &deploymentGroupIntrospectionPrometheus, YAML: "collect_agent_stats", }, + { + Name: "Prometheus Aggregate Agent Stats By", + Description: fmt.Sprintf("When collecting agent stats, aggregate metrics by a given set of comma-separated labels to reduce cardinality. Accepted values are %s.", strings.Join(agentmetrics.LabelAll, ", ")), + Flag: "prometheus-aggregate-agent-stats-by", + Env: "CODER_PROMETHEUS_AGGREGATE_AGENT_STATS_BY", + Value: clibase.Validate(&c.Prometheus.AggregateAgentStatsBy, func(value *clibase.StringArray) error { + if value == nil { + return nil + } + + return agentmetrics.ValidateAggregationLabels(value.Value()) + }), + Group: &deploymentGroupIntrospectionPrometheus, + YAML: "aggregate_agent_stats_by", + Default: strings.Join(agentmetrics.LabelAll, ","), + }, { Name: "Prometheus Collect Database Metrics", Description: "Collect database metrics (may increase charges for metrics storage).", diff --git a/docs/api/general.md b/docs/api/general.md index 7649729197..b21adb8acf 100644 --- a/docs/api/general.md +++ b/docs/api/general.md @@ -317,6 +317,7 @@ curl -X GET http://coder-server:8080/api/v2/deployment/config \ "host": "string", "port": "string" }, + "aggregate_agent_stats_by": ["string"], "collect_agent_stats": true, "collect_db_metrics": true, "enable": true diff --git a/docs/api/schemas.md b/docs/api/schemas.md index 890cd0fd85..6ed655e391 100644 --- a/docs/api/schemas.md +++ b/docs/api/schemas.md @@ -2786,6 +2786,7 @@ AuthorizationObject can represent a "set" of objects, such as: all workspaces in "host": "string", "port": "string" }, + "aggregate_agent_stats_by": ["string"], "collect_agent_stats": true, "collect_db_metrics": true, "enable": true @@ -3154,6 +3155,7 @@ AuthorizationObject can represent a "set" of objects, such as: all workspaces in "host": "string", "port": "string" }, + "aggregate_agent_stats_by": ["string"], "collect_agent_stats": true, "collect_db_metrics": true, "enable": true @@ -4783,6 +4785,7 @@ AuthorizationObject can represent a "set" of objects, such as: all workspaces in "host": "string", "port": "string" }, + "aggregate_agent_stats_by": ["string"], "collect_agent_stats": true, "collect_db_metrics": true, "enable": true @@ -4791,12 +4794,13 @@ AuthorizationObject can represent a "set" of objects, such as: all workspaces in ### Properties -| Name | Type | Required | Restrictions | Description | -| --------------------- | ------------------------------------ | -------- | ------------ | ----------- | -| `address` | [clibase.HostPort](#clibasehostport) | false | | | -| `collect_agent_stats` | boolean | false | | | -| `collect_db_metrics` | boolean | false | | | -| `enable` | boolean | false | | | +| Name | Type | Required | Restrictions | Description | +| -------------------------- | ------------------------------------ | -------- | ------------ | ----------- | +| `address` | [clibase.HostPort](#clibasehostport) | false | | | +| `aggregate_agent_stats_by` | array of string | false | | | +| `collect_agent_stats` | boolean | false | | | +| `collect_db_metrics` | boolean | false | | | +| `enable` | boolean | false | | | ## codersdk.ProvisionerConfig diff --git a/docs/cli/server.md b/docs/cli/server.md index 5a32845378..6c44cd70d3 100644 --- a/docs/cli/server.md +++ b/docs/cli/server.md @@ -742,6 +742,17 @@ URL of a PostgreSQL database. If empty, PostgreSQL binaries will be downloaded f The bind address to serve prometheus metrics. +### --prometheus-aggregate-agent-stats-by + +| | | +| ----------- | -------------------------------------------------------------- | +| Type | string-array | +| Environment | $CODER_PROMETHEUS_AGGREGATE_AGENT_STATS_BY | +| YAML | introspection.prometheus.aggregate_agent_stats_by | +| Default | agent_name,template_name,username,workspace_name | + +When collecting agent stats, aggregate metrics by a given set of comma-separated labels to reduce cardinality. Accepted values are agent_name, template_name, username, workspace_name. + ### --prometheus-collect-agent-stats | | | diff --git a/enterprise/cli/testdata/coder_server_--help.golden b/enterprise/cli/testdata/coder_server_--help.golden index 8e9ccac868..30c2f778e8 100644 --- a/enterprise/cli/testdata/coder_server_--help.golden +++ b/enterprise/cli/testdata/coder_server_--help.golden @@ -124,6 +124,11 @@ INTROSPECTION / PROMETHEUS OPTIONS: --prometheus-address host:port, $CODER_PROMETHEUS_ADDRESS (default: 127.0.0.1:2112) The bind address to serve prometheus metrics. + --prometheus-aggregate-agent-stats-by string-array, $CODER_PROMETHEUS_AGGREGATE_AGENT_STATS_BY (default: agent_name,template_name,username,workspace_name) + When collecting agent stats, aggregate metrics by a given set of + comma-separated labels to reduce cardinality. Accepted values are + agent_name, template_name, username, workspace_name. + --prometheus-collect-agent-stats bool, $CODER_PROMETHEUS_COLLECT_AGENT_STATS Collect agent stats (may increase charges for metrics storage). diff --git a/flake.nix b/flake.nix index cd82bfd84e..f906f3c3d5 100644 --- a/flake.nix +++ b/flake.nix @@ -26,6 +26,7 @@ bat cairo curl + delve drpc.defaultPackage.${system} gcc gdk diff --git a/site/src/api/typesGenerated.ts b/site/src/api/typesGenerated.ts index d5ce9dd5ec..f129ea54a5 100644 --- a/site/src/api/typesGenerated.ts +++ b/site/src/api/typesGenerated.ts @@ -925,6 +925,7 @@ export interface PrometheusConfig { readonly address: string; readonly collect_agent_stats: boolean; readonly collect_db_metrics: boolean; + readonly aggregate_agent_stats_by: string[]; } // From codersdk/deployment.go