feat: make agent stats' cardinality configurable (#12535)

This commit is contained in:
Danny Kopping 2024-03-13 12:03:36 +02:00 committed by GitHub
parent e45d511f28
commit 7a7105ad66
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
19 changed files with 811 additions and 91 deletions

View File

@ -229,13 +229,13 @@ func enablePrometheus(
afterCtx(ctx, closeInsightsMetricsCollector)
if vals.Prometheus.CollectAgentStats {
closeAgentStatsFunc, err := prometheusmetrics.AgentStats(ctx, logger, options.PrometheusRegistry, options.Database, time.Now(), 0)
closeAgentStatsFunc, err := prometheusmetrics.AgentStats(ctx, logger, options.PrometheusRegistry, options.Database, time.Now(), 0, options.DeploymentValues.Prometheus.AggregateAgentStatsBy.Value())
if err != nil {
return nil, xerrors.Errorf("register agent stats prometheus metric: %w", err)
}
afterCtx(ctx, closeAgentStatsFunc)
metricsAggregator, err := prometheusmetrics.NewMetricsAggregator(logger, options.PrometheusRegistry, 0)
metricsAggregator, err := prometheusmetrics.NewMetricsAggregator(logger, options.PrometheusRegistry, 0, options.DeploymentValues.Prometheus.AggregateAgentStatsBy.Value())
if err != nil {
return nil, xerrors.Errorf("can't initialize metrics aggregator: %w", err)
}

View File

@ -123,6 +123,11 @@ INTROSPECTION / PROMETHEUS OPTIONS:
--prometheus-address host:port, $CODER_PROMETHEUS_ADDRESS (default: 127.0.0.1:2112)
The bind address to serve prometheus metrics.
--prometheus-aggregate-agent-stats-by string-array, $CODER_PROMETHEUS_AGGREGATE_AGENT_STATS_BY (default: agent_name,template_name,username,workspace_name)
When collecting agent stats, aggregate metrics by a given set of
comma-separated labels to reduce cardinality. Accepted values are
agent_name, template_name, username, workspace_name.
--prometheus-collect-agent-stats bool, $CODER_PROMETHEUS_COLLECT_AGENT_STATS
Collect agent stats (may increase charges for metrics storage).

View File

@ -188,6 +188,15 @@ introspection:
# Collect agent stats (may increase charges for metrics storage).
# (default: <unset>, type: bool)
collect_agent_stats: false
# When collecting agent stats, aggregate metrics by a given set of comma-separated
# labels to reduce cardinality. Accepted values are agent_name, template_name,
# username, workspace_name.
# (default: agent_name,template_name,username,workspace_name, type: string-array)
aggregate_agent_stats_by:
- agent_name
- template_name
- username
- workspace_name
# Collect database metrics (may increase charges for metrics storage).
# (default: false, type: bool)
collect_db_metrics: false

View File

@ -0,0 +1,38 @@
package agentmetrics
import (
"strings"
"golang.org/x/xerrors"
)
const (
LabelAgentName = "agent_name"
LabelTemplateName = "template_name"
LabelUsername = "username"
LabelWorkspaceName = "workspace_name"
)
var (
LabelAll = []string{LabelAgentName, LabelTemplateName, LabelUsername, LabelWorkspaceName}
LabelAgentStats = []string{LabelAgentName, LabelUsername, LabelWorkspaceName}
)
// ValidateAggregationLabels ensures a given set of labels are valid aggregation labels.
func ValidateAggregationLabels(labels []string) error {
acceptable := LabelAll
seen := make(map[string]any, len(acceptable))
for _, label := range acceptable {
seen[label] = nil
}
for _, label := range labels {
if _, found := seen[label]; !found {
return xerrors.Errorf("%q is not a valid aggregation label; only one or more of %q are acceptable",
label, strings.Join(acceptable, ", "))
}
}
return nil
}

View File

@ -0,0 +1,57 @@
package agentmetrics_test
import (
"testing"
"github.com/stretchr/testify/require"
"github.com/coder/coder/v2/coderd/agentmetrics"
)
func TestValidateAggregationLabels(t *testing.T) {
t.Parallel()
tests := []struct {
name string
labels []string
expectedErr bool
}{
{
name: "empty list is valid",
},
{
name: "single valid entry",
labels: []string{agentmetrics.LabelTemplateName},
},
{
name: "multiple valid entries",
labels: []string{agentmetrics.LabelTemplateName, agentmetrics.LabelUsername},
},
{
name: "repeated valid entries are not invalid",
labels: []string{agentmetrics.LabelTemplateName, agentmetrics.LabelUsername, agentmetrics.LabelUsername, agentmetrics.LabelUsername},
},
{
name: "empty entry is invalid",
labels: []string{""},
expectedErr: true,
},
{
name: "all valid entries",
labels: agentmetrics.LabelAll,
},
}
for _, tc := range tests {
tc := tc
t.Run(tc.name, func(t *testing.T) {
t.Parallel()
err := agentmetrics.ValidateAggregationLabels(tc.labels)
if tc.expectedErr {
require.Error(t, err)
}
})
}
}

6
coderd/apidoc/docs.go generated
View File

@ -10952,6 +10952,12 @@ const docTemplate = `{
"address": {
"$ref": "#/definitions/clibase.HostPort"
},
"aggregate_agent_stats_by": {
"type": "array",
"items": {
"type": "string"
}
},
"collect_agent_stats": {
"type": "boolean"
},

View File

@ -9860,6 +9860,12 @@
"address": {
"$ref": "#/definitions/clibase.HostPort"
},
"aggregate_agent_stats_by": {
"type": "array",
"items": {
"type": "string"
}
},
"collect_agent_stats": {
"type": "boolean"
},

View File

@ -8,8 +8,11 @@ import (
"time"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/common/model"
"golang.org/x/xerrors"
"github.com/coder/coder/v2/coderd/agentmetrics"
"cdr.dev/slog"
agentproto "github.com/coder/coder/v2/agent/proto"
@ -43,9 +46,10 @@ type MetricsAggregator struct {
collectCh chan (chan []prometheus.Metric)
updateCh chan updateRequest
storeSizeGauge prometheus.Gauge
updateHistogram prometheus.Histogram
cleanupHistogram prometheus.Histogram
storeSizeGauge prometheus.Gauge
updateHistogram prometheus.Histogram
cleanupHistogram prometheus.Histogram
aggregateByLabels []string
}
type updateRequest struct {
@ -68,6 +72,8 @@ type annotatedMetric struct {
templateName string
expiryDate time.Time
aggregateByLabels []string
}
type metricKey struct {
@ -102,13 +108,28 @@ func hashKey(req *updateRequest, m *agentproto.Stats_Metric) metricKey {
var _ prometheus.Collector = new(MetricsAggregator)
func (am *annotatedMetric) asPrometheus() (prometheus.Metric, error) {
labels := make([]string, 0, len(agentMetricsLabels)+len(am.Labels))
labelValues := make([]string, 0, len(agentMetricsLabels)+len(am.Labels))
var (
baseLabelNames = am.aggregateByLabels
baseLabelValues []string
extraLabels = am.Labels
)
labels = append(labels, agentMetricsLabels...)
labelValues = append(labelValues, am.username, am.workspaceName, am.agentName, am.templateName)
for _, label := range baseLabelNames {
val, err := am.getFieldByLabel(label)
if err != nil {
return nil, err
}
for _, l := range am.Labels {
baseLabelValues = append(baseLabelValues, val)
}
labels := make([]string, 0, len(baseLabelNames)+len(extraLabels))
labelValues := make([]string, 0, len(baseLabelNames)+len(extraLabels))
labels = append(labels, baseLabelNames...)
labelValues = append(labelValues, baseLabelValues...)
for _, l := range extraLabels {
labels = append(labels, l.Name)
labelValues = append(labelValues, l.Value)
}
@ -118,10 +139,48 @@ func (am *annotatedMetric) asPrometheus() (prometheus.Metric, error) {
if err != nil {
return nil, err
}
return prometheus.MustNewConstMetric(desc, valueType, am.Value, labelValues...), nil
}
func NewMetricsAggregator(logger slog.Logger, registerer prometheus.Registerer, duration time.Duration) (*MetricsAggregator, error) {
// getFieldByLabel returns the related field value for a given label
func (am *annotatedMetric) getFieldByLabel(label string) (string, error) {
var labelVal string
switch label {
case agentmetrics.LabelWorkspaceName:
labelVal = am.workspaceName
case agentmetrics.LabelTemplateName:
labelVal = am.templateName
case agentmetrics.LabelAgentName:
labelVal = am.agentName
case agentmetrics.LabelUsername:
labelVal = am.username
default:
return "", xerrors.Errorf("unexpected label: %q", label)
}
return labelVal, nil
}
func (am *annotatedMetric) shallowCopy() annotatedMetric {
stats := &agentproto.Stats_Metric{
Name: am.Name,
Type: am.Type,
Value: am.Value,
Labels: am.Labels,
}
return annotatedMetric{
Stats_Metric: stats,
username: am.username,
workspaceName: am.workspaceName,
agentName: am.agentName,
templateName: am.templateName,
expiryDate: am.expiryDate,
}
}
func NewMetricsAggregator(logger slog.Logger, registerer prometheus.Registerer, duration time.Duration, aggregateByLabels []string) (*MetricsAggregator, error) {
metricsCleanupInterval := defaultMetricsCleanupInterval
if duration > 0 {
metricsCleanupInterval = duration
@ -174,9 +233,66 @@ func NewMetricsAggregator(logger slog.Logger, registerer prometheus.Registerer,
storeSizeGauge: storeSizeGauge,
updateHistogram: updateHistogram,
cleanupHistogram: cleanupHistogram,
aggregateByLabels: aggregateByLabels,
}, nil
}
// labelAggregator is used to control cardinality of collected Prometheus metrics by pre-aggregating series based on given labels.
type labelAggregator struct {
aggregations map[string]float64
metrics map[string]annotatedMetric
}
func newLabelAggregator(size int) *labelAggregator {
return &labelAggregator{
aggregations: make(map[string]float64, size),
metrics: make(map[string]annotatedMetric, size),
}
}
func (a *labelAggregator) aggregate(am annotatedMetric, labels []string) error {
// Use a LabelSet because it can give deterministic fingerprints of label combinations regardless of map ordering.
labelSet := make(model.LabelSet, len(labels))
for _, label := range labels {
val, err := am.getFieldByLabel(label)
if err != nil {
return err
}
labelSet[model.LabelName(label)] = model.LabelValue(val)
}
// Memoize based on the metric name & the unique combination of labels.
key := fmt.Sprintf("%s:%v", am.Stats_Metric.Name, labelSet.FastFingerprint())
// Aggregate the value based on the key.
a.aggregations[key] += am.Value
metric, found := a.metrics[key]
if !found {
// Take a copy of the given annotatedMetric because it may be manipulated later and contains pointers.
metric = am.shallowCopy()
}
// Store the metric.
metric.aggregateByLabels = labels
metric.Value = a.aggregations[key]
a.metrics[key] = metric
return nil
}
func (a *labelAggregator) listMetrics() []annotatedMetric {
var out []annotatedMetric
for _, am := range a.metrics {
out = append(out, am)
}
return out
}
func (ma *MetricsAggregator) Run(ctx context.Context) func() {
ctx, cancelFunc := context.WithCancel(ctx)
done := make(chan struct{})
@ -216,8 +332,38 @@ func (ma *MetricsAggregator) Run(ctx context.Context) func() {
case outputCh := <-ma.collectCh:
ma.log.Debug(ctx, "collect metrics")
var input []annotatedMetric
output := make([]prometheus.Metric, 0, len(ma.store))
for _, m := range ma.store {
if len(ma.aggregateByLabels) == 0 {
ma.aggregateByLabels = agentmetrics.LabelAll
}
// If custom aggregation labels have not been chosen, generate Prometheus metrics without any pre-aggregation.
// This results in higher cardinality, but may be desirable in larger deployments.
//
// Default behavior.
if len(ma.aggregateByLabels) == len(agentmetrics.LabelAll) {
for _, m := range ma.store {
// Aggregate by all available metrics.
m.aggregateByLabels = defaultAgentMetricsLabels
input = append(input, m)
}
} else {
// However, if custom aggregations have been chosen, we need to aggregate the values from the annotated
// metrics because we cannot register multiple metric series with the same labels.
la := newLabelAggregator(len(ma.store))
for _, m := range ma.store {
if err := la.aggregate(m, ma.aggregateByLabels); err != nil {
ma.log.Error(ctx, "can't aggregate labels", slog.F("labels", strings.Join(ma.aggregateByLabels, ",")), slog.Error(err))
}
}
input = la.listMetrics()
}
for _, m := range input {
promMetric, err := m.asPrometheus()
if err != nil {
ma.log.Error(ctx, "can't convert Prometheus value type", slog.F("name", m.Name), slog.F("type", m.Type), slog.F("value", m.Value), slog.Error(err))
@ -225,6 +371,7 @@ func (ma *MetricsAggregator) Run(ctx context.Context) func() {
}
output = append(output, promMetric)
}
outputCh <- output
close(outputCh)
case <-cleanupTicker.C:
@ -260,7 +407,7 @@ func (ma *MetricsAggregator) Run(ctx context.Context) func() {
func (*MetricsAggregator) Describe(_ chan<- *prometheus.Desc) {
}
var agentMetricsLabels = []string{usernameLabel, workspaceNameLabel, agentNameLabel, templateNameLabel}
var defaultAgentMetricsLabels = []string{agentmetrics.LabelUsername, agentmetrics.LabelWorkspaceName, agentmetrics.LabelAgentName, agentmetrics.LabelTemplateName}
// AgentMetricLabels are the labels used to decorate an agent's metrics.
// This list should match the list of labels in agentMetricsLabels.

View File

@ -2,6 +2,7 @@ package prometheusmetrics_test
import (
"context"
"fmt"
"sort"
"strings"
"sync/atomic"
@ -14,6 +15,7 @@ import (
"github.com/stretchr/testify/require"
"cdr.dev/slog/sloggers/slogtest"
"github.com/coder/coder/v2/coderd/agentmetrics"
agentproto "github.com/coder/coder/v2/agent/proto"
"github.com/coder/coder/v2/coderd/prometheusmetrics"
@ -40,7 +42,7 @@ func TestUpdateMetrics_MetricsDoNotExpire(t *testing.T) {
// given
registry := prometheus.NewRegistry()
metricsAggregator, err := prometheusmetrics.NewMetricsAggregator(slogtest.Make(t, &slogtest.Options{IgnoreErrors: true}), registry, time.Hour) // time.Hour, so metrics won't expire
metricsAggregator, err := prometheusmetrics.NewMetricsAggregator(slogtest.Make(t, &slogtest.Options{IgnoreErrors: true}), registry, time.Hour, nil) // time.Hour, so metrics won't expire
require.NoError(t, err)
ctx, cancelFunc := context.WithCancel(context.Background())
@ -93,54 +95,54 @@ func TestUpdateMetrics_MetricsDoNotExpire(t *testing.T) {
}
commonLabels := []*agentproto.Stats_Metric_Label{
{Name: "agent_name", Value: testAgentName},
{Name: "username", Value: testUsername},
{Name: "workspace_name", Value: testWorkspaceName},
{Name: "template_name", Value: testTemplateName},
{Name: agentmetrics.LabelAgentName, Value: testAgentName},
{Name: agentmetrics.LabelUsername, Value: testUsername},
{Name: agentmetrics.LabelWorkspaceName, Value: testWorkspaceName},
{Name: agentmetrics.LabelTemplateName, Value: testTemplateName},
}
expected := []*agentproto.Stats_Metric{
{Name: "a_counter_one", Type: agentproto.Stats_Metric_COUNTER, Value: 1, Labels: commonLabels},
{Name: "b_counter_two", Type: agentproto.Stats_Metric_COUNTER, Value: -9, Labels: []*agentproto.Stats_Metric_Label{
{Name: "agent_name", Value: testAgentName},
{Name: agentmetrics.LabelAgentName, Value: testAgentName},
{Name: "lizz", Value: "rizz"},
{Name: "username", Value: testUsername},
{Name: "workspace_name", Value: testWorkspaceName},
{Name: "template_name", Value: testTemplateName},
{Name: agentmetrics.LabelUsername, Value: testUsername},
{Name: agentmetrics.LabelWorkspaceName, Value: testWorkspaceName},
{Name: agentmetrics.LabelTemplateName, Value: testTemplateName},
}},
{Name: "b_counter_two", Type: agentproto.Stats_Metric_COUNTER, Value: 4, Labels: commonLabels},
{Name: "c_gauge_three", Type: agentproto.Stats_Metric_GAUGE, Value: 2, Labels: []*agentproto.Stats_Metric_Label{
{Name: "agent_name", Value: testAgentName},
{Name: agentmetrics.LabelAgentName, Value: testAgentName},
{Name: "foobar", Value: "Foobaz"},
{Name: "hello", Value: "world"},
{Name: "username", Value: testUsername},
{Name: "workspace_name", Value: testWorkspaceName},
{Name: "template_name", Value: testTemplateName},
{Name: agentmetrics.LabelUsername, Value: testUsername},
{Name: agentmetrics.LabelWorkspaceName, Value: testWorkspaceName},
{Name: agentmetrics.LabelTemplateName, Value: testTemplateName},
}},
{Name: "c_gauge_three", Type: agentproto.Stats_Metric_GAUGE, Value: 5, Labels: commonLabels},
{Name: "d_gauge_four", Type: agentproto.Stats_Metric_GAUGE, Value: 6, Labels: commonLabels},
{Name: "e_gauge_four", Type: agentproto.Stats_Metric_GAUGE, Value: 17, Labels: []*agentproto.Stats_Metric_Label{
{Name: "agent_name", Value: testAgentName},
{Name: agentmetrics.LabelAgentName, Value: testAgentName},
{Name: "cat", Value: "do,=g"},
{Name: "hello", Value: "wo,,rld"},
{Name: "username", Value: testUsername},
{Name: "workspace_name", Value: testWorkspaceName},
{Name: "template_name", Value: testTemplateName},
{Name: agentmetrics.LabelUsername, Value: testUsername},
{Name: agentmetrics.LabelWorkspaceName, Value: testWorkspaceName},
{Name: agentmetrics.LabelTemplateName, Value: testTemplateName},
}},
{Name: "e_gauge_four", Type: agentproto.Stats_Metric_GAUGE, Value: 15, Labels: []*agentproto.Stats_Metric_Label{
{Name: "agent_name", Value: testAgentName},
{Name: agentmetrics.LabelAgentName, Value: testAgentName},
{Name: "foobar", Value: "Foo,ba=z"},
{Name: "halo", Value: "wor\\,d=1,e=\\,2"},
{Name: "hello", Value: "wo,,r=d"},
{Name: "username", Value: testUsername},
{Name: "workspace_name", Value: testWorkspaceName},
{Name: "template_name", Value: testTemplateName},
{Name: agentmetrics.LabelUsername, Value: testUsername},
{Name: agentmetrics.LabelWorkspaceName, Value: testWorkspaceName},
{Name: agentmetrics.LabelTemplateName, Value: testTemplateName},
}},
{Name: "f_gauge_four", Type: agentproto.Stats_Metric_GAUGE, Value: 8, Labels: []*agentproto.Stats_Metric_Label{
{Name: "agent_name", Value: testAgentName},
{Name: agentmetrics.LabelAgentName, Value: testAgentName},
{Name: "foobar", Value: "foobaz"},
{Name: "username", Value: testUsername},
{Name: "workspace_name", Value: testWorkspaceName},
{Name: "template_name", Value: testTemplateName},
{Name: agentmetrics.LabelUsername, Value: testUsername},
{Name: agentmetrics.LabelWorkspaceName, Value: testWorkspaceName},
{Name: agentmetrics.LabelTemplateName, Value: testTemplateName},
}},
}
@ -175,6 +177,11 @@ func verifyCollectedMetrics(t *testing.T, expected []*agentproto.Stats_Metric, a
return false
}
// ensure stable iteration order
sort.Slice(expected, func(i, j int) bool {
return expected[i].Name < expected[j].Name
})
sort.Slice(actual, func(i, j int) bool {
m1 := prometheusMetricToString(t, actual[i])
m2 := prometheusMetricToString(t, actual[j])
@ -199,9 +206,11 @@ func verifyCollectedMetrics(t *testing.T, expected []*agentproto.Stats_Metric, a
dtoLabels := asMetricAgentLabels(d.GetLabel())
// dto labels are sorted in alphabetical order.
sort.Slice(e.Labels, func(i, j int) bool {
sortFn := func(i, j int) bool {
return e.Labels[i].Name < e.Labels[j].Name
})
}
sort.Slice(e.Labels, sortFn)
sort.Slice(dtoLabels, sortFn)
require.Equal(t, e.Labels, dtoLabels, d.String())
}
return true
@ -253,7 +262,7 @@ func TestUpdateMetrics_MetricsExpire(t *testing.T) {
// given
registry := prometheus.NewRegistry()
metricsAggregator, err := prometheusmetrics.NewMetricsAggregator(slogtest.Make(t, &slogtest.Options{IgnoreErrors: true}), registry, time.Millisecond)
metricsAggregator, err := prometheusmetrics.NewMetricsAggregator(slogtest.Make(t, &slogtest.Options{IgnoreErrors: true}), registry, time.Millisecond, agentmetrics.LabelAll)
require.NoError(t, err)
ctx, cancelFunc := context.WithCancel(context.Background())
@ -291,18 +300,354 @@ func TestUpdateMetrics_MetricsExpire(t *testing.T) {
}, testutil.WaitShort, testutil.IntervalFast)
}
func TestLabelsAggregation(t *testing.T) {
t.Parallel()
type statCollection struct {
labels prometheusmetrics.AgentMetricLabels
metrics []*agentproto.Stats_Metric
}
commonLabels := []*agentproto.Stats_Metric_Label{
{Name: agentmetrics.LabelUsername, Value: testUsername},
{Name: agentmetrics.LabelAgentName, Value: testAgentName},
{Name: agentmetrics.LabelWorkspaceName, Value: testWorkspaceName},
{Name: agentmetrics.LabelTemplateName, Value: testTemplateName},
}
tests := []struct {
name string
given []statCollection
expected []*agentproto.Stats_Metric
aggregateOn []string
}{
{
name: "label aggregations not specified, keep all (high cardinality, default behavior)",
aggregateOn: agentmetrics.LabelAll,
given: []statCollection{
{
labels: testLabels,
metrics: []*agentproto.Stats_Metric{
{Name: "user_counter", Type: agentproto.Stats_Metric_COUNTER, Value: 1},
},
},
{
labels: testLabels,
metrics: []*agentproto.Stats_Metric{
{Name: "active_conns", Type: agentproto.Stats_Metric_GAUGE, Value: 4},
},
},
},
expected: []*agentproto.Stats_Metric{
{Name: "user_counter", Type: agentproto.Stats_Metric_COUNTER, Value: 1, Labels: commonLabels},
{Name: "active_conns", Type: agentproto.Stats_Metric_GAUGE, Value: 4, Labels: commonLabels},
},
},
{
// Scenario: 2 users are using the same agent and we've configured the deployment to aggregate on the "agent_name" label.
name: "single label aggregation, aggregating to single metric",
aggregateOn: []string{agentmetrics.LabelAgentName},
given: []statCollection{
{
labels: prometheusmetrics.AgentMetricLabels{
Username: "user1",
AgentName: "agent1",
},
metrics: []*agentproto.Stats_Metric{
{Name: "user_counter", Type: agentproto.Stats_Metric_COUNTER, Value: 1},
},
},
{
labels: prometheusmetrics.AgentMetricLabels{
Username: "user2",
AgentName: "agent1",
},
metrics: []*agentproto.Stats_Metric{
{Name: "user_counter", Type: agentproto.Stats_Metric_COUNTER, Value: 7},
},
},
},
expected: []*agentproto.Stats_Metric{
// We only observed one agent_name value, so all metrics are aggregated to a single series.
{Name: "user_counter", Type: agentproto.Stats_Metric_COUNTER, Value: 8, Labels: []*agentproto.Stats_Metric_Label{
{Name: agentmetrics.LabelAgentName, Value: "agent1"},
}},
},
},
{
// Scenario: as above, but we're aggregating on two invariant labels.
name: "multiple label aggregation, aggregating to single metric",
aggregateOn: []string{agentmetrics.LabelAgentName, agentmetrics.LabelTemplateName},
given: []statCollection{
{
labels: prometheusmetrics.AgentMetricLabels{
Username: "user1",
AgentName: "agent1",
TemplateName: "template1",
},
metrics: []*agentproto.Stats_Metric{
{Name: "user_counter", Type: agentproto.Stats_Metric_COUNTER, Value: 1},
},
},
{
labels: prometheusmetrics.AgentMetricLabels{
Username: "user2",
AgentName: "agent1",
TemplateName: "template1",
},
metrics: []*agentproto.Stats_Metric{
{Name: "user_counter", Type: agentproto.Stats_Metric_COUNTER, Value: 7},
},
},
},
expected: []*agentproto.Stats_Metric{
// We only observed one agent_name & template_name tuple, so all metrics are aggregated to a single series.
{Name: "user_counter", Type: agentproto.Stats_Metric_COUNTER, Value: 8, Labels: []*agentproto.Stats_Metric_Label{
{Name: agentmetrics.LabelAgentName, Value: "agent1"},
{Name: agentmetrics.LabelTemplateName, Value: "template1"},
}},
},
},
{
// Scenario: aggregating on a label which is unique across all metrics.
name: "single label aggregation, aggregating to multiple metrics",
aggregateOn: []string{agentmetrics.LabelUsername},
given: []statCollection{
{
labels: prometheusmetrics.AgentMetricLabels{
Username: "user1",
AgentName: "agent1",
TemplateName: "template1",
},
metrics: []*agentproto.Stats_Metric{
{Name: "user_counter", Type: agentproto.Stats_Metric_COUNTER, Value: 1},
},
},
{
labels: prometheusmetrics.AgentMetricLabels{
Username: "user2",
AgentName: "agent1",
TemplateName: "template1",
},
metrics: []*agentproto.Stats_Metric{
{Name: "user_counter", Type: agentproto.Stats_Metric_COUNTER, Value: 7},
},
},
},
expected: []*agentproto.Stats_Metric{
// We observed two unique username values, and therefore we have a metric for each.
{Name: "user_counter", Type: agentproto.Stats_Metric_COUNTER, Value: 1, Labels: []*agentproto.Stats_Metric_Label{
{Name: agentmetrics.LabelUsername, Value: "user1"},
}},
{Name: "user_counter", Type: agentproto.Stats_Metric_COUNTER, Value: 7, Labels: []*agentproto.Stats_Metric_Label{
{Name: agentmetrics.LabelUsername, Value: "user2"},
}},
},
},
{
// Scenario: aggregating on a label which is unique across all metrics, plus two invariant labels.
name: "multiple label aggregation, aggregating to multiple metrics",
aggregateOn: []string{agentmetrics.LabelUsername, agentmetrics.LabelAgentName, agentmetrics.LabelTemplateName},
given: []statCollection{
{
labels: prometheusmetrics.AgentMetricLabels{
Username: "user1",
AgentName: "agent1",
TemplateName: "template1",
},
metrics: []*agentproto.Stats_Metric{
{Name: "user_counter", Type: agentproto.Stats_Metric_COUNTER, Value: 1},
},
},
{
labels: prometheusmetrics.AgentMetricLabels{
Username: "user2",
AgentName: "agent1",
TemplateName: "template1",
},
metrics: []*agentproto.Stats_Metric{
{Name: "user_counter", Type: agentproto.Stats_Metric_COUNTER, Value: 7},
},
},
},
expected: []*agentproto.Stats_Metric{
// We observed two unique username values, and therefore we have a metric for each.
{Name: "user_counter", Type: agentproto.Stats_Metric_COUNTER, Value: 1, Labels: []*agentproto.Stats_Metric_Label{
{Name: agentmetrics.LabelUsername, Value: "user1"},
{Name: agentmetrics.LabelAgentName, Value: "agent1"},
{Name: agentmetrics.LabelTemplateName, Value: "template1"},
}},
{Name: "user_counter", Type: agentproto.Stats_Metric_COUNTER, Value: 7, Labels: []*agentproto.Stats_Metric_Label{
{Name: agentmetrics.LabelUsername, Value: "user2"},
{Name: agentmetrics.LabelAgentName, Value: "agent1"},
{Name: agentmetrics.LabelTemplateName, Value: "template1"},
}},
},
},
{
name: "extra labels are retained, even with label aggregations",
aggregateOn: []string{agentmetrics.LabelUsername},
given: []statCollection{
{
labels: testLabels,
metrics: []*agentproto.Stats_Metric{
{Name: "user_counter", Type: agentproto.Stats_Metric_COUNTER, Value: 1},
},
},
{
labels: testLabels,
metrics: []*agentproto.Stats_Metric{
{Name: "extra_label", Type: agentproto.Stats_Metric_COUNTER, Value: 27, Labels: []*agentproto.Stats_Metric_Label{
{Name: "lizz", Value: "rizz"},
}},
},
},
},
expected: []*agentproto.Stats_Metric{
{Name: "user_counter", Type: agentproto.Stats_Metric_COUNTER, Value: 1, Labels: []*agentproto.Stats_Metric_Label{
{Name: agentmetrics.LabelUsername, Value: testUsername},
}},
{Name: "extra_label", Type: agentproto.Stats_Metric_COUNTER, Value: 27, Labels: []*agentproto.Stats_Metric_Label{
{Name: "lizz", Value: "rizz"},
{Name: agentmetrics.LabelUsername, Value: testUsername},
}},
},
},
{
// Both counters and gauges should have all their values summed to produce the correct output.
name: "counters & gauges behave identically",
aggregateOn: []string{agentmetrics.LabelTemplateName},
given: []statCollection{
{
labels: prometheusmetrics.AgentMetricLabels{
Username: "username1",
TemplateName: "template1",
},
metrics: []*agentproto.Stats_Metric{
{Name: "user_counter", Type: agentproto.Stats_Metric_COUNTER, Value: 1},
{Name: "active_conns", Type: agentproto.Stats_Metric_GAUGE, Value: 3},
},
},
{
labels: prometheusmetrics.AgentMetricLabels{
Username: "username2",
TemplateName: "template1",
},
metrics: []*agentproto.Stats_Metric{
{Name: "user_counter", Type: agentproto.Stats_Metric_COUNTER, Value: 2},
{Name: "active_conns", Type: agentproto.Stats_Metric_GAUGE, Value: 4},
},
},
},
expected: []*agentproto.Stats_Metric{
{Name: "user_counter", Type: agentproto.Stats_Metric_COUNTER, Value: 3, Labels: []*agentproto.Stats_Metric_Label{
{Name: agentmetrics.LabelTemplateName, Value: "template1"},
}},
{Name: "active_conns", Type: agentproto.Stats_Metric_GAUGE, Value: 7, Labels: []*agentproto.Stats_Metric_Label{
{Name: agentmetrics.LabelTemplateName, Value: "template1"},
}},
},
},
{
// Scenario: validation fails and an invalid label is selected for aggregation.
name: "invalid label aggregation",
aggregateOn: []string{"nonsense"},
given: []statCollection{
{
labels: testLabels,
metrics: []*agentproto.Stats_Metric{
{Name: "user_counter", Type: agentproto.Stats_Metric_COUNTER, Value: 1},
},
},
},
// Nothing will be returned.
expected: []*agentproto.Stats_Metric{},
},
{
// Scenario: validation fails and an empty list is given for aggregation.
name: "empty label aggregation list",
aggregateOn: []string{},
given: []statCollection{
{
labels: testLabels,
metrics: []*agentproto.Stats_Metric{
{Name: "user_counter", Type: agentproto.Stats_Metric_COUNTER, Value: 1},
},
},
},
// Default aggregation will be used.
expected: []*agentproto.Stats_Metric{
{Name: "user_counter", Type: agentproto.Stats_Metric_COUNTER, Value: 1, Labels: commonLabels},
},
},
}
for _, tc := range tests {
tc := tc
t.Run(tc.name, func(t *testing.T) {
t.Parallel()
// given
registry := prometheus.NewRegistry()
metricsAggregator, err := prometheusmetrics.NewMetricsAggregator(slogtest.Make(t, &slogtest.Options{IgnoreErrors: true}), registry, time.Hour, tc.aggregateOn) // time.Hour, so metrics won't expire
require.NoError(t, err)
ctx, cancelFunc := context.WithCancel(context.Background())
t.Cleanup(cancelFunc)
closeFunc := metricsAggregator.Run(ctx)
t.Cleanup(closeFunc)
// when
for _, sc := range tc.given {
metricsAggregator.Update(ctx, sc.labels, sc.metrics)
}
// then
require.Eventually(t, func() bool {
var actual []prometheus.Metric
metricsCh := make(chan prometheus.Metric)
done := make(chan struct{}, 1)
defer close(done)
go func() {
for m := range metricsCh {
actual = append(actual, m)
}
done <- struct{}{}
}()
metricsAggregator.Collect(metricsCh)
close(metricsCh)
<-done
return verifyCollectedMetrics(t, tc.expected, actual)
}, testutil.WaitMedium, testutil.IntervalSlow)
})
}
}
func Benchmark_MetricsAggregator_Run(b *testing.B) {
benchmarkRunner(b, agentmetrics.LabelAll)
}
func Benchmark_MetricsAggregator_RunWithAggregations(b *testing.B) {
for i := 1; i <= len(agentmetrics.LabelAll); i++ {
b.Run(fmt.Sprintf("%d labels", i), func(b *testing.B) {
benchmarkRunner(b, agentmetrics.LabelAll[0:i])
})
}
}
func benchmarkRunner(b *testing.B, aggregateByLabels []string) {
b.ReportAllocs()
// Number of metrics to generate and send in each iteration.
// Hard-coded to 1024 to avoid overflowing the queue in the metrics aggregator.
numMetrics := 1024
// given
registry := prometheus.NewRegistry()
metricsAggregator := must(prometheusmetrics.NewMetricsAggregator(
slogtest.Make(b, &slogtest.Options{IgnoreErrors: true}),
registry,
time.Hour,
))
metricsAggregator := must(prometheusmetrics.NewMetricsAggregator(slogtest.Make(b, &slogtest.Options{IgnoreErrors: true}), registry, time.Hour, aggregateByLabels))
ctx, cancelFunc := context.WithCancel(context.Background())
b.Cleanup(cancelFunc)

View File

@ -10,27 +10,20 @@ import (
"sync/atomic"
"time"
"github.com/coder/coder/v2/codersdk"
"github.com/google/uuid"
"github.com/prometheus/client_golang/prometheus"
"tailscale.com/tailcfg"
"cdr.dev/slog"
"github.com/coder/coder/v2/coderd/agentmetrics"
"github.com/coder/coder/v2/coderd/database"
"github.com/coder/coder/v2/coderd/database/dbauthz"
"github.com/coder/coder/v2/coderd/database/dbtime"
"github.com/coder/coder/v2/codersdk"
"github.com/coder/coder/v2/tailnet"
)
const (
templateNameLabel = "template_name"
agentNameLabel = "agent_name"
usernameLabel = "username"
workspaceNameLabel = "workspace_name"
)
// ActiveUsers tracks the number of users that have authenticated within the past hour.
func ActiveUsers(ctx context.Context, registerer prometheus.Registerer, db database.Store, duration time.Duration) (func(), error) {
if duration == 0 {
@ -156,7 +149,7 @@ func Agents(ctx context.Context, logger slog.Logger, registerer prometheus.Regis
Subsystem: "agents",
Name: "up",
Help: "The number of active agents per workspace.",
}, []string{usernameLabel, workspaceNameLabel, templateNameLabel, "template_version"}))
}, []string{agentmetrics.LabelUsername, agentmetrics.LabelWorkspaceName, agentmetrics.LabelTemplateName, "template_version"}))
err := registerer.Register(agentsGauge)
if err != nil {
return nil, err
@ -167,7 +160,7 @@ func Agents(ctx context.Context, logger slog.Logger, registerer prometheus.Regis
Subsystem: "agents",
Name: "connections",
Help: "Agent connections with statuses.",
}, []string{agentNameLabel, usernameLabel, workspaceNameLabel, "status", "lifecycle_state", "tailnet_node"}))
}, []string{agentmetrics.LabelAgentName, agentmetrics.LabelUsername, agentmetrics.LabelWorkspaceName, "status", "lifecycle_state", "tailnet_node"}))
err = registerer.Register(agentsConnectionsGauge)
if err != nil {
return nil, err
@ -178,7 +171,7 @@ func Agents(ctx context.Context, logger slog.Logger, registerer prometheus.Regis
Subsystem: "agents",
Name: "connection_latencies_seconds",
Help: "Agent connection latencies in seconds.",
}, []string{agentNameLabel, usernameLabel, workspaceNameLabel, "derp_region", "preferred"}))
}, []string{agentmetrics.LabelAgentName, agentmetrics.LabelUsername, agentmetrics.LabelWorkspaceName, "derp_region", "preferred"}))
err = registerer.Register(agentsConnectionLatenciesGauge)
if err != nil {
return nil, err
@ -189,7 +182,7 @@ func Agents(ctx context.Context, logger slog.Logger, registerer prometheus.Regis
Subsystem: "agents",
Name: "apps",
Help: "Agent applications with statuses.",
}, []string{agentNameLabel, usernameLabel, workspaceNameLabel, "app_name", "health"}))
}, []string{agentmetrics.LabelAgentName, agentmetrics.LabelUsername, agentmetrics.LabelWorkspaceName, "app_name", "health"}))
err = registerer.Register(agentsAppsGauge)
if err != nil {
return nil, err
@ -335,11 +328,17 @@ func Agents(ctx context.Context, logger slog.Logger, registerer prometheus.Regis
}, nil
}
func AgentStats(ctx context.Context, logger slog.Logger, registerer prometheus.Registerer, db database.Store, initialCreateAfter time.Time, duration time.Duration) (func(), error) {
func AgentStats(ctx context.Context, logger slog.Logger, registerer prometheus.Registerer, db database.Store, initialCreateAfter time.Time, duration time.Duration, aggregateByLabels []string) (func(), error) {
if duration == 0 {
duration = 1 * time.Minute
}
if len(aggregateByLabels) == 0 {
aggregateByLabels = agentmetrics.LabelAgentStats
}
aggregateByLabels = filterAcceptableAgentLabels(aggregateByLabels)
metricsCollectorAgentStats := prometheus.NewHistogram(prometheus.HistogramOpts{
Namespace: "coderd",
Subsystem: "prometheusmetrics",
@ -357,7 +356,7 @@ func AgentStats(ctx context.Context, logger slog.Logger, registerer prometheus.R
Subsystem: "agentstats",
Name: "tx_bytes",
Help: "Agent Tx bytes",
}, []string{agentNameLabel, usernameLabel, workspaceNameLabel}))
}, aggregateByLabels))
err = registerer.Register(agentStatsTxBytesGauge)
if err != nil {
return nil, err
@ -368,7 +367,7 @@ func AgentStats(ctx context.Context, logger slog.Logger, registerer prometheus.R
Subsystem: "agentstats",
Name: "rx_bytes",
Help: "Agent Rx bytes",
}, []string{agentNameLabel, usernameLabel, workspaceNameLabel}))
}, aggregateByLabels))
err = registerer.Register(agentStatsRxBytesGauge)
if err != nil {
return nil, err
@ -379,7 +378,7 @@ func AgentStats(ctx context.Context, logger slog.Logger, registerer prometheus.R
Subsystem: "agentstats",
Name: "connection_count",
Help: "The number of established connections by agent",
}, []string{agentNameLabel, usernameLabel, workspaceNameLabel}))
}, aggregateByLabels))
err = registerer.Register(agentStatsConnectionCountGauge)
if err != nil {
return nil, err
@ -390,7 +389,7 @@ func AgentStats(ctx context.Context, logger slog.Logger, registerer prometheus.R
Subsystem: "agentstats",
Name: "connection_median_latency_seconds",
Help: "The median agent connection latency in seconds",
}, []string{agentNameLabel, usernameLabel, workspaceNameLabel}))
}, aggregateByLabels))
err = registerer.Register(agentStatsConnectionMedianLatencyGauge)
if err != nil {
return nil, err
@ -401,7 +400,7 @@ func AgentStats(ctx context.Context, logger slog.Logger, registerer prometheus.R
Subsystem: "agentstats",
Name: "session_count_jetbrains",
Help: "The number of session established by JetBrains",
}, []string{agentNameLabel, usernameLabel, workspaceNameLabel}))
}, aggregateByLabels))
err = registerer.Register(agentStatsSessionCountJetBrainsGauge)
if err != nil {
return nil, err
@ -412,7 +411,7 @@ func AgentStats(ctx context.Context, logger slog.Logger, registerer prometheus.R
Subsystem: "agentstats",
Name: "session_count_reconnecting_pty",
Help: "The number of session established by reconnecting PTY",
}, []string{agentNameLabel, usernameLabel, workspaceNameLabel}))
}, aggregateByLabels))
err = registerer.Register(agentStatsSessionCountReconnectingPTYGauge)
if err != nil {
return nil, err
@ -423,7 +422,7 @@ func AgentStats(ctx context.Context, logger slog.Logger, registerer prometheus.R
Subsystem: "agentstats",
Name: "session_count_ssh",
Help: "The number of session established by SSH",
}, []string{agentNameLabel, usernameLabel, workspaceNameLabel}))
}, aggregateByLabels))
err = registerer.Register(agentStatsSessionCountSSHGauge)
if err != nil {
return nil, err
@ -434,7 +433,7 @@ func AgentStats(ctx context.Context, logger slog.Logger, registerer prometheus.R
Subsystem: "agentstats",
Name: "session_count_vscode",
Help: "The number of session established by VSCode",
}, []string{agentNameLabel, usernameLabel, workspaceNameLabel}))
}, aggregateByLabels))
err = registerer.Register(agentStatsSessionCountVSCodeGauge)
if err != nil {
return nil, err
@ -466,16 +465,28 @@ func AgentStats(ctx context.Context, logger slog.Logger, registerer prometheus.R
logger.Error(ctx, "can't get agent stats", slog.Error(err))
} else {
for _, agentStat := range stats {
agentStatsRxBytesGauge.WithLabelValues(VectorOperationAdd, float64(agentStat.RxBytes), agentStat.AgentName, agentStat.Username, agentStat.WorkspaceName)
agentStatsTxBytesGauge.WithLabelValues(VectorOperationAdd, float64(agentStat.TxBytes), agentStat.AgentName, agentStat.Username, agentStat.WorkspaceName)
var labelValues []string
for _, label := range aggregateByLabels {
switch label {
case agentmetrics.LabelUsername:
labelValues = append(labelValues, agentStat.Username)
case agentmetrics.LabelWorkspaceName:
labelValues = append(labelValues, agentStat.WorkspaceName)
case agentmetrics.LabelAgentName:
labelValues = append(labelValues, agentStat.AgentName)
}
}
agentStatsConnectionCountGauge.WithLabelValues(VectorOperationSet, float64(agentStat.ConnectionCount), agentStat.AgentName, agentStat.Username, agentStat.WorkspaceName)
agentStatsConnectionMedianLatencyGauge.WithLabelValues(VectorOperationSet, agentStat.ConnectionMedianLatencyMS/1000.0 /* (to seconds) */, agentStat.AgentName, agentStat.Username, agentStat.WorkspaceName)
agentStatsRxBytesGauge.WithLabelValues(VectorOperationAdd, float64(agentStat.RxBytes), labelValues...)
agentStatsTxBytesGauge.WithLabelValues(VectorOperationAdd, float64(agentStat.TxBytes), labelValues...)
agentStatsSessionCountJetBrainsGauge.WithLabelValues(VectorOperationSet, float64(agentStat.SessionCountJetBrains), agentStat.AgentName, agentStat.Username, agentStat.WorkspaceName)
agentStatsSessionCountReconnectingPTYGauge.WithLabelValues(VectorOperationSet, float64(agentStat.SessionCountReconnectingPTY), agentStat.AgentName, agentStat.Username, agentStat.WorkspaceName)
agentStatsSessionCountSSHGauge.WithLabelValues(VectorOperationSet, float64(agentStat.SessionCountSSH), agentStat.AgentName, agentStat.Username, agentStat.WorkspaceName)
agentStatsSessionCountVSCodeGauge.WithLabelValues(VectorOperationSet, float64(agentStat.SessionCountVSCode), agentStat.AgentName, agentStat.Username, agentStat.WorkspaceName)
agentStatsConnectionCountGauge.WithLabelValues(VectorOperationSet, float64(agentStat.ConnectionCount), labelValues...)
agentStatsConnectionMedianLatencyGauge.WithLabelValues(VectorOperationSet, agentStat.ConnectionMedianLatencyMS/1000.0 /* (to seconds) */, labelValues...)
agentStatsSessionCountJetBrainsGauge.WithLabelValues(VectorOperationSet, float64(agentStat.SessionCountJetBrains), labelValues...)
agentStatsSessionCountReconnectingPTYGauge.WithLabelValues(VectorOperationSet, float64(agentStat.SessionCountReconnectingPTY), labelValues...)
agentStatsSessionCountSSHGauge.WithLabelValues(VectorOperationSet, float64(agentStat.SessionCountSSH), labelValues...)
agentStatsSessionCountVSCodeGauge.WithLabelValues(VectorOperationSet, float64(agentStat.SessionCountVSCode), labelValues...)
}
if len(stats) > 0 {
@ -504,3 +515,17 @@ func AgentStats(ctx context.Context, logger slog.Logger, registerer prometheus.R
<-done
}, nil
}
// filterAcceptableAgentLabels handles a slightly messy situation whereby `prometheus-aggregate-agent-stats-by` can control on
// which labels agent stats are aggregated, but for these specific metrics in this file there is no `template` label value,
// and therefore we have to exclude it from the list of acceptable labels.
func filterAcceptableAgentLabels(labels []string) []string {
out := make([]string, 0, len(labels))
for _, label := range labels {
if label != agentmetrics.LabelTemplateName {
out = append(out, label)
}
}
return out
}

View File

@ -0,0 +1,40 @@
package prometheusmetrics
import (
"testing"
"github.com/stretchr/testify/require"
"github.com/coder/coder/v2/coderd/agentmetrics"
)
func TestFilterAcceptableAgentLabels(t *testing.T) {
t.Parallel()
tests := []struct {
name string
input []string
expected []string
}{
{
name: "template label is ignored",
input: []string{agentmetrics.LabelTemplateName},
expected: []string{},
},
{
name: "all other labels are returned",
input: agentmetrics.LabelAll,
expected: []string{agentmetrics.LabelAgentName, agentmetrics.LabelUsername, agentmetrics.LabelWorkspaceName},
},
}
for _, tc := range tests {
tc := tc
t.Run(tc.name, func(t *testing.T) {
t.Parallel()
require.Equal(t, tc.expected, filterAcceptableAgentLabels(tc.input))
})
}
}

View File

@ -11,10 +11,6 @@ import (
"testing"
"time"
"github.com/coder/coder/v2/coderd/batchstats"
"github.com/coder/coder/v2/coderd/database/dbtestutil"
"github.com/coder/coder/v2/coderd/database/dbtime"
"github.com/google/uuid"
"github.com/prometheus/client_golang/prometheus"
"github.com/stretchr/testify/assert"
@ -24,10 +20,14 @@ import (
"cdr.dev/slog"
"cdr.dev/slog/sloggers/slogtest"
"github.com/coder/coder/v2/coderd/agentmetrics"
"github.com/coder/coder/v2/coderd/batchstats"
"github.com/coder/coder/v2/coderd/coderdtest"
"github.com/coder/coder/v2/coderd/database"
"github.com/coder/coder/v2/coderd/database/dbgen"
"github.com/coder/coder/v2/coderd/database/dbmem"
"github.com/coder/coder/v2/coderd/database/dbtestutil"
"github.com/coder/coder/v2/coderd/database/dbtime"
"github.com/coder/coder/v2/coderd/prometheusmetrics"
"github.com/coder/coder/v2/codersdk"
"github.com/coder/coder/v2/codersdk/agentsdk"
@ -451,7 +451,7 @@ func TestAgentStats(t *testing.T) {
// and it doesn't depend on the real time.
closeFunc, err := prometheusmetrics.AgentStats(ctx, slogtest.Make(t, &slogtest.Options{
IgnoreErrors: true,
}), registry, db, time.Now().Add(-time.Minute), time.Millisecond)
}), registry, db, time.Now().Add(-time.Minute), time.Millisecond, agentmetrics.LabelAll)
require.NoError(t, err)
t.Cleanup(closeFunc)

View File

@ -4,6 +4,7 @@ import (
"context"
"encoding/json"
"flag"
"fmt"
"net/http"
"os"
"path/filepath"
@ -18,6 +19,7 @@ import (
"github.com/coder/coder/v2/buildinfo"
"github.com/coder/coder/v2/cli/clibase"
"github.com/coder/coder/v2/coderd/agentmetrics"
"github.com/coder/coder/v2/coderd/workspaceapps/appurl"
)
@ -255,10 +257,11 @@ type DERPConfig struct {
}
type PrometheusConfig struct {
Enable clibase.Bool `json:"enable" typescript:",notnull"`
Address clibase.HostPort `json:"address" typescript:",notnull"`
CollectAgentStats clibase.Bool `json:"collect_agent_stats" typescript:",notnull"`
CollectDBMetrics clibase.Bool `json:"collect_db_metrics" typescript:",notnull"`
Enable clibase.Bool `json:"enable" typescript:",notnull"`
Address clibase.HostPort `json:"address" typescript:",notnull"`
CollectAgentStats clibase.Bool `json:"collect_agent_stats" typescript:",notnull"`
CollectDBMetrics clibase.Bool `json:"collect_db_metrics" typescript:",notnull"`
AggregateAgentStatsBy clibase.StringArray `json:"aggregate_agent_stats_by" typescript:",notnull"`
}
type PprofConfig struct {
@ -942,6 +945,22 @@ when required by your organization's security policy.`,
Group: &deploymentGroupIntrospectionPrometheus,
YAML: "collect_agent_stats",
},
{
Name: "Prometheus Aggregate Agent Stats By",
Description: fmt.Sprintf("When collecting agent stats, aggregate metrics by a given set of comma-separated labels to reduce cardinality. Accepted values are %s.", strings.Join(agentmetrics.LabelAll, ", ")),
Flag: "prometheus-aggregate-agent-stats-by",
Env: "CODER_PROMETHEUS_AGGREGATE_AGENT_STATS_BY",
Value: clibase.Validate(&c.Prometheus.AggregateAgentStatsBy, func(value *clibase.StringArray) error {
if value == nil {
return nil
}
return agentmetrics.ValidateAggregationLabels(value.Value())
}),
Group: &deploymentGroupIntrospectionPrometheus,
YAML: "aggregate_agent_stats_by",
Default: strings.Join(agentmetrics.LabelAll, ","),
},
{
Name: "Prometheus Collect Database Metrics",
Description: "Collect database metrics (may increase charges for metrics storage).",

1
docs/api/general.md generated
View File

@ -317,6 +317,7 @@ curl -X GET http://coder-server:8080/api/v2/deployment/config \
"host": "string",
"port": "string"
},
"aggregate_agent_stats_by": ["string"],
"collect_agent_stats": true,
"collect_db_metrics": true,
"enable": true

16
docs/api/schemas.md generated
View File

@ -2786,6 +2786,7 @@ AuthorizationObject can represent a "set" of objects, such as: all workspaces in
"host": "string",
"port": "string"
},
"aggregate_agent_stats_by": ["string"],
"collect_agent_stats": true,
"collect_db_metrics": true,
"enable": true
@ -3154,6 +3155,7 @@ AuthorizationObject can represent a "set" of objects, such as: all workspaces in
"host": "string",
"port": "string"
},
"aggregate_agent_stats_by": ["string"],
"collect_agent_stats": true,
"collect_db_metrics": true,
"enable": true
@ -4783,6 +4785,7 @@ AuthorizationObject can represent a "set" of objects, such as: all workspaces in
"host": "string",
"port": "string"
},
"aggregate_agent_stats_by": ["string"],
"collect_agent_stats": true,
"collect_db_metrics": true,
"enable": true
@ -4791,12 +4794,13 @@ AuthorizationObject can represent a "set" of objects, such as: all workspaces in
### Properties
| Name | Type | Required | Restrictions | Description |
| --------------------- | ------------------------------------ | -------- | ------------ | ----------- |
| `address` | [clibase.HostPort](#clibasehostport) | false | | |
| `collect_agent_stats` | boolean | false | | |
| `collect_db_metrics` | boolean | false | | |
| `enable` | boolean | false | | |
| Name | Type | Required | Restrictions | Description |
| -------------------------- | ------------------------------------ | -------- | ------------ | ----------- |
| `address` | [clibase.HostPort](#clibasehostport) | false | | |
| `aggregate_agent_stats_by` | array of string | false | | |
| `collect_agent_stats` | boolean | false | | |
| `collect_db_metrics` | boolean | false | | |
| `enable` | boolean | false | | |
## codersdk.ProvisionerConfig

11
docs/cli/server.md generated
View File

@ -742,6 +742,17 @@ URL of a PostgreSQL database. If empty, PostgreSQL binaries will be downloaded f
The bind address to serve prometheus metrics.
### --prometheus-aggregate-agent-stats-by
| | |
| ----------- | -------------------------------------------------------------- |
| Type | <code>string-array</code> |
| Environment | <code>$CODER_PROMETHEUS_AGGREGATE_AGENT_STATS_BY</code> |
| YAML | <code>introspection.prometheus.aggregate_agent_stats_by</code> |
| Default | <code>agent_name,template_name,username,workspace_name</code> |
When collecting agent stats, aggregate metrics by a given set of comma-separated labels to reduce cardinality. Accepted values are agent_name, template_name, username, workspace_name.
### --prometheus-collect-agent-stats
| | |

View File

@ -124,6 +124,11 @@ INTROSPECTION / PROMETHEUS OPTIONS:
--prometheus-address host:port, $CODER_PROMETHEUS_ADDRESS (default: 127.0.0.1:2112)
The bind address to serve prometheus metrics.
--prometheus-aggregate-agent-stats-by string-array, $CODER_PROMETHEUS_AGGREGATE_AGENT_STATS_BY (default: agent_name,template_name,username,workspace_name)
When collecting agent stats, aggregate metrics by a given set of
comma-separated labels to reduce cardinality. Accepted values are
agent_name, template_name, username, workspace_name.
--prometheus-collect-agent-stats bool, $CODER_PROMETHEUS_COLLECT_AGENT_STATS
Collect agent stats (may increase charges for metrics storage).

View File

@ -26,6 +26,7 @@
bat
cairo
curl
delve
drpc.defaultPackage.${system}
gcc
gdk

View File

@ -925,6 +925,7 @@ export interface PrometheusConfig {
readonly address: string;
readonly collect_agent_stats: boolean;
readonly collect_db_metrics: boolean;
readonly aggregate_agent_stats_by: string[];
}
// From codersdk/deployment.go