mirror of https://github.com/coder/coder.git
feat: make agent stats' cardinality configurable (#12535)
This commit is contained in:
parent
e45d511f28
commit
7a7105ad66
|
@ -229,13 +229,13 @@ func enablePrometheus(
|
|||
afterCtx(ctx, closeInsightsMetricsCollector)
|
||||
|
||||
if vals.Prometheus.CollectAgentStats {
|
||||
closeAgentStatsFunc, err := prometheusmetrics.AgentStats(ctx, logger, options.PrometheusRegistry, options.Database, time.Now(), 0)
|
||||
closeAgentStatsFunc, err := prometheusmetrics.AgentStats(ctx, logger, options.PrometheusRegistry, options.Database, time.Now(), 0, options.DeploymentValues.Prometheus.AggregateAgentStatsBy.Value())
|
||||
if err != nil {
|
||||
return nil, xerrors.Errorf("register agent stats prometheus metric: %w", err)
|
||||
}
|
||||
afterCtx(ctx, closeAgentStatsFunc)
|
||||
|
||||
metricsAggregator, err := prometheusmetrics.NewMetricsAggregator(logger, options.PrometheusRegistry, 0)
|
||||
metricsAggregator, err := prometheusmetrics.NewMetricsAggregator(logger, options.PrometheusRegistry, 0, options.DeploymentValues.Prometheus.AggregateAgentStatsBy.Value())
|
||||
if err != nil {
|
||||
return nil, xerrors.Errorf("can't initialize metrics aggregator: %w", err)
|
||||
}
|
||||
|
|
|
@ -123,6 +123,11 @@ INTROSPECTION / PROMETHEUS OPTIONS:
|
|||
--prometheus-address host:port, $CODER_PROMETHEUS_ADDRESS (default: 127.0.0.1:2112)
|
||||
The bind address to serve prometheus metrics.
|
||||
|
||||
--prometheus-aggregate-agent-stats-by string-array, $CODER_PROMETHEUS_AGGREGATE_AGENT_STATS_BY (default: agent_name,template_name,username,workspace_name)
|
||||
When collecting agent stats, aggregate metrics by a given set of
|
||||
comma-separated labels to reduce cardinality. Accepted values are
|
||||
agent_name, template_name, username, workspace_name.
|
||||
|
||||
--prometheus-collect-agent-stats bool, $CODER_PROMETHEUS_COLLECT_AGENT_STATS
|
||||
Collect agent stats (may increase charges for metrics storage).
|
||||
|
||||
|
|
|
@ -188,6 +188,15 @@ introspection:
|
|||
# Collect agent stats (may increase charges for metrics storage).
|
||||
# (default: <unset>, type: bool)
|
||||
collect_agent_stats: false
|
||||
# When collecting agent stats, aggregate metrics by a given set of comma-separated
|
||||
# labels to reduce cardinality. Accepted values are agent_name, template_name,
|
||||
# username, workspace_name.
|
||||
# (default: agent_name,template_name,username,workspace_name, type: string-array)
|
||||
aggregate_agent_stats_by:
|
||||
- agent_name
|
||||
- template_name
|
||||
- username
|
||||
- workspace_name
|
||||
# Collect database metrics (may increase charges for metrics storage).
|
||||
# (default: false, type: bool)
|
||||
collect_db_metrics: false
|
||||
|
|
|
@ -0,0 +1,38 @@
|
|||
package agentmetrics
|
||||
|
||||
import (
|
||||
"strings"
|
||||
|
||||
"golang.org/x/xerrors"
|
||||
)
|
||||
|
||||
const (
|
||||
LabelAgentName = "agent_name"
|
||||
LabelTemplateName = "template_name"
|
||||
LabelUsername = "username"
|
||||
LabelWorkspaceName = "workspace_name"
|
||||
)
|
||||
|
||||
var (
|
||||
LabelAll = []string{LabelAgentName, LabelTemplateName, LabelUsername, LabelWorkspaceName}
|
||||
LabelAgentStats = []string{LabelAgentName, LabelUsername, LabelWorkspaceName}
|
||||
)
|
||||
|
||||
// ValidateAggregationLabels ensures a given set of labels are valid aggregation labels.
|
||||
func ValidateAggregationLabels(labels []string) error {
|
||||
acceptable := LabelAll
|
||||
|
||||
seen := make(map[string]any, len(acceptable))
|
||||
for _, label := range acceptable {
|
||||
seen[label] = nil
|
||||
}
|
||||
|
||||
for _, label := range labels {
|
||||
if _, found := seen[label]; !found {
|
||||
return xerrors.Errorf("%q is not a valid aggregation label; only one or more of %q are acceptable",
|
||||
label, strings.Join(acceptable, ", "))
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
|
@ -0,0 +1,57 @@
|
|||
package agentmetrics_test
|
||||
|
||||
import (
|
||||
"testing"
|
||||
|
||||
"github.com/stretchr/testify/require"
|
||||
|
||||
"github.com/coder/coder/v2/coderd/agentmetrics"
|
||||
)
|
||||
|
||||
func TestValidateAggregationLabels(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
tests := []struct {
|
||||
name string
|
||||
labels []string
|
||||
expectedErr bool
|
||||
}{
|
||||
{
|
||||
name: "empty list is valid",
|
||||
},
|
||||
{
|
||||
name: "single valid entry",
|
||||
labels: []string{agentmetrics.LabelTemplateName},
|
||||
},
|
||||
{
|
||||
name: "multiple valid entries",
|
||||
labels: []string{agentmetrics.LabelTemplateName, agentmetrics.LabelUsername},
|
||||
},
|
||||
{
|
||||
name: "repeated valid entries are not invalid",
|
||||
labels: []string{agentmetrics.LabelTemplateName, agentmetrics.LabelUsername, agentmetrics.LabelUsername, agentmetrics.LabelUsername},
|
||||
},
|
||||
{
|
||||
name: "empty entry is invalid",
|
||||
labels: []string{""},
|
||||
expectedErr: true,
|
||||
},
|
||||
{
|
||||
name: "all valid entries",
|
||||
labels: agentmetrics.LabelAll,
|
||||
},
|
||||
}
|
||||
|
||||
for _, tc := range tests {
|
||||
tc := tc
|
||||
|
||||
t.Run(tc.name, func(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
err := agentmetrics.ValidateAggregationLabels(tc.labels)
|
||||
if tc.expectedErr {
|
||||
require.Error(t, err)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
|
@ -10952,6 +10952,12 @@ const docTemplate = `{
|
|||
"address": {
|
||||
"$ref": "#/definitions/clibase.HostPort"
|
||||
},
|
||||
"aggregate_agent_stats_by": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "string"
|
||||
}
|
||||
},
|
||||
"collect_agent_stats": {
|
||||
"type": "boolean"
|
||||
},
|
||||
|
|
|
@ -9860,6 +9860,12 @@
|
|||
"address": {
|
||||
"$ref": "#/definitions/clibase.HostPort"
|
||||
},
|
||||
"aggregate_agent_stats_by": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "string"
|
||||
}
|
||||
},
|
||||
"collect_agent_stats": {
|
||||
"type": "boolean"
|
||||
},
|
||||
|
|
|
@ -8,8 +8,11 @@ import (
|
|||
"time"
|
||||
|
||||
"github.com/prometheus/client_golang/prometheus"
|
||||
"github.com/prometheus/common/model"
|
||||
"golang.org/x/xerrors"
|
||||
|
||||
"github.com/coder/coder/v2/coderd/agentmetrics"
|
||||
|
||||
"cdr.dev/slog"
|
||||
|
||||
agentproto "github.com/coder/coder/v2/agent/proto"
|
||||
|
@ -46,6 +49,7 @@ type MetricsAggregator struct {
|
|||
storeSizeGauge prometheus.Gauge
|
||||
updateHistogram prometheus.Histogram
|
||||
cleanupHistogram prometheus.Histogram
|
||||
aggregateByLabels []string
|
||||
}
|
||||
|
||||
type updateRequest struct {
|
||||
|
@ -68,6 +72,8 @@ type annotatedMetric struct {
|
|||
templateName string
|
||||
|
||||
expiryDate time.Time
|
||||
|
||||
aggregateByLabels []string
|
||||
}
|
||||
|
||||
type metricKey struct {
|
||||
|
@ -102,13 +108,28 @@ func hashKey(req *updateRequest, m *agentproto.Stats_Metric) metricKey {
|
|||
var _ prometheus.Collector = new(MetricsAggregator)
|
||||
|
||||
func (am *annotatedMetric) asPrometheus() (prometheus.Metric, error) {
|
||||
labels := make([]string, 0, len(agentMetricsLabels)+len(am.Labels))
|
||||
labelValues := make([]string, 0, len(agentMetricsLabels)+len(am.Labels))
|
||||
var (
|
||||
baseLabelNames = am.aggregateByLabels
|
||||
baseLabelValues []string
|
||||
extraLabels = am.Labels
|
||||
)
|
||||
|
||||
labels = append(labels, agentMetricsLabels...)
|
||||
labelValues = append(labelValues, am.username, am.workspaceName, am.agentName, am.templateName)
|
||||
for _, label := range baseLabelNames {
|
||||
val, err := am.getFieldByLabel(label)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
for _, l := range am.Labels {
|
||||
baseLabelValues = append(baseLabelValues, val)
|
||||
}
|
||||
|
||||
labels := make([]string, 0, len(baseLabelNames)+len(extraLabels))
|
||||
labelValues := make([]string, 0, len(baseLabelNames)+len(extraLabels))
|
||||
|
||||
labels = append(labels, baseLabelNames...)
|
||||
labelValues = append(labelValues, baseLabelValues...)
|
||||
|
||||
for _, l := range extraLabels {
|
||||
labels = append(labels, l.Name)
|
||||
labelValues = append(labelValues, l.Value)
|
||||
}
|
||||
|
@ -118,10 +139,48 @@ func (am *annotatedMetric) asPrometheus() (prometheus.Metric, error) {
|
|||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return prometheus.MustNewConstMetric(desc, valueType, am.Value, labelValues...), nil
|
||||
}
|
||||
|
||||
func NewMetricsAggregator(logger slog.Logger, registerer prometheus.Registerer, duration time.Duration) (*MetricsAggregator, error) {
|
||||
// getFieldByLabel returns the related field value for a given label
|
||||
func (am *annotatedMetric) getFieldByLabel(label string) (string, error) {
|
||||
var labelVal string
|
||||
switch label {
|
||||
case agentmetrics.LabelWorkspaceName:
|
||||
labelVal = am.workspaceName
|
||||
case agentmetrics.LabelTemplateName:
|
||||
labelVal = am.templateName
|
||||
case agentmetrics.LabelAgentName:
|
||||
labelVal = am.agentName
|
||||
case agentmetrics.LabelUsername:
|
||||
labelVal = am.username
|
||||
default:
|
||||
return "", xerrors.Errorf("unexpected label: %q", label)
|
||||
}
|
||||
|
||||
return labelVal, nil
|
||||
}
|
||||
|
||||
func (am *annotatedMetric) shallowCopy() annotatedMetric {
|
||||
stats := &agentproto.Stats_Metric{
|
||||
Name: am.Name,
|
||||
Type: am.Type,
|
||||
Value: am.Value,
|
||||
Labels: am.Labels,
|
||||
}
|
||||
|
||||
return annotatedMetric{
|
||||
Stats_Metric: stats,
|
||||
username: am.username,
|
||||
workspaceName: am.workspaceName,
|
||||
agentName: am.agentName,
|
||||
templateName: am.templateName,
|
||||
expiryDate: am.expiryDate,
|
||||
}
|
||||
}
|
||||
|
||||
func NewMetricsAggregator(logger slog.Logger, registerer prometheus.Registerer, duration time.Duration, aggregateByLabels []string) (*MetricsAggregator, error) {
|
||||
metricsCleanupInterval := defaultMetricsCleanupInterval
|
||||
if duration > 0 {
|
||||
metricsCleanupInterval = duration
|
||||
|
@ -174,9 +233,66 @@ func NewMetricsAggregator(logger slog.Logger, registerer prometheus.Registerer,
|
|||
storeSizeGauge: storeSizeGauge,
|
||||
updateHistogram: updateHistogram,
|
||||
cleanupHistogram: cleanupHistogram,
|
||||
|
||||
aggregateByLabels: aggregateByLabels,
|
||||
}, nil
|
||||
}
|
||||
|
||||
// labelAggregator is used to control cardinality of collected Prometheus metrics by pre-aggregating series based on given labels.
|
||||
type labelAggregator struct {
|
||||
aggregations map[string]float64
|
||||
metrics map[string]annotatedMetric
|
||||
}
|
||||
|
||||
func newLabelAggregator(size int) *labelAggregator {
|
||||
return &labelAggregator{
|
||||
aggregations: make(map[string]float64, size),
|
||||
metrics: make(map[string]annotatedMetric, size),
|
||||
}
|
||||
}
|
||||
|
||||
func (a *labelAggregator) aggregate(am annotatedMetric, labels []string) error {
|
||||
// Use a LabelSet because it can give deterministic fingerprints of label combinations regardless of map ordering.
|
||||
labelSet := make(model.LabelSet, len(labels))
|
||||
|
||||
for _, label := range labels {
|
||||
val, err := am.getFieldByLabel(label)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
labelSet[model.LabelName(label)] = model.LabelValue(val)
|
||||
}
|
||||
|
||||
// Memoize based on the metric name & the unique combination of labels.
|
||||
key := fmt.Sprintf("%s:%v", am.Stats_Metric.Name, labelSet.FastFingerprint())
|
||||
|
||||
// Aggregate the value based on the key.
|
||||
a.aggregations[key] += am.Value
|
||||
|
||||
metric, found := a.metrics[key]
|
||||
if !found {
|
||||
// Take a copy of the given annotatedMetric because it may be manipulated later and contains pointers.
|
||||
metric = am.shallowCopy()
|
||||
}
|
||||
|
||||
// Store the metric.
|
||||
metric.aggregateByLabels = labels
|
||||
metric.Value = a.aggregations[key]
|
||||
|
||||
a.metrics[key] = metric
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (a *labelAggregator) listMetrics() []annotatedMetric {
|
||||
var out []annotatedMetric
|
||||
for _, am := range a.metrics {
|
||||
out = append(out, am)
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
func (ma *MetricsAggregator) Run(ctx context.Context) func() {
|
||||
ctx, cancelFunc := context.WithCancel(ctx)
|
||||
done := make(chan struct{})
|
||||
|
@ -216,8 +332,38 @@ func (ma *MetricsAggregator) Run(ctx context.Context) func() {
|
|||
case outputCh := <-ma.collectCh:
|
||||
ma.log.Debug(ctx, "collect metrics")
|
||||
|
||||
var input []annotatedMetric
|
||||
output := make([]prometheus.Metric, 0, len(ma.store))
|
||||
|
||||
if len(ma.aggregateByLabels) == 0 {
|
||||
ma.aggregateByLabels = agentmetrics.LabelAll
|
||||
}
|
||||
|
||||
// If custom aggregation labels have not been chosen, generate Prometheus metrics without any pre-aggregation.
|
||||
// This results in higher cardinality, but may be desirable in larger deployments.
|
||||
//
|
||||
// Default behavior.
|
||||
if len(ma.aggregateByLabels) == len(agentmetrics.LabelAll) {
|
||||
for _, m := range ma.store {
|
||||
// Aggregate by all available metrics.
|
||||
m.aggregateByLabels = defaultAgentMetricsLabels
|
||||
input = append(input, m)
|
||||
}
|
||||
} else {
|
||||
// However, if custom aggregations have been chosen, we need to aggregate the values from the annotated
|
||||
// metrics because we cannot register multiple metric series with the same labels.
|
||||
la := newLabelAggregator(len(ma.store))
|
||||
|
||||
for _, m := range ma.store {
|
||||
if err := la.aggregate(m, ma.aggregateByLabels); err != nil {
|
||||
ma.log.Error(ctx, "can't aggregate labels", slog.F("labels", strings.Join(ma.aggregateByLabels, ",")), slog.Error(err))
|
||||
}
|
||||
}
|
||||
|
||||
input = la.listMetrics()
|
||||
}
|
||||
|
||||
for _, m := range input {
|
||||
promMetric, err := m.asPrometheus()
|
||||
if err != nil {
|
||||
ma.log.Error(ctx, "can't convert Prometheus value type", slog.F("name", m.Name), slog.F("type", m.Type), slog.F("value", m.Value), slog.Error(err))
|
||||
|
@ -225,6 +371,7 @@ func (ma *MetricsAggregator) Run(ctx context.Context) func() {
|
|||
}
|
||||
output = append(output, promMetric)
|
||||
}
|
||||
|
||||
outputCh <- output
|
||||
close(outputCh)
|
||||
case <-cleanupTicker.C:
|
||||
|
@ -260,7 +407,7 @@ func (ma *MetricsAggregator) Run(ctx context.Context) func() {
|
|||
func (*MetricsAggregator) Describe(_ chan<- *prometheus.Desc) {
|
||||
}
|
||||
|
||||
var agentMetricsLabels = []string{usernameLabel, workspaceNameLabel, agentNameLabel, templateNameLabel}
|
||||
var defaultAgentMetricsLabels = []string{agentmetrics.LabelUsername, agentmetrics.LabelWorkspaceName, agentmetrics.LabelAgentName, agentmetrics.LabelTemplateName}
|
||||
|
||||
// AgentMetricLabels are the labels used to decorate an agent's metrics.
|
||||
// This list should match the list of labels in agentMetricsLabels.
|
||||
|
|
|
@ -2,6 +2,7 @@ package prometheusmetrics_test
|
|||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"sort"
|
||||
"strings"
|
||||
"sync/atomic"
|
||||
|
@ -14,6 +15,7 @@ import (
|
|||
"github.com/stretchr/testify/require"
|
||||
|
||||
"cdr.dev/slog/sloggers/slogtest"
|
||||
"github.com/coder/coder/v2/coderd/agentmetrics"
|
||||
|
||||
agentproto "github.com/coder/coder/v2/agent/proto"
|
||||
"github.com/coder/coder/v2/coderd/prometheusmetrics"
|
||||
|
@ -40,7 +42,7 @@ func TestUpdateMetrics_MetricsDoNotExpire(t *testing.T) {
|
|||
|
||||
// given
|
||||
registry := prometheus.NewRegistry()
|
||||
metricsAggregator, err := prometheusmetrics.NewMetricsAggregator(slogtest.Make(t, &slogtest.Options{IgnoreErrors: true}), registry, time.Hour) // time.Hour, so metrics won't expire
|
||||
metricsAggregator, err := prometheusmetrics.NewMetricsAggregator(slogtest.Make(t, &slogtest.Options{IgnoreErrors: true}), registry, time.Hour, nil) // time.Hour, so metrics won't expire
|
||||
require.NoError(t, err)
|
||||
|
||||
ctx, cancelFunc := context.WithCancel(context.Background())
|
||||
|
@ -93,54 +95,54 @@ func TestUpdateMetrics_MetricsDoNotExpire(t *testing.T) {
|
|||
}
|
||||
|
||||
commonLabels := []*agentproto.Stats_Metric_Label{
|
||||
{Name: "agent_name", Value: testAgentName},
|
||||
{Name: "username", Value: testUsername},
|
||||
{Name: "workspace_name", Value: testWorkspaceName},
|
||||
{Name: "template_name", Value: testTemplateName},
|
||||
{Name: agentmetrics.LabelAgentName, Value: testAgentName},
|
||||
{Name: agentmetrics.LabelUsername, Value: testUsername},
|
||||
{Name: agentmetrics.LabelWorkspaceName, Value: testWorkspaceName},
|
||||
{Name: agentmetrics.LabelTemplateName, Value: testTemplateName},
|
||||
}
|
||||
expected := []*agentproto.Stats_Metric{
|
||||
{Name: "a_counter_one", Type: agentproto.Stats_Metric_COUNTER, Value: 1, Labels: commonLabels},
|
||||
{Name: "b_counter_two", Type: agentproto.Stats_Metric_COUNTER, Value: -9, Labels: []*agentproto.Stats_Metric_Label{
|
||||
{Name: "agent_name", Value: testAgentName},
|
||||
{Name: agentmetrics.LabelAgentName, Value: testAgentName},
|
||||
{Name: "lizz", Value: "rizz"},
|
||||
{Name: "username", Value: testUsername},
|
||||
{Name: "workspace_name", Value: testWorkspaceName},
|
||||
{Name: "template_name", Value: testTemplateName},
|
||||
{Name: agentmetrics.LabelUsername, Value: testUsername},
|
||||
{Name: agentmetrics.LabelWorkspaceName, Value: testWorkspaceName},
|
||||
{Name: agentmetrics.LabelTemplateName, Value: testTemplateName},
|
||||
}},
|
||||
{Name: "b_counter_two", Type: agentproto.Stats_Metric_COUNTER, Value: 4, Labels: commonLabels},
|
||||
{Name: "c_gauge_three", Type: agentproto.Stats_Metric_GAUGE, Value: 2, Labels: []*agentproto.Stats_Metric_Label{
|
||||
{Name: "agent_name", Value: testAgentName},
|
||||
{Name: agentmetrics.LabelAgentName, Value: testAgentName},
|
||||
{Name: "foobar", Value: "Foobaz"},
|
||||
{Name: "hello", Value: "world"},
|
||||
{Name: "username", Value: testUsername},
|
||||
{Name: "workspace_name", Value: testWorkspaceName},
|
||||
{Name: "template_name", Value: testTemplateName},
|
||||
{Name: agentmetrics.LabelUsername, Value: testUsername},
|
||||
{Name: agentmetrics.LabelWorkspaceName, Value: testWorkspaceName},
|
||||
{Name: agentmetrics.LabelTemplateName, Value: testTemplateName},
|
||||
}},
|
||||
{Name: "c_gauge_three", Type: agentproto.Stats_Metric_GAUGE, Value: 5, Labels: commonLabels},
|
||||
{Name: "d_gauge_four", Type: agentproto.Stats_Metric_GAUGE, Value: 6, Labels: commonLabels},
|
||||
{Name: "e_gauge_four", Type: agentproto.Stats_Metric_GAUGE, Value: 17, Labels: []*agentproto.Stats_Metric_Label{
|
||||
{Name: "agent_name", Value: testAgentName},
|
||||
{Name: agentmetrics.LabelAgentName, Value: testAgentName},
|
||||
{Name: "cat", Value: "do,=g"},
|
||||
{Name: "hello", Value: "wo,,rld"},
|
||||
{Name: "username", Value: testUsername},
|
||||
{Name: "workspace_name", Value: testWorkspaceName},
|
||||
{Name: "template_name", Value: testTemplateName},
|
||||
{Name: agentmetrics.LabelUsername, Value: testUsername},
|
||||
{Name: agentmetrics.LabelWorkspaceName, Value: testWorkspaceName},
|
||||
{Name: agentmetrics.LabelTemplateName, Value: testTemplateName},
|
||||
}},
|
||||
{Name: "e_gauge_four", Type: agentproto.Stats_Metric_GAUGE, Value: 15, Labels: []*agentproto.Stats_Metric_Label{
|
||||
{Name: "agent_name", Value: testAgentName},
|
||||
{Name: agentmetrics.LabelAgentName, Value: testAgentName},
|
||||
{Name: "foobar", Value: "Foo,ba=z"},
|
||||
{Name: "halo", Value: "wor\\,d=1,e=\\,2"},
|
||||
{Name: "hello", Value: "wo,,r=d"},
|
||||
{Name: "username", Value: testUsername},
|
||||
{Name: "workspace_name", Value: testWorkspaceName},
|
||||
{Name: "template_name", Value: testTemplateName},
|
||||
{Name: agentmetrics.LabelUsername, Value: testUsername},
|
||||
{Name: agentmetrics.LabelWorkspaceName, Value: testWorkspaceName},
|
||||
{Name: agentmetrics.LabelTemplateName, Value: testTemplateName},
|
||||
}},
|
||||
{Name: "f_gauge_four", Type: agentproto.Stats_Metric_GAUGE, Value: 8, Labels: []*agentproto.Stats_Metric_Label{
|
||||
{Name: "agent_name", Value: testAgentName},
|
||||
{Name: agentmetrics.LabelAgentName, Value: testAgentName},
|
||||
{Name: "foobar", Value: "foobaz"},
|
||||
{Name: "username", Value: testUsername},
|
||||
{Name: "workspace_name", Value: testWorkspaceName},
|
||||
{Name: "template_name", Value: testTemplateName},
|
||||
{Name: agentmetrics.LabelUsername, Value: testUsername},
|
||||
{Name: agentmetrics.LabelWorkspaceName, Value: testWorkspaceName},
|
||||
{Name: agentmetrics.LabelTemplateName, Value: testTemplateName},
|
||||
}},
|
||||
}
|
||||
|
||||
|
@ -175,6 +177,11 @@ func verifyCollectedMetrics(t *testing.T, expected []*agentproto.Stats_Metric, a
|
|||
return false
|
||||
}
|
||||
|
||||
// ensure stable iteration order
|
||||
sort.Slice(expected, func(i, j int) bool {
|
||||
return expected[i].Name < expected[j].Name
|
||||
})
|
||||
|
||||
sort.Slice(actual, func(i, j int) bool {
|
||||
m1 := prometheusMetricToString(t, actual[i])
|
||||
m2 := prometheusMetricToString(t, actual[j])
|
||||
|
@ -199,9 +206,11 @@ func verifyCollectedMetrics(t *testing.T, expected []*agentproto.Stats_Metric, a
|
|||
|
||||
dtoLabels := asMetricAgentLabels(d.GetLabel())
|
||||
// dto labels are sorted in alphabetical order.
|
||||
sort.Slice(e.Labels, func(i, j int) bool {
|
||||
sortFn := func(i, j int) bool {
|
||||
return e.Labels[i].Name < e.Labels[j].Name
|
||||
})
|
||||
}
|
||||
sort.Slice(e.Labels, sortFn)
|
||||
sort.Slice(dtoLabels, sortFn)
|
||||
require.Equal(t, e.Labels, dtoLabels, d.String())
|
||||
}
|
||||
return true
|
||||
|
@ -253,7 +262,7 @@ func TestUpdateMetrics_MetricsExpire(t *testing.T) {
|
|||
|
||||
// given
|
||||
registry := prometheus.NewRegistry()
|
||||
metricsAggregator, err := prometheusmetrics.NewMetricsAggregator(slogtest.Make(t, &slogtest.Options{IgnoreErrors: true}), registry, time.Millisecond)
|
||||
metricsAggregator, err := prometheusmetrics.NewMetricsAggregator(slogtest.Make(t, &slogtest.Options{IgnoreErrors: true}), registry, time.Millisecond, agentmetrics.LabelAll)
|
||||
require.NoError(t, err)
|
||||
|
||||
ctx, cancelFunc := context.WithCancel(context.Background())
|
||||
|
@ -291,18 +300,354 @@ func TestUpdateMetrics_MetricsExpire(t *testing.T) {
|
|||
}, testutil.WaitShort, testutil.IntervalFast)
|
||||
}
|
||||
|
||||
func TestLabelsAggregation(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
type statCollection struct {
|
||||
labels prometheusmetrics.AgentMetricLabels
|
||||
metrics []*agentproto.Stats_Metric
|
||||
}
|
||||
|
||||
commonLabels := []*agentproto.Stats_Metric_Label{
|
||||
{Name: agentmetrics.LabelUsername, Value: testUsername},
|
||||
{Name: agentmetrics.LabelAgentName, Value: testAgentName},
|
||||
{Name: agentmetrics.LabelWorkspaceName, Value: testWorkspaceName},
|
||||
{Name: agentmetrics.LabelTemplateName, Value: testTemplateName},
|
||||
}
|
||||
|
||||
tests := []struct {
|
||||
name string
|
||||
given []statCollection
|
||||
expected []*agentproto.Stats_Metric
|
||||
aggregateOn []string
|
||||
}{
|
||||
{
|
||||
name: "label aggregations not specified, keep all (high cardinality, default behavior)",
|
||||
aggregateOn: agentmetrics.LabelAll,
|
||||
given: []statCollection{
|
||||
{
|
||||
labels: testLabels,
|
||||
metrics: []*agentproto.Stats_Metric{
|
||||
{Name: "user_counter", Type: agentproto.Stats_Metric_COUNTER, Value: 1},
|
||||
},
|
||||
},
|
||||
{
|
||||
labels: testLabels,
|
||||
metrics: []*agentproto.Stats_Metric{
|
||||
{Name: "active_conns", Type: agentproto.Stats_Metric_GAUGE, Value: 4},
|
||||
},
|
||||
},
|
||||
},
|
||||
expected: []*agentproto.Stats_Metric{
|
||||
{Name: "user_counter", Type: agentproto.Stats_Metric_COUNTER, Value: 1, Labels: commonLabels},
|
||||
{Name: "active_conns", Type: agentproto.Stats_Metric_GAUGE, Value: 4, Labels: commonLabels},
|
||||
},
|
||||
},
|
||||
{
|
||||
// Scenario: 2 users are using the same agent and we've configured the deployment to aggregate on the "agent_name" label.
|
||||
name: "single label aggregation, aggregating to single metric",
|
||||
aggregateOn: []string{agentmetrics.LabelAgentName},
|
||||
given: []statCollection{
|
||||
{
|
||||
labels: prometheusmetrics.AgentMetricLabels{
|
||||
Username: "user1",
|
||||
AgentName: "agent1",
|
||||
},
|
||||
metrics: []*agentproto.Stats_Metric{
|
||||
{Name: "user_counter", Type: agentproto.Stats_Metric_COUNTER, Value: 1},
|
||||
},
|
||||
},
|
||||
{
|
||||
labels: prometheusmetrics.AgentMetricLabels{
|
||||
Username: "user2",
|
||||
AgentName: "agent1",
|
||||
},
|
||||
metrics: []*agentproto.Stats_Metric{
|
||||
{Name: "user_counter", Type: agentproto.Stats_Metric_COUNTER, Value: 7},
|
||||
},
|
||||
},
|
||||
},
|
||||
expected: []*agentproto.Stats_Metric{
|
||||
// We only observed one agent_name value, so all metrics are aggregated to a single series.
|
||||
{Name: "user_counter", Type: agentproto.Stats_Metric_COUNTER, Value: 8, Labels: []*agentproto.Stats_Metric_Label{
|
||||
{Name: agentmetrics.LabelAgentName, Value: "agent1"},
|
||||
}},
|
||||
},
|
||||
},
|
||||
{
|
||||
// Scenario: as above, but we're aggregating on two invariant labels.
|
||||
name: "multiple label aggregation, aggregating to single metric",
|
||||
aggregateOn: []string{agentmetrics.LabelAgentName, agentmetrics.LabelTemplateName},
|
||||
given: []statCollection{
|
||||
{
|
||||
labels: prometheusmetrics.AgentMetricLabels{
|
||||
Username: "user1",
|
||||
AgentName: "agent1",
|
||||
TemplateName: "template1",
|
||||
},
|
||||
metrics: []*agentproto.Stats_Metric{
|
||||
{Name: "user_counter", Type: agentproto.Stats_Metric_COUNTER, Value: 1},
|
||||
},
|
||||
},
|
||||
{
|
||||
labels: prometheusmetrics.AgentMetricLabels{
|
||||
Username: "user2",
|
||||
AgentName: "agent1",
|
||||
TemplateName: "template1",
|
||||
},
|
||||
metrics: []*agentproto.Stats_Metric{
|
||||
{Name: "user_counter", Type: agentproto.Stats_Metric_COUNTER, Value: 7},
|
||||
},
|
||||
},
|
||||
},
|
||||
expected: []*agentproto.Stats_Metric{
|
||||
// We only observed one agent_name & template_name tuple, so all metrics are aggregated to a single series.
|
||||
{Name: "user_counter", Type: agentproto.Stats_Metric_COUNTER, Value: 8, Labels: []*agentproto.Stats_Metric_Label{
|
||||
{Name: agentmetrics.LabelAgentName, Value: "agent1"},
|
||||
{Name: agentmetrics.LabelTemplateName, Value: "template1"},
|
||||
}},
|
||||
},
|
||||
},
|
||||
{
|
||||
// Scenario: aggregating on a label which is unique across all metrics.
|
||||
name: "single label aggregation, aggregating to multiple metrics",
|
||||
aggregateOn: []string{agentmetrics.LabelUsername},
|
||||
given: []statCollection{
|
||||
{
|
||||
labels: prometheusmetrics.AgentMetricLabels{
|
||||
Username: "user1",
|
||||
AgentName: "agent1",
|
||||
TemplateName: "template1",
|
||||
},
|
||||
metrics: []*agentproto.Stats_Metric{
|
||||
{Name: "user_counter", Type: agentproto.Stats_Metric_COUNTER, Value: 1},
|
||||
},
|
||||
},
|
||||
{
|
||||
labels: prometheusmetrics.AgentMetricLabels{
|
||||
Username: "user2",
|
||||
AgentName: "agent1",
|
||||
TemplateName: "template1",
|
||||
},
|
||||
metrics: []*agentproto.Stats_Metric{
|
||||
{Name: "user_counter", Type: agentproto.Stats_Metric_COUNTER, Value: 7},
|
||||
},
|
||||
},
|
||||
},
|
||||
expected: []*agentproto.Stats_Metric{
|
||||
// We observed two unique username values, and therefore we have a metric for each.
|
||||
{Name: "user_counter", Type: agentproto.Stats_Metric_COUNTER, Value: 1, Labels: []*agentproto.Stats_Metric_Label{
|
||||
{Name: agentmetrics.LabelUsername, Value: "user1"},
|
||||
}},
|
||||
{Name: "user_counter", Type: agentproto.Stats_Metric_COUNTER, Value: 7, Labels: []*agentproto.Stats_Metric_Label{
|
||||
{Name: agentmetrics.LabelUsername, Value: "user2"},
|
||||
}},
|
||||
},
|
||||
},
|
||||
{
|
||||
// Scenario: aggregating on a label which is unique across all metrics, plus two invariant labels.
|
||||
name: "multiple label aggregation, aggregating to multiple metrics",
|
||||
aggregateOn: []string{agentmetrics.LabelUsername, agentmetrics.LabelAgentName, agentmetrics.LabelTemplateName},
|
||||
given: []statCollection{
|
||||
{
|
||||
labels: prometheusmetrics.AgentMetricLabels{
|
||||
Username: "user1",
|
||||
AgentName: "agent1",
|
||||
TemplateName: "template1",
|
||||
},
|
||||
metrics: []*agentproto.Stats_Metric{
|
||||
{Name: "user_counter", Type: agentproto.Stats_Metric_COUNTER, Value: 1},
|
||||
},
|
||||
},
|
||||
{
|
||||
labels: prometheusmetrics.AgentMetricLabels{
|
||||
Username: "user2",
|
||||
AgentName: "agent1",
|
||||
TemplateName: "template1",
|
||||
},
|
||||
metrics: []*agentproto.Stats_Metric{
|
||||
{Name: "user_counter", Type: agentproto.Stats_Metric_COUNTER, Value: 7},
|
||||
},
|
||||
},
|
||||
},
|
||||
expected: []*agentproto.Stats_Metric{
|
||||
// We observed two unique username values, and therefore we have a metric for each.
|
||||
{Name: "user_counter", Type: agentproto.Stats_Metric_COUNTER, Value: 1, Labels: []*agentproto.Stats_Metric_Label{
|
||||
{Name: agentmetrics.LabelUsername, Value: "user1"},
|
||||
{Name: agentmetrics.LabelAgentName, Value: "agent1"},
|
||||
{Name: agentmetrics.LabelTemplateName, Value: "template1"},
|
||||
}},
|
||||
{Name: "user_counter", Type: agentproto.Stats_Metric_COUNTER, Value: 7, Labels: []*agentproto.Stats_Metric_Label{
|
||||
{Name: agentmetrics.LabelUsername, Value: "user2"},
|
||||
{Name: agentmetrics.LabelAgentName, Value: "agent1"},
|
||||
{Name: agentmetrics.LabelTemplateName, Value: "template1"},
|
||||
}},
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "extra labels are retained, even with label aggregations",
|
||||
aggregateOn: []string{agentmetrics.LabelUsername},
|
||||
given: []statCollection{
|
||||
{
|
||||
labels: testLabels,
|
||||
metrics: []*agentproto.Stats_Metric{
|
||||
{Name: "user_counter", Type: agentproto.Stats_Metric_COUNTER, Value: 1},
|
||||
},
|
||||
},
|
||||
{
|
||||
labels: testLabels,
|
||||
metrics: []*agentproto.Stats_Metric{
|
||||
{Name: "extra_label", Type: agentproto.Stats_Metric_COUNTER, Value: 27, Labels: []*agentproto.Stats_Metric_Label{
|
||||
{Name: "lizz", Value: "rizz"},
|
||||
}},
|
||||
},
|
||||
},
|
||||
},
|
||||
expected: []*agentproto.Stats_Metric{
|
||||
{Name: "user_counter", Type: agentproto.Stats_Metric_COUNTER, Value: 1, Labels: []*agentproto.Stats_Metric_Label{
|
||||
{Name: agentmetrics.LabelUsername, Value: testUsername},
|
||||
}},
|
||||
{Name: "extra_label", Type: agentproto.Stats_Metric_COUNTER, Value: 27, Labels: []*agentproto.Stats_Metric_Label{
|
||||
{Name: "lizz", Value: "rizz"},
|
||||
{Name: agentmetrics.LabelUsername, Value: testUsername},
|
||||
}},
|
||||
},
|
||||
},
|
||||
{
|
||||
// Both counters and gauges should have all their values summed to produce the correct output.
|
||||
name: "counters & gauges behave identically",
|
||||
aggregateOn: []string{agentmetrics.LabelTemplateName},
|
||||
given: []statCollection{
|
||||
{
|
||||
labels: prometheusmetrics.AgentMetricLabels{
|
||||
Username: "username1",
|
||||
TemplateName: "template1",
|
||||
},
|
||||
metrics: []*agentproto.Stats_Metric{
|
||||
{Name: "user_counter", Type: agentproto.Stats_Metric_COUNTER, Value: 1},
|
||||
{Name: "active_conns", Type: agentproto.Stats_Metric_GAUGE, Value: 3},
|
||||
},
|
||||
},
|
||||
{
|
||||
labels: prometheusmetrics.AgentMetricLabels{
|
||||
Username: "username2",
|
||||
TemplateName: "template1",
|
||||
},
|
||||
metrics: []*agentproto.Stats_Metric{
|
||||
{Name: "user_counter", Type: agentproto.Stats_Metric_COUNTER, Value: 2},
|
||||
{Name: "active_conns", Type: agentproto.Stats_Metric_GAUGE, Value: 4},
|
||||
},
|
||||
},
|
||||
},
|
||||
expected: []*agentproto.Stats_Metric{
|
||||
{Name: "user_counter", Type: agentproto.Stats_Metric_COUNTER, Value: 3, Labels: []*agentproto.Stats_Metric_Label{
|
||||
{Name: agentmetrics.LabelTemplateName, Value: "template1"},
|
||||
}},
|
||||
{Name: "active_conns", Type: agentproto.Stats_Metric_GAUGE, Value: 7, Labels: []*agentproto.Stats_Metric_Label{
|
||||
{Name: agentmetrics.LabelTemplateName, Value: "template1"},
|
||||
}},
|
||||
},
|
||||
},
|
||||
{
|
||||
// Scenario: validation fails and an invalid label is selected for aggregation.
|
||||
name: "invalid label aggregation",
|
||||
aggregateOn: []string{"nonsense"},
|
||||
given: []statCollection{
|
||||
{
|
||||
labels: testLabels,
|
||||
metrics: []*agentproto.Stats_Metric{
|
||||
{Name: "user_counter", Type: agentproto.Stats_Metric_COUNTER, Value: 1},
|
||||
},
|
||||
},
|
||||
},
|
||||
// Nothing will be returned.
|
||||
expected: []*agentproto.Stats_Metric{},
|
||||
},
|
||||
{
|
||||
// Scenario: validation fails and an empty list is given for aggregation.
|
||||
name: "empty label aggregation list",
|
||||
aggregateOn: []string{},
|
||||
given: []statCollection{
|
||||
{
|
||||
labels: testLabels,
|
||||
metrics: []*agentproto.Stats_Metric{
|
||||
{Name: "user_counter", Type: agentproto.Stats_Metric_COUNTER, Value: 1},
|
||||
},
|
||||
},
|
||||
},
|
||||
// Default aggregation will be used.
|
||||
expected: []*agentproto.Stats_Metric{
|
||||
{Name: "user_counter", Type: agentproto.Stats_Metric_COUNTER, Value: 1, Labels: commonLabels},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
for _, tc := range tests {
|
||||
tc := tc
|
||||
|
||||
t.Run(tc.name, func(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
// given
|
||||
registry := prometheus.NewRegistry()
|
||||
metricsAggregator, err := prometheusmetrics.NewMetricsAggregator(slogtest.Make(t, &slogtest.Options{IgnoreErrors: true}), registry, time.Hour, tc.aggregateOn) // time.Hour, so metrics won't expire
|
||||
require.NoError(t, err)
|
||||
|
||||
ctx, cancelFunc := context.WithCancel(context.Background())
|
||||
t.Cleanup(cancelFunc)
|
||||
|
||||
closeFunc := metricsAggregator.Run(ctx)
|
||||
t.Cleanup(closeFunc)
|
||||
|
||||
// when
|
||||
for _, sc := range tc.given {
|
||||
metricsAggregator.Update(ctx, sc.labels, sc.metrics)
|
||||
}
|
||||
|
||||
// then
|
||||
require.Eventually(t, func() bool {
|
||||
var actual []prometheus.Metric
|
||||
metricsCh := make(chan prometheus.Metric)
|
||||
|
||||
done := make(chan struct{}, 1)
|
||||
defer close(done)
|
||||
go func() {
|
||||
for m := range metricsCh {
|
||||
actual = append(actual, m)
|
||||
}
|
||||
done <- struct{}{}
|
||||
}()
|
||||
metricsAggregator.Collect(metricsCh)
|
||||
close(metricsCh)
|
||||
<-done
|
||||
return verifyCollectedMetrics(t, tc.expected, actual)
|
||||
}, testutil.WaitMedium, testutil.IntervalSlow)
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func Benchmark_MetricsAggregator_Run(b *testing.B) {
|
||||
benchmarkRunner(b, agentmetrics.LabelAll)
|
||||
}
|
||||
|
||||
func Benchmark_MetricsAggregator_RunWithAggregations(b *testing.B) {
|
||||
for i := 1; i <= len(agentmetrics.LabelAll); i++ {
|
||||
b.Run(fmt.Sprintf("%d labels", i), func(b *testing.B) {
|
||||
benchmarkRunner(b, agentmetrics.LabelAll[0:i])
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func benchmarkRunner(b *testing.B, aggregateByLabels []string) {
|
||||
b.ReportAllocs()
|
||||
|
||||
// Number of metrics to generate and send in each iteration.
|
||||
// Hard-coded to 1024 to avoid overflowing the queue in the metrics aggregator.
|
||||
numMetrics := 1024
|
||||
|
||||
// given
|
||||
registry := prometheus.NewRegistry()
|
||||
metricsAggregator := must(prometheusmetrics.NewMetricsAggregator(
|
||||
slogtest.Make(b, &slogtest.Options{IgnoreErrors: true}),
|
||||
registry,
|
||||
time.Hour,
|
||||
))
|
||||
metricsAggregator := must(prometheusmetrics.NewMetricsAggregator(slogtest.Make(b, &slogtest.Options{IgnoreErrors: true}), registry, time.Hour, aggregateByLabels))
|
||||
|
||||
ctx, cancelFunc := context.WithCancel(context.Background())
|
||||
b.Cleanup(cancelFunc)
|
||||
|
|
|
@ -10,27 +10,20 @@ import (
|
|||
"sync/atomic"
|
||||
"time"
|
||||
|
||||
"github.com/coder/coder/v2/codersdk"
|
||||
|
||||
"github.com/google/uuid"
|
||||
"github.com/prometheus/client_golang/prometheus"
|
||||
"tailscale.com/tailcfg"
|
||||
|
||||
"cdr.dev/slog"
|
||||
|
||||
"github.com/coder/coder/v2/coderd/agentmetrics"
|
||||
"github.com/coder/coder/v2/coderd/database"
|
||||
"github.com/coder/coder/v2/coderd/database/dbauthz"
|
||||
"github.com/coder/coder/v2/coderd/database/dbtime"
|
||||
"github.com/coder/coder/v2/codersdk"
|
||||
"github.com/coder/coder/v2/tailnet"
|
||||
)
|
||||
|
||||
const (
|
||||
templateNameLabel = "template_name"
|
||||
agentNameLabel = "agent_name"
|
||||
usernameLabel = "username"
|
||||
workspaceNameLabel = "workspace_name"
|
||||
)
|
||||
|
||||
// ActiveUsers tracks the number of users that have authenticated within the past hour.
|
||||
func ActiveUsers(ctx context.Context, registerer prometheus.Registerer, db database.Store, duration time.Duration) (func(), error) {
|
||||
if duration == 0 {
|
||||
|
@ -156,7 +149,7 @@ func Agents(ctx context.Context, logger slog.Logger, registerer prometheus.Regis
|
|||
Subsystem: "agents",
|
||||
Name: "up",
|
||||
Help: "The number of active agents per workspace.",
|
||||
}, []string{usernameLabel, workspaceNameLabel, templateNameLabel, "template_version"}))
|
||||
}, []string{agentmetrics.LabelUsername, agentmetrics.LabelWorkspaceName, agentmetrics.LabelTemplateName, "template_version"}))
|
||||
err := registerer.Register(agentsGauge)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
|
@ -167,7 +160,7 @@ func Agents(ctx context.Context, logger slog.Logger, registerer prometheus.Regis
|
|||
Subsystem: "agents",
|
||||
Name: "connections",
|
||||
Help: "Agent connections with statuses.",
|
||||
}, []string{agentNameLabel, usernameLabel, workspaceNameLabel, "status", "lifecycle_state", "tailnet_node"}))
|
||||
}, []string{agentmetrics.LabelAgentName, agentmetrics.LabelUsername, agentmetrics.LabelWorkspaceName, "status", "lifecycle_state", "tailnet_node"}))
|
||||
err = registerer.Register(agentsConnectionsGauge)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
|
@ -178,7 +171,7 @@ func Agents(ctx context.Context, logger slog.Logger, registerer prometheus.Regis
|
|||
Subsystem: "agents",
|
||||
Name: "connection_latencies_seconds",
|
||||
Help: "Agent connection latencies in seconds.",
|
||||
}, []string{agentNameLabel, usernameLabel, workspaceNameLabel, "derp_region", "preferred"}))
|
||||
}, []string{agentmetrics.LabelAgentName, agentmetrics.LabelUsername, agentmetrics.LabelWorkspaceName, "derp_region", "preferred"}))
|
||||
err = registerer.Register(agentsConnectionLatenciesGauge)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
|
@ -189,7 +182,7 @@ func Agents(ctx context.Context, logger slog.Logger, registerer prometheus.Regis
|
|||
Subsystem: "agents",
|
||||
Name: "apps",
|
||||
Help: "Agent applications with statuses.",
|
||||
}, []string{agentNameLabel, usernameLabel, workspaceNameLabel, "app_name", "health"}))
|
||||
}, []string{agentmetrics.LabelAgentName, agentmetrics.LabelUsername, agentmetrics.LabelWorkspaceName, "app_name", "health"}))
|
||||
err = registerer.Register(agentsAppsGauge)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
|
@ -335,11 +328,17 @@ func Agents(ctx context.Context, logger slog.Logger, registerer prometheus.Regis
|
|||
}, nil
|
||||
}
|
||||
|
||||
func AgentStats(ctx context.Context, logger slog.Logger, registerer prometheus.Registerer, db database.Store, initialCreateAfter time.Time, duration time.Duration) (func(), error) {
|
||||
func AgentStats(ctx context.Context, logger slog.Logger, registerer prometheus.Registerer, db database.Store, initialCreateAfter time.Time, duration time.Duration, aggregateByLabels []string) (func(), error) {
|
||||
if duration == 0 {
|
||||
duration = 1 * time.Minute
|
||||
}
|
||||
|
||||
if len(aggregateByLabels) == 0 {
|
||||
aggregateByLabels = agentmetrics.LabelAgentStats
|
||||
}
|
||||
|
||||
aggregateByLabels = filterAcceptableAgentLabels(aggregateByLabels)
|
||||
|
||||
metricsCollectorAgentStats := prometheus.NewHistogram(prometheus.HistogramOpts{
|
||||
Namespace: "coderd",
|
||||
Subsystem: "prometheusmetrics",
|
||||
|
@ -357,7 +356,7 @@ func AgentStats(ctx context.Context, logger slog.Logger, registerer prometheus.R
|
|||
Subsystem: "agentstats",
|
||||
Name: "tx_bytes",
|
||||
Help: "Agent Tx bytes",
|
||||
}, []string{agentNameLabel, usernameLabel, workspaceNameLabel}))
|
||||
}, aggregateByLabels))
|
||||
err = registerer.Register(agentStatsTxBytesGauge)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
|
@ -368,7 +367,7 @@ func AgentStats(ctx context.Context, logger slog.Logger, registerer prometheus.R
|
|||
Subsystem: "agentstats",
|
||||
Name: "rx_bytes",
|
||||
Help: "Agent Rx bytes",
|
||||
}, []string{agentNameLabel, usernameLabel, workspaceNameLabel}))
|
||||
}, aggregateByLabels))
|
||||
err = registerer.Register(agentStatsRxBytesGauge)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
|
@ -379,7 +378,7 @@ func AgentStats(ctx context.Context, logger slog.Logger, registerer prometheus.R
|
|||
Subsystem: "agentstats",
|
||||
Name: "connection_count",
|
||||
Help: "The number of established connections by agent",
|
||||
}, []string{agentNameLabel, usernameLabel, workspaceNameLabel}))
|
||||
}, aggregateByLabels))
|
||||
err = registerer.Register(agentStatsConnectionCountGauge)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
|
@ -390,7 +389,7 @@ func AgentStats(ctx context.Context, logger slog.Logger, registerer prometheus.R
|
|||
Subsystem: "agentstats",
|
||||
Name: "connection_median_latency_seconds",
|
||||
Help: "The median agent connection latency in seconds",
|
||||
}, []string{agentNameLabel, usernameLabel, workspaceNameLabel}))
|
||||
}, aggregateByLabels))
|
||||
err = registerer.Register(agentStatsConnectionMedianLatencyGauge)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
|
@ -401,7 +400,7 @@ func AgentStats(ctx context.Context, logger slog.Logger, registerer prometheus.R
|
|||
Subsystem: "agentstats",
|
||||
Name: "session_count_jetbrains",
|
||||
Help: "The number of session established by JetBrains",
|
||||
}, []string{agentNameLabel, usernameLabel, workspaceNameLabel}))
|
||||
}, aggregateByLabels))
|
||||
err = registerer.Register(agentStatsSessionCountJetBrainsGauge)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
|
@ -412,7 +411,7 @@ func AgentStats(ctx context.Context, logger slog.Logger, registerer prometheus.R
|
|||
Subsystem: "agentstats",
|
||||
Name: "session_count_reconnecting_pty",
|
||||
Help: "The number of session established by reconnecting PTY",
|
||||
}, []string{agentNameLabel, usernameLabel, workspaceNameLabel}))
|
||||
}, aggregateByLabels))
|
||||
err = registerer.Register(agentStatsSessionCountReconnectingPTYGauge)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
|
@ -423,7 +422,7 @@ func AgentStats(ctx context.Context, logger slog.Logger, registerer prometheus.R
|
|||
Subsystem: "agentstats",
|
||||
Name: "session_count_ssh",
|
||||
Help: "The number of session established by SSH",
|
||||
}, []string{agentNameLabel, usernameLabel, workspaceNameLabel}))
|
||||
}, aggregateByLabels))
|
||||
err = registerer.Register(agentStatsSessionCountSSHGauge)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
|
@ -434,7 +433,7 @@ func AgentStats(ctx context.Context, logger slog.Logger, registerer prometheus.R
|
|||
Subsystem: "agentstats",
|
||||
Name: "session_count_vscode",
|
||||
Help: "The number of session established by VSCode",
|
||||
}, []string{agentNameLabel, usernameLabel, workspaceNameLabel}))
|
||||
}, aggregateByLabels))
|
||||
err = registerer.Register(agentStatsSessionCountVSCodeGauge)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
|
@ -466,16 +465,28 @@ func AgentStats(ctx context.Context, logger slog.Logger, registerer prometheus.R
|
|||
logger.Error(ctx, "can't get agent stats", slog.Error(err))
|
||||
} else {
|
||||
for _, agentStat := range stats {
|
||||
agentStatsRxBytesGauge.WithLabelValues(VectorOperationAdd, float64(agentStat.RxBytes), agentStat.AgentName, agentStat.Username, agentStat.WorkspaceName)
|
||||
agentStatsTxBytesGauge.WithLabelValues(VectorOperationAdd, float64(agentStat.TxBytes), agentStat.AgentName, agentStat.Username, agentStat.WorkspaceName)
|
||||
var labelValues []string
|
||||
for _, label := range aggregateByLabels {
|
||||
switch label {
|
||||
case agentmetrics.LabelUsername:
|
||||
labelValues = append(labelValues, agentStat.Username)
|
||||
case agentmetrics.LabelWorkspaceName:
|
||||
labelValues = append(labelValues, agentStat.WorkspaceName)
|
||||
case agentmetrics.LabelAgentName:
|
||||
labelValues = append(labelValues, agentStat.AgentName)
|
||||
}
|
||||
}
|
||||
|
||||
agentStatsConnectionCountGauge.WithLabelValues(VectorOperationSet, float64(agentStat.ConnectionCount), agentStat.AgentName, agentStat.Username, agentStat.WorkspaceName)
|
||||
agentStatsConnectionMedianLatencyGauge.WithLabelValues(VectorOperationSet, agentStat.ConnectionMedianLatencyMS/1000.0 /* (to seconds) */, agentStat.AgentName, agentStat.Username, agentStat.WorkspaceName)
|
||||
agentStatsRxBytesGauge.WithLabelValues(VectorOperationAdd, float64(agentStat.RxBytes), labelValues...)
|
||||
agentStatsTxBytesGauge.WithLabelValues(VectorOperationAdd, float64(agentStat.TxBytes), labelValues...)
|
||||
|
||||
agentStatsSessionCountJetBrainsGauge.WithLabelValues(VectorOperationSet, float64(agentStat.SessionCountJetBrains), agentStat.AgentName, agentStat.Username, agentStat.WorkspaceName)
|
||||
agentStatsSessionCountReconnectingPTYGauge.WithLabelValues(VectorOperationSet, float64(agentStat.SessionCountReconnectingPTY), agentStat.AgentName, agentStat.Username, agentStat.WorkspaceName)
|
||||
agentStatsSessionCountSSHGauge.WithLabelValues(VectorOperationSet, float64(agentStat.SessionCountSSH), agentStat.AgentName, agentStat.Username, agentStat.WorkspaceName)
|
||||
agentStatsSessionCountVSCodeGauge.WithLabelValues(VectorOperationSet, float64(agentStat.SessionCountVSCode), agentStat.AgentName, agentStat.Username, agentStat.WorkspaceName)
|
||||
agentStatsConnectionCountGauge.WithLabelValues(VectorOperationSet, float64(agentStat.ConnectionCount), labelValues...)
|
||||
agentStatsConnectionMedianLatencyGauge.WithLabelValues(VectorOperationSet, agentStat.ConnectionMedianLatencyMS/1000.0 /* (to seconds) */, labelValues...)
|
||||
|
||||
agentStatsSessionCountJetBrainsGauge.WithLabelValues(VectorOperationSet, float64(agentStat.SessionCountJetBrains), labelValues...)
|
||||
agentStatsSessionCountReconnectingPTYGauge.WithLabelValues(VectorOperationSet, float64(agentStat.SessionCountReconnectingPTY), labelValues...)
|
||||
agentStatsSessionCountSSHGauge.WithLabelValues(VectorOperationSet, float64(agentStat.SessionCountSSH), labelValues...)
|
||||
agentStatsSessionCountVSCodeGauge.WithLabelValues(VectorOperationSet, float64(agentStat.SessionCountVSCode), labelValues...)
|
||||
}
|
||||
|
||||
if len(stats) > 0 {
|
||||
|
@ -504,3 +515,17 @@ func AgentStats(ctx context.Context, logger slog.Logger, registerer prometheus.R
|
|||
<-done
|
||||
}, nil
|
||||
}
|
||||
|
||||
// filterAcceptableAgentLabels handles a slightly messy situation whereby `prometheus-aggregate-agent-stats-by` can control on
|
||||
// which labels agent stats are aggregated, but for these specific metrics in this file there is no `template` label value,
|
||||
// and therefore we have to exclude it from the list of acceptable labels.
|
||||
func filterAcceptableAgentLabels(labels []string) []string {
|
||||
out := make([]string, 0, len(labels))
|
||||
for _, label := range labels {
|
||||
if label != agentmetrics.LabelTemplateName {
|
||||
out = append(out, label)
|
||||
}
|
||||
}
|
||||
|
||||
return out
|
||||
}
|
||||
|
|
|
@ -0,0 +1,40 @@
|
|||
package prometheusmetrics
|
||||
|
||||
import (
|
||||
"testing"
|
||||
|
||||
"github.com/stretchr/testify/require"
|
||||
|
||||
"github.com/coder/coder/v2/coderd/agentmetrics"
|
||||
)
|
||||
|
||||
func TestFilterAcceptableAgentLabels(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
tests := []struct {
|
||||
name string
|
||||
input []string
|
||||
expected []string
|
||||
}{
|
||||
{
|
||||
name: "template label is ignored",
|
||||
input: []string{agentmetrics.LabelTemplateName},
|
||||
expected: []string{},
|
||||
},
|
||||
{
|
||||
name: "all other labels are returned",
|
||||
input: agentmetrics.LabelAll,
|
||||
expected: []string{agentmetrics.LabelAgentName, agentmetrics.LabelUsername, agentmetrics.LabelWorkspaceName},
|
||||
},
|
||||
}
|
||||
|
||||
for _, tc := range tests {
|
||||
tc := tc
|
||||
|
||||
t.Run(tc.name, func(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
require.Equal(t, tc.expected, filterAcceptableAgentLabels(tc.input))
|
||||
})
|
||||
}
|
||||
}
|
|
@ -11,10 +11,6 @@ import (
|
|||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/coder/coder/v2/coderd/batchstats"
|
||||
"github.com/coder/coder/v2/coderd/database/dbtestutil"
|
||||
"github.com/coder/coder/v2/coderd/database/dbtime"
|
||||
|
||||
"github.com/google/uuid"
|
||||
"github.com/prometheus/client_golang/prometheus"
|
||||
"github.com/stretchr/testify/assert"
|
||||
|
@ -24,10 +20,14 @@ import (
|
|||
"cdr.dev/slog"
|
||||
"cdr.dev/slog/sloggers/slogtest"
|
||||
|
||||
"github.com/coder/coder/v2/coderd/agentmetrics"
|
||||
"github.com/coder/coder/v2/coderd/batchstats"
|
||||
"github.com/coder/coder/v2/coderd/coderdtest"
|
||||
"github.com/coder/coder/v2/coderd/database"
|
||||
"github.com/coder/coder/v2/coderd/database/dbgen"
|
||||
"github.com/coder/coder/v2/coderd/database/dbmem"
|
||||
"github.com/coder/coder/v2/coderd/database/dbtestutil"
|
||||
"github.com/coder/coder/v2/coderd/database/dbtime"
|
||||
"github.com/coder/coder/v2/coderd/prometheusmetrics"
|
||||
"github.com/coder/coder/v2/codersdk"
|
||||
"github.com/coder/coder/v2/codersdk/agentsdk"
|
||||
|
@ -451,7 +451,7 @@ func TestAgentStats(t *testing.T) {
|
|||
// and it doesn't depend on the real time.
|
||||
closeFunc, err := prometheusmetrics.AgentStats(ctx, slogtest.Make(t, &slogtest.Options{
|
||||
IgnoreErrors: true,
|
||||
}), registry, db, time.Now().Add(-time.Minute), time.Millisecond)
|
||||
}), registry, db, time.Now().Add(-time.Minute), time.Millisecond, agentmetrics.LabelAll)
|
||||
require.NoError(t, err)
|
||||
t.Cleanup(closeFunc)
|
||||
|
||||
|
|
|
@ -4,6 +4,7 @@ import (
|
|||
"context"
|
||||
"encoding/json"
|
||||
"flag"
|
||||
"fmt"
|
||||
"net/http"
|
||||
"os"
|
||||
"path/filepath"
|
||||
|
@ -18,6 +19,7 @@ import (
|
|||
|
||||
"github.com/coder/coder/v2/buildinfo"
|
||||
"github.com/coder/coder/v2/cli/clibase"
|
||||
"github.com/coder/coder/v2/coderd/agentmetrics"
|
||||
"github.com/coder/coder/v2/coderd/workspaceapps/appurl"
|
||||
)
|
||||
|
||||
|
@ -259,6 +261,7 @@ type PrometheusConfig struct {
|
|||
Address clibase.HostPort `json:"address" typescript:",notnull"`
|
||||
CollectAgentStats clibase.Bool `json:"collect_agent_stats" typescript:",notnull"`
|
||||
CollectDBMetrics clibase.Bool `json:"collect_db_metrics" typescript:",notnull"`
|
||||
AggregateAgentStatsBy clibase.StringArray `json:"aggregate_agent_stats_by" typescript:",notnull"`
|
||||
}
|
||||
|
||||
type PprofConfig struct {
|
||||
|
@ -942,6 +945,22 @@ when required by your organization's security policy.`,
|
|||
Group: &deploymentGroupIntrospectionPrometheus,
|
||||
YAML: "collect_agent_stats",
|
||||
},
|
||||
{
|
||||
Name: "Prometheus Aggregate Agent Stats By",
|
||||
Description: fmt.Sprintf("When collecting agent stats, aggregate metrics by a given set of comma-separated labels to reduce cardinality. Accepted values are %s.", strings.Join(agentmetrics.LabelAll, ", ")),
|
||||
Flag: "prometheus-aggregate-agent-stats-by",
|
||||
Env: "CODER_PROMETHEUS_AGGREGATE_AGENT_STATS_BY",
|
||||
Value: clibase.Validate(&c.Prometheus.AggregateAgentStatsBy, func(value *clibase.StringArray) error {
|
||||
if value == nil {
|
||||
return nil
|
||||
}
|
||||
|
||||
return agentmetrics.ValidateAggregationLabels(value.Value())
|
||||
}),
|
||||
Group: &deploymentGroupIntrospectionPrometheus,
|
||||
YAML: "aggregate_agent_stats_by",
|
||||
Default: strings.Join(agentmetrics.LabelAll, ","),
|
||||
},
|
||||
{
|
||||
Name: "Prometheus Collect Database Metrics",
|
||||
Description: "Collect database metrics (may increase charges for metrics storage).",
|
||||
|
|
|
@ -317,6 +317,7 @@ curl -X GET http://coder-server:8080/api/v2/deployment/config \
|
|||
"host": "string",
|
||||
"port": "string"
|
||||
},
|
||||
"aggregate_agent_stats_by": ["string"],
|
||||
"collect_agent_stats": true,
|
||||
"collect_db_metrics": true,
|
||||
"enable": true
|
||||
|
|
|
@ -2786,6 +2786,7 @@ AuthorizationObject can represent a "set" of objects, such as: all workspaces in
|
|||
"host": "string",
|
||||
"port": "string"
|
||||
},
|
||||
"aggregate_agent_stats_by": ["string"],
|
||||
"collect_agent_stats": true,
|
||||
"collect_db_metrics": true,
|
||||
"enable": true
|
||||
|
@ -3154,6 +3155,7 @@ AuthorizationObject can represent a "set" of objects, such as: all workspaces in
|
|||
"host": "string",
|
||||
"port": "string"
|
||||
},
|
||||
"aggregate_agent_stats_by": ["string"],
|
||||
"collect_agent_stats": true,
|
||||
"collect_db_metrics": true,
|
||||
"enable": true
|
||||
|
@ -4783,6 +4785,7 @@ AuthorizationObject can represent a "set" of objects, such as: all workspaces in
|
|||
"host": "string",
|
||||
"port": "string"
|
||||
},
|
||||
"aggregate_agent_stats_by": ["string"],
|
||||
"collect_agent_stats": true,
|
||||
"collect_db_metrics": true,
|
||||
"enable": true
|
||||
|
@ -4792,8 +4795,9 @@ AuthorizationObject can represent a "set" of objects, such as: all workspaces in
|
|||
### Properties
|
||||
|
||||
| Name | Type | Required | Restrictions | Description |
|
||||
| --------------------- | ------------------------------------ | -------- | ------------ | ----------- |
|
||||
| -------------------------- | ------------------------------------ | -------- | ------------ | ----------- |
|
||||
| `address` | [clibase.HostPort](#clibasehostport) | false | | |
|
||||
| `aggregate_agent_stats_by` | array of string | false | | |
|
||||
| `collect_agent_stats` | boolean | false | | |
|
||||
| `collect_db_metrics` | boolean | false | | |
|
||||
| `enable` | boolean | false | | |
|
||||
|
|
|
@ -742,6 +742,17 @@ URL of a PostgreSQL database. If empty, PostgreSQL binaries will be downloaded f
|
|||
|
||||
The bind address to serve prometheus metrics.
|
||||
|
||||
### --prometheus-aggregate-agent-stats-by
|
||||
|
||||
| | |
|
||||
| ----------- | -------------------------------------------------------------- |
|
||||
| Type | <code>string-array</code> |
|
||||
| Environment | <code>$CODER_PROMETHEUS_AGGREGATE_AGENT_STATS_BY</code> |
|
||||
| YAML | <code>introspection.prometheus.aggregate_agent_stats_by</code> |
|
||||
| Default | <code>agent_name,template_name,username,workspace_name</code> |
|
||||
|
||||
When collecting agent stats, aggregate metrics by a given set of comma-separated labels to reduce cardinality. Accepted values are agent_name, template_name, username, workspace_name.
|
||||
|
||||
### --prometheus-collect-agent-stats
|
||||
|
||||
| | |
|
||||
|
|
|
@ -124,6 +124,11 @@ INTROSPECTION / PROMETHEUS OPTIONS:
|
|||
--prometheus-address host:port, $CODER_PROMETHEUS_ADDRESS (default: 127.0.0.1:2112)
|
||||
The bind address to serve prometheus metrics.
|
||||
|
||||
--prometheus-aggregate-agent-stats-by string-array, $CODER_PROMETHEUS_AGGREGATE_AGENT_STATS_BY (default: agent_name,template_name,username,workspace_name)
|
||||
When collecting agent stats, aggregate metrics by a given set of
|
||||
comma-separated labels to reduce cardinality. Accepted values are
|
||||
agent_name, template_name, username, workspace_name.
|
||||
|
||||
--prometheus-collect-agent-stats bool, $CODER_PROMETHEUS_COLLECT_AGENT_STATS
|
||||
Collect agent stats (may increase charges for metrics storage).
|
||||
|
||||
|
|
|
@ -925,6 +925,7 @@ export interface PrometheusConfig {
|
|||
readonly address: string;
|
||||
readonly collect_agent_stats: boolean;
|
||||
readonly collect_db_metrics: boolean;
|
||||
readonly aggregate_agent_stats_by: string[];
|
||||
}
|
||||
|
||||
// From codersdk/deployment.go
|
||||
|
|
Loading…
Reference in New Issue