From c7f52b73bb95d1afc5b1178eb54e88ba09930254 Mon Sep 17 00:00:00 2001 From: Colin Adler Date: Mon, 5 Feb 2024 23:57:18 -0600 Subject: [PATCH] feat(coderd): add prometheus metrics to servertailnet (#11988) --- coderd/coderd.go | 6 +- coderd/database/pubsub/pubsub_test.go | 79 +++++++-------------------- coderd/tailnet.go | 54 +++++++++++++++++- coderd/tailnet_test.go | 38 +++++++++++++ testutil/prometheus.go | 50 +++++++++++++++++ 5 files changed, 165 insertions(+), 62 deletions(-) create mode 100644 testutil/prometheus.go diff --git a/coderd/coderd.go b/coderd/coderd.go index 9d640e4b01..94864971de 100644 --- a/coderd/coderd.go +++ b/coderd/coderd.go @@ -472,7 +472,7 @@ func New(options *Options) *API { api.Auditor.Store(&options.Auditor) api.TailnetCoordinator.Store(&options.TailnetCoordinator) - api.agentProvider, err = NewServerTailnet(api.ctx, + stn, err := NewServerTailnet(api.ctx, options.Logger, options.DERPServer, api.DERPMap, @@ -485,6 +485,10 @@ func New(options *Options) *API { if err != nil { panic("failed to setup server tailnet: " + err.Error()) } + api.agentProvider = stn + if options.DeploymentValues.Prometheus.Enable { + options.PrometheusRegistry.MustRegister(stn) + } api.TailnetClientService, err = tailnet.NewClientService( api.Logger.Named("tailnetclient"), &api.TailnetCoordinator, diff --git a/coderd/database/pubsub/pubsub_test.go b/coderd/database/pubsub/pubsub_test.go index e4012ad8ed..9b2a495aeb 100644 --- a/coderd/database/pubsub/pubsub_test.go +++ b/coderd/database/pubsub/pubsub_test.go @@ -6,7 +6,6 @@ import ( "testing" "github.com/prometheus/client_golang/prometheus" - dto "github.com/prometheus/client_model/go" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" @@ -43,8 +42,8 @@ func TestPGPubsub_Metrics(t *testing.T) { metrics, err := registry.Gather() require.NoError(t, err) - require.True(t, gaugeHasValue(t, metrics, 0, "coder_pubsub_current_events")) - require.True(t, gaugeHasValue(t, metrics, 0, "coder_pubsub_current_subscribers")) + require.True(t, testutil.PromGaugeHasValue(t, metrics, 0, "coder_pubsub_current_events")) + require.True(t, testutil.PromGaugeHasValue(t, metrics, 0, "coder_pubsub_current_subscribers")) event := "test" data := "testing" @@ -63,14 +62,14 @@ func TestPGPubsub_Metrics(t *testing.T) { require.Eventually(t, func() bool { metrics, err = registry.Gather() assert.NoError(t, err) - return gaugeHasValue(t, metrics, 1, "coder_pubsub_current_events") && - gaugeHasValue(t, metrics, 1, "coder_pubsub_current_subscribers") && - gaugeHasValue(t, metrics, 1, "coder_pubsub_connected") && - counterHasValue(t, metrics, 1, "coder_pubsub_publishes_total", "true") && - counterHasValue(t, metrics, 1, "coder_pubsub_subscribes_total", "true") && - counterHasValue(t, metrics, 1, "coder_pubsub_messages_total", "normal") && - counterHasValue(t, metrics, 7, "coder_pubsub_received_bytes_total") && - counterHasValue(t, metrics, 7, "coder_pubsub_published_bytes_total") + return testutil.PromGaugeHasValue(t, metrics, 1, "coder_pubsub_current_events") && + testutil.PromGaugeHasValue(t, metrics, 1, "coder_pubsub_current_subscribers") && + testutil.PromGaugeHasValue(t, metrics, 1, "coder_pubsub_connected") && + testutil.PromCounterHasValue(t, metrics, 1, "coder_pubsub_publishes_total", "true") && + testutil.PromCounterHasValue(t, metrics, 1, "coder_pubsub_subscribes_total", "true") && + testutil.PromCounterHasValue(t, metrics, 1, "coder_pubsub_messages_total", "normal") && + testutil.PromCounterHasValue(t, metrics, 7, "coder_pubsub_received_bytes_total") && + testutil.PromCounterHasValue(t, metrics, 7, "coder_pubsub_published_bytes_total") }, testutil.WaitShort, testutil.IntervalFast) colossalData := make([]byte, 7600) @@ -93,54 +92,14 @@ func TestPGPubsub_Metrics(t *testing.T) { require.Eventually(t, func() bool { metrics, err = registry.Gather() assert.NoError(t, err) - return gaugeHasValue(t, metrics, 1, "coder_pubsub_current_events") && - gaugeHasValue(t, metrics, 2, "coder_pubsub_current_subscribers") && - gaugeHasValue(t, metrics, 1, "coder_pubsub_connected") && - counterHasValue(t, metrics, 2, "coder_pubsub_publishes_total", "true") && - counterHasValue(t, metrics, 2, "coder_pubsub_subscribes_total", "true") && - counterHasValue(t, metrics, 1, "coder_pubsub_messages_total", "normal") && - counterHasValue(t, metrics, 1, "coder_pubsub_messages_total", "colossal") && - counterHasValue(t, metrics, 7607, "coder_pubsub_received_bytes_total") && - counterHasValue(t, metrics, 7607, "coder_pubsub_published_bytes_total") + return testutil.PromGaugeHasValue(t, metrics, 1, "coder_pubsub_current_events") && + testutil.PromGaugeHasValue(t, metrics, 2, "coder_pubsub_current_subscribers") && + testutil.PromGaugeHasValue(t, metrics, 1, "coder_pubsub_connected") && + testutil.PromCounterHasValue(t, metrics, 2, "coder_pubsub_publishes_total", "true") && + testutil.PromCounterHasValue(t, metrics, 2, "coder_pubsub_subscribes_total", "true") && + testutil.PromCounterHasValue(t, metrics, 1, "coder_pubsub_messages_total", "normal") && + testutil.PromCounterHasValue(t, metrics, 1, "coder_pubsub_messages_total", "colossal") && + testutil.PromCounterHasValue(t, metrics, 7607, "coder_pubsub_received_bytes_total") && + testutil.PromCounterHasValue(t, metrics, 7607, "coder_pubsub_published_bytes_total") }, testutil.WaitShort, testutil.IntervalFast) } - -func gaugeHasValue(t testing.TB, metrics []*dto.MetricFamily, value float64, name string, label ...string) bool { - t.Helper() - for _, family := range metrics { - if family.GetName() != name { - continue - } - ms := family.GetMetric() - for _, m := range ms { - require.Equal(t, len(label), len(m.GetLabel())) - for i, lv := range label { - if lv != m.GetLabel()[i].GetValue() { - continue - } - } - return value == m.GetGauge().GetValue() - } - } - return false -} - -func counterHasValue(t testing.TB, metrics []*dto.MetricFamily, value float64, name string, label ...string) bool { - t.Helper() - for _, family := range metrics { - if family.GetName() != name { - continue - } - ms := family.GetMetric() - for _, m := range ms { - require.Equal(t, len(label), len(m.GetLabel())) - for i, lv := range label { - if lv != m.GetLabel()[i].GetValue() { - continue - } - } - return value == m.GetCounter().GetValue() - } - } - return false -} diff --git a/coderd/tailnet.go b/coderd/tailnet.go index fed86ab5ae..74b821deb8 100644 --- a/coderd/tailnet.go +++ b/coderd/tailnet.go @@ -14,6 +14,7 @@ import ( "time" "github.com/google/uuid" + "github.com/prometheus/client_golang/prometheus" "go.opentelemetry.io/otel/trace" "golang.org/x/xerrors" "tailscale.com/derp" @@ -97,6 +98,18 @@ func NewServerTailnet( agentConnectionTimes: map[uuid.UUID]time.Time{}, agentTickets: map[uuid.UUID]map[uuid.UUID]struct{}{}, transport: tailnetTransport.Clone(), + connsPerAgent: prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Namespace: "coder", + Subsystem: "servertailnet", + Name: "open_connections", + Help: "Total number of TCP connections currently open to workspace agents.", + }, []string{"network"}), + totalConns: prometheus.NewCounterVec(prometheus.CounterOpts{ + Namespace: "coder", + Subsystem: "servertailnet", + Name: "connections_total", + Help: "Total number of TCP connections made to workspace agents.", + }, []string{"network"}), } tn.transport.DialContext = tn.dialContext // These options are mostly just picked at random, and they can likely be @@ -170,6 +183,16 @@ func NewServerTailnet( return tn, nil } +func (s *ServerTailnet) Describe(descs chan<- *prometheus.Desc) { + s.connsPerAgent.Describe(descs) + s.totalConns.Describe(descs) +} + +func (s *ServerTailnet) Collect(metrics chan<- prometheus.Metric) { + s.connsPerAgent.Collect(metrics) + s.totalConns.Collect(metrics) +} + func (s *ServerTailnet) expireOldAgents() { const ( tick = 5 * time.Minute @@ -304,6 +327,9 @@ type ServerTailnet struct { agentTickets map[uuid.UUID]map[uuid.UUID]struct{} transport *http.Transport + + connsPerAgent *prometheus.GaugeVec + totalConns *prometheus.CounterVec } func (s *ServerTailnet) ReverseProxy(targetURL, dashboardURL *url.URL, agentID uuid.UUID) *httputil.ReverseProxy { @@ -349,7 +375,18 @@ func (s *ServerTailnet) dialContext(ctx context.Context, network, addr string) ( return nil, xerrors.Errorf("no agent id attached") } - return s.DialAgentNetConn(ctx, agentID, network, addr) + nc, err := s.DialAgentNetConn(ctx, agentID, network, addr) + if err != nil { + return nil, err + } + + s.connsPerAgent.WithLabelValues("tcp").Inc() + s.totalConns.WithLabelValues("tcp").Inc() + return &instrumentedConn{ + Conn: nc, + agentID: agentID, + connsPerAgent: s.connsPerAgent, + }, nil } func (s *ServerTailnet) ensureAgent(agentID uuid.UUID) error { @@ -455,3 +492,18 @@ func (s *ServerTailnet) Close() error { <-s.derpMapUpdaterClosed return nil } + +type instrumentedConn struct { + net.Conn + + agentID uuid.UUID + closeOnce sync.Once + connsPerAgent *prometheus.GaugeVec +} + +func (c *instrumentedConn) Close() error { + c.closeOnce.Do(func() { + c.connsPerAgent.WithLabelValues("tcp").Dec() + }) + return c.Conn.Close() +} diff --git a/coderd/tailnet_test.go b/coderd/tailnet_test.go index cffe818424..73ccba701b 100644 --- a/coderd/tailnet_test.go +++ b/coderd/tailnet_test.go @@ -13,6 +13,7 @@ import ( "testing" "github.com/google/uuid" + "github.com/prometheus/client_golang/prometheus" "github.com/spf13/afero" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" @@ -79,6 +80,43 @@ func TestServerTailnet_ReverseProxy(t *testing.T) { assert.Equal(t, http.StatusOK, res.StatusCode) }) + t.Run("Metrics", func(t *testing.T) { + t.Parallel() + + ctx, cancel := context.WithTimeout(context.Background(), testutil.WaitLong) + defer cancel() + + agents, serverTailnet := setupServerTailnetAgent(t, 1) + a := agents[0] + + registry := prometheus.NewRegistry() + require.NoError(t, registry.Register(serverTailnet)) + + u, err := url.Parse(fmt.Sprintf("http://127.0.0.1:%d", codersdk.WorkspaceAgentHTTPAPIServerPort)) + require.NoError(t, err) + + rp := serverTailnet.ReverseProxy(u, u, a.id) + + rw := httptest.NewRecorder() + req := httptest.NewRequest( + http.MethodGet, + u.String(), + nil, + ).WithContext(ctx) + + rp.ServeHTTP(rw, req) + res := rw.Result() + defer res.Body.Close() + + assert.Equal(t, http.StatusOK, res.StatusCode) + require.Eventually(t, func() bool { + metrics, err := registry.Gather() + assert.NoError(t, err) + return testutil.PromCounterHasValue(t, metrics, 1, "coder_servertailnet_connections_total", "tcp") && + testutil.PromGaugeHasValue(t, metrics, 1, "coder_servertailnet_open_connections", "tcp") + }, testutil.WaitShort, testutil.IntervalFast) + }) + t.Run("HostRewrite", func(t *testing.T) { t.Parallel() diff --git a/testutil/prometheus.go b/testutil/prometheus.go new file mode 100644 index 0000000000..3d4879c14c --- /dev/null +++ b/testutil/prometheus.go @@ -0,0 +1,50 @@ +package testutil + +import ( + "testing" + + dto "github.com/prometheus/client_model/go" + "github.com/stretchr/testify/require" +) + +func PromGaugeHasValue(t testing.TB, metrics []*dto.MetricFamily, value float64, name string, label ...string) bool { + t.Helper() + for _, family := range metrics { + if family.GetName() != name { + continue + } + ms := family.GetMetric() + metricsLoop: + for _, m := range ms { + require.Equal(t, len(label), len(m.GetLabel())) + for i, lv := range label { + if lv != m.GetLabel()[i].GetValue() { + continue metricsLoop + } + } + return value == m.GetGauge().GetValue() + } + } + return false +} + +func PromCounterHasValue(t testing.TB, metrics []*dto.MetricFamily, value float64, name string, label ...string) bool { + t.Helper() + for _, family := range metrics { + if family.GetName() != name { + continue + } + ms := family.GetMetric() + metricsLoop: + for _, m := range ms { + require.Equal(t, len(label), len(m.GetLabel())) + for i, lv := range label { + if lv != m.GetLabel()[i].GetValue() { + continue metricsLoop + } + } + return value == m.GetCounter().GetValue() + } + } + return false +}