feat(coderd): add prometheus metrics to servertailnet (#11988)

This commit is contained in:
Colin Adler 2024-02-05 23:57:18 -06:00 committed by GitHub
parent c84a637116
commit c7f52b73bb
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 165 additions and 62 deletions

View File

@ -472,7 +472,7 @@ func New(options *Options) *API {
api.Auditor.Store(&options.Auditor)
api.TailnetCoordinator.Store(&options.TailnetCoordinator)
api.agentProvider, err = NewServerTailnet(api.ctx,
stn, err := NewServerTailnet(api.ctx,
options.Logger,
options.DERPServer,
api.DERPMap,
@ -485,6 +485,10 @@ func New(options *Options) *API {
if err != nil {
panic("failed to setup server tailnet: " + err.Error())
}
api.agentProvider = stn
if options.DeploymentValues.Prometheus.Enable {
options.PrometheusRegistry.MustRegister(stn)
}
api.TailnetClientService, err = tailnet.NewClientService(
api.Logger.Named("tailnetclient"),
&api.TailnetCoordinator,

View File

@ -6,7 +6,6 @@ import (
"testing"
"github.com/prometheus/client_golang/prometheus"
dto "github.com/prometheus/client_model/go"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
@ -43,8 +42,8 @@ func TestPGPubsub_Metrics(t *testing.T) {
metrics, err := registry.Gather()
require.NoError(t, err)
require.True(t, gaugeHasValue(t, metrics, 0, "coder_pubsub_current_events"))
require.True(t, gaugeHasValue(t, metrics, 0, "coder_pubsub_current_subscribers"))
require.True(t, testutil.PromGaugeHasValue(t, metrics, 0, "coder_pubsub_current_events"))
require.True(t, testutil.PromGaugeHasValue(t, metrics, 0, "coder_pubsub_current_subscribers"))
event := "test"
data := "testing"
@ -63,14 +62,14 @@ func TestPGPubsub_Metrics(t *testing.T) {
require.Eventually(t, func() bool {
metrics, err = registry.Gather()
assert.NoError(t, err)
return gaugeHasValue(t, metrics, 1, "coder_pubsub_current_events") &&
gaugeHasValue(t, metrics, 1, "coder_pubsub_current_subscribers") &&
gaugeHasValue(t, metrics, 1, "coder_pubsub_connected") &&
counterHasValue(t, metrics, 1, "coder_pubsub_publishes_total", "true") &&
counterHasValue(t, metrics, 1, "coder_pubsub_subscribes_total", "true") &&
counterHasValue(t, metrics, 1, "coder_pubsub_messages_total", "normal") &&
counterHasValue(t, metrics, 7, "coder_pubsub_received_bytes_total") &&
counterHasValue(t, metrics, 7, "coder_pubsub_published_bytes_total")
return testutil.PromGaugeHasValue(t, metrics, 1, "coder_pubsub_current_events") &&
testutil.PromGaugeHasValue(t, metrics, 1, "coder_pubsub_current_subscribers") &&
testutil.PromGaugeHasValue(t, metrics, 1, "coder_pubsub_connected") &&
testutil.PromCounterHasValue(t, metrics, 1, "coder_pubsub_publishes_total", "true") &&
testutil.PromCounterHasValue(t, metrics, 1, "coder_pubsub_subscribes_total", "true") &&
testutil.PromCounterHasValue(t, metrics, 1, "coder_pubsub_messages_total", "normal") &&
testutil.PromCounterHasValue(t, metrics, 7, "coder_pubsub_received_bytes_total") &&
testutil.PromCounterHasValue(t, metrics, 7, "coder_pubsub_published_bytes_total")
}, testutil.WaitShort, testutil.IntervalFast)
colossalData := make([]byte, 7600)
@ -93,54 +92,14 @@ func TestPGPubsub_Metrics(t *testing.T) {
require.Eventually(t, func() bool {
metrics, err = registry.Gather()
assert.NoError(t, err)
return gaugeHasValue(t, metrics, 1, "coder_pubsub_current_events") &&
gaugeHasValue(t, metrics, 2, "coder_pubsub_current_subscribers") &&
gaugeHasValue(t, metrics, 1, "coder_pubsub_connected") &&
counterHasValue(t, metrics, 2, "coder_pubsub_publishes_total", "true") &&
counterHasValue(t, metrics, 2, "coder_pubsub_subscribes_total", "true") &&
counterHasValue(t, metrics, 1, "coder_pubsub_messages_total", "normal") &&
counterHasValue(t, metrics, 1, "coder_pubsub_messages_total", "colossal") &&
counterHasValue(t, metrics, 7607, "coder_pubsub_received_bytes_total") &&
counterHasValue(t, metrics, 7607, "coder_pubsub_published_bytes_total")
return testutil.PromGaugeHasValue(t, metrics, 1, "coder_pubsub_current_events") &&
testutil.PromGaugeHasValue(t, metrics, 2, "coder_pubsub_current_subscribers") &&
testutil.PromGaugeHasValue(t, metrics, 1, "coder_pubsub_connected") &&
testutil.PromCounterHasValue(t, metrics, 2, "coder_pubsub_publishes_total", "true") &&
testutil.PromCounterHasValue(t, metrics, 2, "coder_pubsub_subscribes_total", "true") &&
testutil.PromCounterHasValue(t, metrics, 1, "coder_pubsub_messages_total", "normal") &&
testutil.PromCounterHasValue(t, metrics, 1, "coder_pubsub_messages_total", "colossal") &&
testutil.PromCounterHasValue(t, metrics, 7607, "coder_pubsub_received_bytes_total") &&
testutil.PromCounterHasValue(t, metrics, 7607, "coder_pubsub_published_bytes_total")
}, testutil.WaitShort, testutil.IntervalFast)
}
func gaugeHasValue(t testing.TB, metrics []*dto.MetricFamily, value float64, name string, label ...string) bool {
t.Helper()
for _, family := range metrics {
if family.GetName() != name {
continue
}
ms := family.GetMetric()
for _, m := range ms {
require.Equal(t, len(label), len(m.GetLabel()))
for i, lv := range label {
if lv != m.GetLabel()[i].GetValue() {
continue
}
}
return value == m.GetGauge().GetValue()
}
}
return false
}
func counterHasValue(t testing.TB, metrics []*dto.MetricFamily, value float64, name string, label ...string) bool {
t.Helper()
for _, family := range metrics {
if family.GetName() != name {
continue
}
ms := family.GetMetric()
for _, m := range ms {
require.Equal(t, len(label), len(m.GetLabel()))
for i, lv := range label {
if lv != m.GetLabel()[i].GetValue() {
continue
}
}
return value == m.GetCounter().GetValue()
}
}
return false
}

View File

@ -14,6 +14,7 @@ import (
"time"
"github.com/google/uuid"
"github.com/prometheus/client_golang/prometheus"
"go.opentelemetry.io/otel/trace"
"golang.org/x/xerrors"
"tailscale.com/derp"
@ -97,6 +98,18 @@ func NewServerTailnet(
agentConnectionTimes: map[uuid.UUID]time.Time{},
agentTickets: map[uuid.UUID]map[uuid.UUID]struct{}{},
transport: tailnetTransport.Clone(),
connsPerAgent: prometheus.NewGaugeVec(prometheus.GaugeOpts{
Namespace: "coder",
Subsystem: "servertailnet",
Name: "open_connections",
Help: "Total number of TCP connections currently open to workspace agents.",
}, []string{"network"}),
totalConns: prometheus.NewCounterVec(prometheus.CounterOpts{
Namespace: "coder",
Subsystem: "servertailnet",
Name: "connections_total",
Help: "Total number of TCP connections made to workspace agents.",
}, []string{"network"}),
}
tn.transport.DialContext = tn.dialContext
// These options are mostly just picked at random, and they can likely be
@ -170,6 +183,16 @@ func NewServerTailnet(
return tn, nil
}
func (s *ServerTailnet) Describe(descs chan<- *prometheus.Desc) {
s.connsPerAgent.Describe(descs)
s.totalConns.Describe(descs)
}
func (s *ServerTailnet) Collect(metrics chan<- prometheus.Metric) {
s.connsPerAgent.Collect(metrics)
s.totalConns.Collect(metrics)
}
func (s *ServerTailnet) expireOldAgents() {
const (
tick = 5 * time.Minute
@ -304,6 +327,9 @@ type ServerTailnet struct {
agentTickets map[uuid.UUID]map[uuid.UUID]struct{}
transport *http.Transport
connsPerAgent *prometheus.GaugeVec
totalConns *prometheus.CounterVec
}
func (s *ServerTailnet) ReverseProxy(targetURL, dashboardURL *url.URL, agentID uuid.UUID) *httputil.ReverseProxy {
@ -349,7 +375,18 @@ func (s *ServerTailnet) dialContext(ctx context.Context, network, addr string) (
return nil, xerrors.Errorf("no agent id attached")
}
return s.DialAgentNetConn(ctx, agentID, network, addr)
nc, err := s.DialAgentNetConn(ctx, agentID, network, addr)
if err != nil {
return nil, err
}
s.connsPerAgent.WithLabelValues("tcp").Inc()
s.totalConns.WithLabelValues("tcp").Inc()
return &instrumentedConn{
Conn: nc,
agentID: agentID,
connsPerAgent: s.connsPerAgent,
}, nil
}
func (s *ServerTailnet) ensureAgent(agentID uuid.UUID) error {
@ -455,3 +492,18 @@ func (s *ServerTailnet) Close() error {
<-s.derpMapUpdaterClosed
return nil
}
type instrumentedConn struct {
net.Conn
agentID uuid.UUID
closeOnce sync.Once
connsPerAgent *prometheus.GaugeVec
}
func (c *instrumentedConn) Close() error {
c.closeOnce.Do(func() {
c.connsPerAgent.WithLabelValues("tcp").Dec()
})
return c.Conn.Close()
}

View File

@ -13,6 +13,7 @@ import (
"testing"
"github.com/google/uuid"
"github.com/prometheus/client_golang/prometheus"
"github.com/spf13/afero"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
@ -79,6 +80,43 @@ func TestServerTailnet_ReverseProxy(t *testing.T) {
assert.Equal(t, http.StatusOK, res.StatusCode)
})
t.Run("Metrics", func(t *testing.T) {
t.Parallel()
ctx, cancel := context.WithTimeout(context.Background(), testutil.WaitLong)
defer cancel()
agents, serverTailnet := setupServerTailnetAgent(t, 1)
a := agents[0]
registry := prometheus.NewRegistry()
require.NoError(t, registry.Register(serverTailnet))
u, err := url.Parse(fmt.Sprintf("http://127.0.0.1:%d", codersdk.WorkspaceAgentHTTPAPIServerPort))
require.NoError(t, err)
rp := serverTailnet.ReverseProxy(u, u, a.id)
rw := httptest.NewRecorder()
req := httptest.NewRequest(
http.MethodGet,
u.String(),
nil,
).WithContext(ctx)
rp.ServeHTTP(rw, req)
res := rw.Result()
defer res.Body.Close()
assert.Equal(t, http.StatusOK, res.StatusCode)
require.Eventually(t, func() bool {
metrics, err := registry.Gather()
assert.NoError(t, err)
return testutil.PromCounterHasValue(t, metrics, 1, "coder_servertailnet_connections_total", "tcp") &&
testutil.PromGaugeHasValue(t, metrics, 1, "coder_servertailnet_open_connections", "tcp")
}, testutil.WaitShort, testutil.IntervalFast)
})
t.Run("HostRewrite", func(t *testing.T) {
t.Parallel()

50
testutil/prometheus.go Normal file
View File

@ -0,0 +1,50 @@
package testutil
import (
"testing"
dto "github.com/prometheus/client_model/go"
"github.com/stretchr/testify/require"
)
func PromGaugeHasValue(t testing.TB, metrics []*dto.MetricFamily, value float64, name string, label ...string) bool {
t.Helper()
for _, family := range metrics {
if family.GetName() != name {
continue
}
ms := family.GetMetric()
metricsLoop:
for _, m := range ms {
require.Equal(t, len(label), len(m.GetLabel()))
for i, lv := range label {
if lv != m.GetLabel()[i].GetValue() {
continue metricsLoop
}
}
return value == m.GetGauge().GetValue()
}
}
return false
}
func PromCounterHasValue(t testing.TB, metrics []*dto.MetricFamily, value float64, name string, label ...string) bool {
t.Helper()
for _, family := range metrics {
if family.GetName() != name {
continue
}
ms := family.GetMetric()
metricsLoop:
for _, m := range ms {
require.Equal(t, len(label), len(m.GetLabel()))
for i, lv := range label {
if lv != m.GetLabel()[i].GetValue() {
continue metricsLoop
}
}
return value == m.GetCounter().GetValue()
}
}
return false
}