mirror of https://github.com/coder/coder.git
feat(coderd): add prometheus metrics to servertailnet (#11988)
This commit is contained in:
parent
c84a637116
commit
c7f52b73bb
|
@ -472,7 +472,7 @@ func New(options *Options) *API {
|
||||||
|
|
||||||
api.Auditor.Store(&options.Auditor)
|
api.Auditor.Store(&options.Auditor)
|
||||||
api.TailnetCoordinator.Store(&options.TailnetCoordinator)
|
api.TailnetCoordinator.Store(&options.TailnetCoordinator)
|
||||||
api.agentProvider, err = NewServerTailnet(api.ctx,
|
stn, err := NewServerTailnet(api.ctx,
|
||||||
options.Logger,
|
options.Logger,
|
||||||
options.DERPServer,
|
options.DERPServer,
|
||||||
api.DERPMap,
|
api.DERPMap,
|
||||||
|
@ -485,6 +485,10 @@ func New(options *Options) *API {
|
||||||
if err != nil {
|
if err != nil {
|
||||||
panic("failed to setup server tailnet: " + err.Error())
|
panic("failed to setup server tailnet: " + err.Error())
|
||||||
}
|
}
|
||||||
|
api.agentProvider = stn
|
||||||
|
if options.DeploymentValues.Prometheus.Enable {
|
||||||
|
options.PrometheusRegistry.MustRegister(stn)
|
||||||
|
}
|
||||||
api.TailnetClientService, err = tailnet.NewClientService(
|
api.TailnetClientService, err = tailnet.NewClientService(
|
||||||
api.Logger.Named("tailnetclient"),
|
api.Logger.Named("tailnetclient"),
|
||||||
&api.TailnetCoordinator,
|
&api.TailnetCoordinator,
|
||||||
|
|
|
@ -6,7 +6,6 @@ import (
|
||||||
"testing"
|
"testing"
|
||||||
|
|
||||||
"github.com/prometheus/client_golang/prometheus"
|
"github.com/prometheus/client_golang/prometheus"
|
||||||
dto "github.com/prometheus/client_model/go"
|
|
||||||
"github.com/stretchr/testify/assert"
|
"github.com/stretchr/testify/assert"
|
||||||
"github.com/stretchr/testify/require"
|
"github.com/stretchr/testify/require"
|
||||||
|
|
||||||
|
@ -43,8 +42,8 @@ func TestPGPubsub_Metrics(t *testing.T) {
|
||||||
|
|
||||||
metrics, err := registry.Gather()
|
metrics, err := registry.Gather()
|
||||||
require.NoError(t, err)
|
require.NoError(t, err)
|
||||||
require.True(t, gaugeHasValue(t, metrics, 0, "coder_pubsub_current_events"))
|
require.True(t, testutil.PromGaugeHasValue(t, metrics, 0, "coder_pubsub_current_events"))
|
||||||
require.True(t, gaugeHasValue(t, metrics, 0, "coder_pubsub_current_subscribers"))
|
require.True(t, testutil.PromGaugeHasValue(t, metrics, 0, "coder_pubsub_current_subscribers"))
|
||||||
|
|
||||||
event := "test"
|
event := "test"
|
||||||
data := "testing"
|
data := "testing"
|
||||||
|
@ -63,14 +62,14 @@ func TestPGPubsub_Metrics(t *testing.T) {
|
||||||
require.Eventually(t, func() bool {
|
require.Eventually(t, func() bool {
|
||||||
metrics, err = registry.Gather()
|
metrics, err = registry.Gather()
|
||||||
assert.NoError(t, err)
|
assert.NoError(t, err)
|
||||||
return gaugeHasValue(t, metrics, 1, "coder_pubsub_current_events") &&
|
return testutil.PromGaugeHasValue(t, metrics, 1, "coder_pubsub_current_events") &&
|
||||||
gaugeHasValue(t, metrics, 1, "coder_pubsub_current_subscribers") &&
|
testutil.PromGaugeHasValue(t, metrics, 1, "coder_pubsub_current_subscribers") &&
|
||||||
gaugeHasValue(t, metrics, 1, "coder_pubsub_connected") &&
|
testutil.PromGaugeHasValue(t, metrics, 1, "coder_pubsub_connected") &&
|
||||||
counterHasValue(t, metrics, 1, "coder_pubsub_publishes_total", "true") &&
|
testutil.PromCounterHasValue(t, metrics, 1, "coder_pubsub_publishes_total", "true") &&
|
||||||
counterHasValue(t, metrics, 1, "coder_pubsub_subscribes_total", "true") &&
|
testutil.PromCounterHasValue(t, metrics, 1, "coder_pubsub_subscribes_total", "true") &&
|
||||||
counterHasValue(t, metrics, 1, "coder_pubsub_messages_total", "normal") &&
|
testutil.PromCounterHasValue(t, metrics, 1, "coder_pubsub_messages_total", "normal") &&
|
||||||
counterHasValue(t, metrics, 7, "coder_pubsub_received_bytes_total") &&
|
testutil.PromCounterHasValue(t, metrics, 7, "coder_pubsub_received_bytes_total") &&
|
||||||
counterHasValue(t, metrics, 7, "coder_pubsub_published_bytes_total")
|
testutil.PromCounterHasValue(t, metrics, 7, "coder_pubsub_published_bytes_total")
|
||||||
}, testutil.WaitShort, testutil.IntervalFast)
|
}, testutil.WaitShort, testutil.IntervalFast)
|
||||||
|
|
||||||
colossalData := make([]byte, 7600)
|
colossalData := make([]byte, 7600)
|
||||||
|
@ -93,54 +92,14 @@ func TestPGPubsub_Metrics(t *testing.T) {
|
||||||
require.Eventually(t, func() bool {
|
require.Eventually(t, func() bool {
|
||||||
metrics, err = registry.Gather()
|
metrics, err = registry.Gather()
|
||||||
assert.NoError(t, err)
|
assert.NoError(t, err)
|
||||||
return gaugeHasValue(t, metrics, 1, "coder_pubsub_current_events") &&
|
return testutil.PromGaugeHasValue(t, metrics, 1, "coder_pubsub_current_events") &&
|
||||||
gaugeHasValue(t, metrics, 2, "coder_pubsub_current_subscribers") &&
|
testutil.PromGaugeHasValue(t, metrics, 2, "coder_pubsub_current_subscribers") &&
|
||||||
gaugeHasValue(t, metrics, 1, "coder_pubsub_connected") &&
|
testutil.PromGaugeHasValue(t, metrics, 1, "coder_pubsub_connected") &&
|
||||||
counterHasValue(t, metrics, 2, "coder_pubsub_publishes_total", "true") &&
|
testutil.PromCounterHasValue(t, metrics, 2, "coder_pubsub_publishes_total", "true") &&
|
||||||
counterHasValue(t, metrics, 2, "coder_pubsub_subscribes_total", "true") &&
|
testutil.PromCounterHasValue(t, metrics, 2, "coder_pubsub_subscribes_total", "true") &&
|
||||||
counterHasValue(t, metrics, 1, "coder_pubsub_messages_total", "normal") &&
|
testutil.PromCounterHasValue(t, metrics, 1, "coder_pubsub_messages_total", "normal") &&
|
||||||
counterHasValue(t, metrics, 1, "coder_pubsub_messages_total", "colossal") &&
|
testutil.PromCounterHasValue(t, metrics, 1, "coder_pubsub_messages_total", "colossal") &&
|
||||||
counterHasValue(t, metrics, 7607, "coder_pubsub_received_bytes_total") &&
|
testutil.PromCounterHasValue(t, metrics, 7607, "coder_pubsub_received_bytes_total") &&
|
||||||
counterHasValue(t, metrics, 7607, "coder_pubsub_published_bytes_total")
|
testutil.PromCounterHasValue(t, metrics, 7607, "coder_pubsub_published_bytes_total")
|
||||||
}, testutil.WaitShort, testutil.IntervalFast)
|
}, testutil.WaitShort, testutil.IntervalFast)
|
||||||
}
|
}
|
||||||
|
|
||||||
func gaugeHasValue(t testing.TB, metrics []*dto.MetricFamily, value float64, name string, label ...string) bool {
|
|
||||||
t.Helper()
|
|
||||||
for _, family := range metrics {
|
|
||||||
if family.GetName() != name {
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
ms := family.GetMetric()
|
|
||||||
for _, m := range ms {
|
|
||||||
require.Equal(t, len(label), len(m.GetLabel()))
|
|
||||||
for i, lv := range label {
|
|
||||||
if lv != m.GetLabel()[i].GetValue() {
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return value == m.GetGauge().GetValue()
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return false
|
|
||||||
}
|
|
||||||
|
|
||||||
func counterHasValue(t testing.TB, metrics []*dto.MetricFamily, value float64, name string, label ...string) bool {
|
|
||||||
t.Helper()
|
|
||||||
for _, family := range metrics {
|
|
||||||
if family.GetName() != name {
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
ms := family.GetMetric()
|
|
||||||
for _, m := range ms {
|
|
||||||
require.Equal(t, len(label), len(m.GetLabel()))
|
|
||||||
for i, lv := range label {
|
|
||||||
if lv != m.GetLabel()[i].GetValue() {
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return value == m.GetCounter().GetValue()
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return false
|
|
||||||
}
|
|
||||||
|
|
|
@ -14,6 +14,7 @@ import (
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
"github.com/google/uuid"
|
"github.com/google/uuid"
|
||||||
|
"github.com/prometheus/client_golang/prometheus"
|
||||||
"go.opentelemetry.io/otel/trace"
|
"go.opentelemetry.io/otel/trace"
|
||||||
"golang.org/x/xerrors"
|
"golang.org/x/xerrors"
|
||||||
"tailscale.com/derp"
|
"tailscale.com/derp"
|
||||||
|
@ -97,6 +98,18 @@ func NewServerTailnet(
|
||||||
agentConnectionTimes: map[uuid.UUID]time.Time{},
|
agentConnectionTimes: map[uuid.UUID]time.Time{},
|
||||||
agentTickets: map[uuid.UUID]map[uuid.UUID]struct{}{},
|
agentTickets: map[uuid.UUID]map[uuid.UUID]struct{}{},
|
||||||
transport: tailnetTransport.Clone(),
|
transport: tailnetTransport.Clone(),
|
||||||
|
connsPerAgent: prometheus.NewGaugeVec(prometheus.GaugeOpts{
|
||||||
|
Namespace: "coder",
|
||||||
|
Subsystem: "servertailnet",
|
||||||
|
Name: "open_connections",
|
||||||
|
Help: "Total number of TCP connections currently open to workspace agents.",
|
||||||
|
}, []string{"network"}),
|
||||||
|
totalConns: prometheus.NewCounterVec(prometheus.CounterOpts{
|
||||||
|
Namespace: "coder",
|
||||||
|
Subsystem: "servertailnet",
|
||||||
|
Name: "connections_total",
|
||||||
|
Help: "Total number of TCP connections made to workspace agents.",
|
||||||
|
}, []string{"network"}),
|
||||||
}
|
}
|
||||||
tn.transport.DialContext = tn.dialContext
|
tn.transport.DialContext = tn.dialContext
|
||||||
// These options are mostly just picked at random, and they can likely be
|
// These options are mostly just picked at random, and they can likely be
|
||||||
|
@ -170,6 +183,16 @@ func NewServerTailnet(
|
||||||
return tn, nil
|
return tn, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (s *ServerTailnet) Describe(descs chan<- *prometheus.Desc) {
|
||||||
|
s.connsPerAgent.Describe(descs)
|
||||||
|
s.totalConns.Describe(descs)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *ServerTailnet) Collect(metrics chan<- prometheus.Metric) {
|
||||||
|
s.connsPerAgent.Collect(metrics)
|
||||||
|
s.totalConns.Collect(metrics)
|
||||||
|
}
|
||||||
|
|
||||||
func (s *ServerTailnet) expireOldAgents() {
|
func (s *ServerTailnet) expireOldAgents() {
|
||||||
const (
|
const (
|
||||||
tick = 5 * time.Minute
|
tick = 5 * time.Minute
|
||||||
|
@ -304,6 +327,9 @@ type ServerTailnet struct {
|
||||||
agentTickets map[uuid.UUID]map[uuid.UUID]struct{}
|
agentTickets map[uuid.UUID]map[uuid.UUID]struct{}
|
||||||
|
|
||||||
transport *http.Transport
|
transport *http.Transport
|
||||||
|
|
||||||
|
connsPerAgent *prometheus.GaugeVec
|
||||||
|
totalConns *prometheus.CounterVec
|
||||||
}
|
}
|
||||||
|
|
||||||
func (s *ServerTailnet) ReverseProxy(targetURL, dashboardURL *url.URL, agentID uuid.UUID) *httputil.ReverseProxy {
|
func (s *ServerTailnet) ReverseProxy(targetURL, dashboardURL *url.URL, agentID uuid.UUID) *httputil.ReverseProxy {
|
||||||
|
@ -349,7 +375,18 @@ func (s *ServerTailnet) dialContext(ctx context.Context, network, addr string) (
|
||||||
return nil, xerrors.Errorf("no agent id attached")
|
return nil, xerrors.Errorf("no agent id attached")
|
||||||
}
|
}
|
||||||
|
|
||||||
return s.DialAgentNetConn(ctx, agentID, network, addr)
|
nc, err := s.DialAgentNetConn(ctx, agentID, network, addr)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
s.connsPerAgent.WithLabelValues("tcp").Inc()
|
||||||
|
s.totalConns.WithLabelValues("tcp").Inc()
|
||||||
|
return &instrumentedConn{
|
||||||
|
Conn: nc,
|
||||||
|
agentID: agentID,
|
||||||
|
connsPerAgent: s.connsPerAgent,
|
||||||
|
}, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func (s *ServerTailnet) ensureAgent(agentID uuid.UUID) error {
|
func (s *ServerTailnet) ensureAgent(agentID uuid.UUID) error {
|
||||||
|
@ -455,3 +492,18 @@ func (s *ServerTailnet) Close() error {
|
||||||
<-s.derpMapUpdaterClosed
|
<-s.derpMapUpdaterClosed
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
type instrumentedConn struct {
|
||||||
|
net.Conn
|
||||||
|
|
||||||
|
agentID uuid.UUID
|
||||||
|
closeOnce sync.Once
|
||||||
|
connsPerAgent *prometheus.GaugeVec
|
||||||
|
}
|
||||||
|
|
||||||
|
func (c *instrumentedConn) Close() error {
|
||||||
|
c.closeOnce.Do(func() {
|
||||||
|
c.connsPerAgent.WithLabelValues("tcp").Dec()
|
||||||
|
})
|
||||||
|
return c.Conn.Close()
|
||||||
|
}
|
||||||
|
|
|
@ -13,6 +13,7 @@ import (
|
||||||
"testing"
|
"testing"
|
||||||
|
|
||||||
"github.com/google/uuid"
|
"github.com/google/uuid"
|
||||||
|
"github.com/prometheus/client_golang/prometheus"
|
||||||
"github.com/spf13/afero"
|
"github.com/spf13/afero"
|
||||||
"github.com/stretchr/testify/assert"
|
"github.com/stretchr/testify/assert"
|
||||||
"github.com/stretchr/testify/require"
|
"github.com/stretchr/testify/require"
|
||||||
|
@ -79,6 +80,43 @@ func TestServerTailnet_ReverseProxy(t *testing.T) {
|
||||||
assert.Equal(t, http.StatusOK, res.StatusCode)
|
assert.Equal(t, http.StatusOK, res.StatusCode)
|
||||||
})
|
})
|
||||||
|
|
||||||
|
t.Run("Metrics", func(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
ctx, cancel := context.WithTimeout(context.Background(), testutil.WaitLong)
|
||||||
|
defer cancel()
|
||||||
|
|
||||||
|
agents, serverTailnet := setupServerTailnetAgent(t, 1)
|
||||||
|
a := agents[0]
|
||||||
|
|
||||||
|
registry := prometheus.NewRegistry()
|
||||||
|
require.NoError(t, registry.Register(serverTailnet))
|
||||||
|
|
||||||
|
u, err := url.Parse(fmt.Sprintf("http://127.0.0.1:%d", codersdk.WorkspaceAgentHTTPAPIServerPort))
|
||||||
|
require.NoError(t, err)
|
||||||
|
|
||||||
|
rp := serverTailnet.ReverseProxy(u, u, a.id)
|
||||||
|
|
||||||
|
rw := httptest.NewRecorder()
|
||||||
|
req := httptest.NewRequest(
|
||||||
|
http.MethodGet,
|
||||||
|
u.String(),
|
||||||
|
nil,
|
||||||
|
).WithContext(ctx)
|
||||||
|
|
||||||
|
rp.ServeHTTP(rw, req)
|
||||||
|
res := rw.Result()
|
||||||
|
defer res.Body.Close()
|
||||||
|
|
||||||
|
assert.Equal(t, http.StatusOK, res.StatusCode)
|
||||||
|
require.Eventually(t, func() bool {
|
||||||
|
metrics, err := registry.Gather()
|
||||||
|
assert.NoError(t, err)
|
||||||
|
return testutil.PromCounterHasValue(t, metrics, 1, "coder_servertailnet_connections_total", "tcp") &&
|
||||||
|
testutil.PromGaugeHasValue(t, metrics, 1, "coder_servertailnet_open_connections", "tcp")
|
||||||
|
}, testutil.WaitShort, testutil.IntervalFast)
|
||||||
|
})
|
||||||
|
|
||||||
t.Run("HostRewrite", func(t *testing.T) {
|
t.Run("HostRewrite", func(t *testing.T) {
|
||||||
t.Parallel()
|
t.Parallel()
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,50 @@
|
||||||
|
package testutil
|
||||||
|
|
||||||
|
import (
|
||||||
|
"testing"
|
||||||
|
|
||||||
|
dto "github.com/prometheus/client_model/go"
|
||||||
|
"github.com/stretchr/testify/require"
|
||||||
|
)
|
||||||
|
|
||||||
|
func PromGaugeHasValue(t testing.TB, metrics []*dto.MetricFamily, value float64, name string, label ...string) bool {
|
||||||
|
t.Helper()
|
||||||
|
for _, family := range metrics {
|
||||||
|
if family.GetName() != name {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
ms := family.GetMetric()
|
||||||
|
metricsLoop:
|
||||||
|
for _, m := range ms {
|
||||||
|
require.Equal(t, len(label), len(m.GetLabel()))
|
||||||
|
for i, lv := range label {
|
||||||
|
if lv != m.GetLabel()[i].GetValue() {
|
||||||
|
continue metricsLoop
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return value == m.GetGauge().GetValue()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
|
func PromCounterHasValue(t testing.TB, metrics []*dto.MetricFamily, value float64, name string, label ...string) bool {
|
||||||
|
t.Helper()
|
||||||
|
for _, family := range metrics {
|
||||||
|
if family.GetName() != name {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
ms := family.GetMetric()
|
||||||
|
metricsLoop:
|
||||||
|
for _, m := range ms {
|
||||||
|
require.Equal(t, len(label), len(m.GetLabel()))
|
||||||
|
for i, lv := range label {
|
||||||
|
if lv != m.GetLabel()[i].GetValue() {
|
||||||
|
continue metricsLoop
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return value == m.GetCounter().GetValue()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return false
|
||||||
|
}
|
Loading…
Reference in New Issue