mirror of https://github.com/coder/coder.git
feat(coderd): add prometheus metrics to servertailnet (#11988)
This commit is contained in:
parent
c84a637116
commit
c7f52b73bb
|
@ -472,7 +472,7 @@ func New(options *Options) *API {
|
|||
|
||||
api.Auditor.Store(&options.Auditor)
|
||||
api.TailnetCoordinator.Store(&options.TailnetCoordinator)
|
||||
api.agentProvider, err = NewServerTailnet(api.ctx,
|
||||
stn, err := NewServerTailnet(api.ctx,
|
||||
options.Logger,
|
||||
options.DERPServer,
|
||||
api.DERPMap,
|
||||
|
@ -485,6 +485,10 @@ func New(options *Options) *API {
|
|||
if err != nil {
|
||||
panic("failed to setup server tailnet: " + err.Error())
|
||||
}
|
||||
api.agentProvider = stn
|
||||
if options.DeploymentValues.Prometheus.Enable {
|
||||
options.PrometheusRegistry.MustRegister(stn)
|
||||
}
|
||||
api.TailnetClientService, err = tailnet.NewClientService(
|
||||
api.Logger.Named("tailnetclient"),
|
||||
&api.TailnetCoordinator,
|
||||
|
|
|
@ -6,7 +6,6 @@ import (
|
|||
"testing"
|
||||
|
||||
"github.com/prometheus/client_golang/prometheus"
|
||||
dto "github.com/prometheus/client_model/go"
|
||||
"github.com/stretchr/testify/assert"
|
||||
"github.com/stretchr/testify/require"
|
||||
|
||||
|
@ -43,8 +42,8 @@ func TestPGPubsub_Metrics(t *testing.T) {
|
|||
|
||||
metrics, err := registry.Gather()
|
||||
require.NoError(t, err)
|
||||
require.True(t, gaugeHasValue(t, metrics, 0, "coder_pubsub_current_events"))
|
||||
require.True(t, gaugeHasValue(t, metrics, 0, "coder_pubsub_current_subscribers"))
|
||||
require.True(t, testutil.PromGaugeHasValue(t, metrics, 0, "coder_pubsub_current_events"))
|
||||
require.True(t, testutil.PromGaugeHasValue(t, metrics, 0, "coder_pubsub_current_subscribers"))
|
||||
|
||||
event := "test"
|
||||
data := "testing"
|
||||
|
@ -63,14 +62,14 @@ func TestPGPubsub_Metrics(t *testing.T) {
|
|||
require.Eventually(t, func() bool {
|
||||
metrics, err = registry.Gather()
|
||||
assert.NoError(t, err)
|
||||
return gaugeHasValue(t, metrics, 1, "coder_pubsub_current_events") &&
|
||||
gaugeHasValue(t, metrics, 1, "coder_pubsub_current_subscribers") &&
|
||||
gaugeHasValue(t, metrics, 1, "coder_pubsub_connected") &&
|
||||
counterHasValue(t, metrics, 1, "coder_pubsub_publishes_total", "true") &&
|
||||
counterHasValue(t, metrics, 1, "coder_pubsub_subscribes_total", "true") &&
|
||||
counterHasValue(t, metrics, 1, "coder_pubsub_messages_total", "normal") &&
|
||||
counterHasValue(t, metrics, 7, "coder_pubsub_received_bytes_total") &&
|
||||
counterHasValue(t, metrics, 7, "coder_pubsub_published_bytes_total")
|
||||
return testutil.PromGaugeHasValue(t, metrics, 1, "coder_pubsub_current_events") &&
|
||||
testutil.PromGaugeHasValue(t, metrics, 1, "coder_pubsub_current_subscribers") &&
|
||||
testutil.PromGaugeHasValue(t, metrics, 1, "coder_pubsub_connected") &&
|
||||
testutil.PromCounterHasValue(t, metrics, 1, "coder_pubsub_publishes_total", "true") &&
|
||||
testutil.PromCounterHasValue(t, metrics, 1, "coder_pubsub_subscribes_total", "true") &&
|
||||
testutil.PromCounterHasValue(t, metrics, 1, "coder_pubsub_messages_total", "normal") &&
|
||||
testutil.PromCounterHasValue(t, metrics, 7, "coder_pubsub_received_bytes_total") &&
|
||||
testutil.PromCounterHasValue(t, metrics, 7, "coder_pubsub_published_bytes_total")
|
||||
}, testutil.WaitShort, testutil.IntervalFast)
|
||||
|
||||
colossalData := make([]byte, 7600)
|
||||
|
@ -93,54 +92,14 @@ func TestPGPubsub_Metrics(t *testing.T) {
|
|||
require.Eventually(t, func() bool {
|
||||
metrics, err = registry.Gather()
|
||||
assert.NoError(t, err)
|
||||
return gaugeHasValue(t, metrics, 1, "coder_pubsub_current_events") &&
|
||||
gaugeHasValue(t, metrics, 2, "coder_pubsub_current_subscribers") &&
|
||||
gaugeHasValue(t, metrics, 1, "coder_pubsub_connected") &&
|
||||
counterHasValue(t, metrics, 2, "coder_pubsub_publishes_total", "true") &&
|
||||
counterHasValue(t, metrics, 2, "coder_pubsub_subscribes_total", "true") &&
|
||||
counterHasValue(t, metrics, 1, "coder_pubsub_messages_total", "normal") &&
|
||||
counterHasValue(t, metrics, 1, "coder_pubsub_messages_total", "colossal") &&
|
||||
counterHasValue(t, metrics, 7607, "coder_pubsub_received_bytes_total") &&
|
||||
counterHasValue(t, metrics, 7607, "coder_pubsub_published_bytes_total")
|
||||
return testutil.PromGaugeHasValue(t, metrics, 1, "coder_pubsub_current_events") &&
|
||||
testutil.PromGaugeHasValue(t, metrics, 2, "coder_pubsub_current_subscribers") &&
|
||||
testutil.PromGaugeHasValue(t, metrics, 1, "coder_pubsub_connected") &&
|
||||
testutil.PromCounterHasValue(t, metrics, 2, "coder_pubsub_publishes_total", "true") &&
|
||||
testutil.PromCounterHasValue(t, metrics, 2, "coder_pubsub_subscribes_total", "true") &&
|
||||
testutil.PromCounterHasValue(t, metrics, 1, "coder_pubsub_messages_total", "normal") &&
|
||||
testutil.PromCounterHasValue(t, metrics, 1, "coder_pubsub_messages_total", "colossal") &&
|
||||
testutil.PromCounterHasValue(t, metrics, 7607, "coder_pubsub_received_bytes_total") &&
|
||||
testutil.PromCounterHasValue(t, metrics, 7607, "coder_pubsub_published_bytes_total")
|
||||
}, testutil.WaitShort, testutil.IntervalFast)
|
||||
}
|
||||
|
||||
func gaugeHasValue(t testing.TB, metrics []*dto.MetricFamily, value float64, name string, label ...string) bool {
|
||||
t.Helper()
|
||||
for _, family := range metrics {
|
||||
if family.GetName() != name {
|
||||
continue
|
||||
}
|
||||
ms := family.GetMetric()
|
||||
for _, m := range ms {
|
||||
require.Equal(t, len(label), len(m.GetLabel()))
|
||||
for i, lv := range label {
|
||||
if lv != m.GetLabel()[i].GetValue() {
|
||||
continue
|
||||
}
|
||||
}
|
||||
return value == m.GetGauge().GetValue()
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
func counterHasValue(t testing.TB, metrics []*dto.MetricFamily, value float64, name string, label ...string) bool {
|
||||
t.Helper()
|
||||
for _, family := range metrics {
|
||||
if family.GetName() != name {
|
||||
continue
|
||||
}
|
||||
ms := family.GetMetric()
|
||||
for _, m := range ms {
|
||||
require.Equal(t, len(label), len(m.GetLabel()))
|
||||
for i, lv := range label {
|
||||
if lv != m.GetLabel()[i].GetValue() {
|
||||
continue
|
||||
}
|
||||
}
|
||||
return value == m.GetCounter().GetValue()
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
|
|
@ -14,6 +14,7 @@ import (
|
|||
"time"
|
||||
|
||||
"github.com/google/uuid"
|
||||
"github.com/prometheus/client_golang/prometheus"
|
||||
"go.opentelemetry.io/otel/trace"
|
||||
"golang.org/x/xerrors"
|
||||
"tailscale.com/derp"
|
||||
|
@ -97,6 +98,18 @@ func NewServerTailnet(
|
|||
agentConnectionTimes: map[uuid.UUID]time.Time{},
|
||||
agentTickets: map[uuid.UUID]map[uuid.UUID]struct{}{},
|
||||
transport: tailnetTransport.Clone(),
|
||||
connsPerAgent: prometheus.NewGaugeVec(prometheus.GaugeOpts{
|
||||
Namespace: "coder",
|
||||
Subsystem: "servertailnet",
|
||||
Name: "open_connections",
|
||||
Help: "Total number of TCP connections currently open to workspace agents.",
|
||||
}, []string{"network"}),
|
||||
totalConns: prometheus.NewCounterVec(prometheus.CounterOpts{
|
||||
Namespace: "coder",
|
||||
Subsystem: "servertailnet",
|
||||
Name: "connections_total",
|
||||
Help: "Total number of TCP connections made to workspace agents.",
|
||||
}, []string{"network"}),
|
||||
}
|
||||
tn.transport.DialContext = tn.dialContext
|
||||
// These options are mostly just picked at random, and they can likely be
|
||||
|
@ -170,6 +183,16 @@ func NewServerTailnet(
|
|||
return tn, nil
|
||||
}
|
||||
|
||||
func (s *ServerTailnet) Describe(descs chan<- *prometheus.Desc) {
|
||||
s.connsPerAgent.Describe(descs)
|
||||
s.totalConns.Describe(descs)
|
||||
}
|
||||
|
||||
func (s *ServerTailnet) Collect(metrics chan<- prometheus.Metric) {
|
||||
s.connsPerAgent.Collect(metrics)
|
||||
s.totalConns.Collect(metrics)
|
||||
}
|
||||
|
||||
func (s *ServerTailnet) expireOldAgents() {
|
||||
const (
|
||||
tick = 5 * time.Minute
|
||||
|
@ -304,6 +327,9 @@ type ServerTailnet struct {
|
|||
agentTickets map[uuid.UUID]map[uuid.UUID]struct{}
|
||||
|
||||
transport *http.Transport
|
||||
|
||||
connsPerAgent *prometheus.GaugeVec
|
||||
totalConns *prometheus.CounterVec
|
||||
}
|
||||
|
||||
func (s *ServerTailnet) ReverseProxy(targetURL, dashboardURL *url.URL, agentID uuid.UUID) *httputil.ReverseProxy {
|
||||
|
@ -349,7 +375,18 @@ func (s *ServerTailnet) dialContext(ctx context.Context, network, addr string) (
|
|||
return nil, xerrors.Errorf("no agent id attached")
|
||||
}
|
||||
|
||||
return s.DialAgentNetConn(ctx, agentID, network, addr)
|
||||
nc, err := s.DialAgentNetConn(ctx, agentID, network, addr)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
s.connsPerAgent.WithLabelValues("tcp").Inc()
|
||||
s.totalConns.WithLabelValues("tcp").Inc()
|
||||
return &instrumentedConn{
|
||||
Conn: nc,
|
||||
agentID: agentID,
|
||||
connsPerAgent: s.connsPerAgent,
|
||||
}, nil
|
||||
}
|
||||
|
||||
func (s *ServerTailnet) ensureAgent(agentID uuid.UUID) error {
|
||||
|
@ -455,3 +492,18 @@ func (s *ServerTailnet) Close() error {
|
|||
<-s.derpMapUpdaterClosed
|
||||
return nil
|
||||
}
|
||||
|
||||
type instrumentedConn struct {
|
||||
net.Conn
|
||||
|
||||
agentID uuid.UUID
|
||||
closeOnce sync.Once
|
||||
connsPerAgent *prometheus.GaugeVec
|
||||
}
|
||||
|
||||
func (c *instrumentedConn) Close() error {
|
||||
c.closeOnce.Do(func() {
|
||||
c.connsPerAgent.WithLabelValues("tcp").Dec()
|
||||
})
|
||||
return c.Conn.Close()
|
||||
}
|
||||
|
|
|
@ -13,6 +13,7 @@ import (
|
|||
"testing"
|
||||
|
||||
"github.com/google/uuid"
|
||||
"github.com/prometheus/client_golang/prometheus"
|
||||
"github.com/spf13/afero"
|
||||
"github.com/stretchr/testify/assert"
|
||||
"github.com/stretchr/testify/require"
|
||||
|
@ -79,6 +80,43 @@ func TestServerTailnet_ReverseProxy(t *testing.T) {
|
|||
assert.Equal(t, http.StatusOK, res.StatusCode)
|
||||
})
|
||||
|
||||
t.Run("Metrics", func(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
ctx, cancel := context.WithTimeout(context.Background(), testutil.WaitLong)
|
||||
defer cancel()
|
||||
|
||||
agents, serverTailnet := setupServerTailnetAgent(t, 1)
|
||||
a := agents[0]
|
||||
|
||||
registry := prometheus.NewRegistry()
|
||||
require.NoError(t, registry.Register(serverTailnet))
|
||||
|
||||
u, err := url.Parse(fmt.Sprintf("http://127.0.0.1:%d", codersdk.WorkspaceAgentHTTPAPIServerPort))
|
||||
require.NoError(t, err)
|
||||
|
||||
rp := serverTailnet.ReverseProxy(u, u, a.id)
|
||||
|
||||
rw := httptest.NewRecorder()
|
||||
req := httptest.NewRequest(
|
||||
http.MethodGet,
|
||||
u.String(),
|
||||
nil,
|
||||
).WithContext(ctx)
|
||||
|
||||
rp.ServeHTTP(rw, req)
|
||||
res := rw.Result()
|
||||
defer res.Body.Close()
|
||||
|
||||
assert.Equal(t, http.StatusOK, res.StatusCode)
|
||||
require.Eventually(t, func() bool {
|
||||
metrics, err := registry.Gather()
|
||||
assert.NoError(t, err)
|
||||
return testutil.PromCounterHasValue(t, metrics, 1, "coder_servertailnet_connections_total", "tcp") &&
|
||||
testutil.PromGaugeHasValue(t, metrics, 1, "coder_servertailnet_open_connections", "tcp")
|
||||
}, testutil.WaitShort, testutil.IntervalFast)
|
||||
})
|
||||
|
||||
t.Run("HostRewrite", func(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
|
|
|
@ -0,0 +1,50 @@
|
|||
package testutil
|
||||
|
||||
import (
|
||||
"testing"
|
||||
|
||||
dto "github.com/prometheus/client_model/go"
|
||||
"github.com/stretchr/testify/require"
|
||||
)
|
||||
|
||||
func PromGaugeHasValue(t testing.TB, metrics []*dto.MetricFamily, value float64, name string, label ...string) bool {
|
||||
t.Helper()
|
||||
for _, family := range metrics {
|
||||
if family.GetName() != name {
|
||||
continue
|
||||
}
|
||||
ms := family.GetMetric()
|
||||
metricsLoop:
|
||||
for _, m := range ms {
|
||||
require.Equal(t, len(label), len(m.GetLabel()))
|
||||
for i, lv := range label {
|
||||
if lv != m.GetLabel()[i].GetValue() {
|
||||
continue metricsLoop
|
||||
}
|
||||
}
|
||||
return value == m.GetGauge().GetValue()
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
func PromCounterHasValue(t testing.TB, metrics []*dto.MetricFamily, value float64, name string, label ...string) bool {
|
||||
t.Helper()
|
||||
for _, family := range metrics {
|
||||
if family.GetName() != name {
|
||||
continue
|
||||
}
|
||||
ms := family.GetMetric()
|
||||
metricsLoop:
|
||||
for _, m := range ms {
|
||||
require.Equal(t, len(label), len(m.GetLabel()))
|
||||
for i, lv := range label {
|
||||
if lv != m.GetLabel()[i].GetValue() {
|
||||
continue metricsLoop
|
||||
}
|
||||
}
|
||||
return value == m.GetCounter().GetValue()
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
Loading…
Reference in New Issue