mirror of https://github.com/coder/coder.git
Merge 28a96dec4a
into 93d8812284
This commit is contained in:
commit
394858863a
|
@ -0,0 +1,66 @@
|
|||
package pubsub
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"time"
|
||||
|
||||
"github.com/google/uuid"
|
||||
"golang.org/x/xerrors"
|
||||
)
|
||||
|
||||
// LatencyMeasurer is used to measure the send & receive latencies of the underlying Pubsub implementation. We use these
|
||||
// measurements to export metrics which can indicate when a Pubsub implementation's queue is overloaded and/or full.
|
||||
type LatencyMeasurer struct {
|
||||
// Create unique pubsub channel names so that multiple replicas do not clash when performing latency measurements,
|
||||
// and only create one UUID per Pubsub impl (and not request) to limit the number of notification channels that need
|
||||
// to be maintained by the Pubsub impl.
|
||||
channelIDs map[Pubsub]uuid.UUID
|
||||
}
|
||||
|
||||
func NewLatencyMeasurer() *LatencyMeasurer {
|
||||
return &LatencyMeasurer{
|
||||
channelIDs: make(map[Pubsub]uuid.UUID),
|
||||
}
|
||||
}
|
||||
|
||||
// Measure takes a given Pubsub implementation, publishes a message & immediately receives it, and returns the observed latency.
|
||||
func (lm *LatencyMeasurer) Measure(ctx context.Context, p Pubsub) (send float64, recv float64, err error) {
|
||||
var (
|
||||
start time.Time
|
||||
res = make(chan float64, 1)
|
||||
)
|
||||
|
||||
cancel, err := p.Subscribe(lm.latencyChannelName(p), func(ctx context.Context, _ []byte) {
|
||||
res <- time.Since(start).Seconds()
|
||||
})
|
||||
if err != nil {
|
||||
return -1, -1, xerrors.Errorf("failed to subscribe: %w", err)
|
||||
}
|
||||
defer cancel()
|
||||
|
||||
start = time.Now()
|
||||
err = p.Publish(lm.latencyChannelName(p), []byte{})
|
||||
if err != nil {
|
||||
return -1, -1, xerrors.Errorf("failed to publish: %w", err)
|
||||
}
|
||||
|
||||
send = time.Since(start).Seconds()
|
||||
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return send, -1, ctx.Err()
|
||||
case val := <-res:
|
||||
return send, val, nil
|
||||
}
|
||||
}
|
||||
|
||||
func (lm *LatencyMeasurer) latencyChannelName(p Pubsub) string {
|
||||
cid, found := lm.channelIDs[p]
|
||||
if !found {
|
||||
cid = uuid.New()
|
||||
lm.channelIDs[p] = cid
|
||||
}
|
||||
|
||||
return fmt.Sprintf("latency-measure:%s", cid.String())
|
||||
}
|
|
@ -12,6 +12,7 @@ import (
|
|||
"github.com/google/uuid"
|
||||
"github.com/lib/pq"
|
||||
"github.com/prometheus/client_golang/prometheus"
|
||||
"go.uber.org/atomic"
|
||||
"golang.org/x/xerrors"
|
||||
|
||||
"cdr.dev/slog"
|
||||
|
@ -205,6 +206,9 @@ type PGPubsub struct {
|
|||
receivedBytesTotal prometheus.Counter
|
||||
disconnectionsTotal prometheus.Counter
|
||||
connected prometheus.Gauge
|
||||
|
||||
latencyMeasurer *LatencyMeasurer
|
||||
latencyErrCounter atomic.Float64
|
||||
}
|
||||
|
||||
// BufferSize is the maximum number of unhandled messages we will buffer
|
||||
|
@ -478,6 +482,25 @@ var (
|
|||
)
|
||||
)
|
||||
|
||||
// additional metrics collected out-of-band
|
||||
var (
|
||||
pubsubSendLatencyDesc = prometheus.NewDesc(
|
||||
"coder_pubsub_send_latency_seconds",
|
||||
"The time taken to send a message into a pubsub event channel",
|
||||
nil, nil,
|
||||
)
|
||||
pubsubRecvLatencyDesc = prometheus.NewDesc(
|
||||
"coder_pubsub_receive_latency_seconds",
|
||||
"The time taken to receive a message from a pubsub event channel",
|
||||
nil, nil,
|
||||
)
|
||||
pubsubLatencyMeasureErrDesc = prometheus.NewDesc(
|
||||
"coder_pubsub_latency_measure_errs_total",
|
||||
"The number of pubsub latency measurement failures",
|
||||
nil, nil,
|
||||
)
|
||||
)
|
||||
|
||||
// We'll track messages as size "normal" and "colossal", where the
|
||||
// latter are messages larger than 7600 bytes, or 95% of the postgres
|
||||
// notify limit. If we see a lot of colossal packets that's an indication that
|
||||
|
@ -504,6 +527,11 @@ func (p *PGPubsub) Describe(descs chan<- *prometheus.Desc) {
|
|||
// implicit metrics
|
||||
descs <- currentSubscribersDesc
|
||||
descs <- currentEventsDesc
|
||||
|
||||
// additional metrics
|
||||
descs <- pubsubSendLatencyDesc
|
||||
descs <- pubsubRecvLatencyDesc
|
||||
descs <- pubsubLatencyMeasureErrDesc
|
||||
}
|
||||
|
||||
// Collect implements, along with Describe, the prometheus.Collector interface
|
||||
|
@ -528,6 +556,19 @@ func (p *PGPubsub) Collect(metrics chan<- prometheus.Metric) {
|
|||
p.qMu.Unlock()
|
||||
metrics <- prometheus.MustNewConstMetric(currentSubscribersDesc, prometheus.GaugeValue, float64(subs))
|
||||
metrics <- prometheus.MustNewConstMetric(currentEventsDesc, prometheus.GaugeValue, float64(events))
|
||||
|
||||
// additional metrics
|
||||
ctx, cancel := context.WithTimeout(context.Background(), time.Second*10)
|
||||
defer cancel()
|
||||
|
||||
send, recv, err := p.latencyMeasurer.Measure(ctx, p)
|
||||
if err != nil {
|
||||
p.logger.Warn(context.Background(), "failed to measure latency", slog.Error(err))
|
||||
metrics <- prometheus.MustNewConstMetric(pubsubLatencyMeasureErrDesc, prometheus.CounterValue, p.latencyErrCounter.Add(1))
|
||||
return
|
||||
}
|
||||
metrics <- prometheus.MustNewConstMetric(pubsubSendLatencyDesc, prometheus.GaugeValue, send)
|
||||
metrics <- prometheus.MustNewConstMetric(pubsubRecvLatencyDesc, prometheus.GaugeValue, recv)
|
||||
}
|
||||
|
||||
// New creates a new Pubsub implementation using a PostgreSQL connection.
|
||||
|
@ -544,10 +585,11 @@ func New(startCtx context.Context, logger slog.Logger, database *sql.DB, connect
|
|||
// newWithoutListener creates a new PGPubsub without creating the pqListener.
|
||||
func newWithoutListener(logger slog.Logger, database *sql.DB) *PGPubsub {
|
||||
return &PGPubsub{
|
||||
logger: logger,
|
||||
listenDone: make(chan struct{}),
|
||||
db: database,
|
||||
queues: make(map[string]map[uuid.UUID]*msgQueue),
|
||||
logger: logger,
|
||||
listenDone: make(chan struct{}),
|
||||
db: database,
|
||||
queues: make(map[string]map[uuid.UUID]*msgQueue),
|
||||
latencyMeasurer: NewLatencyMeasurer(),
|
||||
|
||||
publishesTotal: prometheus.NewCounterVec(prometheus.CounterOpts{
|
||||
Namespace: "coder",
|
||||
|
|
|
@ -294,3 +294,56 @@ func TestPubsub_Disconnect(t *testing.T) {
|
|||
}
|
||||
require.True(t, gotDroppedErr)
|
||||
}
|
||||
|
||||
func TestMeasureLatency(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
newPubsub := func() (pubsub.Pubsub, func()) {
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
logger := slogtest.Make(t, nil).Leveled(slog.LevelDebug)
|
||||
connectionURL, closePg, err := dbtestutil.Open()
|
||||
require.NoError(t, err)
|
||||
db, err := sql.Open("postgres", connectionURL)
|
||||
require.NoError(t, err)
|
||||
ps, err := pubsub.New(ctx, logger, db, connectionURL)
|
||||
require.NoError(t, err)
|
||||
|
||||
return ps, func() {
|
||||
_ = ps.Close()
|
||||
_ = db.Close()
|
||||
closePg()
|
||||
cancel()
|
||||
}
|
||||
}
|
||||
|
||||
t.Run("MeasureLatency", func(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
ps, done := newPubsub()
|
||||
defer done()
|
||||
|
||||
ctx, cancel := context.WithTimeout(context.Background(), testutil.WaitSuperLong)
|
||||
defer cancel()
|
||||
|
||||
send, recv, err := pubsub.NewLatencyMeasurer().Measure(ctx, ps)
|
||||
require.NoError(t, err)
|
||||
require.Greater(t, send, 0.0)
|
||||
require.Greater(t, recv, 0.0)
|
||||
})
|
||||
|
||||
t.Run("MeasureLatencyRecvTimeout", func(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
ps, done := newPubsub()
|
||||
defer done()
|
||||
|
||||
// nolint:gocritic // need a very short timeout here to trigger error
|
||||
ctx, cancel := context.WithTimeout(context.Background(), time.Nanosecond)
|
||||
defer cancel()
|
||||
|
||||
send, recv, err := pubsub.NewLatencyMeasurer().Measure(ctx, ps)
|
||||
require.ErrorContains(t, err, context.DeadlineExceeded.Error())
|
||||
require.Greater(t, send, 0.0)
|
||||
require.EqualValues(t, recv, -1)
|
||||
})
|
||||
}
|
||||
|
|
|
@ -39,7 +39,11 @@ func TestPGPubsub_Metrics(t *testing.T) {
|
|||
err = registry.Register(uut)
|
||||
require.NoError(t, err)
|
||||
|
||||
// each Gather measures pubsub latency by publishing a message & subscribing to it
|
||||
var gatherCount float64
|
||||
|
||||
metrics, err := registry.Gather()
|
||||
gatherCount++
|
||||
require.NoError(t, err)
|
||||
require.True(t, testutil.PromGaugeHasValue(t, metrics, 0, "coder_pubsub_current_events"))
|
||||
require.True(t, testutil.PromGaugeHasValue(t, metrics, 0, "coder_pubsub_current_subscribers"))
|
||||
|
@ -60,18 +64,23 @@ func TestPGPubsub_Metrics(t *testing.T) {
|
|||
|
||||
require.Eventually(t, func() bool {
|
||||
metrics, err = registry.Gather()
|
||||
gatherCount++
|
||||
assert.NoError(t, err)
|
||||
return testutil.PromGaugeHasValue(t, metrics, 1, "coder_pubsub_current_events") &&
|
||||
testutil.PromGaugeHasValue(t, metrics, 1, "coder_pubsub_current_subscribers") &&
|
||||
testutil.PromGaugeHasValue(t, metrics, 1, "coder_pubsub_connected") &&
|
||||
testutil.PromCounterHasValue(t, metrics, 1, "coder_pubsub_publishes_total", "true") &&
|
||||
testutil.PromCounterHasValue(t, metrics, 1, "coder_pubsub_subscribes_total", "true") &&
|
||||
testutil.PromCounterHasValue(t, metrics, 1, "coder_pubsub_messages_total", "normal") &&
|
||||
testutil.PromCounterHasValue(t, metrics, 7, "coder_pubsub_received_bytes_total") &&
|
||||
testutil.PromCounterHasValue(t, metrics, 7, "coder_pubsub_published_bytes_total")
|
||||
testutil.PromCounterHasValue(t, metrics, gatherCount, "coder_pubsub_publishes_total", "true") &&
|
||||
testutil.PromCounterHasValue(t, metrics, gatherCount, "coder_pubsub_subscribes_total", "true") &&
|
||||
testutil.PromCounterHasValue(t, metrics, gatherCount, "coder_pubsub_messages_total", "normal") &&
|
||||
testutil.PromCounterHasValue(t, metrics, float64(len(data)), "coder_pubsub_received_bytes_total") &&
|
||||
testutil.PromCounterHasValue(t, metrics, float64(len(data)), "coder_pubsub_published_bytes_total") &&
|
||||
testutil.PromGaugeAssertion(t, metrics, func(in float64) bool { return in > 0 }, "coder_pubsub_send_latency_seconds") &&
|
||||
testutil.PromGaugeAssertion(t, metrics, func(in float64) bool { return in > 0 }, "coder_pubsub_receive_latency_seconds") &&
|
||||
!testutil.PromCounterGathered(t, metrics, "coder_pubsub_latency_measure_errs_total")
|
||||
}, testutil.WaitShort, testutil.IntervalFast)
|
||||
|
||||
colossalData := make([]byte, 7600)
|
||||
colossalSize := 7600
|
||||
colossalData := make([]byte, colossalSize)
|
||||
for i := range colossalData {
|
||||
colossalData[i] = 'q'
|
||||
}
|
||||
|
@ -90,15 +99,19 @@ func TestPGPubsub_Metrics(t *testing.T) {
|
|||
|
||||
require.Eventually(t, func() bool {
|
||||
metrics, err = registry.Gather()
|
||||
gatherCount++
|
||||
assert.NoError(t, err)
|
||||
return testutil.PromGaugeHasValue(t, metrics, 1, "coder_pubsub_current_events") &&
|
||||
testutil.PromGaugeHasValue(t, metrics, 2, "coder_pubsub_current_subscribers") &&
|
||||
testutil.PromGaugeHasValue(t, metrics, 1, "coder_pubsub_connected") &&
|
||||
testutil.PromCounterHasValue(t, metrics, 2, "coder_pubsub_publishes_total", "true") &&
|
||||
testutil.PromCounterHasValue(t, metrics, 2, "coder_pubsub_subscribes_total", "true") &&
|
||||
testutil.PromCounterHasValue(t, metrics, 1, "coder_pubsub_messages_total", "normal") &&
|
||||
testutil.PromCounterHasValue(t, metrics, 1+gatherCount, "coder_pubsub_publishes_total", "true") &&
|
||||
testutil.PromCounterHasValue(t, metrics, 1+gatherCount, "coder_pubsub_subscribes_total", "true") &&
|
||||
testutil.PromCounterHasValue(t, metrics, gatherCount, "coder_pubsub_messages_total", "normal") &&
|
||||
testutil.PromCounterHasValue(t, metrics, 1, "coder_pubsub_messages_total", "colossal") &&
|
||||
testutil.PromCounterHasValue(t, metrics, 7607, "coder_pubsub_received_bytes_total") &&
|
||||
testutil.PromCounterHasValue(t, metrics, 7607, "coder_pubsub_published_bytes_total")
|
||||
testutil.PromCounterHasValue(t, metrics, float64(colossalSize+len(data)), "coder_pubsub_received_bytes_total") &&
|
||||
testutil.PromCounterHasValue(t, metrics, float64(colossalSize+len(data)), "coder_pubsub_published_bytes_total") &&
|
||||
testutil.PromGaugeAssertion(t, metrics, func(in float64) bool { return in > 0 }, "coder_pubsub_send_latency_seconds") &&
|
||||
testutil.PromGaugeAssertion(t, metrics, func(in float64) bool { return in > 0 }, "coder_pubsub_receive_latency_seconds") &&
|
||||
!testutil.PromCounterGathered(t, metrics, "coder_pubsub_latency_measure_errs_total")
|
||||
}, testutil.WaitShort, testutil.IntervalFast)
|
||||
}
|
||||
|
|
|
@ -7,29 +7,58 @@ import (
|
|||
"github.com/stretchr/testify/require"
|
||||
)
|
||||
|
||||
func PromGaugeHasValue(t testing.TB, metrics []*dto.MetricFamily, value float64, name string, label ...string) bool {
|
||||
type kind string
|
||||
|
||||
const (
|
||||
counterKind kind = "counter"
|
||||
gaugeKind kind = "gauge"
|
||||
)
|
||||
|
||||
func PromGaugeHasValue(t testing.TB, metrics []*dto.MetricFamily, value float64, name string, labels ...string) bool {
|
||||
t.Helper()
|
||||
for _, family := range metrics {
|
||||
if family.GetName() != name {
|
||||
continue
|
||||
}
|
||||
ms := family.GetMetric()
|
||||
metricsLoop:
|
||||
for _, m := range ms {
|
||||
require.Equal(t, len(label), len(m.GetLabel()))
|
||||
for i, lv := range label {
|
||||
if lv != m.GetLabel()[i].GetValue() {
|
||||
continue metricsLoop
|
||||
}
|
||||
}
|
||||
return value == m.GetGauge().GetValue()
|
||||
}
|
||||
}
|
||||
return false
|
||||
return value == getValue(t, metrics, gaugeKind, name, labels...)
|
||||
}
|
||||
|
||||
func PromCounterHasValue(t testing.TB, metrics []*dto.MetricFamily, value float64, name string, label ...string) bool {
|
||||
func PromCounterHasValue(t testing.TB, metrics []*dto.MetricFamily, value float64, name string, labels ...string) bool {
|
||||
t.Helper()
|
||||
return value == getValue(t, metrics, counterKind, name, labels...)
|
||||
}
|
||||
|
||||
func PromGaugeAssertion(t testing.TB, metrics []*dto.MetricFamily, assert func(in float64) bool, name string, labels ...string) bool {
|
||||
t.Helper()
|
||||
return assert(getValue(t, metrics, gaugeKind, name, labels...))
|
||||
}
|
||||
|
||||
func PromCounterAssertion(t testing.TB, metrics []*dto.MetricFamily, assert func(in float64) bool, name string, labels ...string) bool {
|
||||
t.Helper()
|
||||
return assert(getValue(t, metrics, counterKind, name, labels...))
|
||||
}
|
||||
|
||||
func PromCounterGathered(t testing.TB, metrics []*dto.MetricFamily, name string, labels ...string) bool {
|
||||
return getMetric(t, metrics, name, labels...) != nil
|
||||
}
|
||||
|
||||
func PromGaugeGathered(t testing.TB, metrics []*dto.MetricFamily, name string, labels ...string) bool {
|
||||
return getMetric(t, metrics, name, labels...) != nil
|
||||
}
|
||||
|
||||
func getValue(t testing.TB, metrics []*dto.MetricFamily, kind kind, name string, labels ...string) float64 {
|
||||
m := getMetric(t, metrics, name, labels...)
|
||||
if m == nil {
|
||||
return -1
|
||||
}
|
||||
|
||||
switch kind {
|
||||
case counterKind:
|
||||
return m.GetCounter().GetValue()
|
||||
case gaugeKind:
|
||||
return m.GetGauge().GetValue()
|
||||
default:
|
||||
return -1
|
||||
}
|
||||
}
|
||||
|
||||
func getMetric(t testing.TB, metrics []*dto.MetricFamily, name string, labels ...string) *dto.Metric {
|
||||
for _, family := range metrics {
|
||||
if family.GetName() != name {
|
||||
continue
|
||||
|
@ -37,14 +66,16 @@ func PromCounterHasValue(t testing.TB, metrics []*dto.MetricFamily, value float6
|
|||
ms := family.GetMetric()
|
||||
metricsLoop:
|
||||
for _, m := range ms {
|
||||
require.Equal(t, len(label), len(m.GetLabel()))
|
||||
for i, lv := range label {
|
||||
require.Equal(t, len(labels), len(m.GetLabel()))
|
||||
for i, lv := range labels {
|
||||
if lv != m.GetLabel()[i].GetValue() {
|
||||
continue metricsLoop
|
||||
}
|
||||
}
|
||||
return value == m.GetCounter().GetValue()
|
||||
|
||||
return m
|
||||
}
|
||||
}
|
||||
return false
|
||||
|
||||
return nil
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue