This commit is contained in:
Danny Kopping 2024-05-02 09:48:13 +00:00 committed by GitHub
commit 394858863a
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 243 additions and 38 deletions

View File

@ -0,0 +1,66 @@
package pubsub
import (
"context"
"fmt"
"time"
"github.com/google/uuid"
"golang.org/x/xerrors"
)
// LatencyMeasurer is used to measure the send & receive latencies of the underlying Pubsub implementation. We use these
// measurements to export metrics which can indicate when a Pubsub implementation's queue is overloaded and/or full.
type LatencyMeasurer struct {
// Create unique pubsub channel names so that multiple replicas do not clash when performing latency measurements,
// and only create one UUID per Pubsub impl (and not request) to limit the number of notification channels that need
// to be maintained by the Pubsub impl.
channelIDs map[Pubsub]uuid.UUID
}
func NewLatencyMeasurer() *LatencyMeasurer {
return &LatencyMeasurer{
channelIDs: make(map[Pubsub]uuid.UUID),
}
}
// Measure takes a given Pubsub implementation, publishes a message & immediately receives it, and returns the observed latency.
func (lm *LatencyMeasurer) Measure(ctx context.Context, p Pubsub) (send float64, recv float64, err error) {
var (
start time.Time
res = make(chan float64, 1)
)
cancel, err := p.Subscribe(lm.latencyChannelName(p), func(ctx context.Context, _ []byte) {
res <- time.Since(start).Seconds()
})
if err != nil {
return -1, -1, xerrors.Errorf("failed to subscribe: %w", err)
}
defer cancel()
start = time.Now()
err = p.Publish(lm.latencyChannelName(p), []byte{})
if err != nil {
return -1, -1, xerrors.Errorf("failed to publish: %w", err)
}
send = time.Since(start).Seconds()
select {
case <-ctx.Done():
return send, -1, ctx.Err()
case val := <-res:
return send, val, nil
}
}
func (lm *LatencyMeasurer) latencyChannelName(p Pubsub) string {
cid, found := lm.channelIDs[p]
if !found {
cid = uuid.New()
lm.channelIDs[p] = cid
}
return fmt.Sprintf("latency-measure:%s", cid.String())
}

View File

@ -12,6 +12,7 @@ import (
"github.com/google/uuid"
"github.com/lib/pq"
"github.com/prometheus/client_golang/prometheus"
"go.uber.org/atomic"
"golang.org/x/xerrors"
"cdr.dev/slog"
@ -205,6 +206,9 @@ type PGPubsub struct {
receivedBytesTotal prometheus.Counter
disconnectionsTotal prometheus.Counter
connected prometheus.Gauge
latencyMeasurer *LatencyMeasurer
latencyErrCounter atomic.Float64
}
// BufferSize is the maximum number of unhandled messages we will buffer
@ -478,6 +482,25 @@ var (
)
)
// additional metrics collected out-of-band
var (
pubsubSendLatencyDesc = prometheus.NewDesc(
"coder_pubsub_send_latency_seconds",
"The time taken to send a message into a pubsub event channel",
nil, nil,
)
pubsubRecvLatencyDesc = prometheus.NewDesc(
"coder_pubsub_receive_latency_seconds",
"The time taken to receive a message from a pubsub event channel",
nil, nil,
)
pubsubLatencyMeasureErrDesc = prometheus.NewDesc(
"coder_pubsub_latency_measure_errs_total",
"The number of pubsub latency measurement failures",
nil, nil,
)
)
// We'll track messages as size "normal" and "colossal", where the
// latter are messages larger than 7600 bytes, or 95% of the postgres
// notify limit. If we see a lot of colossal packets that's an indication that
@ -504,6 +527,11 @@ func (p *PGPubsub) Describe(descs chan<- *prometheus.Desc) {
// implicit metrics
descs <- currentSubscribersDesc
descs <- currentEventsDesc
// additional metrics
descs <- pubsubSendLatencyDesc
descs <- pubsubRecvLatencyDesc
descs <- pubsubLatencyMeasureErrDesc
}
// Collect implements, along with Describe, the prometheus.Collector interface
@ -528,6 +556,19 @@ func (p *PGPubsub) Collect(metrics chan<- prometheus.Metric) {
p.qMu.Unlock()
metrics <- prometheus.MustNewConstMetric(currentSubscribersDesc, prometheus.GaugeValue, float64(subs))
metrics <- prometheus.MustNewConstMetric(currentEventsDesc, prometheus.GaugeValue, float64(events))
// additional metrics
ctx, cancel := context.WithTimeout(context.Background(), time.Second*10)
defer cancel()
send, recv, err := p.latencyMeasurer.Measure(ctx, p)
if err != nil {
p.logger.Warn(context.Background(), "failed to measure latency", slog.Error(err))
metrics <- prometheus.MustNewConstMetric(pubsubLatencyMeasureErrDesc, prometheus.CounterValue, p.latencyErrCounter.Add(1))
return
}
metrics <- prometheus.MustNewConstMetric(pubsubSendLatencyDesc, prometheus.GaugeValue, send)
metrics <- prometheus.MustNewConstMetric(pubsubRecvLatencyDesc, prometheus.GaugeValue, recv)
}
// New creates a new Pubsub implementation using a PostgreSQL connection.
@ -544,10 +585,11 @@ func New(startCtx context.Context, logger slog.Logger, database *sql.DB, connect
// newWithoutListener creates a new PGPubsub without creating the pqListener.
func newWithoutListener(logger slog.Logger, database *sql.DB) *PGPubsub {
return &PGPubsub{
logger: logger,
listenDone: make(chan struct{}),
db: database,
queues: make(map[string]map[uuid.UUID]*msgQueue),
logger: logger,
listenDone: make(chan struct{}),
db: database,
queues: make(map[string]map[uuid.UUID]*msgQueue),
latencyMeasurer: NewLatencyMeasurer(),
publishesTotal: prometheus.NewCounterVec(prometheus.CounterOpts{
Namespace: "coder",

View File

@ -294,3 +294,56 @@ func TestPubsub_Disconnect(t *testing.T) {
}
require.True(t, gotDroppedErr)
}
func TestMeasureLatency(t *testing.T) {
t.Parallel()
newPubsub := func() (pubsub.Pubsub, func()) {
ctx, cancel := context.WithCancel(context.Background())
logger := slogtest.Make(t, nil).Leveled(slog.LevelDebug)
connectionURL, closePg, err := dbtestutil.Open()
require.NoError(t, err)
db, err := sql.Open("postgres", connectionURL)
require.NoError(t, err)
ps, err := pubsub.New(ctx, logger, db, connectionURL)
require.NoError(t, err)
return ps, func() {
_ = ps.Close()
_ = db.Close()
closePg()
cancel()
}
}
t.Run("MeasureLatency", func(t *testing.T) {
t.Parallel()
ps, done := newPubsub()
defer done()
ctx, cancel := context.WithTimeout(context.Background(), testutil.WaitSuperLong)
defer cancel()
send, recv, err := pubsub.NewLatencyMeasurer().Measure(ctx, ps)
require.NoError(t, err)
require.Greater(t, send, 0.0)
require.Greater(t, recv, 0.0)
})
t.Run("MeasureLatencyRecvTimeout", func(t *testing.T) {
t.Parallel()
ps, done := newPubsub()
defer done()
// nolint:gocritic // need a very short timeout here to trigger error
ctx, cancel := context.WithTimeout(context.Background(), time.Nanosecond)
defer cancel()
send, recv, err := pubsub.NewLatencyMeasurer().Measure(ctx, ps)
require.ErrorContains(t, err, context.DeadlineExceeded.Error())
require.Greater(t, send, 0.0)
require.EqualValues(t, recv, -1)
})
}

View File

@ -39,7 +39,11 @@ func TestPGPubsub_Metrics(t *testing.T) {
err = registry.Register(uut)
require.NoError(t, err)
// each Gather measures pubsub latency by publishing a message & subscribing to it
var gatherCount float64
metrics, err := registry.Gather()
gatherCount++
require.NoError(t, err)
require.True(t, testutil.PromGaugeHasValue(t, metrics, 0, "coder_pubsub_current_events"))
require.True(t, testutil.PromGaugeHasValue(t, metrics, 0, "coder_pubsub_current_subscribers"))
@ -60,18 +64,23 @@ func TestPGPubsub_Metrics(t *testing.T) {
require.Eventually(t, func() bool {
metrics, err = registry.Gather()
gatherCount++
assert.NoError(t, err)
return testutil.PromGaugeHasValue(t, metrics, 1, "coder_pubsub_current_events") &&
testutil.PromGaugeHasValue(t, metrics, 1, "coder_pubsub_current_subscribers") &&
testutil.PromGaugeHasValue(t, metrics, 1, "coder_pubsub_connected") &&
testutil.PromCounterHasValue(t, metrics, 1, "coder_pubsub_publishes_total", "true") &&
testutil.PromCounterHasValue(t, metrics, 1, "coder_pubsub_subscribes_total", "true") &&
testutil.PromCounterHasValue(t, metrics, 1, "coder_pubsub_messages_total", "normal") &&
testutil.PromCounterHasValue(t, metrics, 7, "coder_pubsub_received_bytes_total") &&
testutil.PromCounterHasValue(t, metrics, 7, "coder_pubsub_published_bytes_total")
testutil.PromCounterHasValue(t, metrics, gatherCount, "coder_pubsub_publishes_total", "true") &&
testutil.PromCounterHasValue(t, metrics, gatherCount, "coder_pubsub_subscribes_total", "true") &&
testutil.PromCounterHasValue(t, metrics, gatherCount, "coder_pubsub_messages_total", "normal") &&
testutil.PromCounterHasValue(t, metrics, float64(len(data)), "coder_pubsub_received_bytes_total") &&
testutil.PromCounterHasValue(t, metrics, float64(len(data)), "coder_pubsub_published_bytes_total") &&
testutil.PromGaugeAssertion(t, metrics, func(in float64) bool { return in > 0 }, "coder_pubsub_send_latency_seconds") &&
testutil.PromGaugeAssertion(t, metrics, func(in float64) bool { return in > 0 }, "coder_pubsub_receive_latency_seconds") &&
!testutil.PromCounterGathered(t, metrics, "coder_pubsub_latency_measure_errs_total")
}, testutil.WaitShort, testutil.IntervalFast)
colossalData := make([]byte, 7600)
colossalSize := 7600
colossalData := make([]byte, colossalSize)
for i := range colossalData {
colossalData[i] = 'q'
}
@ -90,15 +99,19 @@ func TestPGPubsub_Metrics(t *testing.T) {
require.Eventually(t, func() bool {
metrics, err = registry.Gather()
gatherCount++
assert.NoError(t, err)
return testutil.PromGaugeHasValue(t, metrics, 1, "coder_pubsub_current_events") &&
testutil.PromGaugeHasValue(t, metrics, 2, "coder_pubsub_current_subscribers") &&
testutil.PromGaugeHasValue(t, metrics, 1, "coder_pubsub_connected") &&
testutil.PromCounterHasValue(t, metrics, 2, "coder_pubsub_publishes_total", "true") &&
testutil.PromCounterHasValue(t, metrics, 2, "coder_pubsub_subscribes_total", "true") &&
testutil.PromCounterHasValue(t, metrics, 1, "coder_pubsub_messages_total", "normal") &&
testutil.PromCounterHasValue(t, metrics, 1+gatherCount, "coder_pubsub_publishes_total", "true") &&
testutil.PromCounterHasValue(t, metrics, 1+gatherCount, "coder_pubsub_subscribes_total", "true") &&
testutil.PromCounterHasValue(t, metrics, gatherCount, "coder_pubsub_messages_total", "normal") &&
testutil.PromCounterHasValue(t, metrics, 1, "coder_pubsub_messages_total", "colossal") &&
testutil.PromCounterHasValue(t, metrics, 7607, "coder_pubsub_received_bytes_total") &&
testutil.PromCounterHasValue(t, metrics, 7607, "coder_pubsub_published_bytes_total")
testutil.PromCounterHasValue(t, metrics, float64(colossalSize+len(data)), "coder_pubsub_received_bytes_total") &&
testutil.PromCounterHasValue(t, metrics, float64(colossalSize+len(data)), "coder_pubsub_published_bytes_total") &&
testutil.PromGaugeAssertion(t, metrics, func(in float64) bool { return in > 0 }, "coder_pubsub_send_latency_seconds") &&
testutil.PromGaugeAssertion(t, metrics, func(in float64) bool { return in > 0 }, "coder_pubsub_receive_latency_seconds") &&
!testutil.PromCounterGathered(t, metrics, "coder_pubsub_latency_measure_errs_total")
}, testutil.WaitShort, testutil.IntervalFast)
}

View File

@ -7,29 +7,58 @@ import (
"github.com/stretchr/testify/require"
)
func PromGaugeHasValue(t testing.TB, metrics []*dto.MetricFamily, value float64, name string, label ...string) bool {
type kind string
const (
counterKind kind = "counter"
gaugeKind kind = "gauge"
)
func PromGaugeHasValue(t testing.TB, metrics []*dto.MetricFamily, value float64, name string, labels ...string) bool {
t.Helper()
for _, family := range metrics {
if family.GetName() != name {
continue
}
ms := family.GetMetric()
metricsLoop:
for _, m := range ms {
require.Equal(t, len(label), len(m.GetLabel()))
for i, lv := range label {
if lv != m.GetLabel()[i].GetValue() {
continue metricsLoop
}
}
return value == m.GetGauge().GetValue()
}
}
return false
return value == getValue(t, metrics, gaugeKind, name, labels...)
}
func PromCounterHasValue(t testing.TB, metrics []*dto.MetricFamily, value float64, name string, label ...string) bool {
func PromCounterHasValue(t testing.TB, metrics []*dto.MetricFamily, value float64, name string, labels ...string) bool {
t.Helper()
return value == getValue(t, metrics, counterKind, name, labels...)
}
func PromGaugeAssertion(t testing.TB, metrics []*dto.MetricFamily, assert func(in float64) bool, name string, labels ...string) bool {
t.Helper()
return assert(getValue(t, metrics, gaugeKind, name, labels...))
}
func PromCounterAssertion(t testing.TB, metrics []*dto.MetricFamily, assert func(in float64) bool, name string, labels ...string) bool {
t.Helper()
return assert(getValue(t, metrics, counterKind, name, labels...))
}
func PromCounterGathered(t testing.TB, metrics []*dto.MetricFamily, name string, labels ...string) bool {
return getMetric(t, metrics, name, labels...) != nil
}
func PromGaugeGathered(t testing.TB, metrics []*dto.MetricFamily, name string, labels ...string) bool {
return getMetric(t, metrics, name, labels...) != nil
}
func getValue(t testing.TB, metrics []*dto.MetricFamily, kind kind, name string, labels ...string) float64 {
m := getMetric(t, metrics, name, labels...)
if m == nil {
return -1
}
switch kind {
case counterKind:
return m.GetCounter().GetValue()
case gaugeKind:
return m.GetGauge().GetValue()
default:
return -1
}
}
func getMetric(t testing.TB, metrics []*dto.MetricFamily, name string, labels ...string) *dto.Metric {
for _, family := range metrics {
if family.GetName() != name {
continue
@ -37,14 +66,16 @@ func PromCounterHasValue(t testing.TB, metrics []*dto.MetricFamily, value float6
ms := family.GetMetric()
metricsLoop:
for _, m := range ms {
require.Equal(t, len(label), len(m.GetLabel()))
for i, lv := range label {
require.Equal(t, len(labels), len(m.GetLabel()))
for i, lv := range labels {
if lv != m.GetLabel()[i].GetValue() {
continue metricsLoop
}
}
return value == m.GetCounter().GetValue()
return m
}
}
return false
return nil
}