mirror of https://github.com/coder/coder.git
fix(enterprise): mark nodes from unhealthy coordinators as lost
Instead of removing the mappings of unhealthy coordinators entirely, mark them as lost instead. This prevents peers from disappearing from other peers if a coordinator misses a heartbeat.
This commit is contained in:
parent
3ff9cef498
commit
a8ac20594a
|
@ -1485,10 +1485,17 @@ func (h *heartbeats) filter(mappings []mapping) []mapping {
|
|||
ok := m.coordinator == h.self
|
||||
if !ok {
|
||||
_, ok = h.coordinators[m.coordinator]
|
||||
if !ok {
|
||||
// If a mapping exists to a coordinator lost to heartbeats,
|
||||
// still add the mapping as LOST. If a coordinator misses
|
||||
// heartbeats but a client is still connected to it, this may be
|
||||
// the only mapping available for it. Newer mappings will take
|
||||
// precedence.
|
||||
m.kind = proto.CoordinateResponse_PeerUpdate_LOST
|
||||
}
|
||||
}
|
||||
if ok {
|
||||
out = append(out, m)
|
||||
}
|
||||
|
||||
out = append(out, m)
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
|
|
@ -11,6 +11,7 @@ import (
|
|||
"time"
|
||||
|
||||
"github.com/google/uuid"
|
||||
"github.com/stretchr/testify/assert"
|
||||
"github.com/stretchr/testify/require"
|
||||
"go.uber.org/mock/gomock"
|
||||
"golang.org/x/xerrors"
|
||||
|
@ -33,9 +34,9 @@ import (
|
|||
// make update-golden-files
|
||||
var UpdateGoldenFiles = flag.Bool("update", false, "update .golden files")
|
||||
|
||||
// TestHeartbeat_Cleanup is internal so that we can overwrite the cleanup period and not wait an hour for the timed
|
||||
// TestHeartbeats_Cleanup is internal so that we can overwrite the cleanup period and not wait an hour for the timed
|
||||
// cleanup.
|
||||
func TestHeartbeat_Cleanup(t *testing.T) {
|
||||
func TestHeartbeats_Cleanup(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
ctrl := gomock.NewController(t)
|
||||
|
@ -78,6 +79,41 @@ func TestHeartbeat_Cleanup(t *testing.T) {
|
|||
close(waitForCleanup)
|
||||
}
|
||||
|
||||
func TestHeartbeats_LostCoordinator_MarkLost(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
ctrl := gomock.NewController(t)
|
||||
mStore := dbmock.NewMockStore(ctrl)
|
||||
|
||||
ctx, cancel := context.WithTimeout(context.Background(), testutil.WaitShort)
|
||||
defer cancel()
|
||||
logger := slogtest.Make(t, nil).Leveled(slog.LevelDebug)
|
||||
|
||||
uut := &heartbeats{
|
||||
ctx: ctx,
|
||||
logger: logger,
|
||||
store: mStore,
|
||||
cleanupPeriod: time.Millisecond,
|
||||
coordinators: map[uuid.UUID]time.Time{
|
||||
uuid.New(): time.Now(),
|
||||
},
|
||||
}
|
||||
|
||||
mpngs := []mapping{{
|
||||
peer: uuid.New(),
|
||||
coordinator: uuid.New(),
|
||||
updatedAt: time.Now(),
|
||||
node: &proto.Node{},
|
||||
kind: proto.CoordinateResponse_PeerUpdate_NODE,
|
||||
}}
|
||||
|
||||
// Filter should still return the mapping without a coordinator, but marked
|
||||
// as LOST.
|
||||
got := uut.filter(mpngs)
|
||||
require.Len(t, got, 1)
|
||||
assert.Equal(t, proto.CoordinateResponse_PeerUpdate_LOST, got[0].kind)
|
||||
}
|
||||
|
||||
// TestLostPeerCleanupQueries tests that our SQL queries to clean up lost peers do what we expect,
|
||||
// that is, clean up peers and associated tunnels that have been lost for over 24 hours.
|
||||
func TestLostPeerCleanupQueries(t *testing.T) {
|
||||
|
|
Loading…
Reference in New Issue