fix(enterprise): mark nodes from unhealthy coordinators as lost

Instead of removing the mappings of unhealthy coordinators entirely,
mark them as lost instead. This prevents peers from disappearing from
other peers if a coordinator misses a heartbeat.
This commit is contained in:
Colin Adler 2024-05-01 21:10:58 +00:00
parent 3ff9cef498
commit a8ac20594a
2 changed files with 48 additions and 5 deletions

View File

@ -1485,10 +1485,17 @@ func (h *heartbeats) filter(mappings []mapping) []mapping {
ok := m.coordinator == h.self
if !ok {
_, ok = h.coordinators[m.coordinator]
if !ok {
// If a mapping exists to a coordinator lost to heartbeats,
// still add the mapping as LOST. If a coordinator misses
// heartbeats but a client is still connected to it, this may be
// the only mapping available for it. Newer mappings will take
// precedence.
m.kind = proto.CoordinateResponse_PeerUpdate_LOST
}
}
if ok {
out = append(out, m)
}
out = append(out, m)
}
return out
}

View File

@ -11,6 +11,7 @@ import (
"time"
"github.com/google/uuid"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
"go.uber.org/mock/gomock"
"golang.org/x/xerrors"
@ -33,9 +34,9 @@ import (
// make update-golden-files
var UpdateGoldenFiles = flag.Bool("update", false, "update .golden files")
// TestHeartbeat_Cleanup is internal so that we can overwrite the cleanup period and not wait an hour for the timed
// TestHeartbeats_Cleanup is internal so that we can overwrite the cleanup period and not wait an hour for the timed
// cleanup.
func TestHeartbeat_Cleanup(t *testing.T) {
func TestHeartbeats_Cleanup(t *testing.T) {
t.Parallel()
ctrl := gomock.NewController(t)
@ -78,6 +79,41 @@ func TestHeartbeat_Cleanup(t *testing.T) {
close(waitForCleanup)
}
func TestHeartbeats_LostCoordinator_MarkLost(t *testing.T) {
t.Parallel()
ctrl := gomock.NewController(t)
mStore := dbmock.NewMockStore(ctrl)
ctx, cancel := context.WithTimeout(context.Background(), testutil.WaitShort)
defer cancel()
logger := slogtest.Make(t, nil).Leveled(slog.LevelDebug)
uut := &heartbeats{
ctx: ctx,
logger: logger,
store: mStore,
cleanupPeriod: time.Millisecond,
coordinators: map[uuid.UUID]time.Time{
uuid.New(): time.Now(),
},
}
mpngs := []mapping{{
peer: uuid.New(),
coordinator: uuid.New(),
updatedAt: time.Now(),
node: &proto.Node{},
kind: proto.CoordinateResponse_PeerUpdate_NODE,
}}
// Filter should still return the mapping without a coordinator, but marked
// as LOST.
got := uut.filter(mpngs)
require.Len(t, got, 1)
assert.Equal(t, proto.CoordinateResponse_PeerUpdate_LOST, got[0].kind)
}
// TestLostPeerCleanupQueries tests that our SQL queries to clean up lost peers do what we expect,
// that is, clean up peers and associated tunnels that have been lost for over 24 hours.
func TestLostPeerCleanupQueries(t *testing.T) {