fix(enterprise): mark nodes from unhealthy coordinators as lost (#13123)

Instead of removing the mappings of unhealthy coordinators entirely,
mark them as lost instead. This prevents peers from disappearing from
other peers if a coordinator misses a heartbeat.
This commit is contained in:
Colin Adler 2024-05-03 14:07:29 -05:00 committed by GitHub
parent a3c23ed313
commit 205c43da99
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 104 additions and 15 deletions

View File

@ -1485,10 +1485,17 @@ func (h *heartbeats) filter(mappings []mapping) []mapping {
ok := m.coordinator == h.self
if !ok {
_, ok = h.coordinators[m.coordinator]
if !ok {
// If a mapping exists to a coordinator lost to heartbeats,
// still add the mapping as LOST. If a coordinator misses
// heartbeats but a client is still connected to it, this may be
// the only mapping available for it. Newer mappings will take
// precedence.
m.kind = proto.CoordinateResponse_PeerUpdate_LOST
}
}
if ok {
out = append(out, m)
}
out = append(out, m)
}
return out
}

View File

@ -11,6 +11,7 @@ import (
"time"
"github.com/google/uuid"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
"go.uber.org/mock/gomock"
"golang.org/x/xerrors"
@ -33,9 +34,9 @@ import (
// make update-golden-files
var UpdateGoldenFiles = flag.Bool("update", false, "update .golden files")
// TestHeartbeat_Cleanup is internal so that we can overwrite the cleanup period and not wait an hour for the timed
// TestHeartbeats_Cleanup is internal so that we can overwrite the cleanup period and not wait an hour for the timed
// cleanup.
func TestHeartbeat_Cleanup(t *testing.T) {
func TestHeartbeats_Cleanup(t *testing.T) {
t.Parallel()
ctrl := gomock.NewController(t)
@ -78,6 +79,41 @@ func TestHeartbeat_Cleanup(t *testing.T) {
close(waitForCleanup)
}
func TestHeartbeats_LostCoordinator_MarkLost(t *testing.T) {
t.Parallel()
ctrl := gomock.NewController(t)
mStore := dbmock.NewMockStore(ctrl)
ctx, cancel := context.WithTimeout(context.Background(), testutil.WaitShort)
defer cancel()
logger := slogtest.Make(t, nil).Leveled(slog.LevelDebug)
uut := &heartbeats{
ctx: ctx,
logger: logger,
store: mStore,
cleanupPeriod: time.Millisecond,
coordinators: map[uuid.UUID]time.Time{
uuid.New(): time.Now(),
},
}
mpngs := []mapping{{
peer: uuid.New(),
coordinator: uuid.New(),
updatedAt: time.Now(),
node: &proto.Node{},
kind: proto.CoordinateResponse_PeerUpdate_NODE,
}}
// Filter should still return the mapping without a coordinator, but marked
// as LOST.
got := uut.filter(mpngs)
require.Len(t, got, 1)
assert.Equal(t, proto.CoordinateResponse_PeerUpdate_LOST, got[0].kind)
}
// TestLostPeerCleanupQueries tests that our SQL queries to clean up lost peers do what we expect,
// that is, clean up peers and associated tunnels that have been lost for over 24 hours.
func TestLostPeerCleanupQueries(t *testing.T) {

View File

@ -415,6 +415,52 @@ func TestPGCoordinatorSingle_MissedHeartbeats(t *testing.T) {
assertEventuallyLost(ctx, t, store, client.id)
}
func TestPGCoordinatorSingle_MissedHeartbeats_NoDrop(t *testing.T) {
t.Parallel()
if !dbtestutil.WillUsePostgres() {
t.Skip("test only with postgres")
}
store, ps := dbtestutil.NewDB(t)
ctx, cancel := context.WithTimeout(context.Background(), testutil.WaitSuperLong)
defer cancel()
logger := slogtest.Make(t, nil).Leveled(slog.LevelDebug)
coordinator, err := tailnet.NewPGCoord(ctx, logger, ps, store)
require.NoError(t, err)
defer coordinator.Close()
agentID := uuid.New()
client := agpltest.NewPeer(ctx, t, coordinator, "client")
defer client.Close(ctx)
client.AddTunnel(agentID)
client.UpdateDERP(11)
// simulate a second coordinator via DB calls only --- our goal is to test
// broken heart-beating, so we can't use a real coordinator
fCoord2 := &fakeCoordinator{
ctx: ctx,
t: t,
store: store,
id: uuid.New(),
}
// simulate a single heartbeat, the coordinator is healthy
fCoord2.heartbeat()
fCoord2.agentNode(agentID, &agpl.Node{PreferredDERP: 12})
// since it's healthy the client should get the new node.
client.AssertEventuallyHasDERP(agentID, 12)
// the heartbeat should then timeout and we'll get sent a LOST update, NOT a
// disconnect.
client.AssertEventuallyLost(agentID)
client.Close(ctx)
assertEventuallyLost(ctx, t, store, client.ID)
}
func TestPGCoordinatorSingle_SendsHeartbeats(t *testing.T) {
t.Parallel()
if !dbtestutil.WillUsePostgres() {
@ -857,6 +903,16 @@ func newTestAgent(t *testing.T, coord agpl.CoordinatorV1, name string, id ...uui
return a
}
func newTestClient(t *testing.T, coord agpl.CoordinatorV1, agentID uuid.UUID, id ...uuid.UUID) *testConn {
c := newTestConn(id)
go func() {
err := coord.ServeClient(c.serverWS, c.id, agentID)
assert.NoError(t, err)
close(c.closeChan)
}()
return c
}
func (c *testConn) close() error {
return c.ws.Close()
}
@ -902,16 +958,6 @@ func (c *testConn) waitForClose(ctx context.Context, t *testing.T) {
}
}
func newTestClient(t *testing.T, coord agpl.CoordinatorV1, agentID uuid.UUID, id ...uuid.UUID) *testConn {
c := newTestConn(id)
go func() {
err := coord.ServeClient(c.serverWS, c.id, agentID)
assert.NoError(t, err)
close(c.closeChan)
}()
return c
}
func assertEventuallyHasDERPs(ctx context.Context, t *testing.T, c *testConn, expected ...int) {
t.Helper()
for {