fix(enterprise): mark nodes from unhealthy coordinators as lost (#13123)

Instead of removing the mappings of unhealthy coordinators entirely, mark them as lost instead. This prevents peers from disappearing from other peers if a coordinator misses a heartbeat.
2024-05-03 14:07:29 -05:00 · 2024-05-03 14:07:29 -05:00 · 205c43da99
parent a3c23ed313
commit 205c43da99
3 changed files with 104 additions and 15 deletions
--- a/enterprise/tailnet/pgcoord.go
+++ b/enterprise/tailnet/pgcoord.go
@ -1485,10 +1485,17 @@ func (h *heartbeats) filter(mappings []mapping) []mapping {
 		ok := m.coordinator == h.self
 		if !ok {
 			_, ok = h.coordinators[m.coordinator]
+			if !ok {
+				// If a mapping exists to a coordinator lost to heartbeats,
+				// still add the mapping as LOST. If a coordinator misses
+				// heartbeats but a client is still connected to it, this may be
+				// the only mapping available for it. Newer mappings will take
+				// precedence.
+				m.kind = proto.CoordinateResponse_PeerUpdate_LOST
+			}
 		}
-		if ok {
-			out = append(out, m)
-		}
+
+		out = append(out, m)
 	}
 	return out
 }
--- a/enterprise/tailnet/pgcoord_internal_test.go
+++ b/enterprise/tailnet/pgcoord_internal_test.go
@ -11,6 +11,7 @@ import (
 	"time"

 	"github.com/google/uuid"
+	"github.com/stretchr/testify/assert"
 	"github.com/stretchr/testify/require"
 	"go.uber.org/mock/gomock"
 	"golang.org/x/xerrors"
@ -33,9 +34,9 @@ import (
 // make update-golden-files
 var UpdateGoldenFiles = flag.Bool("update", false, "update .golden files")

-// TestHeartbeat_Cleanup is internal so that we can overwrite the cleanup period and not wait an hour for the timed
+// TestHeartbeats_Cleanup is internal so that we can overwrite the cleanup period and not wait an hour for the timed
 // cleanup.
-func TestHeartbeat_Cleanup(t *testing.T) {
+func TestHeartbeats_Cleanup(t *testing.T) {
 	t.Parallel()

 	ctrl := gomock.NewController(t)
@ -78,6 +79,41 @@ func TestHeartbeat_Cleanup(t *testing.T) {
 	close(waitForCleanup)
 }

+func TestHeartbeats_LostCoordinator_MarkLost(t *testing.T) {
+	t.Parallel()
+
+	ctrl := gomock.NewController(t)
+	mStore := dbmock.NewMockStore(ctrl)
+
+	ctx, cancel := context.WithTimeout(context.Background(), testutil.WaitShort)
+	defer cancel()
+	logger := slogtest.Make(t, nil).Leveled(slog.LevelDebug)
+
+	uut := &heartbeats{
+		ctx:           ctx,
+		logger:        logger,
+		store:         mStore,
+		cleanupPeriod: time.Millisecond,
+		coordinators: map[uuid.UUID]time.Time{
+			uuid.New(): time.Now(),
+		},
+	}
+
+	mpngs := []mapping{{
+		peer:        uuid.New(),
+		coordinator: uuid.New(),
+		updatedAt:   time.Now(),
+		node:        &proto.Node{},
+		kind:        proto.CoordinateResponse_PeerUpdate_NODE,
+	}}
+
+	// Filter should still return the mapping without a coordinator, but marked
+	// as LOST.
+	got := uut.filter(mpngs)
+	require.Len(t, got, 1)
+	assert.Equal(t, proto.CoordinateResponse_PeerUpdate_LOST, got[0].kind)
+}
+
 // TestLostPeerCleanupQueries tests that our SQL queries to clean up lost peers do what we expect,
 // that is, clean up peers and associated tunnels that have been lost for over 24 hours.
 func TestLostPeerCleanupQueries(t *testing.T) {
--- a/enterprise/tailnet/pgcoord_test.go
+++ b/enterprise/tailnet/pgcoord_test.go
@ -415,6 +415,52 @@ func TestPGCoordinatorSingle_MissedHeartbeats(t *testing.T) {
 	assertEventuallyLost(ctx, t, store, client.id)
 }

+func TestPGCoordinatorSingle_MissedHeartbeats_NoDrop(t *testing.T) {
+	t.Parallel()
+	if !dbtestutil.WillUsePostgres() {
+		t.Skip("test only with postgres")
+	}
+	store, ps := dbtestutil.NewDB(t)
+	ctx, cancel := context.WithTimeout(context.Background(), testutil.WaitSuperLong)
+	defer cancel()
+	logger := slogtest.Make(t, nil).Leveled(slog.LevelDebug)
+
+	coordinator, err := tailnet.NewPGCoord(ctx, logger, ps, store)
+	require.NoError(t, err)
+	defer coordinator.Close()
+
+	agentID := uuid.New()
+
+	client := agpltest.NewPeer(ctx, t, coordinator, "client")
+	defer client.Close(ctx)
+	client.AddTunnel(agentID)
+
+	client.UpdateDERP(11)
+
+	// simulate a second coordinator via DB calls only --- our goal is to test
+	// broken heart-beating, so we can't use a real coordinator
+	fCoord2 := &fakeCoordinator{
+		ctx:   ctx,
+		t:     t,
+		store: store,
+		id:    uuid.New(),
+	}
+	// simulate a single heartbeat, the coordinator is healthy
+	fCoord2.heartbeat()
+
+	fCoord2.agentNode(agentID, &agpl.Node{PreferredDERP: 12})
+	// since it's healthy the client should get the new node.
+	client.AssertEventuallyHasDERP(agentID, 12)
+
+	// the heartbeat should then timeout and we'll get sent a LOST update, NOT a
+	// disconnect.
+	client.AssertEventuallyLost(agentID)
+
+	client.Close(ctx)
+
+	assertEventuallyLost(ctx, t, store, client.ID)
+}
+
 func TestPGCoordinatorSingle_SendsHeartbeats(t *testing.T) {
 	t.Parallel()
 	if !dbtestutil.WillUsePostgres() {
@ -857,6 +903,16 @@ func newTestAgent(t *testing.T, coord agpl.CoordinatorV1, name string, id ...uui
 	return a
 }

+func newTestClient(t *testing.T, coord agpl.CoordinatorV1, agentID uuid.UUID, id ...uuid.UUID) *testConn {
+	c := newTestConn(id)
+	go func() {
+		err := coord.ServeClient(c.serverWS, c.id, agentID)
+		assert.NoError(t, err)
+		close(c.closeChan)
+	}()
+	return c
+}
+
 func (c *testConn) close() error {
 	return c.ws.Close()
 }
@ -902,16 +958,6 @@ func (c *testConn) waitForClose(ctx context.Context, t *testing.T) {
 	}
 }

-func newTestClient(t *testing.T, coord agpl.CoordinatorV1, agentID uuid.UUID, id ...uuid.UUID) *testConn {
-	c := newTestConn(id)
-	go func() {
-		err := coord.ServeClient(c.serverWS, c.id, agentID)
-		assert.NoError(t, err)
-		close(c.closeChan)
-	}()
-	return c
-}
-
 func assertEventuallyHasDERPs(ctx context.Context, t *testing.T, c *testConn, expected ...int) {
 	t.Helper()
 	for {