fix: set node callback each time we reinit the coordinator in servertailnet (#12140)

I think this will resolve #12136 but lets get a proper test at the system level before closing. Before this change, we only register the node callback at start of day for the server tailnet. If the coordinator changes, like we know happens when we are licensed for the PGCoordinator, we close the connection to the old coord, and open a new one to the new coord. The callback is designed to direct the updates to the new coordinator, but there is nothing that specifically triggers it to fire after we connect to the new coordinator. If we have STUN, then period re-STUNs will generally get it to fire eventually, but without STUN it we could go indefinitely without a callback. This PR changes the servertailnet to re-register the callback each time we reconnect to the coordinator. Registering a callback (even if it's the same callback) triggers an immediate call with our node information, so the new coordinator will have it.
2024-02-14 20:45:31 +04:00 · 2024-02-14 20:45:31 +04:00 · 04991f425a
parent 5a0d240bc3
commit 04991f425a
2 changed files with 19 additions and 22 deletions
--- a/coderd/tailnet.go
+++ b/coderd/tailnet.go
@ -136,28 +136,8 @@ func NewServerTailnet(
 		return nil, xerrors.Errorf("get initial multi agent: %w", err)
 	}
 	tn.agentConn.Store(&agentConn)
-
-	pn, err := tailnet.NodeToProto(conn.Node())
-	if err != nil {
-		tn.logger.Critical(context.Background(), "failed to convert node", slog.Error(err))
-	} else {
-		err = tn.getAgentConn().UpdateSelf(pn)
-		if err != nil {
-			tn.logger.Warn(context.Background(), "server tailnet update self", slog.Error(err))
-		}
-	}
-
-	conn.SetNodeCallback(func(node *tailnet.Node) {
-		pn, err := tailnet.NodeToProto(node)
-		if err != nil {
-			tn.logger.Critical(context.Background(), "failed to convert node", slog.Error(err))
-			return
-		}
-		err = tn.getAgentConn().UpdateSelf(pn)
-		if err != nil {
-			tn.logger.Warn(context.Background(), "broadcast server node to agents", slog.Error(err))
-		}
-	})
+	// registering the callback also triggers send of the initial node
+	tn.coordinatee.SetNodeCallback(tn.nodeCallback)

 	// This is set to allow local DERP traffic to be proxied through memory
 	// instead of needing to hit the external access URL. Don't use the ctx
@ -183,6 +163,18 @@ func NewServerTailnet(
 	return tn, nil
 }

+func (s *ServerTailnet) nodeCallback(node *tailnet.Node) {
+	pn, err := tailnet.NodeToProto(node)
+	if err != nil {
+		s.logger.Critical(context.Background(), "failed to convert node", slog.Error(err))
+		return
+	}
+	err = s.getAgentConn().UpdateSelf(pn)
+	if err != nil {
+		s.logger.Warn(context.Background(), "broadcast server node to agents", slog.Error(err))
+	}
+}
+
 func (s *ServerTailnet) Describe(descs chan<- *prometheus.Desc) {
 	s.connsPerAgent.Describe(descs)
 	s.totalConns.Describe(descs)
@ -285,6 +277,9 @@ func (s *ServerTailnet) reinitCoordinator() {
 			continue
 		}
 		s.agentConn.Store(&agentConn)
+		// reset the Node callback, which triggers the conn to send the node immediately, and also
+		// register for updates
+		s.coordinatee.SetNodeCallback(s.nodeCallback)

 		// Resubscribe to all of the agents we're tracking.
 		for agentID := range s.agentConnectionTimes {
--- a/coderd/tailnet_internal_test.go
+++ b/coderd/tailnet_internal_test.go
@ -48,6 +48,7 @@ func TestServerTailnet_Reconnect(t *testing.T) {
 		agentConnectionTimes: make(map[uuid.UUID]time.Time),
 	}
 	// reinit the Coordinator once, to load mMultiAgent0
+	mCoord.EXPECT().SetNodeCallback(gomock.Any()).Times(1)
 	uut.reinitCoordinator()

 	mMultiAgent0.EXPECT().NextUpdate(gomock.Any()).
@ -57,6 +58,7 @@ func TestServerTailnet_Reconnect(t *testing.T) {
 		Times(1).
 		Return(true) // this triggers reconnect
 	setLost := mCoord.EXPECT().SetAllPeersLost().Times(1).After(closed0)
+	mCoord.EXPECT().SetNodeCallback(gomock.Any()).Times(1).After(closed0)
 	mMultiAgent1.EXPECT().NextUpdate(gomock.Any()).
 		Times(1).
 		After(setLost).