From 04991f425a43de561e63fa8246dd041f989f867f Mon Sep 17 00:00:00 2001 From: Spike Curtis Date: Wed, 14 Feb 2024 20:45:31 +0400 Subject: [PATCH] fix: set node callback each time we reinit the coordinator in servertailnet (#12140) I think this will resolve #12136 but lets get a proper test at the system level before closing. Before this change, we only register the node callback at start of day for the server tailnet. If the coordinator changes, like we know happens when we are licensed for the PGCoordinator, we close the connection to the old coord, and open a new one to the new coord. The callback is designed to direct the updates to the new coordinator, but there is nothing that specifically triggers it to fire after we connect to the new coordinator. If we have STUN, then period re-STUNs will generally get it to fire eventually, but without STUN it we could go indefinitely without a callback. This PR changes the servertailnet to re-register the callback each time we reconnect to the coordinator. Registering a callback (even if it's the same callback) triggers an immediate call with our node information, so the new coordinator will have it. --- coderd/tailnet.go | 39 ++++++++++++++------------------- coderd/tailnet_internal_test.go | 2 ++ 2 files changed, 19 insertions(+), 22 deletions(-) diff --git a/coderd/tailnet.go b/coderd/tailnet.go index 74b821deb8..0247096d71 100644 --- a/coderd/tailnet.go +++ b/coderd/tailnet.go @@ -136,28 +136,8 @@ func NewServerTailnet( return nil, xerrors.Errorf("get initial multi agent: %w", err) } tn.agentConn.Store(&agentConn) - - pn, err := tailnet.NodeToProto(conn.Node()) - if err != nil { - tn.logger.Critical(context.Background(), "failed to convert node", slog.Error(err)) - } else { - err = tn.getAgentConn().UpdateSelf(pn) - if err != nil { - tn.logger.Warn(context.Background(), "server tailnet update self", slog.Error(err)) - } - } - - conn.SetNodeCallback(func(node *tailnet.Node) { - pn, err := tailnet.NodeToProto(node) - if err != nil { - tn.logger.Critical(context.Background(), "failed to convert node", slog.Error(err)) - return - } - err = tn.getAgentConn().UpdateSelf(pn) - if err != nil { - tn.logger.Warn(context.Background(), "broadcast server node to agents", slog.Error(err)) - } - }) + // registering the callback also triggers send of the initial node + tn.coordinatee.SetNodeCallback(tn.nodeCallback) // This is set to allow local DERP traffic to be proxied through memory // instead of needing to hit the external access URL. Don't use the ctx @@ -183,6 +163,18 @@ func NewServerTailnet( return tn, nil } +func (s *ServerTailnet) nodeCallback(node *tailnet.Node) { + pn, err := tailnet.NodeToProto(node) + if err != nil { + s.logger.Critical(context.Background(), "failed to convert node", slog.Error(err)) + return + } + err = s.getAgentConn().UpdateSelf(pn) + if err != nil { + s.logger.Warn(context.Background(), "broadcast server node to agents", slog.Error(err)) + } +} + func (s *ServerTailnet) Describe(descs chan<- *prometheus.Desc) { s.connsPerAgent.Describe(descs) s.totalConns.Describe(descs) @@ -285,6 +277,9 @@ func (s *ServerTailnet) reinitCoordinator() { continue } s.agentConn.Store(&agentConn) + // reset the Node callback, which triggers the conn to send the node immediately, and also + // register for updates + s.coordinatee.SetNodeCallback(s.nodeCallback) // Resubscribe to all of the agents we're tracking. for agentID := range s.agentConnectionTimes { diff --git a/coderd/tailnet_internal_test.go b/coderd/tailnet_internal_test.go index f09ac1d28b..f8750dcbe9 100644 --- a/coderd/tailnet_internal_test.go +++ b/coderd/tailnet_internal_test.go @@ -48,6 +48,7 @@ func TestServerTailnet_Reconnect(t *testing.T) { agentConnectionTimes: make(map[uuid.UUID]time.Time), } // reinit the Coordinator once, to load mMultiAgent0 + mCoord.EXPECT().SetNodeCallback(gomock.Any()).Times(1) uut.reinitCoordinator() mMultiAgent0.EXPECT().NextUpdate(gomock.Any()). @@ -57,6 +58,7 @@ func TestServerTailnet_Reconnect(t *testing.T) { Times(1). Return(true) // this triggers reconnect setLost := mCoord.EXPECT().SetAllPeersLost().Times(1).After(closed0) + mCoord.EXPECT().SetNodeCallback(gomock.Any()).Times(1).After(closed0) mMultiAgent1.EXPECT().NextUpdate(gomock.Any()). Times(1). After(setLost).