diff --git a/coderd/workspaceagents.go b/coderd/workspaceagents.go index 9cd2494ce3..041245825e 100644 --- a/coderd/workspaceagents.go +++ b/coderd/workspaceagents.go @@ -11,8 +11,10 @@ import ( "net/http" "net/netip" "net/url" + "runtime/pprof" "strconv" "strings" + "sync" "time" "github.com/google/uuid" @@ -291,11 +293,12 @@ func (api *API) workspaceAgentPTY(rw http.ResponseWriter, r *http.Request) { }) return } - go httpapi.Heartbeat(ctx, conn) - _, wsNetConn := websocketNetConn(ctx, conn, websocket.MessageBinary) + ctx, wsNetConn := websocketNetConn(ctx, conn, websocket.MessageBinary) defer wsNetConn.Close() // Also closes conn. + go httpapi.Heartbeat(ctx, conn) + agentConn, release, err := api.workspaceAgentCache.Acquire(r, workspaceAgent.ID) if err != nil { _ = conn.Close(websocket.StatusInternalError, httpapi.WebsocketCloseSprintf("dial workspace agent: %s", err)) @@ -606,11 +609,40 @@ func (api *API) workspaceAgentCoordinate(rw http.ResponseWriter, r *http.Request }) return } - go httpapi.Heartbeat(ctx, conn) ctx, wsNetConn := websocketNetConn(ctx, conn, websocket.MessageBinary) defer wsNetConn.Close() + // We use a custom heartbeat routine here instead of `httpapi.Heartbeat` + // because we want to log the agent's last ping time. + var lastPing time.Time + var pingMu sync.Mutex + go pprof.Do(ctx, pprof.Labels("agent", workspaceAgent.ID.String()), func(ctx context.Context) { + // TODO(mafredri): Is this too frequent? Use separate ping disconnect timeout? + t := time.NewTicker(api.AgentConnectionUpdateFrequency) + defer t.Stop() + + for { + select { + case <-t.C: + case <-ctx.Done(): + return + } + + // We don't need a context that times out here because the ping will + // eventually go through. If the context times out, then other + // websocket read operations will receive an error, obfuscating the + // actual problem. + err := conn.Ping(ctx) + if err != nil { + return + } + pingMu.Lock() + lastPing = time.Now() + pingMu.Unlock() + } + }) + firstConnectedAt := workspaceAgent.FirstConnectedAt if !firstConnectedAt.Valid { firstConnectedAt = sql.NullTime{ @@ -654,9 +686,12 @@ func (api *API) workspaceAgentCoordinate(rw http.ResponseWriter, r *http.Request ctx, cancel := context.WithTimeout(dbauthz.AsSystemRestricted(api.ctx), api.AgentInactiveDisconnectTimeout) defer cancel() - disconnectedAt = sql.NullTime{ - Time: database.Now(), - Valid: true, + // Only update timestamp if the disconnect is new. + if !disconnectedAt.Valid { + disconnectedAt = sql.NullTime{ + Time: database.Now(), + Valid: true, + } } err := updateConnectionTimes(ctx) if err != nil { @@ -711,15 +746,37 @@ func (api *API) workspaceAgentCoordinate(rw http.ResponseWriter, r *http.Request return case <-ticker.C: } - lastConnectedAt = sql.NullTime{ - Time: database.Now(), - Valid: true, + + pingMu.Lock() + lastPing := lastPing + pingMu.Unlock() + + var connectionStatusChanged bool + if time.Since(lastPing) > api.AgentInactiveDisconnectTimeout { + if !disconnectedAt.Valid { + connectionStatusChanged = true + disconnectedAt = sql.NullTime{ + Time: database.Now(), + Valid: true, + } + } + } else { + connectionStatusChanged = disconnectedAt.Valid + // TODO(mafredri): Should we update it here or allow lastConnectedAt to shadow it? + disconnectedAt = sql.NullTime{} + lastConnectedAt = sql.NullTime{ + Time: database.Now(), + Valid: true, + } } err = updateConnectionTimes(ctx) if err != nil { _ = conn.Close(websocket.StatusGoingAway, err.Error()) return } + if connectionStatusChanged { + api.publishWorkspaceUpdate(ctx, build.WorkspaceID) + } err := ensureLatestBuild() if err != nil { // Disconnect agents that are no longer valid.