From d2ae16dd224eccac0881e201cf130ba7e9d632c7 Mon Sep 17 00:00:00 2001 From: Colin Adler Date: Mon, 23 Jan 2023 14:05:29 -0600 Subject: [PATCH] fix: routinely ping agent websocket to ensure liveness (#5824) --- agent/agent.go | 3 +++ cli/agent.go | 1 + codersdk/workspaceagents.go | 36 +++++++++++++++++++++++++++++++ provisionerd/provisionerd_test.go | 6 ++---- 4 files changed, 42 insertions(+), 4 deletions(-) diff --git a/agent/agent.go b/agent/agent.go index 47d9c394a8..95ef8d713e 100644 --- a/agent/agent.go +++ b/agent/agent.go @@ -430,6 +430,9 @@ func (a *agent) createTailnet(ctx context.Context, derpMap *tailcfg.DERPMap) (_ // runCoordinator runs a coordinator and returns whether a reconnect // should occur. func (a *agent) runCoordinator(ctx context.Context, network *tailnet.Conn) error { + ctx, cancel := context.WithCancel(ctx) + defer cancel() + coordinator, err := a.client.ListenWorkspaceAgent(ctx) if err != nil { return err diff --git a/cli/agent.go b/cli/agent.go index 95744aa340..2edc472bd6 100644 --- a/cli/agent.go +++ b/cli/agent.go @@ -83,6 +83,7 @@ func workspaceAgent() *cobra.Command { slog.F("version", version), ) client := codersdk.New(coderURL) + client.Logger = logger // Set a reasonable timeout so requests can't hang forever! client.HTTPClient.Timeout = 10 * time.Second diff --git a/codersdk/workspaceagents.go b/codersdk/workspaceagents.go index 93ac907ab4..5f033bddb3 100644 --- a/codersdk/workspaceagents.go +++ b/codersdk/workspaceagents.go @@ -340,6 +340,42 @@ func (c *Client) ListenWorkspaceAgent(ctx context.Context) (net.Conn, error) { return nil, readBodyAsError(res) } + // Ping once every 30 seconds to ensure that the websocket is alive. If we + // don't get a response within 30s we kill the websocket and reconnect. + // See: https://github.com/coder/coder/pull/5824 + go func() { + tick := 30 * time.Second + ticker := time.NewTicker(tick) + defer ticker.Stop() + defer func() { + c.Logger.Debug(ctx, "coordinate pinger exited") + }() + for { + select { + case <-ctx.Done(): + return + case start := <-ticker.C: + ctx, cancel := context.WithTimeout(ctx, tick) + + err := conn.Ping(ctx) + if err != nil { + c.Logger.Error(ctx, "workspace agent coordinate ping", slog.Error(err)) + + err := conn.Close(websocket.StatusGoingAway, "Ping failed") + if err != nil { + c.Logger.Error(ctx, "close workspace agent coordinate websocket", slog.Error(err)) + } + + cancel() + return + } + + c.Logger.Debug(ctx, "got coordinate pong", slog.F("took", time.Since(start))) + cancel() + } + } + }() + return websocket.NetConn(ctx, conn, websocket.MessageBinary), nil } diff --git a/provisionerd/provisionerd_test.go b/provisionerd/provisionerd_test.go index d65dceb581..44884800fa 100644 --- a/provisionerd/provisionerd_test.go +++ b/provisionerd/provisionerd_test.go @@ -12,9 +12,6 @@ import ( "testing" "time" - "github.com/coder/coder/provisionerd/runner" - "github.com/coder/coder/testutil" - "github.com/hashicorp/yamux" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" @@ -26,11 +23,12 @@ import ( "cdr.dev/slog" "cdr.dev/slog/sloggers/slogtest" - "github.com/coder/coder/provisionerd" "github.com/coder/coder/provisionerd/proto" + "github.com/coder/coder/provisionerd/runner" "github.com/coder/coder/provisionersdk" sdkproto "github.com/coder/coder/provisionersdk/proto" + "github.com/coder/coder/testutil" ) func TestMain(m *testing.M) {