diff --git a/coderd/healthcheck/accessurl.go b/coderd/healthcheck/accessurl.go index 6f3b0fdc07..cfcc0ac006 100644 --- a/coderd/healthcheck/accessurl.go +++ b/coderd/healthcheck/accessurl.go @@ -7,8 +7,6 @@ import ( "net/url" "time" - "golang.org/x/xerrors" - "github.com/coder/coder/v2/coderd/healthcheck/health" "github.com/coder/coder/v2/coderd/util/ptr" ) @@ -44,7 +42,7 @@ func (r *AccessURLReport) Run(ctx context.Context, opts *AccessURLReportOptions) r.Dismissed = opts.Dismissed if opts.AccessURL == nil { - r.Error = ptr.Ref("access URL is nil") + r.Error = ptr.Ref(health.Messagef(health.CodeAccessURLNotSet, "Access URL not set")) r.Severity = health.SeverityError return } @@ -56,21 +54,21 @@ func (r *AccessURLReport) Run(ctx context.Context, opts *AccessURLReportOptions) accessURL, err := opts.AccessURL.Parse("/healthz") if err != nil { - r.Error = convertError(xerrors.Errorf("parse healthz endpoint: %w", err)) + r.Error = ptr.Ref(health.Messagef(health.CodeAccessURLInvalid, "parse healthz endpoint: %s", err)) r.Severity = health.SeverityError return } req, err := http.NewRequestWithContext(ctx, "GET", accessURL.String(), nil) if err != nil { - r.Error = convertError(xerrors.Errorf("create healthz request: %w", err)) + r.Error = ptr.Ref(health.Messagef(health.CodeAccessURLFetch, "create healthz request: %s", err)) r.Severity = health.SeverityError return } res, err := opts.Client.Do(req) if err != nil { - r.Error = convertError(xerrors.Errorf("get healthz endpoint: %w", err)) + r.Error = ptr.Ref(health.Messagef(health.CodeAccessURLFetch, "get healthz endpoint: %s", err)) r.Severity = health.SeverityError return } @@ -78,7 +76,7 @@ func (r *AccessURLReport) Run(ctx context.Context, opts *AccessURLReportOptions) body, err := io.ReadAll(res.Body) if err != nil { - r.Error = convertError(xerrors.Errorf("read healthz response: %w", err)) + r.Error = ptr.Ref(health.Messagef(health.CodeAccessURLFetch, "read healthz response: %s", err)) r.Severity = health.SeverityError return } @@ -88,6 +86,7 @@ func (r *AccessURLReport) Run(ctx context.Context, opts *AccessURLReportOptions) r.StatusCode = res.StatusCode if res.StatusCode != http.StatusOK { r.Severity = health.SeverityWarning + r.Warnings = append(r.Warnings, health.Messagef(health.CodeAccessURLNotOK, "/healthz did not return 200 OK")) } r.HealthzResponse = string(body) } diff --git a/coderd/healthcheck/accessurl_test.go b/coderd/healthcheck/accessurl_test.go index 9e368cc679..788fd41481 100644 --- a/coderd/healthcheck/accessurl_test.go +++ b/coderd/healthcheck/accessurl_test.go @@ -11,7 +11,6 @@ import ( "github.com/stretchr/testify/require" "golang.org/x/xerrors" - "github.com/coder/coder/v2/coderd/coderdtest" "github.com/coder/coder/v2/coderd/healthcheck" "github.com/coder/coder/v2/coderd/healthcheck/health" ) @@ -25,12 +24,17 @@ func TestAccessURL(t *testing.T) { var ( ctx, cancel = context.WithCancel(context.Background()) report healthcheck.AccessURLReport - client = coderdtest.New(t, nil) + resp = []byte("OK") + srv = httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.WriteHeader(http.StatusOK) + _, _ = w.Write(resp) + })) ) defer cancel() report.Run(ctx, &healthcheck.AccessURLReportOptions{ - AccessURL: client.URL, + Client: srv.Client(), + AccessURL: mustURL(t, srv.URL), }) assert.True(t, report.Healthy) @@ -41,35 +45,27 @@ func TestAccessURL(t *testing.T) { assert.Nil(t, report.Error) }) - t.Run("404", func(t *testing.T) { + t.Run("NotSet", func(t *testing.T) { t.Parallel() var ( ctx, cancel = context.WithCancel(context.Background()) report healthcheck.AccessURLReport - resp = []byte("NOT OK") - srv = httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { - w.WriteHeader(http.StatusNotFound) - w.Write(resp) - })) ) defer cancel() - defer srv.Close() - - u, err := url.Parse(srv.URL) - require.NoError(t, err) report.Run(ctx, &healthcheck.AccessURLReportOptions{ - Client: srv.Client(), - AccessURL: u, + Client: nil, // defaults to http.DefaultClient + AccessURL: nil, }) assert.False(t, report.Healthy) - assert.True(t, report.Reachable) - assert.Equal(t, health.SeverityWarning, report.Severity) - assert.Equal(t, http.StatusNotFound, report.StatusCode) - assert.Equal(t, string(resp), report.HealthzResponse) - assert.Nil(t, report.Error) + assert.False(t, report.Reachable) + assert.Equal(t, health.SeverityError, report.Severity) + assert.Equal(t, 0, report.StatusCode) + assert.Equal(t, "", report.HealthzResponse) + require.NotNil(t, report.Error) + assert.Contains(t, *report.Error, health.CodeAccessURLNotSet) }) t.Run("ClientErr", func(t *testing.T) { @@ -81,7 +77,7 @@ func TestAccessURL(t *testing.T) { resp = []byte("OK") srv = httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { w.WriteHeader(http.StatusOK) - w.Write(resp) + _, _ = w.Write(resp) })) client = srv.Client() ) @@ -93,12 +89,9 @@ func TestAccessURL(t *testing.T) { return nil, expErr }) - u, err := url.Parse(srv.URL) - require.NoError(t, err) - report.Run(ctx, &healthcheck.AccessURLReportOptions{ Client: client, - AccessURL: u, + AccessURL: mustURL(t, srv.URL), }) assert.False(t, report.Healthy) @@ -108,6 +101,38 @@ func TestAccessURL(t *testing.T) { assert.Equal(t, "", report.HealthzResponse) require.NotNil(t, report.Error) assert.Contains(t, *report.Error, expErr.Error()) + assert.Contains(t, *report.Error, health.CodeAccessURLFetch) + }) + + t.Run("404", func(t *testing.T) { + t.Parallel() + + var ( + ctx, cancel = context.WithCancel(context.Background()) + report healthcheck.AccessURLReport + resp = []byte("NOT OK") + srv = httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.WriteHeader(http.StatusNotFound) + _, _ = w.Write(resp) + })) + ) + defer cancel() + defer srv.Close() + + report.Run(ctx, &healthcheck.AccessURLReportOptions{ + Client: srv.Client(), + AccessURL: mustURL(t, srv.URL), + }) + + assert.False(t, report.Healthy) + assert.True(t, report.Reachable) + assert.Equal(t, health.SeverityWarning, report.Severity) + assert.Equal(t, http.StatusNotFound, report.StatusCode) + assert.Equal(t, string(resp), report.HealthzResponse) + assert.Nil(t, report.Error) + if assert.NotEmpty(t, report.Warnings) { + assert.Contains(t, report.Warnings[0], health.CodeAccessURLNotOK) + } }) t.Run("DismissedError", func(t *testing.T) { @@ -133,3 +158,10 @@ type roundTripFunc func(r *http.Request) (*http.Response, error) func (rt roundTripFunc) RoundTrip(r *http.Request) (*http.Response, error) { return rt(r) } + +func mustURL(t testing.TB, s string) *url.URL { + t.Helper() + u, err := url.Parse(s) + require.NoError(t, err) + return u +} diff --git a/coderd/healthcheck/database.go b/coderd/healthcheck/database.go index 3df3fcd972..aa2896f65c 100644 --- a/coderd/healthcheck/database.go +++ b/coderd/healthcheck/database.go @@ -4,11 +4,11 @@ import ( "context" "time" - "golang.org/x/exp/slices" - "golang.org/x/xerrors" - "github.com/coder/coder/v2/coderd/database" "github.com/coder/coder/v2/coderd/healthcheck/health" + "github.com/coder/coder/v2/coderd/util/ptr" + + "golang.org/x/exp/slices" ) const ( @@ -55,8 +55,9 @@ func (r *DatabaseReport) Run(ctx context.Context, opts *DatabaseReportOptions) { for i := 0; i < pingCount; i++ { pong, err := opts.DB.Ping(ctx) if err != nil { - r.Error = convertError(xerrors.Errorf("ping: %w", err)) + r.Error = ptr.Ref(health.Messagef(health.CodeDatabasePingFailed, "ping database: %s", err)) r.Severity = health.SeverityError + return } pings = append(pings, pong) @@ -69,6 +70,7 @@ func (r *DatabaseReport) Run(ctx context.Context, opts *DatabaseReportOptions) { r.LatencyMS = latency.Milliseconds() if r.LatencyMS >= r.ThresholdMS { r.Severity = health.SeverityWarning + r.Warnings = append(r.Warnings, health.Messagef(health.CodeDatabasePingSlow, "median database ping above threshold")) } r.Healthy = true r.Reachable = true diff --git a/coderd/healthcheck/database_test.go b/coderd/healthcheck/database_test.go index 8ac5bbe38c..afa518f270 100644 --- a/coderd/healthcheck/database_test.go +++ b/coderd/healthcheck/database_test.go @@ -65,6 +65,7 @@ func TestDatabase(t *testing.T) { require.NotNil(t, report.Error) assert.Equal(t, healthcheck.DatabaseDefaultThreshold.Milliseconds(), report.ThresholdMS) assert.Contains(t, *report.Error, err.Error()) + assert.Contains(t, *report.Error, health.CodeDatabasePingFailed) }) t.Run("DismissedError", func(t *testing.T) { @@ -85,6 +86,7 @@ func TestDatabase(t *testing.T) { assert.Equal(t, health.SeverityError, report.Severity) assert.True(t, report.Dismissed) require.NotNil(t, report.Error) + assert.Contains(t, *report.Error, health.CodeDatabasePingFailed) }) t.Run("Median", func(t *testing.T) { @@ -112,6 +114,7 @@ func TestDatabase(t *testing.T) { assert.EqualValues(t, 1, report.LatencyMS) assert.Equal(t, healthcheck.DatabaseDefaultThreshold.Milliseconds(), report.ThresholdMS) assert.Nil(t, report.Error) + assert.Empty(t, report.Warnings) }) t.Run("Threshold", func(t *testing.T) { @@ -139,5 +142,8 @@ func TestDatabase(t *testing.T) { assert.EqualValues(t, 1000, report.LatencyMS) assert.Equal(t, time.Second.Milliseconds(), report.ThresholdMS) assert.Nil(t, report.Error) + if assert.NotEmpty(t, report.Warnings) { + assert.Contains(t, report.Warnings[0], health.CodeDatabasePingSlow) + } }) } diff --git a/coderd/healthcheck/derphealth/derp.go b/coderd/healthcheck/derphealth/derp.go index 3f9f78b319..9051cc6e52 100644 --- a/coderd/healthcheck/derphealth/derp.go +++ b/coderd/healthcheck/derphealth/derp.go @@ -136,9 +136,7 @@ func (r *Report) Run(ctx context.Context, opts *ReportOptions) { r.Healthy = false } - for _, w := range regionReport.Warnings { - r.Warnings = append(r.Warnings, fmt.Sprintf("[%s] %s", regionReport.Region.RegionName, w)) - } + r.Warnings = append(r.Warnings, regionReport.Warnings...) mu.Unlock() }() } @@ -202,9 +200,7 @@ func (r *RegionReport) Run(ctx context.Context) { unhealthyNodes++ } - for _, w := range nodeReport.Warnings { - r.Warnings = append(r.Warnings, fmt.Sprintf("[%s] %s", nodeReport.Node.Name, w)) - } + r.Warnings = append(r.Warnings, nodeReport.Warnings...) r.mu.Unlock() }() } @@ -228,7 +224,7 @@ func (r *RegionReport) Run(ctx context.Context) { } else if unhealthyNodes == 1 { // r.Healthy = true (by default) r.Severity = health.SeverityWarning - r.Warnings = append(r.Warnings, oneNodeUnhealthy) + r.Warnings = append(r.Warnings, health.Messagef(health.CodeDERPOneNodeUnhealthy, oneNodeUnhealthy)) } else if unhealthyNodes > 1 { r.Healthy = false @@ -292,7 +288,7 @@ func (r *NodeReport) Run(ctx context.Context) { } if r.UsesWebsocket { - r.Warnings = append(r.Warnings, warningNodeUsesWebsocket) + r.Warnings = append(r.Warnings, health.Messagef(health.CodeDERPNodeUsesWebsocket, warningNodeUsesWebsocket)) r.Severity = health.SeverityWarning } } diff --git a/coderd/healthcheck/derphealth/derp_test.go b/coderd/healthcheck/derphealth/derp_test.go index cf307637ac..8a2bf99e76 100644 --- a/coderd/healthcheck/derphealth/derp_test.go +++ b/coderd/healthcheck/derphealth/derp_test.go @@ -129,6 +129,9 @@ func TestDERP(t *testing.T) { assert.True(t, report.Healthy) assert.Equal(t, health.SeverityWarning, report.Severity) assert.True(t, report.Dismissed) + if assert.NotEmpty(t, report.Warnings) { + assert.Contains(t, report.Warnings[0], health.CodeDERPOneNodeUnhealthy) + } for _, region := range report.Regions { assert.True(t, region.Healthy) assert.True(t, region.NodeReports[0].Healthy) @@ -232,7 +235,9 @@ func TestDERP(t *testing.T) { assert.True(t, report.Healthy) assert.Equal(t, health.SeverityWarning, report.Severity) - assert.NotEmpty(t, report.Warnings) + if assert.NotEmpty(t, report.Warnings) { + assert.Contains(t, report.Warnings[0], health.CodeDERPNodeUsesWebsocket) + } for _, region := range report.Regions { assert.True(t, region.Healthy) assert.Equal(t, health.SeverityWarning, region.Severity) diff --git a/coderd/healthcheck/health/model.go b/coderd/healthcheck/health/model.go index 461c9c8f3c..27ff9eae1f 100644 --- a/coderd/healthcheck/health/model.go +++ b/coderd/healthcheck/health/model.go @@ -1,9 +1,37 @@ package health +import ( + "fmt" + "strings" +) + const ( SeverityOK Severity = "ok" SeverityWarning Severity = "warning" SeverityError Severity = "error" + + // CodeUnknown is a catch-all health code when something unexpected goes wrong (for example, a panic). + CodeUnknown Code = "EUNKNOWN" + + CodeProxyUpdate Code = "EWP01" + CodeProxyFetch Code = "EWP02" + CodeProxyVersionMismatch Code = "EWP03" + CodeProxyUnhealthy Code = "EWP04" + + CodeDatabasePingFailed Code = "EDB01" + CodeDatabasePingSlow Code = "EDB02" + + CodeWebsocketDial Code = "EWS01" + CodeWebsocketEcho Code = "EWS02" + CodeWebsocketMsg Code = "EWS03" + + CodeAccessURLNotSet Code = "EACS01" + CodeAccessURLInvalid Code = "EACS02" + CodeAccessURLFetch Code = "EACS03" + CodeAccessURLNotOK Code = "EACS04" + + CodeDERPNodeUsesWebsocket Code = `EDERP01` + CodeDERPOneNodeUnhealthy Code = `EDERP02` ) // @typescript-generate Severity @@ -18,3 +46,17 @@ var severityRank = map[Severity]int{ func (s Severity) Value() int { return severityRank[s] } + +// Code is a stable identifier used to link to documentation. +// @typescript-generate Code +type Code string + +// Messagef is a convenience function for formatting a healthcheck error message. +func Messagef(code Code, msg string, args ...any) string { + var sb strings.Builder + _, _ = sb.WriteString(string(code)) + _, _ = sb.WriteRune(':') + _, _ = sb.WriteRune(' ') + _, _ = sb.WriteString(fmt.Sprintf(msg, args...)) + return sb.String() +} diff --git a/coderd/healthcheck/healthcheck.go b/coderd/healthcheck/healthcheck.go index 9ecb9b9d13..f1d399b889 100644 --- a/coderd/healthcheck/healthcheck.go +++ b/coderd/healthcheck/healthcheck.go @@ -2,7 +2,6 @@ package healthcheck import ( "context" - "fmt" "sync" "time" @@ -104,7 +103,7 @@ func Run(ctx context.Context, opts *ReportOptions) *Report { defer wg.Done() defer func() { if err := recover(); err != nil { - report.DERP.Error = ptr.Ref(fmt.Sprint(err)) + report.DERP.Error = ptr.Ref(health.Messagef(health.CodeUnknown, "derp report panic: %s", err)) } }() @@ -116,7 +115,7 @@ func Run(ctx context.Context, opts *ReportOptions) *Report { defer wg.Done() defer func() { if err := recover(); err != nil { - report.AccessURL.Error = ptr.Ref(fmt.Sprint(err)) + report.AccessURL.Error = ptr.Ref(health.Messagef(health.CodeUnknown, "access url report panic: %s", err)) } }() @@ -128,7 +127,7 @@ func Run(ctx context.Context, opts *ReportOptions) *Report { defer wg.Done() defer func() { if err := recover(); err != nil { - report.Websocket.Error = ptr.Ref(fmt.Sprint(err)) + report.Websocket.Error = ptr.Ref(health.Messagef(health.CodeUnknown, "websocket report panic: %s", err)) } }() @@ -140,7 +139,7 @@ func Run(ctx context.Context, opts *ReportOptions) *Report { defer wg.Done() defer func() { if err := recover(); err != nil { - report.Database.Error = ptr.Ref(fmt.Sprint(err)) + report.Database.Error = ptr.Ref(health.Messagef(health.CodeUnknown, "database report panic: %s", err)) } }() @@ -152,7 +151,7 @@ func Run(ctx context.Context, opts *ReportOptions) *Report { defer wg.Done() defer func() { if err := recover(); err != nil { - report.WorkspaceProxy.Error = ptr.Ref(fmt.Sprint(err)) + report.WorkspaceProxy.Error = ptr.Ref(health.Messagef(health.CodeUnknown, "proxy report panic: %s", err)) } }() diff --git a/coderd/healthcheck/websocket.go b/coderd/healthcheck/websocket.go index 2a4792c874..372a322bc8 100644 --- a/coderd/healthcheck/websocket.go +++ b/coderd/healthcheck/websocket.go @@ -13,6 +13,7 @@ import ( "nhooyr.io/websocket" "github.com/coder/coder/v2/coderd/healthcheck/health" + "github.com/coder/coder/v2/coderd/util/ptr" ) // @typescript-generate WebsocketReport @@ -75,6 +76,7 @@ func (r *WebsocketReport) Run(ctx context.Context, opts *WebsocketReportOptions) } if err != nil { r.Error = convertError(xerrors.Errorf("websocket dial: %w", err)) + r.Error = ptr.Ref(health.Messagef(health.CodeWebsocketDial, "websocket dial: %s", err)) r.Severity = health.SeverityError return } @@ -84,26 +86,26 @@ func (r *WebsocketReport) Run(ctx context.Context, opts *WebsocketReportOptions) msg := strconv.Itoa(i) err := c.Write(ctx, websocket.MessageText, []byte(msg)) if err != nil { - r.Error = convertError(xerrors.Errorf("write message: %w", err)) + r.Error = ptr.Ref(health.Messagef(health.CodeWebsocketEcho, "write message: %s", err)) r.Severity = health.SeverityError return } ty, got, err := c.Read(ctx) if err != nil { - r.Error = convertError(xerrors.Errorf("read message: %w", err)) + r.Error = ptr.Ref(health.Messagef(health.CodeWebsocketEcho, "read message: %s", err)) r.Severity = health.SeverityError return } if ty != websocket.MessageText { - r.Error = convertError(xerrors.Errorf("received incorrect message type: %v", ty)) + r.Error = ptr.Ref(health.Messagef(health.CodeWebsocketMsg, "received incorrect message type: %v", ty)) r.Severity = health.SeverityError return } if string(got) != msg { - r.Error = convertError(xerrors.Errorf("received incorrect message: wanted %q, got %q", msg, string(got))) + r.Error = ptr.Ref(health.Messagef(health.CodeWebsocketMsg, "received incorrect message: wanted %q, got %q", msg, string(got))) r.Severity = health.SeverityError return } diff --git a/coderd/healthcheck/websocket_test.go b/coderd/healthcheck/websocket_test.go index 1beb96ea06..dd2a42dffb 100644 --- a/coderd/healthcheck/websocket_test.go +++ b/coderd/healthcheck/websocket_test.go @@ -63,7 +63,9 @@ func TestWebsocket(t *testing.T) { APIKey: "test", }) - require.NotNil(t, wsReport.Error) + if assert.NotNil(t, wsReport.Error) { + assert.Contains(t, *wsReport.Error, health.CodeWebsocketDial) + } require.Equal(t, health.SeverityError, wsReport.Severity) assert.Equal(t, wsReport.Body, "test error") assert.Equal(t, wsReport.Code, http.StatusBadRequest) diff --git a/coderd/healthcheck/workspaceproxy.go b/coderd/healthcheck/workspaceproxy.go index 8ab8e86dd4..bfb1b892d9 100644 --- a/coderd/healthcheck/workspaceproxy.go +++ b/coderd/healthcheck/workspaceproxy.go @@ -6,12 +6,12 @@ import ( "sort" "strings" - "golang.org/x/xerrors" - "github.com/coder/coder/v2/buildinfo" "github.com/coder/coder/v2/coderd/healthcheck/health" "github.com/coder/coder/v2/coderd/util/ptr" "github.com/coder/coder/v2/codersdk" + + "golang.org/x/xerrors" ) // @typescript-generate WorkspaceProxyReport @@ -64,7 +64,7 @@ func (r *WorkspaceProxyReport) Run(ctx context.Context, opts *WorkspaceProxyRepo // If this fails, just mark it as a warning. It is still updated in the background. if err := opts.WorkspaceProxiesFetchUpdater.Update(ctx); err != nil { r.Severity = health.SeverityWarning - r.Warnings = append(r.Warnings, xerrors.Errorf("update proxy health: %w", err).Error()) + r.Warnings = append(r.Warnings, health.Messagef(health.CodeProxyUpdate, "update proxy health: %s", err)) return } @@ -72,7 +72,7 @@ func (r *WorkspaceProxyReport) Run(ctx context.Context, opts *WorkspaceProxyRepo if err != nil { r.Healthy = false r.Severity = health.SeverityError - r.Error = ptr.Ref(err.Error()) + r.Error = ptr.Ref(health.Messagef(health.CodeProxyFetch, "fetch workspace proxies: %s", err)) return } @@ -99,11 +99,13 @@ func (r *WorkspaceProxyReport) Run(ctx context.Context, opts *WorkspaceProxyRepo r.Severity = calculateSeverity(total, healthy) r.Healthy = r.Severity.Value() < health.SeverityError.Value() - switch r.Severity { - case health.SeverityWarning, health.SeverityOK: - r.Warnings = append(r.Warnings, errs...) - case health.SeverityError: - r.appendError(errs...) + for _, err := range errs { + switch r.Severity { + case health.SeverityWarning, health.SeverityOK: + r.Warnings = append(r.Warnings, health.Messagef(health.CodeProxyUnhealthy, err)) + case health.SeverityError: + r.appendError(health.Messagef(health.CodeProxyUnhealthy, err)) + } } // Versions _must_ match. Perform this check last. This will clobber any other severity. @@ -111,7 +113,7 @@ func (r *WorkspaceProxyReport) Run(ctx context.Context, opts *WorkspaceProxyRepo if vErr := checkVersion(proxy, opts.CurrentVersion); vErr != nil { r.Healthy = false r.Severity = health.SeverityError - r.appendError(fmt.Sprintf("%s: %s", proxy.Name, vErr.Error())) + r.appendError(health.Messagef(health.CodeProxyVersionMismatch, vErr.Error())) } } } diff --git a/coderd/healthcheck/workspaceproxy_test.go b/coderd/healthcheck/workspaceproxy_test.go index 3cd560ad38..98a75eb237 100644 --- a/coderd/healthcheck/workspaceproxy_test.go +++ b/coderd/healthcheck/workspaceproxy_test.go @@ -2,6 +2,7 @@ package healthcheck_test import ( "context" + "strings" "testing" "github.com/stretchr/testify/assert" @@ -26,6 +27,7 @@ func TestWorkspaceProxies(t *testing.T) { updateProxyHealth func(context.Context) error expectedHealthy bool expectedError string + expectedWarning string expectedSeverity health.Severity }{ { @@ -53,6 +55,7 @@ func TestWorkspaceProxies(t *testing.T) { updateProxyHealth: fakeUpdateProxyHealth(nil), expectedHealthy: false, expectedSeverity: health.SeverityError, + expectedError: string(health.CodeProxyUnhealthy), }, { name: "Enabled/OneUnreachable", @@ -80,7 +83,7 @@ func TestWorkspaceProxies(t *testing.T) { updateProxyHealth: fakeUpdateProxyHealth(nil), expectedHealthy: false, expectedSeverity: health.SeverityError, - expectedError: "connect: connection refused", + expectedError: string(health.CodeProxyUnhealthy), }, { name: "Enabled/AllHealthy", @@ -103,6 +106,7 @@ func TestWorkspaceProxies(t *testing.T) { updateProxyHealth: fakeUpdateProxyHealth(nil), expectedHealthy: true, expectedSeverity: health.SeverityWarning, + expectedWarning: string(health.CodeProxyUnhealthy), }, { name: "Enabled/AllUnhealthy", @@ -113,6 +117,7 @@ func TestWorkspaceProxies(t *testing.T) { updateProxyHealth: fakeUpdateProxyHealth(nil), expectedHealthy: false, expectedSeverity: health.SeverityError, + expectedError: string(health.CodeProxyUnhealthy), }, { name: "Enabled/OneOutOfDate", @@ -150,7 +155,7 @@ func TestWorkspaceProxies(t *testing.T) { updateProxyHealth: fakeUpdateProxyHealth(nil), expectedHealthy: false, expectedSeverity: health.SeverityError, - expectedError: assert.AnError.Error(), + expectedError: string(health.CodeProxyFetch), }, { name: "Enabled/ErrUpdateProxyHealth", @@ -158,6 +163,7 @@ func TestWorkspaceProxies(t *testing.T) { updateProxyHealth: fakeUpdateProxyHealth(assert.AnError), expectedHealthy: true, expectedSeverity: health.SeverityWarning, + expectedWarning: string(health.CodeProxyUpdate), }, } { tt := tt @@ -179,13 +185,22 @@ func TestWorkspaceProxies(t *testing.T) { assert.Equal(t, tt.expectedHealthy, rpt.Healthy) assert.Equal(t, tt.expectedSeverity, rpt.Severity) - if tt.expectedError != "" { - assert.NotNil(t, rpt.Error) + if tt.expectedError != "" && assert.NotNil(t, rpt.Error) { assert.Contains(t, *rpt.Error, tt.expectedError) } else { - if !assert.Nil(t, rpt.Error) { - assert.Empty(t, *rpt.Error) + assert.Nil(t, rpt.Error) + } + if tt.expectedWarning != "" && assert.NotEmpty(t, rpt.Warnings) { + var found bool + for _, w := range rpt.Warnings { + if strings.Contains(w, tt.expectedWarning) { + found = true + break + } } + assert.True(t, found, "expected warning %s not found in %v", tt.expectedWarning, rpt.Warnings) + } else { + assert.Empty(t, rpt.Warnings) } }) } @@ -221,13 +236,24 @@ func (u *fakeWorkspaceProxyFetchUpdater) Update(ctx context.Context) error { return u.updateFunc(ctx) } +//nolint:revive // yes, this is a control flag, and that is OK in a unit test. func fakeWorkspaceProxy(name string, healthy bool, version string) codersdk.WorkspaceProxy { + var status codersdk.WorkspaceProxyStatus + if !healthy { + status = codersdk.WorkspaceProxyStatus{ + Status: codersdk.ProxyUnreachable, + Report: codersdk.ProxyHealthReport{ + Errors: []string{assert.AnError.Error()}, + }, + } + } return codersdk.WorkspaceProxy{ Region: codersdk.Region{ Name: name, Healthy: healthy, }, Version: version, + Status: status, } } diff --git a/docs/admin/healthcheck.md b/docs/admin/healthcheck.md new file mode 100644 index 0000000000..4c65fa0fca --- /dev/null +++ b/docs/admin/healthcheck.md @@ -0,0 +1,248 @@ +# Deployment Health + +Coder includes an operator-friendly deployment health page that provides a +number of details about the health of your Coder deployment. + +You can view it at `https://${CODER_URL}/health`, or you can alternatively view +the [JSON response directly](../api/debug.md#debug-info-deployment-health). + +The deployment health page is broken up into the following sections: + +## Access URL + +The Access URL section shows checks related to Coder's +[access URL](./configure.md#access-url). + +Coder will periodically send a GET request to `${CODER_ACCESS_URL}/healthz` and +validate that the response is `200 OK`. The expected response body is also the +string `OK`. + +If there is an issue, you may see one of the following errors reported: + +### EACS01: Access URL not set + +**Problem:** no access URL has been configured. + +**Solution:** configure an [access URL](./configure.md#access-url) for Coder. + +### EACS02: Access URL invalid + +**Problem:** `${CODER_ACCESS_URL}/healthz` is not a valid URL. + +**Solution:** Ensure that the access URL is a valid URL accepted by +[`url.Parse`](https://pkg.go.dev/net/url#Parse). Example: +`https://dev.coder.com/`. + +> [!TIP] You can check this [here](https://go.dev/play/p/CabcJZyTwt9). + +### EACS03: Failed to fetch `/healthz` + +**Problem:** Coder was unable to execute a GET request to +`${CODER_ACCESS_URL}/healthz`. + +This could be due to a number of reasons, including but not limited to: + +- DNS lookup failure +- A misconfigured firewall +- A misconfigured reverse proxy +- Invalid or expired SSL certificates + +**Solution:** Investigate and resolve the root cause of the connection issue. + +To troubleshoot further, you can log into the machine running Coder and attempt +to run the following command: + +```shell +curl -v ${CODER_ACCESS_URL}/healthz +# Expected output: +# * Trying XXX.XXX.XXX.XXX:443 +# * Connected to https://coder.company.com (XXX.XXX.XXX.XXX) port 443 (#0) +# [...] +# OK +``` + +The output of this command should aid further diagnosis. + +### EACS04: /healthz did not return 200 OK + +**Problem:** Coder was able to execute a GET request to +`${CODER_ACCESS_URL}/healthz`, but the response code was not `200 OK` as +expected. + +This could mean, for instance, that: + +- The request did not actually hit your Coder instance (potentially an incorrect + DNS entry) +- The request hit your Coder instance, but on an unexpected path (potentially a + misconfigured reverse proxy) + +**Solution:** Inspect the `HealthzResponse` in the health check output. This +should give you a good indication of the root cause. + +## Database + +Coder continuously executes a short database query to validate that it can reach +its configured database, and also measures the median latency over 5 attempts. + +### EDB01: Database Ping Failed + +**Problem:** This error code is returned if any attempt to execute this database +query fails. + +**Solution:** Investigate the health of the database. + +### EDB02: Database Latency High + +**Problem:** This code is returned if the median latency is higher than the +[configured threshold](../cli/server.md#--health-check-threshold-database). This +may not be an error as such, but is an indication of a potential issue. + +**Solution:** Investigate the sizing of the configured database with regard to +Coder's current activity and usage. It may be necessary to increase the +resources allocated to Coder's database. Alternatively, you can raise the +configured threshold to a higher value (this will not address the root cause). + +> [!TIP] +> +> - You can enable +> [detailed database metrics](../cli/server.md#--prometheus-collect-db-metrics) +> in Coder's Prometheus endpoint. +> - If you have [tracing enabled](../cli/server.md#--trace), these traces may +> also contain useful information regarding Coder's database activity. + +## DERP + +Coder workspace agents may use +[DERP (Designated Encrypted Relay for Packets)](https://tailscale.com/blog/how-tailscale-works/#encrypted-tcp-relays-derp) +to communicate with Coder. This requires connectivity to a number of configured +[DERP servers](../cli/server.md#--derp-config-path) which are used to relay +traffic between Coder and workspace agents. Coder periodically queries the +health of its configured DERP servers and may return one or more of the +following: + +### EDERP01: DERP Node Uses Websocket + +**Problem:** When Coder attempts to establish a connection to one or more DERP +servers, it sends a specific `Upgrade: derp` HTTP header. Some load balancers +may block this header, in which case Coder will fall back to +`Upgrade: websocket`. + +This is not necessarily a fatal error, but a possible indication of a +misconfigured reverse HTTP proxy. Additionally, while workspace users should +still be able to reach their workspaces, connection performance may be degraded. + +> [!NOTE] This may also be shown if you have +> [forced websocket connections for DERP](../cli/server.md#--derp-force-websockets). + +**Solution:** ensure that any configured reverse proxy does not strip the +`Upgrade: derp` header. + +### EDERP02: One or more DERP nodes are unhealthy + +**Problem:** This is shown if Coder is unable to reach one or more configured +DERP servers. Clients will fall back to use the remaining DERP servers, but +performance may be impacted for clients closest to the unhealthy DERP server. + +**Solution:** Ensure that the DERP server is available and reachable over the +network on port 443, for example: + +```shell +curl -v "https://coder.company.com:443/derp" +# Expected output: +# * Trying XXX.XXX.XXX.XXX:443 +# * Connected to https://coder.company.com (XXX.XXX.XXX.XXX) port 443 (#0) +# DERP requires connection upgrade +``` + +## Websocket + +Coder makes heavy use of [WebSockets](https://datatracker.ietf.org/doc/rfc6455/) +for long-lived connections: + +- Between users interacting with Coder's Web UI (for example, the built-in + terminal, or VSCode Web), +- Between workspace agents and `coderd`, +- Between Coder [workspace proxies](../admin/workspace-proxies.md) and `coderd`. + +Any issues causing failures to establish WebSocket connections will result in +**severe** impairment of functionality for users. To validate this +functionality, Coder will periodically attempt to establish a WebSocket +connection with itself using the configured [Access URL](#access-url), send a +message over the connection, and attempt to read back that same message. + +### EWS01: Failed to establish a WebSocket connection + +**Problem:** Coder was unable to establish a WebSocket connection over its own +Access URL. + +**Solution:** There are multiple possible causes of this problem: + +1. Ensure that Coder's configured Access URL can be reached from the server + running Coder, using standard troubleshooting tools like `curl`: + + ```shell + curl -v "https://coder.company.com:443/" + ``` + +2. Ensure that any reverse proxy that is sitting in front of Coder's configured + access URL is not stripping the HTTP header `Upgrade: websocket`. + +### EWS02: Failed to echo a WebSocket message + +**Problem:** Coder was able to establish a WebSocket connection, but was unable +to write a message. + +**Solution:** There are multiple possible causes of this problem: + +1. Validate that any reverse proxy servers in front of Coder's configured access + URL are not prematurely closing the connection. +2. Validate that the network link between Coder and the workspace proxy is + stable, e.g. by using `ping`. +3. Validate that any internal network infrastructure (for example, firewalls, + proxies, VPNs) do not interfere with WebSocket connections. + +## Workspace Proxy + +If you have configured [Workspace Proxies](../admin/workspace-proxies.md), Coder +will periodically query their availability and show their status here. + +### EWP01: Error Updating Workspace Proxy Health + +**Problem:** Coder was unable to query the connected workspace proxies for their +health status. + +**Solution:** This may be a transient issue. If it persists, it could signify a +connectivity issue. + +### EWP02: Error Fetching Workspace Proxies + +**Problem:** Coder was unable to fetch the stored workspace proxy health data +from the database. + +**Solution:** This may be a transient issue. If it persists, it could signify an +issue with Coder's configured database. + +### EWP03: Workspace Proxy Version Mismatch + +**Problem:** One or more workspace proxies are more than one major or minor +version out of date with the main deployment. It is important that workspace +proxies are updated at the same time as the main deployment to minimize the risk +of API incompatibility. + +**Solution:** Update the workspace proxy to match the currently running version +of Coder. + +### EWP04: One or more Workspace Proxies Unhealthy + +**Problem:** One or more workspace proxies are not reachable. + +**Solution:** Ensure that Coder can establish a connection to the configured +workspace proxies on port 443. + +## Unknown Error + +**Problem:** This error is shown when an unexpected error occurred evaluating +deployment health. It may resolve on its own. + +**Solution:** This may be a bug. +[File a GitHub issue](https://github.com/coder/coder/issues/new)! diff --git a/docs/images/icons/health.svg b/docs/images/icons/health.svg new file mode 100644 index 0000000000..9e961a9cb7 --- /dev/null +++ b/docs/images/icons/health.svg @@ -0,0 +1,3 @@ + + + diff --git a/docs/manifest.json b/docs/manifest.json index 82628cfc77..eb4276ca99 100644 --- a/docs/manifest.json +++ b/docs/manifest.json @@ -433,6 +433,12 @@ "path": "./admin/encryption.md", "icon_path": "./images/icons/lock.svg", "state": "enterprise" + }, + { + "title": "Deployment Health", + "description": "Learn how to monitor the health of your Coder deployment", + "path": "./admin/healthcheck.md", + "icon_path": "./images/icons/health.svg" } ] }, diff --git a/site/src/api/typesGenerated.ts b/site/src/api/typesGenerated.ts index 9d930d27f1..21c70c6bb1 100644 --- a/site/src/api/typesGenerated.ts +++ b/site/src/api/typesGenerated.ts @@ -2203,6 +2203,43 @@ export const ClibaseValueSources: ClibaseValueSource[] = [ // The code below is generated from coderd/healthcheck/health. +// From health/model.go +export type HealthCode = + | "EACS01" + | "EACS02" + | "EACS03" + | "EACS04" + | "EDB01" + | "EDB02" + | "EDERP01" + | "EDERP02" + | "EUNKNOWN" + | "EWP01" + | "EWP02" + | "EWP03" + | "EWP04" + | "EWS01" + | "EWS02" + | "EWS03"; +export const HealthCodes: HealthCode[] = [ + "EACS01", + "EACS02", + "EACS03", + "EACS04", + "EDB01", + "EDB02", + "EDERP01", + "EDERP02", + "EUNKNOWN", + "EWP01", + "EWP02", + "EWP03", + "EWP04", + "EWS01", + "EWS02", + "EWS03", +]; + // From health/model.go export type HealthSeverity = "error" | "ok" | "warning"; export const HealthSeveritys: HealthSeverity[] = ["error", "ok", "warning"];