diff --git a/coderd/healthcheck/accessurl.go b/coderd/healthcheck/accessurl.go
index 6f3b0fdc07..cfcc0ac006 100644
--- a/coderd/healthcheck/accessurl.go
+++ b/coderd/healthcheck/accessurl.go
@@ -7,8 +7,6 @@ import (
"net/url"
"time"
- "golang.org/x/xerrors"
-
"github.com/coder/coder/v2/coderd/healthcheck/health"
"github.com/coder/coder/v2/coderd/util/ptr"
)
@@ -44,7 +42,7 @@ func (r *AccessURLReport) Run(ctx context.Context, opts *AccessURLReportOptions)
r.Dismissed = opts.Dismissed
if opts.AccessURL == nil {
- r.Error = ptr.Ref("access URL is nil")
+ r.Error = ptr.Ref(health.Messagef(health.CodeAccessURLNotSet, "Access URL not set"))
r.Severity = health.SeverityError
return
}
@@ -56,21 +54,21 @@ func (r *AccessURLReport) Run(ctx context.Context, opts *AccessURLReportOptions)
accessURL, err := opts.AccessURL.Parse("/healthz")
if err != nil {
- r.Error = convertError(xerrors.Errorf("parse healthz endpoint: %w", err))
+ r.Error = ptr.Ref(health.Messagef(health.CodeAccessURLInvalid, "parse healthz endpoint: %s", err))
r.Severity = health.SeverityError
return
}
req, err := http.NewRequestWithContext(ctx, "GET", accessURL.String(), nil)
if err != nil {
- r.Error = convertError(xerrors.Errorf("create healthz request: %w", err))
+ r.Error = ptr.Ref(health.Messagef(health.CodeAccessURLFetch, "create healthz request: %s", err))
r.Severity = health.SeverityError
return
}
res, err := opts.Client.Do(req)
if err != nil {
- r.Error = convertError(xerrors.Errorf("get healthz endpoint: %w", err))
+ r.Error = ptr.Ref(health.Messagef(health.CodeAccessURLFetch, "get healthz endpoint: %s", err))
r.Severity = health.SeverityError
return
}
@@ -78,7 +76,7 @@ func (r *AccessURLReport) Run(ctx context.Context, opts *AccessURLReportOptions)
body, err := io.ReadAll(res.Body)
if err != nil {
- r.Error = convertError(xerrors.Errorf("read healthz response: %w", err))
+ r.Error = ptr.Ref(health.Messagef(health.CodeAccessURLFetch, "read healthz response: %s", err))
r.Severity = health.SeverityError
return
}
@@ -88,6 +86,7 @@ func (r *AccessURLReport) Run(ctx context.Context, opts *AccessURLReportOptions)
r.StatusCode = res.StatusCode
if res.StatusCode != http.StatusOK {
r.Severity = health.SeverityWarning
+ r.Warnings = append(r.Warnings, health.Messagef(health.CodeAccessURLNotOK, "/healthz did not return 200 OK"))
}
r.HealthzResponse = string(body)
}
diff --git a/coderd/healthcheck/accessurl_test.go b/coderd/healthcheck/accessurl_test.go
index 9e368cc679..788fd41481 100644
--- a/coderd/healthcheck/accessurl_test.go
+++ b/coderd/healthcheck/accessurl_test.go
@@ -11,7 +11,6 @@ import (
"github.com/stretchr/testify/require"
"golang.org/x/xerrors"
- "github.com/coder/coder/v2/coderd/coderdtest"
"github.com/coder/coder/v2/coderd/healthcheck"
"github.com/coder/coder/v2/coderd/healthcheck/health"
)
@@ -25,12 +24,17 @@ func TestAccessURL(t *testing.T) {
var (
ctx, cancel = context.WithCancel(context.Background())
report healthcheck.AccessURLReport
- client = coderdtest.New(t, nil)
+ resp = []byte("OK")
+ srv = httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+ w.WriteHeader(http.StatusOK)
+ _, _ = w.Write(resp)
+ }))
)
defer cancel()
report.Run(ctx, &healthcheck.AccessURLReportOptions{
- AccessURL: client.URL,
+ Client: srv.Client(),
+ AccessURL: mustURL(t, srv.URL),
})
assert.True(t, report.Healthy)
@@ -41,35 +45,27 @@ func TestAccessURL(t *testing.T) {
assert.Nil(t, report.Error)
})
- t.Run("404", func(t *testing.T) {
+ t.Run("NotSet", func(t *testing.T) {
t.Parallel()
var (
ctx, cancel = context.WithCancel(context.Background())
report healthcheck.AccessURLReport
- resp = []byte("NOT OK")
- srv = httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
- w.WriteHeader(http.StatusNotFound)
- w.Write(resp)
- }))
)
defer cancel()
- defer srv.Close()
-
- u, err := url.Parse(srv.URL)
- require.NoError(t, err)
report.Run(ctx, &healthcheck.AccessURLReportOptions{
- Client: srv.Client(),
- AccessURL: u,
+ Client: nil, // defaults to http.DefaultClient
+ AccessURL: nil,
})
assert.False(t, report.Healthy)
- assert.True(t, report.Reachable)
- assert.Equal(t, health.SeverityWarning, report.Severity)
- assert.Equal(t, http.StatusNotFound, report.StatusCode)
- assert.Equal(t, string(resp), report.HealthzResponse)
- assert.Nil(t, report.Error)
+ assert.False(t, report.Reachable)
+ assert.Equal(t, health.SeverityError, report.Severity)
+ assert.Equal(t, 0, report.StatusCode)
+ assert.Equal(t, "", report.HealthzResponse)
+ require.NotNil(t, report.Error)
+ assert.Contains(t, *report.Error, health.CodeAccessURLNotSet)
})
t.Run("ClientErr", func(t *testing.T) {
@@ -81,7 +77,7 @@ func TestAccessURL(t *testing.T) {
resp = []byte("OK")
srv = httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
w.WriteHeader(http.StatusOK)
- w.Write(resp)
+ _, _ = w.Write(resp)
}))
client = srv.Client()
)
@@ -93,12 +89,9 @@ func TestAccessURL(t *testing.T) {
return nil, expErr
})
- u, err := url.Parse(srv.URL)
- require.NoError(t, err)
-
report.Run(ctx, &healthcheck.AccessURLReportOptions{
Client: client,
- AccessURL: u,
+ AccessURL: mustURL(t, srv.URL),
})
assert.False(t, report.Healthy)
@@ -108,6 +101,38 @@ func TestAccessURL(t *testing.T) {
assert.Equal(t, "", report.HealthzResponse)
require.NotNil(t, report.Error)
assert.Contains(t, *report.Error, expErr.Error())
+ assert.Contains(t, *report.Error, health.CodeAccessURLFetch)
+ })
+
+ t.Run("404", func(t *testing.T) {
+ t.Parallel()
+
+ var (
+ ctx, cancel = context.WithCancel(context.Background())
+ report healthcheck.AccessURLReport
+ resp = []byte("NOT OK")
+ srv = httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+ w.WriteHeader(http.StatusNotFound)
+ _, _ = w.Write(resp)
+ }))
+ )
+ defer cancel()
+ defer srv.Close()
+
+ report.Run(ctx, &healthcheck.AccessURLReportOptions{
+ Client: srv.Client(),
+ AccessURL: mustURL(t, srv.URL),
+ })
+
+ assert.False(t, report.Healthy)
+ assert.True(t, report.Reachable)
+ assert.Equal(t, health.SeverityWarning, report.Severity)
+ assert.Equal(t, http.StatusNotFound, report.StatusCode)
+ assert.Equal(t, string(resp), report.HealthzResponse)
+ assert.Nil(t, report.Error)
+ if assert.NotEmpty(t, report.Warnings) {
+ assert.Contains(t, report.Warnings[0], health.CodeAccessURLNotOK)
+ }
})
t.Run("DismissedError", func(t *testing.T) {
@@ -133,3 +158,10 @@ type roundTripFunc func(r *http.Request) (*http.Response, error)
func (rt roundTripFunc) RoundTrip(r *http.Request) (*http.Response, error) {
return rt(r)
}
+
+func mustURL(t testing.TB, s string) *url.URL {
+ t.Helper()
+ u, err := url.Parse(s)
+ require.NoError(t, err)
+ return u
+}
diff --git a/coderd/healthcheck/database.go b/coderd/healthcheck/database.go
index 3df3fcd972..aa2896f65c 100644
--- a/coderd/healthcheck/database.go
+++ b/coderd/healthcheck/database.go
@@ -4,11 +4,11 @@ import (
"context"
"time"
- "golang.org/x/exp/slices"
- "golang.org/x/xerrors"
-
"github.com/coder/coder/v2/coderd/database"
"github.com/coder/coder/v2/coderd/healthcheck/health"
+ "github.com/coder/coder/v2/coderd/util/ptr"
+
+ "golang.org/x/exp/slices"
)
const (
@@ -55,8 +55,9 @@ func (r *DatabaseReport) Run(ctx context.Context, opts *DatabaseReportOptions) {
for i := 0; i < pingCount; i++ {
pong, err := opts.DB.Ping(ctx)
if err != nil {
- r.Error = convertError(xerrors.Errorf("ping: %w", err))
+ r.Error = ptr.Ref(health.Messagef(health.CodeDatabasePingFailed, "ping database: %s", err))
r.Severity = health.SeverityError
+
return
}
pings = append(pings, pong)
@@ -69,6 +70,7 @@ func (r *DatabaseReport) Run(ctx context.Context, opts *DatabaseReportOptions) {
r.LatencyMS = latency.Milliseconds()
if r.LatencyMS >= r.ThresholdMS {
r.Severity = health.SeverityWarning
+ r.Warnings = append(r.Warnings, health.Messagef(health.CodeDatabasePingSlow, "median database ping above threshold"))
}
r.Healthy = true
r.Reachable = true
diff --git a/coderd/healthcheck/database_test.go b/coderd/healthcheck/database_test.go
index 8ac5bbe38c..afa518f270 100644
--- a/coderd/healthcheck/database_test.go
+++ b/coderd/healthcheck/database_test.go
@@ -65,6 +65,7 @@ func TestDatabase(t *testing.T) {
require.NotNil(t, report.Error)
assert.Equal(t, healthcheck.DatabaseDefaultThreshold.Milliseconds(), report.ThresholdMS)
assert.Contains(t, *report.Error, err.Error())
+ assert.Contains(t, *report.Error, health.CodeDatabasePingFailed)
})
t.Run("DismissedError", func(t *testing.T) {
@@ -85,6 +86,7 @@ func TestDatabase(t *testing.T) {
assert.Equal(t, health.SeverityError, report.Severity)
assert.True(t, report.Dismissed)
require.NotNil(t, report.Error)
+ assert.Contains(t, *report.Error, health.CodeDatabasePingFailed)
})
t.Run("Median", func(t *testing.T) {
@@ -112,6 +114,7 @@ func TestDatabase(t *testing.T) {
assert.EqualValues(t, 1, report.LatencyMS)
assert.Equal(t, healthcheck.DatabaseDefaultThreshold.Milliseconds(), report.ThresholdMS)
assert.Nil(t, report.Error)
+ assert.Empty(t, report.Warnings)
})
t.Run("Threshold", func(t *testing.T) {
@@ -139,5 +142,8 @@ func TestDatabase(t *testing.T) {
assert.EqualValues(t, 1000, report.LatencyMS)
assert.Equal(t, time.Second.Milliseconds(), report.ThresholdMS)
assert.Nil(t, report.Error)
+ if assert.NotEmpty(t, report.Warnings) {
+ assert.Contains(t, report.Warnings[0], health.CodeDatabasePingSlow)
+ }
})
}
diff --git a/coderd/healthcheck/derphealth/derp.go b/coderd/healthcheck/derphealth/derp.go
index 3f9f78b319..9051cc6e52 100644
--- a/coderd/healthcheck/derphealth/derp.go
+++ b/coderd/healthcheck/derphealth/derp.go
@@ -136,9 +136,7 @@ func (r *Report) Run(ctx context.Context, opts *ReportOptions) {
r.Healthy = false
}
- for _, w := range regionReport.Warnings {
- r.Warnings = append(r.Warnings, fmt.Sprintf("[%s] %s", regionReport.Region.RegionName, w))
- }
+ r.Warnings = append(r.Warnings, regionReport.Warnings...)
mu.Unlock()
}()
}
@@ -202,9 +200,7 @@ func (r *RegionReport) Run(ctx context.Context) {
unhealthyNodes++
}
- for _, w := range nodeReport.Warnings {
- r.Warnings = append(r.Warnings, fmt.Sprintf("[%s] %s", nodeReport.Node.Name, w))
- }
+ r.Warnings = append(r.Warnings, nodeReport.Warnings...)
r.mu.Unlock()
}()
}
@@ -228,7 +224,7 @@ func (r *RegionReport) Run(ctx context.Context) {
} else if unhealthyNodes == 1 {
// r.Healthy = true (by default)
r.Severity = health.SeverityWarning
- r.Warnings = append(r.Warnings, oneNodeUnhealthy)
+ r.Warnings = append(r.Warnings, health.Messagef(health.CodeDERPOneNodeUnhealthy, oneNodeUnhealthy))
} else if unhealthyNodes > 1 {
r.Healthy = false
@@ -292,7 +288,7 @@ func (r *NodeReport) Run(ctx context.Context) {
}
if r.UsesWebsocket {
- r.Warnings = append(r.Warnings, warningNodeUsesWebsocket)
+ r.Warnings = append(r.Warnings, health.Messagef(health.CodeDERPNodeUsesWebsocket, warningNodeUsesWebsocket))
r.Severity = health.SeverityWarning
}
}
diff --git a/coderd/healthcheck/derphealth/derp_test.go b/coderd/healthcheck/derphealth/derp_test.go
index cf307637ac..8a2bf99e76 100644
--- a/coderd/healthcheck/derphealth/derp_test.go
+++ b/coderd/healthcheck/derphealth/derp_test.go
@@ -129,6 +129,9 @@ func TestDERP(t *testing.T) {
assert.True(t, report.Healthy)
assert.Equal(t, health.SeverityWarning, report.Severity)
assert.True(t, report.Dismissed)
+ if assert.NotEmpty(t, report.Warnings) {
+ assert.Contains(t, report.Warnings[0], health.CodeDERPOneNodeUnhealthy)
+ }
for _, region := range report.Regions {
assert.True(t, region.Healthy)
assert.True(t, region.NodeReports[0].Healthy)
@@ -232,7 +235,9 @@ func TestDERP(t *testing.T) {
assert.True(t, report.Healthy)
assert.Equal(t, health.SeverityWarning, report.Severity)
- assert.NotEmpty(t, report.Warnings)
+ if assert.NotEmpty(t, report.Warnings) {
+ assert.Contains(t, report.Warnings[0], health.CodeDERPNodeUsesWebsocket)
+ }
for _, region := range report.Regions {
assert.True(t, region.Healthy)
assert.Equal(t, health.SeverityWarning, region.Severity)
diff --git a/coderd/healthcheck/health/model.go b/coderd/healthcheck/health/model.go
index 461c9c8f3c..27ff9eae1f 100644
--- a/coderd/healthcheck/health/model.go
+++ b/coderd/healthcheck/health/model.go
@@ -1,9 +1,37 @@
package health
+import (
+ "fmt"
+ "strings"
+)
+
const (
SeverityOK Severity = "ok"
SeverityWarning Severity = "warning"
SeverityError Severity = "error"
+
+ // CodeUnknown is a catch-all health code when something unexpected goes wrong (for example, a panic).
+ CodeUnknown Code = "EUNKNOWN"
+
+ CodeProxyUpdate Code = "EWP01"
+ CodeProxyFetch Code = "EWP02"
+ CodeProxyVersionMismatch Code = "EWP03"
+ CodeProxyUnhealthy Code = "EWP04"
+
+ CodeDatabasePingFailed Code = "EDB01"
+ CodeDatabasePingSlow Code = "EDB02"
+
+ CodeWebsocketDial Code = "EWS01"
+ CodeWebsocketEcho Code = "EWS02"
+ CodeWebsocketMsg Code = "EWS03"
+
+ CodeAccessURLNotSet Code = "EACS01"
+ CodeAccessURLInvalid Code = "EACS02"
+ CodeAccessURLFetch Code = "EACS03"
+ CodeAccessURLNotOK Code = "EACS04"
+
+ CodeDERPNodeUsesWebsocket Code = `EDERP01`
+ CodeDERPOneNodeUnhealthy Code = `EDERP02`
)
// @typescript-generate Severity
@@ -18,3 +46,17 @@ var severityRank = map[Severity]int{
func (s Severity) Value() int {
return severityRank[s]
}
+
+// Code is a stable identifier used to link to documentation.
+// @typescript-generate Code
+type Code string
+
+// Messagef is a convenience function for formatting a healthcheck error message.
+func Messagef(code Code, msg string, args ...any) string {
+ var sb strings.Builder
+ _, _ = sb.WriteString(string(code))
+ _, _ = sb.WriteRune(':')
+ _, _ = sb.WriteRune(' ')
+ _, _ = sb.WriteString(fmt.Sprintf(msg, args...))
+ return sb.String()
+}
diff --git a/coderd/healthcheck/healthcheck.go b/coderd/healthcheck/healthcheck.go
index 9ecb9b9d13..f1d399b889 100644
--- a/coderd/healthcheck/healthcheck.go
+++ b/coderd/healthcheck/healthcheck.go
@@ -2,7 +2,6 @@ package healthcheck
import (
"context"
- "fmt"
"sync"
"time"
@@ -104,7 +103,7 @@ func Run(ctx context.Context, opts *ReportOptions) *Report {
defer wg.Done()
defer func() {
if err := recover(); err != nil {
- report.DERP.Error = ptr.Ref(fmt.Sprint(err))
+ report.DERP.Error = ptr.Ref(health.Messagef(health.CodeUnknown, "derp report panic: %s", err))
}
}()
@@ -116,7 +115,7 @@ func Run(ctx context.Context, opts *ReportOptions) *Report {
defer wg.Done()
defer func() {
if err := recover(); err != nil {
- report.AccessURL.Error = ptr.Ref(fmt.Sprint(err))
+ report.AccessURL.Error = ptr.Ref(health.Messagef(health.CodeUnknown, "access url report panic: %s", err))
}
}()
@@ -128,7 +127,7 @@ func Run(ctx context.Context, opts *ReportOptions) *Report {
defer wg.Done()
defer func() {
if err := recover(); err != nil {
- report.Websocket.Error = ptr.Ref(fmt.Sprint(err))
+ report.Websocket.Error = ptr.Ref(health.Messagef(health.CodeUnknown, "websocket report panic: %s", err))
}
}()
@@ -140,7 +139,7 @@ func Run(ctx context.Context, opts *ReportOptions) *Report {
defer wg.Done()
defer func() {
if err := recover(); err != nil {
- report.Database.Error = ptr.Ref(fmt.Sprint(err))
+ report.Database.Error = ptr.Ref(health.Messagef(health.CodeUnknown, "database report panic: %s", err))
}
}()
@@ -152,7 +151,7 @@ func Run(ctx context.Context, opts *ReportOptions) *Report {
defer wg.Done()
defer func() {
if err := recover(); err != nil {
- report.WorkspaceProxy.Error = ptr.Ref(fmt.Sprint(err))
+ report.WorkspaceProxy.Error = ptr.Ref(health.Messagef(health.CodeUnknown, "proxy report panic: %s", err))
}
}()
diff --git a/coderd/healthcheck/websocket.go b/coderd/healthcheck/websocket.go
index 2a4792c874..372a322bc8 100644
--- a/coderd/healthcheck/websocket.go
+++ b/coderd/healthcheck/websocket.go
@@ -13,6 +13,7 @@ import (
"nhooyr.io/websocket"
"github.com/coder/coder/v2/coderd/healthcheck/health"
+ "github.com/coder/coder/v2/coderd/util/ptr"
)
// @typescript-generate WebsocketReport
@@ -75,6 +76,7 @@ func (r *WebsocketReport) Run(ctx context.Context, opts *WebsocketReportOptions)
}
if err != nil {
r.Error = convertError(xerrors.Errorf("websocket dial: %w", err))
+ r.Error = ptr.Ref(health.Messagef(health.CodeWebsocketDial, "websocket dial: %s", err))
r.Severity = health.SeverityError
return
}
@@ -84,26 +86,26 @@ func (r *WebsocketReport) Run(ctx context.Context, opts *WebsocketReportOptions)
msg := strconv.Itoa(i)
err := c.Write(ctx, websocket.MessageText, []byte(msg))
if err != nil {
- r.Error = convertError(xerrors.Errorf("write message: %w", err))
+ r.Error = ptr.Ref(health.Messagef(health.CodeWebsocketEcho, "write message: %s", err))
r.Severity = health.SeverityError
return
}
ty, got, err := c.Read(ctx)
if err != nil {
- r.Error = convertError(xerrors.Errorf("read message: %w", err))
+ r.Error = ptr.Ref(health.Messagef(health.CodeWebsocketEcho, "read message: %s", err))
r.Severity = health.SeverityError
return
}
if ty != websocket.MessageText {
- r.Error = convertError(xerrors.Errorf("received incorrect message type: %v", ty))
+ r.Error = ptr.Ref(health.Messagef(health.CodeWebsocketMsg, "received incorrect message type: %v", ty))
r.Severity = health.SeverityError
return
}
if string(got) != msg {
- r.Error = convertError(xerrors.Errorf("received incorrect message: wanted %q, got %q", msg, string(got)))
+ r.Error = ptr.Ref(health.Messagef(health.CodeWebsocketMsg, "received incorrect message: wanted %q, got %q", msg, string(got)))
r.Severity = health.SeverityError
return
}
diff --git a/coderd/healthcheck/websocket_test.go b/coderd/healthcheck/websocket_test.go
index 1beb96ea06..dd2a42dffb 100644
--- a/coderd/healthcheck/websocket_test.go
+++ b/coderd/healthcheck/websocket_test.go
@@ -63,7 +63,9 @@ func TestWebsocket(t *testing.T) {
APIKey: "test",
})
- require.NotNil(t, wsReport.Error)
+ if assert.NotNil(t, wsReport.Error) {
+ assert.Contains(t, *wsReport.Error, health.CodeWebsocketDial)
+ }
require.Equal(t, health.SeverityError, wsReport.Severity)
assert.Equal(t, wsReport.Body, "test error")
assert.Equal(t, wsReport.Code, http.StatusBadRequest)
diff --git a/coderd/healthcheck/workspaceproxy.go b/coderd/healthcheck/workspaceproxy.go
index 8ab8e86dd4..bfb1b892d9 100644
--- a/coderd/healthcheck/workspaceproxy.go
+++ b/coderd/healthcheck/workspaceproxy.go
@@ -6,12 +6,12 @@ import (
"sort"
"strings"
- "golang.org/x/xerrors"
-
"github.com/coder/coder/v2/buildinfo"
"github.com/coder/coder/v2/coderd/healthcheck/health"
"github.com/coder/coder/v2/coderd/util/ptr"
"github.com/coder/coder/v2/codersdk"
+
+ "golang.org/x/xerrors"
)
// @typescript-generate WorkspaceProxyReport
@@ -64,7 +64,7 @@ func (r *WorkspaceProxyReport) Run(ctx context.Context, opts *WorkspaceProxyRepo
// If this fails, just mark it as a warning. It is still updated in the background.
if err := opts.WorkspaceProxiesFetchUpdater.Update(ctx); err != nil {
r.Severity = health.SeverityWarning
- r.Warnings = append(r.Warnings, xerrors.Errorf("update proxy health: %w", err).Error())
+ r.Warnings = append(r.Warnings, health.Messagef(health.CodeProxyUpdate, "update proxy health: %s", err))
return
}
@@ -72,7 +72,7 @@ func (r *WorkspaceProxyReport) Run(ctx context.Context, opts *WorkspaceProxyRepo
if err != nil {
r.Healthy = false
r.Severity = health.SeverityError
- r.Error = ptr.Ref(err.Error())
+ r.Error = ptr.Ref(health.Messagef(health.CodeProxyFetch, "fetch workspace proxies: %s", err))
return
}
@@ -99,11 +99,13 @@ func (r *WorkspaceProxyReport) Run(ctx context.Context, opts *WorkspaceProxyRepo
r.Severity = calculateSeverity(total, healthy)
r.Healthy = r.Severity.Value() < health.SeverityError.Value()
- switch r.Severity {
- case health.SeverityWarning, health.SeverityOK:
- r.Warnings = append(r.Warnings, errs...)
- case health.SeverityError:
- r.appendError(errs...)
+ for _, err := range errs {
+ switch r.Severity {
+ case health.SeverityWarning, health.SeverityOK:
+ r.Warnings = append(r.Warnings, health.Messagef(health.CodeProxyUnhealthy, err))
+ case health.SeverityError:
+ r.appendError(health.Messagef(health.CodeProxyUnhealthy, err))
+ }
}
// Versions _must_ match. Perform this check last. This will clobber any other severity.
@@ -111,7 +113,7 @@ func (r *WorkspaceProxyReport) Run(ctx context.Context, opts *WorkspaceProxyRepo
if vErr := checkVersion(proxy, opts.CurrentVersion); vErr != nil {
r.Healthy = false
r.Severity = health.SeverityError
- r.appendError(fmt.Sprintf("%s: %s", proxy.Name, vErr.Error()))
+ r.appendError(health.Messagef(health.CodeProxyVersionMismatch, vErr.Error()))
}
}
}
diff --git a/coderd/healthcheck/workspaceproxy_test.go b/coderd/healthcheck/workspaceproxy_test.go
index 3cd560ad38..98a75eb237 100644
--- a/coderd/healthcheck/workspaceproxy_test.go
+++ b/coderd/healthcheck/workspaceproxy_test.go
@@ -2,6 +2,7 @@ package healthcheck_test
import (
"context"
+ "strings"
"testing"
"github.com/stretchr/testify/assert"
@@ -26,6 +27,7 @@ func TestWorkspaceProxies(t *testing.T) {
updateProxyHealth func(context.Context) error
expectedHealthy bool
expectedError string
+ expectedWarning string
expectedSeverity health.Severity
}{
{
@@ -53,6 +55,7 @@ func TestWorkspaceProxies(t *testing.T) {
updateProxyHealth: fakeUpdateProxyHealth(nil),
expectedHealthy: false,
expectedSeverity: health.SeverityError,
+ expectedError: string(health.CodeProxyUnhealthy),
},
{
name: "Enabled/OneUnreachable",
@@ -80,7 +83,7 @@ func TestWorkspaceProxies(t *testing.T) {
updateProxyHealth: fakeUpdateProxyHealth(nil),
expectedHealthy: false,
expectedSeverity: health.SeverityError,
- expectedError: "connect: connection refused",
+ expectedError: string(health.CodeProxyUnhealthy),
},
{
name: "Enabled/AllHealthy",
@@ -103,6 +106,7 @@ func TestWorkspaceProxies(t *testing.T) {
updateProxyHealth: fakeUpdateProxyHealth(nil),
expectedHealthy: true,
expectedSeverity: health.SeverityWarning,
+ expectedWarning: string(health.CodeProxyUnhealthy),
},
{
name: "Enabled/AllUnhealthy",
@@ -113,6 +117,7 @@ func TestWorkspaceProxies(t *testing.T) {
updateProxyHealth: fakeUpdateProxyHealth(nil),
expectedHealthy: false,
expectedSeverity: health.SeverityError,
+ expectedError: string(health.CodeProxyUnhealthy),
},
{
name: "Enabled/OneOutOfDate",
@@ -150,7 +155,7 @@ func TestWorkspaceProxies(t *testing.T) {
updateProxyHealth: fakeUpdateProxyHealth(nil),
expectedHealthy: false,
expectedSeverity: health.SeverityError,
- expectedError: assert.AnError.Error(),
+ expectedError: string(health.CodeProxyFetch),
},
{
name: "Enabled/ErrUpdateProxyHealth",
@@ -158,6 +163,7 @@ func TestWorkspaceProxies(t *testing.T) {
updateProxyHealth: fakeUpdateProxyHealth(assert.AnError),
expectedHealthy: true,
expectedSeverity: health.SeverityWarning,
+ expectedWarning: string(health.CodeProxyUpdate),
},
} {
tt := tt
@@ -179,13 +185,22 @@ func TestWorkspaceProxies(t *testing.T) {
assert.Equal(t, tt.expectedHealthy, rpt.Healthy)
assert.Equal(t, tt.expectedSeverity, rpt.Severity)
- if tt.expectedError != "" {
- assert.NotNil(t, rpt.Error)
+ if tt.expectedError != "" && assert.NotNil(t, rpt.Error) {
assert.Contains(t, *rpt.Error, tt.expectedError)
} else {
- if !assert.Nil(t, rpt.Error) {
- assert.Empty(t, *rpt.Error)
+ assert.Nil(t, rpt.Error)
+ }
+ if tt.expectedWarning != "" && assert.NotEmpty(t, rpt.Warnings) {
+ var found bool
+ for _, w := range rpt.Warnings {
+ if strings.Contains(w, tt.expectedWarning) {
+ found = true
+ break
+ }
}
+ assert.True(t, found, "expected warning %s not found in %v", tt.expectedWarning, rpt.Warnings)
+ } else {
+ assert.Empty(t, rpt.Warnings)
}
})
}
@@ -221,13 +236,24 @@ func (u *fakeWorkspaceProxyFetchUpdater) Update(ctx context.Context) error {
return u.updateFunc(ctx)
}
+//nolint:revive // yes, this is a control flag, and that is OK in a unit test.
func fakeWorkspaceProxy(name string, healthy bool, version string) codersdk.WorkspaceProxy {
+ var status codersdk.WorkspaceProxyStatus
+ if !healthy {
+ status = codersdk.WorkspaceProxyStatus{
+ Status: codersdk.ProxyUnreachable,
+ Report: codersdk.ProxyHealthReport{
+ Errors: []string{assert.AnError.Error()},
+ },
+ }
+ }
return codersdk.WorkspaceProxy{
Region: codersdk.Region{
Name: name,
Healthy: healthy,
},
Version: version,
+ Status: status,
}
}
diff --git a/docs/admin/healthcheck.md b/docs/admin/healthcheck.md
new file mode 100644
index 0000000000..4c65fa0fca
--- /dev/null
+++ b/docs/admin/healthcheck.md
@@ -0,0 +1,248 @@
+# Deployment Health
+
+Coder includes an operator-friendly deployment health page that provides a
+number of details about the health of your Coder deployment.
+
+You can view it at `https://${CODER_URL}/health`, or you can alternatively view
+the [JSON response directly](../api/debug.md#debug-info-deployment-health).
+
+The deployment health page is broken up into the following sections:
+
+## Access URL
+
+The Access URL section shows checks related to Coder's
+[access URL](./configure.md#access-url).
+
+Coder will periodically send a GET request to `${CODER_ACCESS_URL}/healthz` and
+validate that the response is `200 OK`. The expected response body is also the
+string `OK`.
+
+If there is an issue, you may see one of the following errors reported:
+
+### EACS01: Access URL not set
+
+**Problem:** no access URL has been configured.
+
+**Solution:** configure an [access URL](./configure.md#access-url) for Coder.
+
+### EACS02: Access URL invalid
+
+**Problem:** `${CODER_ACCESS_URL}/healthz` is not a valid URL.
+
+**Solution:** Ensure that the access URL is a valid URL accepted by
+[`url.Parse`](https://pkg.go.dev/net/url#Parse). Example:
+`https://dev.coder.com/`.
+
+> [!TIP] You can check this [here](https://go.dev/play/p/CabcJZyTwt9).
+
+### EACS03: Failed to fetch `/healthz`
+
+**Problem:** Coder was unable to execute a GET request to
+`${CODER_ACCESS_URL}/healthz`.
+
+This could be due to a number of reasons, including but not limited to:
+
+- DNS lookup failure
+- A misconfigured firewall
+- A misconfigured reverse proxy
+- Invalid or expired SSL certificates
+
+**Solution:** Investigate and resolve the root cause of the connection issue.
+
+To troubleshoot further, you can log into the machine running Coder and attempt
+to run the following command:
+
+```shell
+curl -v ${CODER_ACCESS_URL}/healthz
+# Expected output:
+# * Trying XXX.XXX.XXX.XXX:443
+# * Connected to https://coder.company.com (XXX.XXX.XXX.XXX) port 443 (#0)
+# [...]
+# OK
+```
+
+The output of this command should aid further diagnosis.
+
+### EACS04: /healthz did not return 200 OK
+
+**Problem:** Coder was able to execute a GET request to
+`${CODER_ACCESS_URL}/healthz`, but the response code was not `200 OK` as
+expected.
+
+This could mean, for instance, that:
+
+- The request did not actually hit your Coder instance (potentially an incorrect
+ DNS entry)
+- The request hit your Coder instance, but on an unexpected path (potentially a
+ misconfigured reverse proxy)
+
+**Solution:** Inspect the `HealthzResponse` in the health check output. This
+should give you a good indication of the root cause.
+
+## Database
+
+Coder continuously executes a short database query to validate that it can reach
+its configured database, and also measures the median latency over 5 attempts.
+
+### EDB01: Database Ping Failed
+
+**Problem:** This error code is returned if any attempt to execute this database
+query fails.
+
+**Solution:** Investigate the health of the database.
+
+### EDB02: Database Latency High
+
+**Problem:** This code is returned if the median latency is higher than the
+[configured threshold](../cli/server.md#--health-check-threshold-database). This
+may not be an error as such, but is an indication of a potential issue.
+
+**Solution:** Investigate the sizing of the configured database with regard to
+Coder's current activity and usage. It may be necessary to increase the
+resources allocated to Coder's database. Alternatively, you can raise the
+configured threshold to a higher value (this will not address the root cause).
+
+> [!TIP]
+>
+> - You can enable
+> [detailed database metrics](../cli/server.md#--prometheus-collect-db-metrics)
+> in Coder's Prometheus endpoint.
+> - If you have [tracing enabled](../cli/server.md#--trace), these traces may
+> also contain useful information regarding Coder's database activity.
+
+## DERP
+
+Coder workspace agents may use
+[DERP (Designated Encrypted Relay for Packets)](https://tailscale.com/blog/how-tailscale-works/#encrypted-tcp-relays-derp)
+to communicate with Coder. This requires connectivity to a number of configured
+[DERP servers](../cli/server.md#--derp-config-path) which are used to relay
+traffic between Coder and workspace agents. Coder periodically queries the
+health of its configured DERP servers and may return one or more of the
+following:
+
+### EDERP01: DERP Node Uses Websocket
+
+**Problem:** When Coder attempts to establish a connection to one or more DERP
+servers, it sends a specific `Upgrade: derp` HTTP header. Some load balancers
+may block this header, in which case Coder will fall back to
+`Upgrade: websocket`.
+
+This is not necessarily a fatal error, but a possible indication of a
+misconfigured reverse HTTP proxy. Additionally, while workspace users should
+still be able to reach their workspaces, connection performance may be degraded.
+
+> [!NOTE] This may also be shown if you have
+> [forced websocket connections for DERP](../cli/server.md#--derp-force-websockets).
+
+**Solution:** ensure that any configured reverse proxy does not strip the
+`Upgrade: derp` header.
+
+### EDERP02: One or more DERP nodes are unhealthy
+
+**Problem:** This is shown if Coder is unable to reach one or more configured
+DERP servers. Clients will fall back to use the remaining DERP servers, but
+performance may be impacted for clients closest to the unhealthy DERP server.
+
+**Solution:** Ensure that the DERP server is available and reachable over the
+network on port 443, for example:
+
+```shell
+curl -v "https://coder.company.com:443/derp"
+# Expected output:
+# * Trying XXX.XXX.XXX.XXX:443
+# * Connected to https://coder.company.com (XXX.XXX.XXX.XXX) port 443 (#0)
+# DERP requires connection upgrade
+```
+
+## Websocket
+
+Coder makes heavy use of [WebSockets](https://datatracker.ietf.org/doc/rfc6455/)
+for long-lived connections:
+
+- Between users interacting with Coder's Web UI (for example, the built-in
+ terminal, or VSCode Web),
+- Between workspace agents and `coderd`,
+- Between Coder [workspace proxies](../admin/workspace-proxies.md) and `coderd`.
+
+Any issues causing failures to establish WebSocket connections will result in
+**severe** impairment of functionality for users. To validate this
+functionality, Coder will periodically attempt to establish a WebSocket
+connection with itself using the configured [Access URL](#access-url), send a
+message over the connection, and attempt to read back that same message.
+
+### EWS01: Failed to establish a WebSocket connection
+
+**Problem:** Coder was unable to establish a WebSocket connection over its own
+Access URL.
+
+**Solution:** There are multiple possible causes of this problem:
+
+1. Ensure that Coder's configured Access URL can be reached from the server
+ running Coder, using standard troubleshooting tools like `curl`:
+
+ ```shell
+ curl -v "https://coder.company.com:443/"
+ ```
+
+2. Ensure that any reverse proxy that is sitting in front of Coder's configured
+ access URL is not stripping the HTTP header `Upgrade: websocket`.
+
+### EWS02: Failed to echo a WebSocket message
+
+**Problem:** Coder was able to establish a WebSocket connection, but was unable
+to write a message.
+
+**Solution:** There are multiple possible causes of this problem:
+
+1. Validate that any reverse proxy servers in front of Coder's configured access
+ URL are not prematurely closing the connection.
+2. Validate that the network link between Coder and the workspace proxy is
+ stable, e.g. by using `ping`.
+3. Validate that any internal network infrastructure (for example, firewalls,
+ proxies, VPNs) do not interfere with WebSocket connections.
+
+## Workspace Proxy
+
+If you have configured [Workspace Proxies](../admin/workspace-proxies.md), Coder
+will periodically query their availability and show their status here.
+
+### EWP01: Error Updating Workspace Proxy Health
+
+**Problem:** Coder was unable to query the connected workspace proxies for their
+health status.
+
+**Solution:** This may be a transient issue. If it persists, it could signify a
+connectivity issue.
+
+### EWP02: Error Fetching Workspace Proxies
+
+**Problem:** Coder was unable to fetch the stored workspace proxy health data
+from the database.
+
+**Solution:** This may be a transient issue. If it persists, it could signify an
+issue with Coder's configured database.
+
+### EWP03: Workspace Proxy Version Mismatch
+
+**Problem:** One or more workspace proxies are more than one major or minor
+version out of date with the main deployment. It is important that workspace
+proxies are updated at the same time as the main deployment to minimize the risk
+of API incompatibility.
+
+**Solution:** Update the workspace proxy to match the currently running version
+of Coder.
+
+### EWP04: One or more Workspace Proxies Unhealthy
+
+**Problem:** One or more workspace proxies are not reachable.
+
+**Solution:** Ensure that Coder can establish a connection to the configured
+workspace proxies on port 443.
+
+## Unknown Error
+
+**Problem:** This error is shown when an unexpected error occurred evaluating
+deployment health. It may resolve on its own.
+
+**Solution:** This may be a bug.
+[File a GitHub issue](https://github.com/coder/coder/issues/new)!
diff --git a/docs/images/icons/health.svg b/docs/images/icons/health.svg
new file mode 100644
index 0000000000..9e961a9cb7
--- /dev/null
+++ b/docs/images/icons/health.svg
@@ -0,0 +1,3 @@
+
diff --git a/docs/manifest.json b/docs/manifest.json
index 82628cfc77..eb4276ca99 100644
--- a/docs/manifest.json
+++ b/docs/manifest.json
@@ -433,6 +433,12 @@
"path": "./admin/encryption.md",
"icon_path": "./images/icons/lock.svg",
"state": "enterprise"
+ },
+ {
+ "title": "Deployment Health",
+ "description": "Learn how to monitor the health of your Coder deployment",
+ "path": "./admin/healthcheck.md",
+ "icon_path": "./images/icons/health.svg"
}
]
},
diff --git a/site/src/api/typesGenerated.ts b/site/src/api/typesGenerated.ts
index 9d930d27f1..21c70c6bb1 100644
--- a/site/src/api/typesGenerated.ts
+++ b/site/src/api/typesGenerated.ts
@@ -2203,6 +2203,43 @@ export const ClibaseValueSources: ClibaseValueSource[] = [
// The code below is generated from coderd/healthcheck/health.
+// From health/model.go
+export type HealthCode =
+ | "EACS01"
+ | "EACS02"
+ | "EACS03"
+ | "EACS04"
+ | "EDB01"
+ | "EDB02"
+ | "EDERP01"
+ | "EDERP02"
+ | "EUNKNOWN"
+ | "EWP01"
+ | "EWP02"
+ | "EWP03"
+ | "EWP04"
+ | "EWS01"
+ | "EWS02"
+ | "EWS03";
+export const HealthCodes: HealthCode[] = [
+ "EACS01",
+ "EACS02",
+ "EACS03",
+ "EACS04",
+ "EDB01",
+ "EDB02",
+ "EDERP01",
+ "EDERP02",
+ "EUNKNOWN",
+ "EWP01",
+ "EWP02",
+ "EWP03",
+ "EWP04",
+ "EWS01",
+ "EWS02",
+ "EWS03",
+];
+
// From health/model.go
export type HealthSeverity = "error" | "ok" | "warning";
export const HealthSeveritys: HealthSeverity[] = ["error", "ok", "warning"];