feat(coderd/healthcheck): add access URL error codes and healthcheck doc (#10915)

Relates to #8965

- Added error codes for separate code paths in health checks
- Prefixed errors and warnings with error code prefixes
- Added a docs page with details on each code, cause and solution

Co-authored-by: Muhammad Atif Ali <atif@coder.com>
This commit is contained in:
Cian Johnston 2023-11-30 12:15:40 +00:00 committed by GitHub
parent 5b2f43619b
commit 4f9292859d
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
16 changed files with 479 additions and 72 deletions

View File

@ -7,8 +7,6 @@ import (
"net/url"
"time"
"golang.org/x/xerrors"
"github.com/coder/coder/v2/coderd/healthcheck/health"
"github.com/coder/coder/v2/coderd/util/ptr"
)
@ -44,7 +42,7 @@ func (r *AccessURLReport) Run(ctx context.Context, opts *AccessURLReportOptions)
r.Dismissed = opts.Dismissed
if opts.AccessURL == nil {
r.Error = ptr.Ref("access URL is nil")
r.Error = ptr.Ref(health.Messagef(health.CodeAccessURLNotSet, "Access URL not set"))
r.Severity = health.SeverityError
return
}
@ -56,21 +54,21 @@ func (r *AccessURLReport) Run(ctx context.Context, opts *AccessURLReportOptions)
accessURL, err := opts.AccessURL.Parse("/healthz")
if err != nil {
r.Error = convertError(xerrors.Errorf("parse healthz endpoint: %w", err))
r.Error = ptr.Ref(health.Messagef(health.CodeAccessURLInvalid, "parse healthz endpoint: %s", err))
r.Severity = health.SeverityError
return
}
req, err := http.NewRequestWithContext(ctx, "GET", accessURL.String(), nil)
if err != nil {
r.Error = convertError(xerrors.Errorf("create healthz request: %w", err))
r.Error = ptr.Ref(health.Messagef(health.CodeAccessURLFetch, "create healthz request: %s", err))
r.Severity = health.SeverityError
return
}
res, err := opts.Client.Do(req)
if err != nil {
r.Error = convertError(xerrors.Errorf("get healthz endpoint: %w", err))
r.Error = ptr.Ref(health.Messagef(health.CodeAccessURLFetch, "get healthz endpoint: %s", err))
r.Severity = health.SeverityError
return
}
@ -78,7 +76,7 @@ func (r *AccessURLReport) Run(ctx context.Context, opts *AccessURLReportOptions)
body, err := io.ReadAll(res.Body)
if err != nil {
r.Error = convertError(xerrors.Errorf("read healthz response: %w", err))
r.Error = ptr.Ref(health.Messagef(health.CodeAccessURLFetch, "read healthz response: %s", err))
r.Severity = health.SeverityError
return
}
@ -88,6 +86,7 @@ func (r *AccessURLReport) Run(ctx context.Context, opts *AccessURLReportOptions)
r.StatusCode = res.StatusCode
if res.StatusCode != http.StatusOK {
r.Severity = health.SeverityWarning
r.Warnings = append(r.Warnings, health.Messagef(health.CodeAccessURLNotOK, "/healthz did not return 200 OK"))
}
r.HealthzResponse = string(body)
}

View File

@ -11,7 +11,6 @@ import (
"github.com/stretchr/testify/require"
"golang.org/x/xerrors"
"github.com/coder/coder/v2/coderd/coderdtest"
"github.com/coder/coder/v2/coderd/healthcheck"
"github.com/coder/coder/v2/coderd/healthcheck/health"
)
@ -25,12 +24,17 @@ func TestAccessURL(t *testing.T) {
var (
ctx, cancel = context.WithCancel(context.Background())
report healthcheck.AccessURLReport
client = coderdtest.New(t, nil)
resp = []byte("OK")
srv = httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
w.WriteHeader(http.StatusOK)
_, _ = w.Write(resp)
}))
)
defer cancel()
report.Run(ctx, &healthcheck.AccessURLReportOptions{
AccessURL: client.URL,
Client: srv.Client(),
AccessURL: mustURL(t, srv.URL),
})
assert.True(t, report.Healthy)
@ -41,35 +45,27 @@ func TestAccessURL(t *testing.T) {
assert.Nil(t, report.Error)
})
t.Run("404", func(t *testing.T) {
t.Run("NotSet", func(t *testing.T) {
t.Parallel()
var (
ctx, cancel = context.WithCancel(context.Background())
report healthcheck.AccessURLReport
resp = []byte("NOT OK")
srv = httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
w.WriteHeader(http.StatusNotFound)
w.Write(resp)
}))
)
defer cancel()
defer srv.Close()
u, err := url.Parse(srv.URL)
require.NoError(t, err)
report.Run(ctx, &healthcheck.AccessURLReportOptions{
Client: srv.Client(),
AccessURL: u,
Client: nil, // defaults to http.DefaultClient
AccessURL: nil,
})
assert.False(t, report.Healthy)
assert.True(t, report.Reachable)
assert.Equal(t, health.SeverityWarning, report.Severity)
assert.Equal(t, http.StatusNotFound, report.StatusCode)
assert.Equal(t, string(resp), report.HealthzResponse)
assert.Nil(t, report.Error)
assert.False(t, report.Reachable)
assert.Equal(t, health.SeverityError, report.Severity)
assert.Equal(t, 0, report.StatusCode)
assert.Equal(t, "", report.HealthzResponse)
require.NotNil(t, report.Error)
assert.Contains(t, *report.Error, health.CodeAccessURLNotSet)
})
t.Run("ClientErr", func(t *testing.T) {
@ -81,7 +77,7 @@ func TestAccessURL(t *testing.T) {
resp = []byte("OK")
srv = httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
w.WriteHeader(http.StatusOK)
w.Write(resp)
_, _ = w.Write(resp)
}))
client = srv.Client()
)
@ -93,12 +89,9 @@ func TestAccessURL(t *testing.T) {
return nil, expErr
})
u, err := url.Parse(srv.URL)
require.NoError(t, err)
report.Run(ctx, &healthcheck.AccessURLReportOptions{
Client: client,
AccessURL: u,
AccessURL: mustURL(t, srv.URL),
})
assert.False(t, report.Healthy)
@ -108,6 +101,38 @@ func TestAccessURL(t *testing.T) {
assert.Equal(t, "", report.HealthzResponse)
require.NotNil(t, report.Error)
assert.Contains(t, *report.Error, expErr.Error())
assert.Contains(t, *report.Error, health.CodeAccessURLFetch)
})
t.Run("404", func(t *testing.T) {
t.Parallel()
var (
ctx, cancel = context.WithCancel(context.Background())
report healthcheck.AccessURLReport
resp = []byte("NOT OK")
srv = httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
w.WriteHeader(http.StatusNotFound)
_, _ = w.Write(resp)
}))
)
defer cancel()
defer srv.Close()
report.Run(ctx, &healthcheck.AccessURLReportOptions{
Client: srv.Client(),
AccessURL: mustURL(t, srv.URL),
})
assert.False(t, report.Healthy)
assert.True(t, report.Reachable)
assert.Equal(t, health.SeverityWarning, report.Severity)
assert.Equal(t, http.StatusNotFound, report.StatusCode)
assert.Equal(t, string(resp), report.HealthzResponse)
assert.Nil(t, report.Error)
if assert.NotEmpty(t, report.Warnings) {
assert.Contains(t, report.Warnings[0], health.CodeAccessURLNotOK)
}
})
t.Run("DismissedError", func(t *testing.T) {
@ -133,3 +158,10 @@ type roundTripFunc func(r *http.Request) (*http.Response, error)
func (rt roundTripFunc) RoundTrip(r *http.Request) (*http.Response, error) {
return rt(r)
}
func mustURL(t testing.TB, s string) *url.URL {
t.Helper()
u, err := url.Parse(s)
require.NoError(t, err)
return u
}

View File

@ -4,11 +4,11 @@ import (
"context"
"time"
"golang.org/x/exp/slices"
"golang.org/x/xerrors"
"github.com/coder/coder/v2/coderd/database"
"github.com/coder/coder/v2/coderd/healthcheck/health"
"github.com/coder/coder/v2/coderd/util/ptr"
"golang.org/x/exp/slices"
)
const (
@ -55,8 +55,9 @@ func (r *DatabaseReport) Run(ctx context.Context, opts *DatabaseReportOptions) {
for i := 0; i < pingCount; i++ {
pong, err := opts.DB.Ping(ctx)
if err != nil {
r.Error = convertError(xerrors.Errorf("ping: %w", err))
r.Error = ptr.Ref(health.Messagef(health.CodeDatabasePingFailed, "ping database: %s", err))
r.Severity = health.SeverityError
return
}
pings = append(pings, pong)
@ -69,6 +70,7 @@ func (r *DatabaseReport) Run(ctx context.Context, opts *DatabaseReportOptions) {
r.LatencyMS = latency.Milliseconds()
if r.LatencyMS >= r.ThresholdMS {
r.Severity = health.SeverityWarning
r.Warnings = append(r.Warnings, health.Messagef(health.CodeDatabasePingSlow, "median database ping above threshold"))
}
r.Healthy = true
r.Reachable = true

View File

@ -65,6 +65,7 @@ func TestDatabase(t *testing.T) {
require.NotNil(t, report.Error)
assert.Equal(t, healthcheck.DatabaseDefaultThreshold.Milliseconds(), report.ThresholdMS)
assert.Contains(t, *report.Error, err.Error())
assert.Contains(t, *report.Error, health.CodeDatabasePingFailed)
})
t.Run("DismissedError", func(t *testing.T) {
@ -85,6 +86,7 @@ func TestDatabase(t *testing.T) {
assert.Equal(t, health.SeverityError, report.Severity)
assert.True(t, report.Dismissed)
require.NotNil(t, report.Error)
assert.Contains(t, *report.Error, health.CodeDatabasePingFailed)
})
t.Run("Median", func(t *testing.T) {
@ -112,6 +114,7 @@ func TestDatabase(t *testing.T) {
assert.EqualValues(t, 1, report.LatencyMS)
assert.Equal(t, healthcheck.DatabaseDefaultThreshold.Milliseconds(), report.ThresholdMS)
assert.Nil(t, report.Error)
assert.Empty(t, report.Warnings)
})
t.Run("Threshold", func(t *testing.T) {
@ -139,5 +142,8 @@ func TestDatabase(t *testing.T) {
assert.EqualValues(t, 1000, report.LatencyMS)
assert.Equal(t, time.Second.Milliseconds(), report.ThresholdMS)
assert.Nil(t, report.Error)
if assert.NotEmpty(t, report.Warnings) {
assert.Contains(t, report.Warnings[0], health.CodeDatabasePingSlow)
}
})
}

View File

@ -136,9 +136,7 @@ func (r *Report) Run(ctx context.Context, opts *ReportOptions) {
r.Healthy = false
}
for _, w := range regionReport.Warnings {
r.Warnings = append(r.Warnings, fmt.Sprintf("[%s] %s", regionReport.Region.RegionName, w))
}
r.Warnings = append(r.Warnings, regionReport.Warnings...)
mu.Unlock()
}()
}
@ -202,9 +200,7 @@ func (r *RegionReport) Run(ctx context.Context) {
unhealthyNodes++
}
for _, w := range nodeReport.Warnings {
r.Warnings = append(r.Warnings, fmt.Sprintf("[%s] %s", nodeReport.Node.Name, w))
}
r.Warnings = append(r.Warnings, nodeReport.Warnings...)
r.mu.Unlock()
}()
}
@ -228,7 +224,7 @@ func (r *RegionReport) Run(ctx context.Context) {
} else if unhealthyNodes == 1 {
// r.Healthy = true (by default)
r.Severity = health.SeverityWarning
r.Warnings = append(r.Warnings, oneNodeUnhealthy)
r.Warnings = append(r.Warnings, health.Messagef(health.CodeDERPOneNodeUnhealthy, oneNodeUnhealthy))
} else if unhealthyNodes > 1 {
r.Healthy = false
@ -292,7 +288,7 @@ func (r *NodeReport) Run(ctx context.Context) {
}
if r.UsesWebsocket {
r.Warnings = append(r.Warnings, warningNodeUsesWebsocket)
r.Warnings = append(r.Warnings, health.Messagef(health.CodeDERPNodeUsesWebsocket, warningNodeUsesWebsocket))
r.Severity = health.SeverityWarning
}
}

View File

@ -129,6 +129,9 @@ func TestDERP(t *testing.T) {
assert.True(t, report.Healthy)
assert.Equal(t, health.SeverityWarning, report.Severity)
assert.True(t, report.Dismissed)
if assert.NotEmpty(t, report.Warnings) {
assert.Contains(t, report.Warnings[0], health.CodeDERPOneNodeUnhealthy)
}
for _, region := range report.Regions {
assert.True(t, region.Healthy)
assert.True(t, region.NodeReports[0].Healthy)
@ -232,7 +235,9 @@ func TestDERP(t *testing.T) {
assert.True(t, report.Healthy)
assert.Equal(t, health.SeverityWarning, report.Severity)
assert.NotEmpty(t, report.Warnings)
if assert.NotEmpty(t, report.Warnings) {
assert.Contains(t, report.Warnings[0], health.CodeDERPNodeUsesWebsocket)
}
for _, region := range report.Regions {
assert.True(t, region.Healthy)
assert.Equal(t, health.SeverityWarning, region.Severity)

View File

@ -1,9 +1,37 @@
package health
import (
"fmt"
"strings"
)
const (
SeverityOK Severity = "ok"
SeverityWarning Severity = "warning"
SeverityError Severity = "error"
// CodeUnknown is a catch-all health code when something unexpected goes wrong (for example, a panic).
CodeUnknown Code = "EUNKNOWN"
CodeProxyUpdate Code = "EWP01"
CodeProxyFetch Code = "EWP02"
CodeProxyVersionMismatch Code = "EWP03"
CodeProxyUnhealthy Code = "EWP04"
CodeDatabasePingFailed Code = "EDB01"
CodeDatabasePingSlow Code = "EDB02"
CodeWebsocketDial Code = "EWS01"
CodeWebsocketEcho Code = "EWS02"
CodeWebsocketMsg Code = "EWS03"
CodeAccessURLNotSet Code = "EACS01"
CodeAccessURLInvalid Code = "EACS02"
CodeAccessURLFetch Code = "EACS03"
CodeAccessURLNotOK Code = "EACS04"
CodeDERPNodeUsesWebsocket Code = `EDERP01`
CodeDERPOneNodeUnhealthy Code = `EDERP02`
)
// @typescript-generate Severity
@ -18,3 +46,17 @@ var severityRank = map[Severity]int{
func (s Severity) Value() int {
return severityRank[s]
}
// Code is a stable identifier used to link to documentation.
// @typescript-generate Code
type Code string
// Messagef is a convenience function for formatting a healthcheck error message.
func Messagef(code Code, msg string, args ...any) string {
var sb strings.Builder
_, _ = sb.WriteString(string(code))
_, _ = sb.WriteRune(':')
_, _ = sb.WriteRune(' ')
_, _ = sb.WriteString(fmt.Sprintf(msg, args...))
return sb.String()
}

View File

@ -2,7 +2,6 @@ package healthcheck
import (
"context"
"fmt"
"sync"
"time"
@ -104,7 +103,7 @@ func Run(ctx context.Context, opts *ReportOptions) *Report {
defer wg.Done()
defer func() {
if err := recover(); err != nil {
report.DERP.Error = ptr.Ref(fmt.Sprint(err))
report.DERP.Error = ptr.Ref(health.Messagef(health.CodeUnknown, "derp report panic: %s", err))
}
}()
@ -116,7 +115,7 @@ func Run(ctx context.Context, opts *ReportOptions) *Report {
defer wg.Done()
defer func() {
if err := recover(); err != nil {
report.AccessURL.Error = ptr.Ref(fmt.Sprint(err))
report.AccessURL.Error = ptr.Ref(health.Messagef(health.CodeUnknown, "access url report panic: %s", err))
}
}()
@ -128,7 +127,7 @@ func Run(ctx context.Context, opts *ReportOptions) *Report {
defer wg.Done()
defer func() {
if err := recover(); err != nil {
report.Websocket.Error = ptr.Ref(fmt.Sprint(err))
report.Websocket.Error = ptr.Ref(health.Messagef(health.CodeUnknown, "websocket report panic: %s", err))
}
}()
@ -140,7 +139,7 @@ func Run(ctx context.Context, opts *ReportOptions) *Report {
defer wg.Done()
defer func() {
if err := recover(); err != nil {
report.Database.Error = ptr.Ref(fmt.Sprint(err))
report.Database.Error = ptr.Ref(health.Messagef(health.CodeUnknown, "database report panic: %s", err))
}
}()
@ -152,7 +151,7 @@ func Run(ctx context.Context, opts *ReportOptions) *Report {
defer wg.Done()
defer func() {
if err := recover(); err != nil {
report.WorkspaceProxy.Error = ptr.Ref(fmt.Sprint(err))
report.WorkspaceProxy.Error = ptr.Ref(health.Messagef(health.CodeUnknown, "proxy report panic: %s", err))
}
}()

View File

@ -13,6 +13,7 @@ import (
"nhooyr.io/websocket"
"github.com/coder/coder/v2/coderd/healthcheck/health"
"github.com/coder/coder/v2/coderd/util/ptr"
)
// @typescript-generate WebsocketReport
@ -75,6 +76,7 @@ func (r *WebsocketReport) Run(ctx context.Context, opts *WebsocketReportOptions)
}
if err != nil {
r.Error = convertError(xerrors.Errorf("websocket dial: %w", err))
r.Error = ptr.Ref(health.Messagef(health.CodeWebsocketDial, "websocket dial: %s", err))
r.Severity = health.SeverityError
return
}
@ -84,26 +86,26 @@ func (r *WebsocketReport) Run(ctx context.Context, opts *WebsocketReportOptions)
msg := strconv.Itoa(i)
err := c.Write(ctx, websocket.MessageText, []byte(msg))
if err != nil {
r.Error = convertError(xerrors.Errorf("write message: %w", err))
r.Error = ptr.Ref(health.Messagef(health.CodeWebsocketEcho, "write message: %s", err))
r.Severity = health.SeverityError
return
}
ty, got, err := c.Read(ctx)
if err != nil {
r.Error = convertError(xerrors.Errorf("read message: %w", err))
r.Error = ptr.Ref(health.Messagef(health.CodeWebsocketEcho, "read message: %s", err))
r.Severity = health.SeverityError
return
}
if ty != websocket.MessageText {
r.Error = convertError(xerrors.Errorf("received incorrect message type: %v", ty))
r.Error = ptr.Ref(health.Messagef(health.CodeWebsocketMsg, "received incorrect message type: %v", ty))
r.Severity = health.SeverityError
return
}
if string(got) != msg {
r.Error = convertError(xerrors.Errorf("received incorrect message: wanted %q, got %q", msg, string(got)))
r.Error = ptr.Ref(health.Messagef(health.CodeWebsocketMsg, "received incorrect message: wanted %q, got %q", msg, string(got)))
r.Severity = health.SeverityError
return
}

View File

@ -63,7 +63,9 @@ func TestWebsocket(t *testing.T) {
APIKey: "test",
})
require.NotNil(t, wsReport.Error)
if assert.NotNil(t, wsReport.Error) {
assert.Contains(t, *wsReport.Error, health.CodeWebsocketDial)
}
require.Equal(t, health.SeverityError, wsReport.Severity)
assert.Equal(t, wsReport.Body, "test error")
assert.Equal(t, wsReport.Code, http.StatusBadRequest)

View File

@ -6,12 +6,12 @@ import (
"sort"
"strings"
"golang.org/x/xerrors"
"github.com/coder/coder/v2/buildinfo"
"github.com/coder/coder/v2/coderd/healthcheck/health"
"github.com/coder/coder/v2/coderd/util/ptr"
"github.com/coder/coder/v2/codersdk"
"golang.org/x/xerrors"
)
// @typescript-generate WorkspaceProxyReport
@ -64,7 +64,7 @@ func (r *WorkspaceProxyReport) Run(ctx context.Context, opts *WorkspaceProxyRepo
// If this fails, just mark it as a warning. It is still updated in the background.
if err := opts.WorkspaceProxiesFetchUpdater.Update(ctx); err != nil {
r.Severity = health.SeverityWarning
r.Warnings = append(r.Warnings, xerrors.Errorf("update proxy health: %w", err).Error())
r.Warnings = append(r.Warnings, health.Messagef(health.CodeProxyUpdate, "update proxy health: %s", err))
return
}
@ -72,7 +72,7 @@ func (r *WorkspaceProxyReport) Run(ctx context.Context, opts *WorkspaceProxyRepo
if err != nil {
r.Healthy = false
r.Severity = health.SeverityError
r.Error = ptr.Ref(err.Error())
r.Error = ptr.Ref(health.Messagef(health.CodeProxyFetch, "fetch workspace proxies: %s", err))
return
}
@ -99,11 +99,13 @@ func (r *WorkspaceProxyReport) Run(ctx context.Context, opts *WorkspaceProxyRepo
r.Severity = calculateSeverity(total, healthy)
r.Healthy = r.Severity.Value() < health.SeverityError.Value()
switch r.Severity {
case health.SeverityWarning, health.SeverityOK:
r.Warnings = append(r.Warnings, errs...)
case health.SeverityError:
r.appendError(errs...)
for _, err := range errs {
switch r.Severity {
case health.SeverityWarning, health.SeverityOK:
r.Warnings = append(r.Warnings, health.Messagef(health.CodeProxyUnhealthy, err))
case health.SeverityError:
r.appendError(health.Messagef(health.CodeProxyUnhealthy, err))
}
}
// Versions _must_ match. Perform this check last. This will clobber any other severity.
@ -111,7 +113,7 @@ func (r *WorkspaceProxyReport) Run(ctx context.Context, opts *WorkspaceProxyRepo
if vErr := checkVersion(proxy, opts.CurrentVersion); vErr != nil {
r.Healthy = false
r.Severity = health.SeverityError
r.appendError(fmt.Sprintf("%s: %s", proxy.Name, vErr.Error()))
r.appendError(health.Messagef(health.CodeProxyVersionMismatch, vErr.Error()))
}
}
}

View File

@ -2,6 +2,7 @@ package healthcheck_test
import (
"context"
"strings"
"testing"
"github.com/stretchr/testify/assert"
@ -26,6 +27,7 @@ func TestWorkspaceProxies(t *testing.T) {
updateProxyHealth func(context.Context) error
expectedHealthy bool
expectedError string
expectedWarning string
expectedSeverity health.Severity
}{
{
@ -53,6 +55,7 @@ func TestWorkspaceProxies(t *testing.T) {
updateProxyHealth: fakeUpdateProxyHealth(nil),
expectedHealthy: false,
expectedSeverity: health.SeverityError,
expectedError: string(health.CodeProxyUnhealthy),
},
{
name: "Enabled/OneUnreachable",
@ -80,7 +83,7 @@ func TestWorkspaceProxies(t *testing.T) {
updateProxyHealth: fakeUpdateProxyHealth(nil),
expectedHealthy: false,
expectedSeverity: health.SeverityError,
expectedError: "connect: connection refused",
expectedError: string(health.CodeProxyUnhealthy),
},
{
name: "Enabled/AllHealthy",
@ -103,6 +106,7 @@ func TestWorkspaceProxies(t *testing.T) {
updateProxyHealth: fakeUpdateProxyHealth(nil),
expectedHealthy: true,
expectedSeverity: health.SeverityWarning,
expectedWarning: string(health.CodeProxyUnhealthy),
},
{
name: "Enabled/AllUnhealthy",
@ -113,6 +117,7 @@ func TestWorkspaceProxies(t *testing.T) {
updateProxyHealth: fakeUpdateProxyHealth(nil),
expectedHealthy: false,
expectedSeverity: health.SeverityError,
expectedError: string(health.CodeProxyUnhealthy),
},
{
name: "Enabled/OneOutOfDate",
@ -150,7 +155,7 @@ func TestWorkspaceProxies(t *testing.T) {
updateProxyHealth: fakeUpdateProxyHealth(nil),
expectedHealthy: false,
expectedSeverity: health.SeverityError,
expectedError: assert.AnError.Error(),
expectedError: string(health.CodeProxyFetch),
},
{
name: "Enabled/ErrUpdateProxyHealth",
@ -158,6 +163,7 @@ func TestWorkspaceProxies(t *testing.T) {
updateProxyHealth: fakeUpdateProxyHealth(assert.AnError),
expectedHealthy: true,
expectedSeverity: health.SeverityWarning,
expectedWarning: string(health.CodeProxyUpdate),
},
} {
tt := tt
@ -179,13 +185,22 @@ func TestWorkspaceProxies(t *testing.T) {
assert.Equal(t, tt.expectedHealthy, rpt.Healthy)
assert.Equal(t, tt.expectedSeverity, rpt.Severity)
if tt.expectedError != "" {
assert.NotNil(t, rpt.Error)
if tt.expectedError != "" && assert.NotNil(t, rpt.Error) {
assert.Contains(t, *rpt.Error, tt.expectedError)
} else {
if !assert.Nil(t, rpt.Error) {
assert.Empty(t, *rpt.Error)
assert.Nil(t, rpt.Error)
}
if tt.expectedWarning != "" && assert.NotEmpty(t, rpt.Warnings) {
var found bool
for _, w := range rpt.Warnings {
if strings.Contains(w, tt.expectedWarning) {
found = true
break
}
}
assert.True(t, found, "expected warning %s not found in %v", tt.expectedWarning, rpt.Warnings)
} else {
assert.Empty(t, rpt.Warnings)
}
})
}
@ -221,13 +236,24 @@ func (u *fakeWorkspaceProxyFetchUpdater) Update(ctx context.Context) error {
return u.updateFunc(ctx)
}
//nolint:revive // yes, this is a control flag, and that is OK in a unit test.
func fakeWorkspaceProxy(name string, healthy bool, version string) codersdk.WorkspaceProxy {
var status codersdk.WorkspaceProxyStatus
if !healthy {
status = codersdk.WorkspaceProxyStatus{
Status: codersdk.ProxyUnreachable,
Report: codersdk.ProxyHealthReport{
Errors: []string{assert.AnError.Error()},
},
}
}
return codersdk.WorkspaceProxy{
Region: codersdk.Region{
Name: name,
Healthy: healthy,
},
Version: version,
Status: status,
}
}

248
docs/admin/healthcheck.md Normal file
View File

@ -0,0 +1,248 @@
# Deployment Health
Coder includes an operator-friendly deployment health page that provides a
number of details about the health of your Coder deployment.
You can view it at `https://${CODER_URL}/health`, or you can alternatively view
the [JSON response directly](../api/debug.md#debug-info-deployment-health).
The deployment health page is broken up into the following sections:
## Access URL
The Access URL section shows checks related to Coder's
[access URL](./configure.md#access-url).
Coder will periodically send a GET request to `${CODER_ACCESS_URL}/healthz` and
validate that the response is `200 OK`. The expected response body is also the
string `OK`.
If there is an issue, you may see one of the following errors reported:
### <a name="EACS01">EACS01: Access URL not set</a>
**Problem:** no access URL has been configured.
**Solution:** configure an [access URL](./configure.md#access-url) for Coder.
### <a name="EACS02">EACS02: Access URL invalid</a>
**Problem:** `${CODER_ACCESS_URL}/healthz` is not a valid URL.
**Solution:** Ensure that the access URL is a valid URL accepted by
[`url.Parse`](https://pkg.go.dev/net/url#Parse). Example:
`https://dev.coder.com/`.
> [!TIP] You can check this [here](https://go.dev/play/p/CabcJZyTwt9).
### <a name="EACS03">EACS03: Failed to fetch `/healthz`</a>
**Problem:** Coder was unable to execute a GET request to
`${CODER_ACCESS_URL}/healthz`.
This could be due to a number of reasons, including but not limited to:
- DNS lookup failure
- A misconfigured firewall
- A misconfigured reverse proxy
- Invalid or expired SSL certificates
**Solution:** Investigate and resolve the root cause of the connection issue.
To troubleshoot further, you can log into the machine running Coder and attempt
to run the following command:
```shell
curl -v ${CODER_ACCESS_URL}/healthz
# Expected output:
# * Trying XXX.XXX.XXX.XXX:443
# * Connected to https://coder.company.com (XXX.XXX.XXX.XXX) port 443 (#0)
# [...]
# OK
```
The output of this command should aid further diagnosis.
### <a name="EACS04">EACS04: /healthz did not return 200 OK</a>
**Problem:** Coder was able to execute a GET request to
`${CODER_ACCESS_URL}/healthz`, but the response code was not `200 OK` as
expected.
This could mean, for instance, that:
- The request did not actually hit your Coder instance (potentially an incorrect
DNS entry)
- The request hit your Coder instance, but on an unexpected path (potentially a
misconfigured reverse proxy)
**Solution:** Inspect the `HealthzResponse` in the health check output. This
should give you a good indication of the root cause.
## Database
Coder continuously executes a short database query to validate that it can reach
its configured database, and also measures the median latency over 5 attempts.
### <a name="EDB01">EDB01: Database Ping Failed</a>
**Problem:** This error code is returned if any attempt to execute this database
query fails.
**Solution:** Investigate the health of the database.
### <a name="EDB02">EDB02: Database Latency High</a>
**Problem:** This code is returned if the median latency is higher than the
[configured threshold](../cli/server.md#--health-check-threshold-database). This
may not be an error as such, but is an indication of a potential issue.
**Solution:** Investigate the sizing of the configured database with regard to
Coder's current activity and usage. It may be necessary to increase the
resources allocated to Coder's database. Alternatively, you can raise the
configured threshold to a higher value (this will not address the root cause).
> [!TIP]
>
> - You can enable
> [detailed database metrics](../cli/server.md#--prometheus-collect-db-metrics)
> in Coder's Prometheus endpoint.
> - If you have [tracing enabled](../cli/server.md#--trace), these traces may
> also contain useful information regarding Coder's database activity.
## DERP
Coder workspace agents may use
[DERP (Designated Encrypted Relay for Packets)](https://tailscale.com/blog/how-tailscale-works/#encrypted-tcp-relays-derp)
to communicate with Coder. This requires connectivity to a number of configured
[DERP servers](../cli/server.md#--derp-config-path) which are used to relay
traffic between Coder and workspace agents. Coder periodically queries the
health of its configured DERP servers and may return one or more of the
following:
### <a name="EDERP01">EDERP01: DERP Node Uses Websocket</a>
**Problem:** When Coder attempts to establish a connection to one or more DERP
servers, it sends a specific `Upgrade: derp` HTTP header. Some load balancers
may block this header, in which case Coder will fall back to
`Upgrade: websocket`.
This is not necessarily a fatal error, but a possible indication of a
misconfigured reverse HTTP proxy. Additionally, while workspace users should
still be able to reach their workspaces, connection performance may be degraded.
> [!NOTE] This may also be shown if you have
> [forced websocket connections for DERP](../cli/server.md#--derp-force-websockets).
**Solution:** ensure that any configured reverse proxy does not strip the
`Upgrade: derp` header.
### <a name="EDERP02">EDERP02: One or more DERP nodes are unhealthy</a>
**Problem:** This is shown if Coder is unable to reach one or more configured
DERP servers. Clients will fall back to use the remaining DERP servers, but
performance may be impacted for clients closest to the unhealthy DERP server.
**Solution:** Ensure that the DERP server is available and reachable over the
network on port 443, for example:
```shell
curl -v "https://coder.company.com:443/derp"
# Expected output:
# * Trying XXX.XXX.XXX.XXX:443
# * Connected to https://coder.company.com (XXX.XXX.XXX.XXX) port 443 (#0)
# DERP requires connection upgrade
```
## Websocket
Coder makes heavy use of [WebSockets](https://datatracker.ietf.org/doc/rfc6455/)
for long-lived connections:
- Between users interacting with Coder's Web UI (for example, the built-in
terminal, or VSCode Web),
- Between workspace agents and `coderd`,
- Between Coder [workspace proxies](../admin/workspace-proxies.md) and `coderd`.
Any issues causing failures to establish WebSocket connections will result in
**severe** impairment of functionality for users. To validate this
functionality, Coder will periodically attempt to establish a WebSocket
connection with itself using the configured [Access URL](#access-url), send a
message over the connection, and attempt to read back that same message.
### <a name="EWS01">EWS01: Failed to establish a WebSocket connection</a>
**Problem:** Coder was unable to establish a WebSocket connection over its own
Access URL.
**Solution:** There are multiple possible causes of this problem:
1. Ensure that Coder's configured Access URL can be reached from the server
running Coder, using standard troubleshooting tools like `curl`:
```shell
curl -v "https://coder.company.com:443/"
```
2. Ensure that any reverse proxy that is sitting in front of Coder's configured
access URL is not stripping the HTTP header `Upgrade: websocket`.
### <a name="EWS02">EWS02: Failed to echo a WebSocket message</a>
**Problem:** Coder was able to establish a WebSocket connection, but was unable
to write a message.
**Solution:** There are multiple possible causes of this problem:
1. Validate that any reverse proxy servers in front of Coder's configured access
URL are not prematurely closing the connection.
2. Validate that the network link between Coder and the workspace proxy is
stable, e.g. by using `ping`.
3. Validate that any internal network infrastructure (for example, firewalls,
proxies, VPNs) do not interfere with WebSocket connections.
## Workspace Proxy
If you have configured [Workspace Proxies](../admin/workspace-proxies.md), Coder
will periodically query their availability and show their status here.
### <a name="EWP01">EWP01: Error Updating Workspace Proxy Health</a>
**Problem:** Coder was unable to query the connected workspace proxies for their
health status.
**Solution:** This may be a transient issue. If it persists, it could signify a
connectivity issue.
### <a name="EWP02">EWP02: Error Fetching Workspace Proxies</a>
**Problem:** Coder was unable to fetch the stored workspace proxy health data
from the database.
**Solution:** This may be a transient issue. If it persists, it could signify an
issue with Coder's configured database.
### <a name="EWP03">EWP03: Workspace Proxy Version Mismatch</a>
**Problem:** One or more workspace proxies are more than one major or minor
version out of date with the main deployment. It is important that workspace
proxies are updated at the same time as the main deployment to minimize the risk
of API incompatibility.
**Solution:** Update the workspace proxy to match the currently running version
of Coder.
### <a name="EWP04">EWP04: One or more Workspace Proxies Unhealthy</a>
**Problem:** One or more workspace proxies are not reachable.
**Solution:** Ensure that Coder can establish a connection to the configured
workspace proxies on port 443.
## <a name="EUNKNOWN">Unknown Error</a>
**Problem:** This error is shown when an unexpected error occurred evaluating
deployment health. It may resolve on its own.
**Solution:** This may be a bug.
[File a GitHub issue](https://github.com/coder/coder/issues/new)!

View File

@ -0,0 +1,3 @@
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 32 32">
<path stroke="none" d="M10.5 13H8v-3h2.5V7.5h3V10H16v3h-2.5v2.5h-3V13zM12 2 4 6v6.09c0 5.05 3.41 9.76 8 10.91 4.59-1.15 8-5.86 8-10.91V5l-8-3z"/>
</svg>

After

Width:  |  Height:  |  Size: 215 B

View File

@ -433,6 +433,12 @@
"path": "./admin/encryption.md",
"icon_path": "./images/icons/lock.svg",
"state": "enterprise"
},
{
"title": "Deployment Health",
"description": "Learn how to monitor the health of your Coder deployment",
"path": "./admin/healthcheck.md",
"icon_path": "./images/icons/health.svg"
}
]
},

View File

@ -2203,6 +2203,43 @@ export const ClibaseValueSources: ClibaseValueSource[] = [
// The code below is generated from coderd/healthcheck/health.
// From health/model.go
export type HealthCode =
| "EACS01"
| "EACS02"
| "EACS03"
| "EACS04"
| "EDB01"
| "EDB02"
| "EDERP01"
| "EDERP02"
| "EUNKNOWN"
| "EWP01"
| "EWP02"
| "EWP03"
| "EWP04"
| "EWS01"
| "EWS02"
| "EWS03";
export const HealthCodes: HealthCode[] = [
"EACS01",
"EACS02",
"EACS03",
"EACS04",
"EDB01",
"EDB02",
"EDERP01",
"EDERP02",
"EUNKNOWN",
"EWP01",
"EWP02",
"EWP03",
"EWP04",
"EWS01",
"EWS02",
"EWS03",
];
// From health/model.go
export type HealthSeverity = "error" | "ok" | "warning";
export const HealthSeveritys: HealthSeverity[] = ["error", "ok", "warning"];