mirror of https://github.com/coder/coder.git
feat(coderd/healthcheck): add access URL error codes and healthcheck doc (#10915)
Relates to #8965 - Added error codes for separate code paths in health checks - Prefixed errors and warnings with error code prefixes - Added a docs page with details on each code, cause and solution Co-authored-by: Muhammad Atif Ali <atif@coder.com>
This commit is contained in:
parent
5b2f43619b
commit
4f9292859d
|
@ -7,8 +7,6 @@ import (
|
|||
"net/url"
|
||||
"time"
|
||||
|
||||
"golang.org/x/xerrors"
|
||||
|
||||
"github.com/coder/coder/v2/coderd/healthcheck/health"
|
||||
"github.com/coder/coder/v2/coderd/util/ptr"
|
||||
)
|
||||
|
@ -44,7 +42,7 @@ func (r *AccessURLReport) Run(ctx context.Context, opts *AccessURLReportOptions)
|
|||
r.Dismissed = opts.Dismissed
|
||||
|
||||
if opts.AccessURL == nil {
|
||||
r.Error = ptr.Ref("access URL is nil")
|
||||
r.Error = ptr.Ref(health.Messagef(health.CodeAccessURLNotSet, "Access URL not set"))
|
||||
r.Severity = health.SeverityError
|
||||
return
|
||||
}
|
||||
|
@ -56,21 +54,21 @@ func (r *AccessURLReport) Run(ctx context.Context, opts *AccessURLReportOptions)
|
|||
|
||||
accessURL, err := opts.AccessURL.Parse("/healthz")
|
||||
if err != nil {
|
||||
r.Error = convertError(xerrors.Errorf("parse healthz endpoint: %w", err))
|
||||
r.Error = ptr.Ref(health.Messagef(health.CodeAccessURLInvalid, "parse healthz endpoint: %s", err))
|
||||
r.Severity = health.SeverityError
|
||||
return
|
||||
}
|
||||
|
||||
req, err := http.NewRequestWithContext(ctx, "GET", accessURL.String(), nil)
|
||||
if err != nil {
|
||||
r.Error = convertError(xerrors.Errorf("create healthz request: %w", err))
|
||||
r.Error = ptr.Ref(health.Messagef(health.CodeAccessURLFetch, "create healthz request: %s", err))
|
||||
r.Severity = health.SeverityError
|
||||
return
|
||||
}
|
||||
|
||||
res, err := opts.Client.Do(req)
|
||||
if err != nil {
|
||||
r.Error = convertError(xerrors.Errorf("get healthz endpoint: %w", err))
|
||||
r.Error = ptr.Ref(health.Messagef(health.CodeAccessURLFetch, "get healthz endpoint: %s", err))
|
||||
r.Severity = health.SeverityError
|
||||
return
|
||||
}
|
||||
|
@ -78,7 +76,7 @@ func (r *AccessURLReport) Run(ctx context.Context, opts *AccessURLReportOptions)
|
|||
|
||||
body, err := io.ReadAll(res.Body)
|
||||
if err != nil {
|
||||
r.Error = convertError(xerrors.Errorf("read healthz response: %w", err))
|
||||
r.Error = ptr.Ref(health.Messagef(health.CodeAccessURLFetch, "read healthz response: %s", err))
|
||||
r.Severity = health.SeverityError
|
||||
return
|
||||
}
|
||||
|
@ -88,6 +86,7 @@ func (r *AccessURLReport) Run(ctx context.Context, opts *AccessURLReportOptions)
|
|||
r.StatusCode = res.StatusCode
|
||||
if res.StatusCode != http.StatusOK {
|
||||
r.Severity = health.SeverityWarning
|
||||
r.Warnings = append(r.Warnings, health.Messagef(health.CodeAccessURLNotOK, "/healthz did not return 200 OK"))
|
||||
}
|
||||
r.HealthzResponse = string(body)
|
||||
}
|
||||
|
|
|
@ -11,7 +11,6 @@ import (
|
|||
"github.com/stretchr/testify/require"
|
||||
"golang.org/x/xerrors"
|
||||
|
||||
"github.com/coder/coder/v2/coderd/coderdtest"
|
||||
"github.com/coder/coder/v2/coderd/healthcheck"
|
||||
"github.com/coder/coder/v2/coderd/healthcheck/health"
|
||||
)
|
||||
|
@ -25,12 +24,17 @@ func TestAccessURL(t *testing.T) {
|
|||
var (
|
||||
ctx, cancel = context.WithCancel(context.Background())
|
||||
report healthcheck.AccessURLReport
|
||||
client = coderdtest.New(t, nil)
|
||||
resp = []byte("OK")
|
||||
srv = httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
w.WriteHeader(http.StatusOK)
|
||||
_, _ = w.Write(resp)
|
||||
}))
|
||||
)
|
||||
defer cancel()
|
||||
|
||||
report.Run(ctx, &healthcheck.AccessURLReportOptions{
|
||||
AccessURL: client.URL,
|
||||
Client: srv.Client(),
|
||||
AccessURL: mustURL(t, srv.URL),
|
||||
})
|
||||
|
||||
assert.True(t, report.Healthy)
|
||||
|
@ -41,35 +45,27 @@ func TestAccessURL(t *testing.T) {
|
|||
assert.Nil(t, report.Error)
|
||||
})
|
||||
|
||||
t.Run("404", func(t *testing.T) {
|
||||
t.Run("NotSet", func(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
var (
|
||||
ctx, cancel = context.WithCancel(context.Background())
|
||||
report healthcheck.AccessURLReport
|
||||
resp = []byte("NOT OK")
|
||||
srv = httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
w.WriteHeader(http.StatusNotFound)
|
||||
w.Write(resp)
|
||||
}))
|
||||
)
|
||||
defer cancel()
|
||||
defer srv.Close()
|
||||
|
||||
u, err := url.Parse(srv.URL)
|
||||
require.NoError(t, err)
|
||||
|
||||
report.Run(ctx, &healthcheck.AccessURLReportOptions{
|
||||
Client: srv.Client(),
|
||||
AccessURL: u,
|
||||
Client: nil, // defaults to http.DefaultClient
|
||||
AccessURL: nil,
|
||||
})
|
||||
|
||||
assert.False(t, report.Healthy)
|
||||
assert.True(t, report.Reachable)
|
||||
assert.Equal(t, health.SeverityWarning, report.Severity)
|
||||
assert.Equal(t, http.StatusNotFound, report.StatusCode)
|
||||
assert.Equal(t, string(resp), report.HealthzResponse)
|
||||
assert.Nil(t, report.Error)
|
||||
assert.False(t, report.Reachable)
|
||||
assert.Equal(t, health.SeverityError, report.Severity)
|
||||
assert.Equal(t, 0, report.StatusCode)
|
||||
assert.Equal(t, "", report.HealthzResponse)
|
||||
require.NotNil(t, report.Error)
|
||||
assert.Contains(t, *report.Error, health.CodeAccessURLNotSet)
|
||||
})
|
||||
|
||||
t.Run("ClientErr", func(t *testing.T) {
|
||||
|
@ -81,7 +77,7 @@ func TestAccessURL(t *testing.T) {
|
|||
resp = []byte("OK")
|
||||
srv = httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
w.WriteHeader(http.StatusOK)
|
||||
w.Write(resp)
|
||||
_, _ = w.Write(resp)
|
||||
}))
|
||||
client = srv.Client()
|
||||
)
|
||||
|
@ -93,12 +89,9 @@ func TestAccessURL(t *testing.T) {
|
|||
return nil, expErr
|
||||
})
|
||||
|
||||
u, err := url.Parse(srv.URL)
|
||||
require.NoError(t, err)
|
||||
|
||||
report.Run(ctx, &healthcheck.AccessURLReportOptions{
|
||||
Client: client,
|
||||
AccessURL: u,
|
||||
AccessURL: mustURL(t, srv.URL),
|
||||
})
|
||||
|
||||
assert.False(t, report.Healthy)
|
||||
|
@ -108,6 +101,38 @@ func TestAccessURL(t *testing.T) {
|
|||
assert.Equal(t, "", report.HealthzResponse)
|
||||
require.NotNil(t, report.Error)
|
||||
assert.Contains(t, *report.Error, expErr.Error())
|
||||
assert.Contains(t, *report.Error, health.CodeAccessURLFetch)
|
||||
})
|
||||
|
||||
t.Run("404", func(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
var (
|
||||
ctx, cancel = context.WithCancel(context.Background())
|
||||
report healthcheck.AccessURLReport
|
||||
resp = []byte("NOT OK")
|
||||
srv = httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
w.WriteHeader(http.StatusNotFound)
|
||||
_, _ = w.Write(resp)
|
||||
}))
|
||||
)
|
||||
defer cancel()
|
||||
defer srv.Close()
|
||||
|
||||
report.Run(ctx, &healthcheck.AccessURLReportOptions{
|
||||
Client: srv.Client(),
|
||||
AccessURL: mustURL(t, srv.URL),
|
||||
})
|
||||
|
||||
assert.False(t, report.Healthy)
|
||||
assert.True(t, report.Reachable)
|
||||
assert.Equal(t, health.SeverityWarning, report.Severity)
|
||||
assert.Equal(t, http.StatusNotFound, report.StatusCode)
|
||||
assert.Equal(t, string(resp), report.HealthzResponse)
|
||||
assert.Nil(t, report.Error)
|
||||
if assert.NotEmpty(t, report.Warnings) {
|
||||
assert.Contains(t, report.Warnings[0], health.CodeAccessURLNotOK)
|
||||
}
|
||||
})
|
||||
|
||||
t.Run("DismissedError", func(t *testing.T) {
|
||||
|
@ -133,3 +158,10 @@ type roundTripFunc func(r *http.Request) (*http.Response, error)
|
|||
func (rt roundTripFunc) RoundTrip(r *http.Request) (*http.Response, error) {
|
||||
return rt(r)
|
||||
}
|
||||
|
||||
func mustURL(t testing.TB, s string) *url.URL {
|
||||
t.Helper()
|
||||
u, err := url.Parse(s)
|
||||
require.NoError(t, err)
|
||||
return u
|
||||
}
|
||||
|
|
|
@ -4,11 +4,11 @@ import (
|
|||
"context"
|
||||
"time"
|
||||
|
||||
"golang.org/x/exp/slices"
|
||||
"golang.org/x/xerrors"
|
||||
|
||||
"github.com/coder/coder/v2/coderd/database"
|
||||
"github.com/coder/coder/v2/coderd/healthcheck/health"
|
||||
"github.com/coder/coder/v2/coderd/util/ptr"
|
||||
|
||||
"golang.org/x/exp/slices"
|
||||
)
|
||||
|
||||
const (
|
||||
|
@ -55,8 +55,9 @@ func (r *DatabaseReport) Run(ctx context.Context, opts *DatabaseReportOptions) {
|
|||
for i := 0; i < pingCount; i++ {
|
||||
pong, err := opts.DB.Ping(ctx)
|
||||
if err != nil {
|
||||
r.Error = convertError(xerrors.Errorf("ping: %w", err))
|
||||
r.Error = ptr.Ref(health.Messagef(health.CodeDatabasePingFailed, "ping database: %s", err))
|
||||
r.Severity = health.SeverityError
|
||||
|
||||
return
|
||||
}
|
||||
pings = append(pings, pong)
|
||||
|
@ -69,6 +70,7 @@ func (r *DatabaseReport) Run(ctx context.Context, opts *DatabaseReportOptions) {
|
|||
r.LatencyMS = latency.Milliseconds()
|
||||
if r.LatencyMS >= r.ThresholdMS {
|
||||
r.Severity = health.SeverityWarning
|
||||
r.Warnings = append(r.Warnings, health.Messagef(health.CodeDatabasePingSlow, "median database ping above threshold"))
|
||||
}
|
||||
r.Healthy = true
|
||||
r.Reachable = true
|
||||
|
|
|
@ -65,6 +65,7 @@ func TestDatabase(t *testing.T) {
|
|||
require.NotNil(t, report.Error)
|
||||
assert.Equal(t, healthcheck.DatabaseDefaultThreshold.Milliseconds(), report.ThresholdMS)
|
||||
assert.Contains(t, *report.Error, err.Error())
|
||||
assert.Contains(t, *report.Error, health.CodeDatabasePingFailed)
|
||||
})
|
||||
|
||||
t.Run("DismissedError", func(t *testing.T) {
|
||||
|
@ -85,6 +86,7 @@ func TestDatabase(t *testing.T) {
|
|||
assert.Equal(t, health.SeverityError, report.Severity)
|
||||
assert.True(t, report.Dismissed)
|
||||
require.NotNil(t, report.Error)
|
||||
assert.Contains(t, *report.Error, health.CodeDatabasePingFailed)
|
||||
})
|
||||
|
||||
t.Run("Median", func(t *testing.T) {
|
||||
|
@ -112,6 +114,7 @@ func TestDatabase(t *testing.T) {
|
|||
assert.EqualValues(t, 1, report.LatencyMS)
|
||||
assert.Equal(t, healthcheck.DatabaseDefaultThreshold.Milliseconds(), report.ThresholdMS)
|
||||
assert.Nil(t, report.Error)
|
||||
assert.Empty(t, report.Warnings)
|
||||
})
|
||||
|
||||
t.Run("Threshold", func(t *testing.T) {
|
||||
|
@ -139,5 +142,8 @@ func TestDatabase(t *testing.T) {
|
|||
assert.EqualValues(t, 1000, report.LatencyMS)
|
||||
assert.Equal(t, time.Second.Milliseconds(), report.ThresholdMS)
|
||||
assert.Nil(t, report.Error)
|
||||
if assert.NotEmpty(t, report.Warnings) {
|
||||
assert.Contains(t, report.Warnings[0], health.CodeDatabasePingSlow)
|
||||
}
|
||||
})
|
||||
}
|
||||
|
|
|
@ -136,9 +136,7 @@ func (r *Report) Run(ctx context.Context, opts *ReportOptions) {
|
|||
r.Healthy = false
|
||||
}
|
||||
|
||||
for _, w := range regionReport.Warnings {
|
||||
r.Warnings = append(r.Warnings, fmt.Sprintf("[%s] %s", regionReport.Region.RegionName, w))
|
||||
}
|
||||
r.Warnings = append(r.Warnings, regionReport.Warnings...)
|
||||
mu.Unlock()
|
||||
}()
|
||||
}
|
||||
|
@ -202,9 +200,7 @@ func (r *RegionReport) Run(ctx context.Context) {
|
|||
unhealthyNodes++
|
||||
}
|
||||
|
||||
for _, w := range nodeReport.Warnings {
|
||||
r.Warnings = append(r.Warnings, fmt.Sprintf("[%s] %s", nodeReport.Node.Name, w))
|
||||
}
|
||||
r.Warnings = append(r.Warnings, nodeReport.Warnings...)
|
||||
r.mu.Unlock()
|
||||
}()
|
||||
}
|
||||
|
@ -228,7 +224,7 @@ func (r *RegionReport) Run(ctx context.Context) {
|
|||
} else if unhealthyNodes == 1 {
|
||||
// r.Healthy = true (by default)
|
||||
r.Severity = health.SeverityWarning
|
||||
r.Warnings = append(r.Warnings, oneNodeUnhealthy)
|
||||
r.Warnings = append(r.Warnings, health.Messagef(health.CodeDERPOneNodeUnhealthy, oneNodeUnhealthy))
|
||||
} else if unhealthyNodes > 1 {
|
||||
r.Healthy = false
|
||||
|
||||
|
@ -292,7 +288,7 @@ func (r *NodeReport) Run(ctx context.Context) {
|
|||
}
|
||||
|
||||
if r.UsesWebsocket {
|
||||
r.Warnings = append(r.Warnings, warningNodeUsesWebsocket)
|
||||
r.Warnings = append(r.Warnings, health.Messagef(health.CodeDERPNodeUsesWebsocket, warningNodeUsesWebsocket))
|
||||
r.Severity = health.SeverityWarning
|
||||
}
|
||||
}
|
||||
|
|
|
@ -129,6 +129,9 @@ func TestDERP(t *testing.T) {
|
|||
assert.True(t, report.Healthy)
|
||||
assert.Equal(t, health.SeverityWarning, report.Severity)
|
||||
assert.True(t, report.Dismissed)
|
||||
if assert.NotEmpty(t, report.Warnings) {
|
||||
assert.Contains(t, report.Warnings[0], health.CodeDERPOneNodeUnhealthy)
|
||||
}
|
||||
for _, region := range report.Regions {
|
||||
assert.True(t, region.Healthy)
|
||||
assert.True(t, region.NodeReports[0].Healthy)
|
||||
|
@ -232,7 +235,9 @@ func TestDERP(t *testing.T) {
|
|||
|
||||
assert.True(t, report.Healthy)
|
||||
assert.Equal(t, health.SeverityWarning, report.Severity)
|
||||
assert.NotEmpty(t, report.Warnings)
|
||||
if assert.NotEmpty(t, report.Warnings) {
|
||||
assert.Contains(t, report.Warnings[0], health.CodeDERPNodeUsesWebsocket)
|
||||
}
|
||||
for _, region := range report.Regions {
|
||||
assert.True(t, region.Healthy)
|
||||
assert.Equal(t, health.SeverityWarning, region.Severity)
|
||||
|
|
|
@ -1,9 +1,37 @@
|
|||
package health
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"strings"
|
||||
)
|
||||
|
||||
const (
|
||||
SeverityOK Severity = "ok"
|
||||
SeverityWarning Severity = "warning"
|
||||
SeverityError Severity = "error"
|
||||
|
||||
// CodeUnknown is a catch-all health code when something unexpected goes wrong (for example, a panic).
|
||||
CodeUnknown Code = "EUNKNOWN"
|
||||
|
||||
CodeProxyUpdate Code = "EWP01"
|
||||
CodeProxyFetch Code = "EWP02"
|
||||
CodeProxyVersionMismatch Code = "EWP03"
|
||||
CodeProxyUnhealthy Code = "EWP04"
|
||||
|
||||
CodeDatabasePingFailed Code = "EDB01"
|
||||
CodeDatabasePingSlow Code = "EDB02"
|
||||
|
||||
CodeWebsocketDial Code = "EWS01"
|
||||
CodeWebsocketEcho Code = "EWS02"
|
||||
CodeWebsocketMsg Code = "EWS03"
|
||||
|
||||
CodeAccessURLNotSet Code = "EACS01"
|
||||
CodeAccessURLInvalid Code = "EACS02"
|
||||
CodeAccessURLFetch Code = "EACS03"
|
||||
CodeAccessURLNotOK Code = "EACS04"
|
||||
|
||||
CodeDERPNodeUsesWebsocket Code = `EDERP01`
|
||||
CodeDERPOneNodeUnhealthy Code = `EDERP02`
|
||||
)
|
||||
|
||||
// @typescript-generate Severity
|
||||
|
@ -18,3 +46,17 @@ var severityRank = map[Severity]int{
|
|||
func (s Severity) Value() int {
|
||||
return severityRank[s]
|
||||
}
|
||||
|
||||
// Code is a stable identifier used to link to documentation.
|
||||
// @typescript-generate Code
|
||||
type Code string
|
||||
|
||||
// Messagef is a convenience function for formatting a healthcheck error message.
|
||||
func Messagef(code Code, msg string, args ...any) string {
|
||||
var sb strings.Builder
|
||||
_, _ = sb.WriteString(string(code))
|
||||
_, _ = sb.WriteRune(':')
|
||||
_, _ = sb.WriteRune(' ')
|
||||
_, _ = sb.WriteString(fmt.Sprintf(msg, args...))
|
||||
return sb.String()
|
||||
}
|
||||
|
|
|
@ -2,7 +2,6 @@ package healthcheck
|
|||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
|
@ -104,7 +103,7 @@ func Run(ctx context.Context, opts *ReportOptions) *Report {
|
|||
defer wg.Done()
|
||||
defer func() {
|
||||
if err := recover(); err != nil {
|
||||
report.DERP.Error = ptr.Ref(fmt.Sprint(err))
|
||||
report.DERP.Error = ptr.Ref(health.Messagef(health.CodeUnknown, "derp report panic: %s", err))
|
||||
}
|
||||
}()
|
||||
|
||||
|
@ -116,7 +115,7 @@ func Run(ctx context.Context, opts *ReportOptions) *Report {
|
|||
defer wg.Done()
|
||||
defer func() {
|
||||
if err := recover(); err != nil {
|
||||
report.AccessURL.Error = ptr.Ref(fmt.Sprint(err))
|
||||
report.AccessURL.Error = ptr.Ref(health.Messagef(health.CodeUnknown, "access url report panic: %s", err))
|
||||
}
|
||||
}()
|
||||
|
||||
|
@ -128,7 +127,7 @@ func Run(ctx context.Context, opts *ReportOptions) *Report {
|
|||
defer wg.Done()
|
||||
defer func() {
|
||||
if err := recover(); err != nil {
|
||||
report.Websocket.Error = ptr.Ref(fmt.Sprint(err))
|
||||
report.Websocket.Error = ptr.Ref(health.Messagef(health.CodeUnknown, "websocket report panic: %s", err))
|
||||
}
|
||||
}()
|
||||
|
||||
|
@ -140,7 +139,7 @@ func Run(ctx context.Context, opts *ReportOptions) *Report {
|
|||
defer wg.Done()
|
||||
defer func() {
|
||||
if err := recover(); err != nil {
|
||||
report.Database.Error = ptr.Ref(fmt.Sprint(err))
|
||||
report.Database.Error = ptr.Ref(health.Messagef(health.CodeUnknown, "database report panic: %s", err))
|
||||
}
|
||||
}()
|
||||
|
||||
|
@ -152,7 +151,7 @@ func Run(ctx context.Context, opts *ReportOptions) *Report {
|
|||
defer wg.Done()
|
||||
defer func() {
|
||||
if err := recover(); err != nil {
|
||||
report.WorkspaceProxy.Error = ptr.Ref(fmt.Sprint(err))
|
||||
report.WorkspaceProxy.Error = ptr.Ref(health.Messagef(health.CodeUnknown, "proxy report panic: %s", err))
|
||||
}
|
||||
}()
|
||||
|
||||
|
|
|
@ -13,6 +13,7 @@ import (
|
|||
"nhooyr.io/websocket"
|
||||
|
||||
"github.com/coder/coder/v2/coderd/healthcheck/health"
|
||||
"github.com/coder/coder/v2/coderd/util/ptr"
|
||||
)
|
||||
|
||||
// @typescript-generate WebsocketReport
|
||||
|
@ -75,6 +76,7 @@ func (r *WebsocketReport) Run(ctx context.Context, opts *WebsocketReportOptions)
|
|||
}
|
||||
if err != nil {
|
||||
r.Error = convertError(xerrors.Errorf("websocket dial: %w", err))
|
||||
r.Error = ptr.Ref(health.Messagef(health.CodeWebsocketDial, "websocket dial: %s", err))
|
||||
r.Severity = health.SeverityError
|
||||
return
|
||||
}
|
||||
|
@ -84,26 +86,26 @@ func (r *WebsocketReport) Run(ctx context.Context, opts *WebsocketReportOptions)
|
|||
msg := strconv.Itoa(i)
|
||||
err := c.Write(ctx, websocket.MessageText, []byte(msg))
|
||||
if err != nil {
|
||||
r.Error = convertError(xerrors.Errorf("write message: %w", err))
|
||||
r.Error = ptr.Ref(health.Messagef(health.CodeWebsocketEcho, "write message: %s", err))
|
||||
r.Severity = health.SeverityError
|
||||
return
|
||||
}
|
||||
|
||||
ty, got, err := c.Read(ctx)
|
||||
if err != nil {
|
||||
r.Error = convertError(xerrors.Errorf("read message: %w", err))
|
||||
r.Error = ptr.Ref(health.Messagef(health.CodeWebsocketEcho, "read message: %s", err))
|
||||
r.Severity = health.SeverityError
|
||||
return
|
||||
}
|
||||
|
||||
if ty != websocket.MessageText {
|
||||
r.Error = convertError(xerrors.Errorf("received incorrect message type: %v", ty))
|
||||
r.Error = ptr.Ref(health.Messagef(health.CodeWebsocketMsg, "received incorrect message type: %v", ty))
|
||||
r.Severity = health.SeverityError
|
||||
return
|
||||
}
|
||||
|
||||
if string(got) != msg {
|
||||
r.Error = convertError(xerrors.Errorf("received incorrect message: wanted %q, got %q", msg, string(got)))
|
||||
r.Error = ptr.Ref(health.Messagef(health.CodeWebsocketMsg, "received incorrect message: wanted %q, got %q", msg, string(got)))
|
||||
r.Severity = health.SeverityError
|
||||
return
|
||||
}
|
||||
|
|
|
@ -63,7 +63,9 @@ func TestWebsocket(t *testing.T) {
|
|||
APIKey: "test",
|
||||
})
|
||||
|
||||
require.NotNil(t, wsReport.Error)
|
||||
if assert.NotNil(t, wsReport.Error) {
|
||||
assert.Contains(t, *wsReport.Error, health.CodeWebsocketDial)
|
||||
}
|
||||
require.Equal(t, health.SeverityError, wsReport.Severity)
|
||||
assert.Equal(t, wsReport.Body, "test error")
|
||||
assert.Equal(t, wsReport.Code, http.StatusBadRequest)
|
||||
|
|
|
@ -6,12 +6,12 @@ import (
|
|||
"sort"
|
||||
"strings"
|
||||
|
||||
"golang.org/x/xerrors"
|
||||
|
||||
"github.com/coder/coder/v2/buildinfo"
|
||||
"github.com/coder/coder/v2/coderd/healthcheck/health"
|
||||
"github.com/coder/coder/v2/coderd/util/ptr"
|
||||
"github.com/coder/coder/v2/codersdk"
|
||||
|
||||
"golang.org/x/xerrors"
|
||||
)
|
||||
|
||||
// @typescript-generate WorkspaceProxyReport
|
||||
|
@ -64,7 +64,7 @@ func (r *WorkspaceProxyReport) Run(ctx context.Context, opts *WorkspaceProxyRepo
|
|||
// If this fails, just mark it as a warning. It is still updated in the background.
|
||||
if err := opts.WorkspaceProxiesFetchUpdater.Update(ctx); err != nil {
|
||||
r.Severity = health.SeverityWarning
|
||||
r.Warnings = append(r.Warnings, xerrors.Errorf("update proxy health: %w", err).Error())
|
||||
r.Warnings = append(r.Warnings, health.Messagef(health.CodeProxyUpdate, "update proxy health: %s", err))
|
||||
return
|
||||
}
|
||||
|
||||
|
@ -72,7 +72,7 @@ func (r *WorkspaceProxyReport) Run(ctx context.Context, opts *WorkspaceProxyRepo
|
|||
if err != nil {
|
||||
r.Healthy = false
|
||||
r.Severity = health.SeverityError
|
||||
r.Error = ptr.Ref(err.Error())
|
||||
r.Error = ptr.Ref(health.Messagef(health.CodeProxyFetch, "fetch workspace proxies: %s", err))
|
||||
return
|
||||
}
|
||||
|
||||
|
@ -99,11 +99,13 @@ func (r *WorkspaceProxyReport) Run(ctx context.Context, opts *WorkspaceProxyRepo
|
|||
|
||||
r.Severity = calculateSeverity(total, healthy)
|
||||
r.Healthy = r.Severity.Value() < health.SeverityError.Value()
|
||||
switch r.Severity {
|
||||
case health.SeverityWarning, health.SeverityOK:
|
||||
r.Warnings = append(r.Warnings, errs...)
|
||||
case health.SeverityError:
|
||||
r.appendError(errs...)
|
||||
for _, err := range errs {
|
||||
switch r.Severity {
|
||||
case health.SeverityWarning, health.SeverityOK:
|
||||
r.Warnings = append(r.Warnings, health.Messagef(health.CodeProxyUnhealthy, err))
|
||||
case health.SeverityError:
|
||||
r.appendError(health.Messagef(health.CodeProxyUnhealthy, err))
|
||||
}
|
||||
}
|
||||
|
||||
// Versions _must_ match. Perform this check last. This will clobber any other severity.
|
||||
|
@ -111,7 +113,7 @@ func (r *WorkspaceProxyReport) Run(ctx context.Context, opts *WorkspaceProxyRepo
|
|||
if vErr := checkVersion(proxy, opts.CurrentVersion); vErr != nil {
|
||||
r.Healthy = false
|
||||
r.Severity = health.SeverityError
|
||||
r.appendError(fmt.Sprintf("%s: %s", proxy.Name, vErr.Error()))
|
||||
r.appendError(health.Messagef(health.CodeProxyVersionMismatch, vErr.Error()))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -2,6 +2,7 @@ package healthcheck_test
|
|||
|
||||
import (
|
||||
"context"
|
||||
"strings"
|
||||
"testing"
|
||||
|
||||
"github.com/stretchr/testify/assert"
|
||||
|
@ -26,6 +27,7 @@ func TestWorkspaceProxies(t *testing.T) {
|
|||
updateProxyHealth func(context.Context) error
|
||||
expectedHealthy bool
|
||||
expectedError string
|
||||
expectedWarning string
|
||||
expectedSeverity health.Severity
|
||||
}{
|
||||
{
|
||||
|
@ -53,6 +55,7 @@ func TestWorkspaceProxies(t *testing.T) {
|
|||
updateProxyHealth: fakeUpdateProxyHealth(nil),
|
||||
expectedHealthy: false,
|
||||
expectedSeverity: health.SeverityError,
|
||||
expectedError: string(health.CodeProxyUnhealthy),
|
||||
},
|
||||
{
|
||||
name: "Enabled/OneUnreachable",
|
||||
|
@ -80,7 +83,7 @@ func TestWorkspaceProxies(t *testing.T) {
|
|||
updateProxyHealth: fakeUpdateProxyHealth(nil),
|
||||
expectedHealthy: false,
|
||||
expectedSeverity: health.SeverityError,
|
||||
expectedError: "connect: connection refused",
|
||||
expectedError: string(health.CodeProxyUnhealthy),
|
||||
},
|
||||
{
|
||||
name: "Enabled/AllHealthy",
|
||||
|
@ -103,6 +106,7 @@ func TestWorkspaceProxies(t *testing.T) {
|
|||
updateProxyHealth: fakeUpdateProxyHealth(nil),
|
||||
expectedHealthy: true,
|
||||
expectedSeverity: health.SeverityWarning,
|
||||
expectedWarning: string(health.CodeProxyUnhealthy),
|
||||
},
|
||||
{
|
||||
name: "Enabled/AllUnhealthy",
|
||||
|
@ -113,6 +117,7 @@ func TestWorkspaceProxies(t *testing.T) {
|
|||
updateProxyHealth: fakeUpdateProxyHealth(nil),
|
||||
expectedHealthy: false,
|
||||
expectedSeverity: health.SeverityError,
|
||||
expectedError: string(health.CodeProxyUnhealthy),
|
||||
},
|
||||
{
|
||||
name: "Enabled/OneOutOfDate",
|
||||
|
@ -150,7 +155,7 @@ func TestWorkspaceProxies(t *testing.T) {
|
|||
updateProxyHealth: fakeUpdateProxyHealth(nil),
|
||||
expectedHealthy: false,
|
||||
expectedSeverity: health.SeverityError,
|
||||
expectedError: assert.AnError.Error(),
|
||||
expectedError: string(health.CodeProxyFetch),
|
||||
},
|
||||
{
|
||||
name: "Enabled/ErrUpdateProxyHealth",
|
||||
|
@ -158,6 +163,7 @@ func TestWorkspaceProxies(t *testing.T) {
|
|||
updateProxyHealth: fakeUpdateProxyHealth(assert.AnError),
|
||||
expectedHealthy: true,
|
||||
expectedSeverity: health.SeverityWarning,
|
||||
expectedWarning: string(health.CodeProxyUpdate),
|
||||
},
|
||||
} {
|
||||
tt := tt
|
||||
|
@ -179,13 +185,22 @@ func TestWorkspaceProxies(t *testing.T) {
|
|||
|
||||
assert.Equal(t, tt.expectedHealthy, rpt.Healthy)
|
||||
assert.Equal(t, tt.expectedSeverity, rpt.Severity)
|
||||
if tt.expectedError != "" {
|
||||
assert.NotNil(t, rpt.Error)
|
||||
if tt.expectedError != "" && assert.NotNil(t, rpt.Error) {
|
||||
assert.Contains(t, *rpt.Error, tt.expectedError)
|
||||
} else {
|
||||
if !assert.Nil(t, rpt.Error) {
|
||||
assert.Empty(t, *rpt.Error)
|
||||
assert.Nil(t, rpt.Error)
|
||||
}
|
||||
if tt.expectedWarning != "" && assert.NotEmpty(t, rpt.Warnings) {
|
||||
var found bool
|
||||
for _, w := range rpt.Warnings {
|
||||
if strings.Contains(w, tt.expectedWarning) {
|
||||
found = true
|
||||
break
|
||||
}
|
||||
}
|
||||
assert.True(t, found, "expected warning %s not found in %v", tt.expectedWarning, rpt.Warnings)
|
||||
} else {
|
||||
assert.Empty(t, rpt.Warnings)
|
||||
}
|
||||
})
|
||||
}
|
||||
|
@ -221,13 +236,24 @@ func (u *fakeWorkspaceProxyFetchUpdater) Update(ctx context.Context) error {
|
|||
return u.updateFunc(ctx)
|
||||
}
|
||||
|
||||
//nolint:revive // yes, this is a control flag, and that is OK in a unit test.
|
||||
func fakeWorkspaceProxy(name string, healthy bool, version string) codersdk.WorkspaceProxy {
|
||||
var status codersdk.WorkspaceProxyStatus
|
||||
if !healthy {
|
||||
status = codersdk.WorkspaceProxyStatus{
|
||||
Status: codersdk.ProxyUnreachable,
|
||||
Report: codersdk.ProxyHealthReport{
|
||||
Errors: []string{assert.AnError.Error()},
|
||||
},
|
||||
}
|
||||
}
|
||||
return codersdk.WorkspaceProxy{
|
||||
Region: codersdk.Region{
|
||||
Name: name,
|
||||
Healthy: healthy,
|
||||
},
|
||||
Version: version,
|
||||
Status: status,
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -0,0 +1,248 @@
|
|||
# Deployment Health
|
||||
|
||||
Coder includes an operator-friendly deployment health page that provides a
|
||||
number of details about the health of your Coder deployment.
|
||||
|
||||
You can view it at `https://${CODER_URL}/health`, or you can alternatively view
|
||||
the [JSON response directly](../api/debug.md#debug-info-deployment-health).
|
||||
|
||||
The deployment health page is broken up into the following sections:
|
||||
|
||||
## Access URL
|
||||
|
||||
The Access URL section shows checks related to Coder's
|
||||
[access URL](./configure.md#access-url).
|
||||
|
||||
Coder will periodically send a GET request to `${CODER_ACCESS_URL}/healthz` and
|
||||
validate that the response is `200 OK`. The expected response body is also the
|
||||
string `OK`.
|
||||
|
||||
If there is an issue, you may see one of the following errors reported:
|
||||
|
||||
### <a name="EACS01">EACS01: Access URL not set</a>
|
||||
|
||||
**Problem:** no access URL has been configured.
|
||||
|
||||
**Solution:** configure an [access URL](./configure.md#access-url) for Coder.
|
||||
|
||||
### <a name="EACS02">EACS02: Access URL invalid</a>
|
||||
|
||||
**Problem:** `${CODER_ACCESS_URL}/healthz` is not a valid URL.
|
||||
|
||||
**Solution:** Ensure that the access URL is a valid URL accepted by
|
||||
[`url.Parse`](https://pkg.go.dev/net/url#Parse). Example:
|
||||
`https://dev.coder.com/`.
|
||||
|
||||
> [!TIP] You can check this [here](https://go.dev/play/p/CabcJZyTwt9).
|
||||
|
||||
### <a name="EACS03">EACS03: Failed to fetch `/healthz`</a>
|
||||
|
||||
**Problem:** Coder was unable to execute a GET request to
|
||||
`${CODER_ACCESS_URL}/healthz`.
|
||||
|
||||
This could be due to a number of reasons, including but not limited to:
|
||||
|
||||
- DNS lookup failure
|
||||
- A misconfigured firewall
|
||||
- A misconfigured reverse proxy
|
||||
- Invalid or expired SSL certificates
|
||||
|
||||
**Solution:** Investigate and resolve the root cause of the connection issue.
|
||||
|
||||
To troubleshoot further, you can log into the machine running Coder and attempt
|
||||
to run the following command:
|
||||
|
||||
```shell
|
||||
curl -v ${CODER_ACCESS_URL}/healthz
|
||||
# Expected output:
|
||||
# * Trying XXX.XXX.XXX.XXX:443
|
||||
# * Connected to https://coder.company.com (XXX.XXX.XXX.XXX) port 443 (#0)
|
||||
# [...]
|
||||
# OK
|
||||
```
|
||||
|
||||
The output of this command should aid further diagnosis.
|
||||
|
||||
### <a name="EACS04">EACS04: /healthz did not return 200 OK</a>
|
||||
|
||||
**Problem:** Coder was able to execute a GET request to
|
||||
`${CODER_ACCESS_URL}/healthz`, but the response code was not `200 OK` as
|
||||
expected.
|
||||
|
||||
This could mean, for instance, that:
|
||||
|
||||
- The request did not actually hit your Coder instance (potentially an incorrect
|
||||
DNS entry)
|
||||
- The request hit your Coder instance, but on an unexpected path (potentially a
|
||||
misconfigured reverse proxy)
|
||||
|
||||
**Solution:** Inspect the `HealthzResponse` in the health check output. This
|
||||
should give you a good indication of the root cause.
|
||||
|
||||
## Database
|
||||
|
||||
Coder continuously executes a short database query to validate that it can reach
|
||||
its configured database, and also measures the median latency over 5 attempts.
|
||||
|
||||
### <a name="EDB01">EDB01: Database Ping Failed</a>
|
||||
|
||||
**Problem:** This error code is returned if any attempt to execute this database
|
||||
query fails.
|
||||
|
||||
**Solution:** Investigate the health of the database.
|
||||
|
||||
### <a name="EDB02">EDB02: Database Latency High</a>
|
||||
|
||||
**Problem:** This code is returned if the median latency is higher than the
|
||||
[configured threshold](../cli/server.md#--health-check-threshold-database). This
|
||||
may not be an error as such, but is an indication of a potential issue.
|
||||
|
||||
**Solution:** Investigate the sizing of the configured database with regard to
|
||||
Coder's current activity and usage. It may be necessary to increase the
|
||||
resources allocated to Coder's database. Alternatively, you can raise the
|
||||
configured threshold to a higher value (this will not address the root cause).
|
||||
|
||||
> [!TIP]
|
||||
>
|
||||
> - You can enable
|
||||
> [detailed database metrics](../cli/server.md#--prometheus-collect-db-metrics)
|
||||
> in Coder's Prometheus endpoint.
|
||||
> - If you have [tracing enabled](../cli/server.md#--trace), these traces may
|
||||
> also contain useful information regarding Coder's database activity.
|
||||
|
||||
## DERP
|
||||
|
||||
Coder workspace agents may use
|
||||
[DERP (Designated Encrypted Relay for Packets)](https://tailscale.com/blog/how-tailscale-works/#encrypted-tcp-relays-derp)
|
||||
to communicate with Coder. This requires connectivity to a number of configured
|
||||
[DERP servers](../cli/server.md#--derp-config-path) which are used to relay
|
||||
traffic between Coder and workspace agents. Coder periodically queries the
|
||||
health of its configured DERP servers and may return one or more of the
|
||||
following:
|
||||
|
||||
### <a name="EDERP01">EDERP01: DERP Node Uses Websocket</a>
|
||||
|
||||
**Problem:** When Coder attempts to establish a connection to one or more DERP
|
||||
servers, it sends a specific `Upgrade: derp` HTTP header. Some load balancers
|
||||
may block this header, in which case Coder will fall back to
|
||||
`Upgrade: websocket`.
|
||||
|
||||
This is not necessarily a fatal error, but a possible indication of a
|
||||
misconfigured reverse HTTP proxy. Additionally, while workspace users should
|
||||
still be able to reach their workspaces, connection performance may be degraded.
|
||||
|
||||
> [!NOTE] This may also be shown if you have
|
||||
> [forced websocket connections for DERP](../cli/server.md#--derp-force-websockets).
|
||||
|
||||
**Solution:** ensure that any configured reverse proxy does not strip the
|
||||
`Upgrade: derp` header.
|
||||
|
||||
### <a name="EDERP02">EDERP02: One or more DERP nodes are unhealthy</a>
|
||||
|
||||
**Problem:** This is shown if Coder is unable to reach one or more configured
|
||||
DERP servers. Clients will fall back to use the remaining DERP servers, but
|
||||
performance may be impacted for clients closest to the unhealthy DERP server.
|
||||
|
||||
**Solution:** Ensure that the DERP server is available and reachable over the
|
||||
network on port 443, for example:
|
||||
|
||||
```shell
|
||||
curl -v "https://coder.company.com:443/derp"
|
||||
# Expected output:
|
||||
# * Trying XXX.XXX.XXX.XXX:443
|
||||
# * Connected to https://coder.company.com (XXX.XXX.XXX.XXX) port 443 (#0)
|
||||
# DERP requires connection upgrade
|
||||
```
|
||||
|
||||
## Websocket
|
||||
|
||||
Coder makes heavy use of [WebSockets](https://datatracker.ietf.org/doc/rfc6455/)
|
||||
for long-lived connections:
|
||||
|
||||
- Between users interacting with Coder's Web UI (for example, the built-in
|
||||
terminal, or VSCode Web),
|
||||
- Between workspace agents and `coderd`,
|
||||
- Between Coder [workspace proxies](../admin/workspace-proxies.md) and `coderd`.
|
||||
|
||||
Any issues causing failures to establish WebSocket connections will result in
|
||||
**severe** impairment of functionality for users. To validate this
|
||||
functionality, Coder will periodically attempt to establish a WebSocket
|
||||
connection with itself using the configured [Access URL](#access-url), send a
|
||||
message over the connection, and attempt to read back that same message.
|
||||
|
||||
### <a name="EWS01">EWS01: Failed to establish a WebSocket connection</a>
|
||||
|
||||
**Problem:** Coder was unable to establish a WebSocket connection over its own
|
||||
Access URL.
|
||||
|
||||
**Solution:** There are multiple possible causes of this problem:
|
||||
|
||||
1. Ensure that Coder's configured Access URL can be reached from the server
|
||||
running Coder, using standard troubleshooting tools like `curl`:
|
||||
|
||||
```shell
|
||||
curl -v "https://coder.company.com:443/"
|
||||
```
|
||||
|
||||
2. Ensure that any reverse proxy that is sitting in front of Coder's configured
|
||||
access URL is not stripping the HTTP header `Upgrade: websocket`.
|
||||
|
||||
### <a name="EWS02">EWS02: Failed to echo a WebSocket message</a>
|
||||
|
||||
**Problem:** Coder was able to establish a WebSocket connection, but was unable
|
||||
to write a message.
|
||||
|
||||
**Solution:** There are multiple possible causes of this problem:
|
||||
|
||||
1. Validate that any reverse proxy servers in front of Coder's configured access
|
||||
URL are not prematurely closing the connection.
|
||||
2. Validate that the network link between Coder and the workspace proxy is
|
||||
stable, e.g. by using `ping`.
|
||||
3. Validate that any internal network infrastructure (for example, firewalls,
|
||||
proxies, VPNs) do not interfere with WebSocket connections.
|
||||
|
||||
## Workspace Proxy
|
||||
|
||||
If you have configured [Workspace Proxies](../admin/workspace-proxies.md), Coder
|
||||
will periodically query their availability and show their status here.
|
||||
|
||||
### <a name="EWP01">EWP01: Error Updating Workspace Proxy Health</a>
|
||||
|
||||
**Problem:** Coder was unable to query the connected workspace proxies for their
|
||||
health status.
|
||||
|
||||
**Solution:** This may be a transient issue. If it persists, it could signify a
|
||||
connectivity issue.
|
||||
|
||||
### <a name="EWP02">EWP02: Error Fetching Workspace Proxies</a>
|
||||
|
||||
**Problem:** Coder was unable to fetch the stored workspace proxy health data
|
||||
from the database.
|
||||
|
||||
**Solution:** This may be a transient issue. If it persists, it could signify an
|
||||
issue with Coder's configured database.
|
||||
|
||||
### <a name="EWP03">EWP03: Workspace Proxy Version Mismatch</a>
|
||||
|
||||
**Problem:** One or more workspace proxies are more than one major or minor
|
||||
version out of date with the main deployment. It is important that workspace
|
||||
proxies are updated at the same time as the main deployment to minimize the risk
|
||||
of API incompatibility.
|
||||
|
||||
**Solution:** Update the workspace proxy to match the currently running version
|
||||
of Coder.
|
||||
|
||||
### <a name="EWP04">EWP04: One or more Workspace Proxies Unhealthy</a>
|
||||
|
||||
**Problem:** One or more workspace proxies are not reachable.
|
||||
|
||||
**Solution:** Ensure that Coder can establish a connection to the configured
|
||||
workspace proxies on port 443.
|
||||
|
||||
## <a name="EUNKNOWN">Unknown Error</a>
|
||||
|
||||
**Problem:** This error is shown when an unexpected error occurred evaluating
|
||||
deployment health. It may resolve on its own.
|
||||
|
||||
**Solution:** This may be a bug.
|
||||
[File a GitHub issue](https://github.com/coder/coder/issues/new)!
|
|
@ -0,0 +1,3 @@
|
|||
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 32 32">
|
||||
<path stroke="none" d="M10.5 13H8v-3h2.5V7.5h3V10H16v3h-2.5v2.5h-3V13zM12 2 4 6v6.09c0 5.05 3.41 9.76 8 10.91 4.59-1.15 8-5.86 8-10.91V5l-8-3z"/>
|
||||
</svg>
|
After Width: | Height: | Size: 215 B |
|
@ -433,6 +433,12 @@
|
|||
"path": "./admin/encryption.md",
|
||||
"icon_path": "./images/icons/lock.svg",
|
||||
"state": "enterprise"
|
||||
},
|
||||
{
|
||||
"title": "Deployment Health",
|
||||
"description": "Learn how to monitor the health of your Coder deployment",
|
||||
"path": "./admin/healthcheck.md",
|
||||
"icon_path": "./images/icons/health.svg"
|
||||
}
|
||||
]
|
||||
},
|
||||
|
|
|
@ -2203,6 +2203,43 @@ export const ClibaseValueSources: ClibaseValueSource[] = [
|
|||
|
||||
// The code below is generated from coderd/healthcheck/health.
|
||||
|
||||
// From health/model.go
|
||||
export type HealthCode =
|
||||
| "EACS01"
|
||||
| "EACS02"
|
||||
| "EACS03"
|
||||
| "EACS04"
|
||||
| "EDB01"
|
||||
| "EDB02"
|
||||
| "EDERP01"
|
||||
| "EDERP02"
|
||||
| "EUNKNOWN"
|
||||
| "EWP01"
|
||||
| "EWP02"
|
||||
| "EWP03"
|
||||
| "EWP04"
|
||||
| "EWS01"
|
||||
| "EWS02"
|
||||
| "EWS03";
|
||||
export const HealthCodes: HealthCode[] = [
|
||||
"EACS01",
|
||||
"EACS02",
|
||||
"EACS03",
|
||||
"EACS04",
|
||||
"EDB01",
|
||||
"EDB02",
|
||||
"EDERP01",
|
||||
"EDERP02",
|
||||
"EUNKNOWN",
|
||||
"EWP01",
|
||||
"EWP02",
|
||||
"EWP03",
|
||||
"EWP04",
|
||||
"EWS01",
|
||||
"EWS02",
|
||||
"EWS03",
|
||||
];
|
||||
|
||||
// From health/model.go
|
||||
export type HealthSeverity = "error" | "ok" | "warning";
|
||||
export const HealthSeveritys: HealthSeverity[] = ["error", "ok", "warning"];
|
||||
|
|
Loading…
Reference in New Issue