mirror of https://github.com/coder/coder.git
383 lines
12 KiB
Go
383 lines
12 KiB
Go
package proxyhealth
|
|
|
|
import (
|
|
"context"
|
|
"encoding/json"
|
|
"errors"
|
|
"fmt"
|
|
"net/http"
|
|
"net/url"
|
|
"strings"
|
|
"sync"
|
|
"sync/atomic"
|
|
"time"
|
|
|
|
"github.com/google/uuid"
|
|
"github.com/prometheus/client_golang/prometheus"
|
|
"golang.org/x/sync/errgroup"
|
|
"golang.org/x/xerrors"
|
|
|
|
"cdr.dev/slog"
|
|
"github.com/coder/coder/v2/coderd/database"
|
|
"github.com/coder/coder/v2/coderd/database/dbauthz"
|
|
"github.com/coder/coder/v2/coderd/prometheusmetrics"
|
|
"github.com/coder/coder/v2/codersdk"
|
|
)
|
|
|
|
type Status string
|
|
|
|
const (
|
|
// Unknown should never be returned by the proxy health check.
|
|
Unknown Status = "unknown"
|
|
// Healthy means the proxy access url is reachable and returns a healthy
|
|
// status code.
|
|
Healthy Status = "ok"
|
|
// Unreachable means the proxy access url is not responding.
|
|
Unreachable Status = "unreachable"
|
|
// Unhealthy means the proxy access url is responding, but there is some
|
|
// problem with the proxy. This problem may or may not be preventing functionality.
|
|
Unhealthy Status = "unhealthy"
|
|
// Unregistered means the proxy has not registered a url yet. This means
|
|
// the proxy was created with the cli, but has not yet been started.
|
|
Unregistered Status = "unregistered"
|
|
)
|
|
|
|
type Options struct {
|
|
// Interval is the interval at which the proxy health is checked.
|
|
Interval time.Duration
|
|
DB database.Store
|
|
Logger slog.Logger
|
|
Client *http.Client
|
|
Prometheus *prometheus.Registry
|
|
}
|
|
|
|
// ProxyHealth runs a go routine that periodically checks the health of all
|
|
// workspace proxies. This information is stored in memory, so each coderd
|
|
// replica has its own view of the health of the proxies. These views should be
|
|
// consistent, and if they are not, it indicates a problem.
|
|
type ProxyHealth struct {
|
|
db database.Store
|
|
interval time.Duration
|
|
logger slog.Logger
|
|
client *http.Client
|
|
|
|
// Cached values for quick access to the health of proxies.
|
|
cache *atomic.Pointer[map[uuid.UUID]ProxyStatus]
|
|
proxyHosts *atomic.Pointer[[]string]
|
|
|
|
// PromMetrics
|
|
healthCheckDuration prometheus.Histogram
|
|
healthCheckResults *prometheusmetrics.CachedGaugeVec
|
|
}
|
|
|
|
func New(opts *Options) (*ProxyHealth, error) {
|
|
if opts.Interval <= 0 {
|
|
opts.Interval = time.Minute
|
|
}
|
|
if opts.DB == nil {
|
|
return nil, xerrors.Errorf("db is required")
|
|
}
|
|
if opts.Prometheus == nil {
|
|
opts.Prometheus = prometheus.NewRegistry()
|
|
}
|
|
|
|
client := opts.Client
|
|
if client == nil {
|
|
client = http.DefaultClient
|
|
}
|
|
// Set a timeout on the client, so we don't wait forever for a healthz response.
|
|
tmp := *client
|
|
tmp.Timeout = time.Second * 5
|
|
client = &tmp
|
|
|
|
// Prometheus metrics
|
|
healthCheckDuration := prometheus.NewHistogram(prometheus.HistogramOpts{
|
|
Namespace: "coderd",
|
|
Subsystem: "proxyhealth",
|
|
Name: "health_check_duration_seconds",
|
|
Help: "Histogram for duration of proxy health collection in seconds.",
|
|
Buckets: []float64{0.001, 0.005, 0.010, 0.025, 0.050, 0.100, 0.500, 1, 5, 10, 30},
|
|
})
|
|
opts.Prometheus.MustRegister(healthCheckDuration)
|
|
|
|
healthCheckResults := prometheusmetrics.NewCachedGaugeVec(prometheus.NewGaugeVec(
|
|
prometheus.GaugeOpts{
|
|
Namespace: "coderd",
|
|
Subsystem: "proxyhealth",
|
|
Name: "health_check_results",
|
|
Help: "This endpoint returns a number to indicate the health status. " +
|
|
"-3 (unknown), -2 (Unreachable), -1 (Unhealthy), 0 (Unregistered), 1 (Healthy)",
|
|
}, []string{"proxy_id"}))
|
|
opts.Prometheus.MustRegister(healthCheckResults)
|
|
|
|
return &ProxyHealth{
|
|
db: opts.DB,
|
|
interval: opts.Interval,
|
|
logger: opts.Logger,
|
|
client: client,
|
|
cache: &atomic.Pointer[map[uuid.UUID]ProxyStatus]{},
|
|
proxyHosts: &atomic.Pointer[[]string]{},
|
|
healthCheckDuration: healthCheckDuration,
|
|
healthCheckResults: healthCheckResults,
|
|
}, nil
|
|
}
|
|
|
|
// Run will block until the context is canceled. It will periodically check the
|
|
// health of all proxies and store the results in the cache.
|
|
func (p *ProxyHealth) Run(ctx context.Context) {
|
|
ticker := time.NewTicker(p.interval)
|
|
defer ticker.Stop()
|
|
|
|
for {
|
|
select {
|
|
case <-ctx.Done():
|
|
return
|
|
case now := <-ticker.C:
|
|
statuses, err := p.runOnce(ctx, now)
|
|
if err != nil {
|
|
p.logger.Error(ctx, "proxy health check failed", slog.Error(err))
|
|
continue
|
|
}
|
|
p.storeProxyHealth(statuses)
|
|
}
|
|
}
|
|
}
|
|
|
|
func (p *ProxyHealth) storeProxyHealth(statuses map[uuid.UUID]ProxyStatus) {
|
|
var proxyHosts []string
|
|
for _, s := range statuses {
|
|
if s.ProxyHost != "" {
|
|
proxyHosts = append(proxyHosts, s.ProxyHost)
|
|
}
|
|
}
|
|
|
|
// Store the statuses in the cache before any other quick values.
|
|
p.cache.Store(&statuses)
|
|
p.proxyHosts.Store(&proxyHosts)
|
|
}
|
|
|
|
// ForceUpdate runs a single health check and updates the cache. If the health
|
|
// check fails, the cache is not updated and an error is returned. This is useful
|
|
// to trigger an update when a proxy is created or deleted.
|
|
func (p *ProxyHealth) ForceUpdate(ctx context.Context) error {
|
|
statuses, err := p.runOnce(ctx, time.Now())
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
p.storeProxyHealth(statuses)
|
|
return nil
|
|
}
|
|
|
|
// HealthStatus returns the current health status of all proxies stored in the
|
|
// cache.
|
|
func (p *ProxyHealth) HealthStatus() map[uuid.UUID]ProxyStatus {
|
|
if p == nil {
|
|
// This can happen because workspace proxies are still an experiment.
|
|
// For the /regions endpoint, this will be nil in those cases.
|
|
return map[uuid.UUID]ProxyStatus{}
|
|
}
|
|
ptr := p.cache.Load()
|
|
if ptr == nil {
|
|
return map[uuid.UUID]ProxyStatus{}
|
|
}
|
|
return *ptr
|
|
}
|
|
|
|
type ProxyStatus struct {
|
|
// ProxyStatus includes the value of the proxy at the time of checking. This is
|
|
// useful to know as it helps determine if the proxy checked has different values
|
|
// then the proxy in hand. AKA if the proxy was updated, and the status was for
|
|
// an older proxy.
|
|
Proxy database.WorkspaceProxy
|
|
// ProxyHost is the host:port of the proxy url. This is included in the status
|
|
// to make sure the proxy url is a valid URL. It also makes it easier to
|
|
// escalate errors if the url.Parse errors (should never happen).
|
|
ProxyHost string
|
|
Status Status
|
|
Report codersdk.ProxyHealthReport
|
|
CheckedAt time.Time
|
|
}
|
|
|
|
// ProxyHosts returns the host:port of all healthy proxies.
|
|
// This can be computed from HealthStatus, but is cached to avoid the
|
|
// caller needing to loop over all proxies to compute this on all
|
|
// static web requests.
|
|
func (p *ProxyHealth) ProxyHosts() []string {
|
|
ptr := p.proxyHosts.Load()
|
|
if ptr == nil {
|
|
return []string{}
|
|
}
|
|
return *ptr
|
|
}
|
|
|
|
// runOnce runs the health check for all workspace proxies. If there is an
|
|
// unexpected error, an error is returned. Expected errors will mark a proxy as
|
|
// unreachable.
|
|
func (p *ProxyHealth) runOnce(ctx context.Context, now time.Time) (map[uuid.UUID]ProxyStatus, error) {
|
|
// Record from the given time.
|
|
defer func() { p.healthCheckDuration.Observe(time.Since(now).Seconds()) }()
|
|
|
|
//nolint:gocritic // Proxy health is a system service.
|
|
proxies, err := p.db.GetWorkspaceProxies(dbauthz.AsSystemRestricted(ctx))
|
|
if err != nil {
|
|
return nil, xerrors.Errorf("get workspace proxies: %w", err)
|
|
}
|
|
|
|
// Just use a mutex to protect map writes.
|
|
var statusMu sync.Mutex
|
|
proxyStatus := map[uuid.UUID]ProxyStatus{}
|
|
|
|
grp, gctx := errgroup.WithContext(ctx)
|
|
// Arbitrary parallelism limit.
|
|
grp.SetLimit(5)
|
|
|
|
for _, proxy := range proxies {
|
|
if proxy.Deleted {
|
|
// Ignore deleted proxies.
|
|
continue
|
|
}
|
|
// Each proxy needs to have a status set. Make a local copy for the
|
|
// call to be run async.
|
|
proxy := proxy
|
|
status := ProxyStatus{
|
|
Proxy: proxy,
|
|
CheckedAt: now,
|
|
Status: Unknown,
|
|
}
|
|
|
|
grp.Go(func() error {
|
|
if proxy.Url == "" {
|
|
// Empty URL means the proxy has not registered yet.
|
|
// When the proxy is started, it will update the url.
|
|
statusMu.Lock()
|
|
defer statusMu.Unlock()
|
|
p.healthCheckResults.WithLabelValues(prometheusmetrics.VectorOperationSet, 0, proxy.ID.String())
|
|
status.Status = Unregistered
|
|
proxyStatus[proxy.ID] = status
|
|
return nil
|
|
}
|
|
|
|
// Try to hit the healthz-report endpoint for a comprehensive health check.
|
|
reqURL := fmt.Sprintf("%s/healthz-report", strings.TrimSuffix(proxy.Url, "/"))
|
|
req, err := http.NewRequestWithContext(gctx, http.MethodGet, reqURL, nil)
|
|
if err != nil {
|
|
return xerrors.Errorf("new request: %w", err)
|
|
}
|
|
req = req.WithContext(gctx)
|
|
|
|
resp, err := p.client.Do(req)
|
|
if err == nil {
|
|
defer resp.Body.Close()
|
|
}
|
|
// A switch statement felt easier to categorize the different cases than
|
|
// if else statements or nested if statements.
|
|
switch {
|
|
case err == nil && resp.StatusCode == http.StatusOK:
|
|
err := json.NewDecoder(resp.Body).Decode(&status.Report)
|
|
if err != nil {
|
|
isCoderErr := fmt.Errorf("proxy url %q is not a coder proxy instance, verify the url is correct", reqURL)
|
|
if resp.Header.Get(codersdk.BuildVersionHeader) != "" {
|
|
isCoderErr = fmt.Errorf("proxy url %q is a coder instance, but unable to decode the response payload. Could this be a primary coderd and not a proxy?", reqURL)
|
|
}
|
|
|
|
// If the response is not json, then the user likely input a bad url that returns status code 200.
|
|
// This is very common, since most webpages do return a 200. So let's improve the error message.
|
|
if notJSONErr := codersdk.ExpectJSONMime(resp); notJSONErr != nil {
|
|
err = errors.Join(
|
|
isCoderErr,
|
|
fmt.Errorf("attempted to query health at %q but got back the incorrect content type: %w", reqURL, notJSONErr),
|
|
)
|
|
|
|
status.Report.Errors = []string{
|
|
err.Error(),
|
|
}
|
|
status.Status = Unhealthy
|
|
break
|
|
}
|
|
|
|
// If we cannot read the report, mark the proxy as unhealthy.
|
|
status.Report.Errors = []string{
|
|
errors.Join(
|
|
isCoderErr,
|
|
fmt.Errorf("received a status code 200, but failed to decode health report body: %w", err),
|
|
).Error(),
|
|
}
|
|
status.Status = Unhealthy
|
|
break
|
|
}
|
|
if len(status.Report.Errors) > 0 {
|
|
status.Status = Unhealthy
|
|
break
|
|
}
|
|
|
|
status.Status = Healthy
|
|
case err == nil && resp.StatusCode != http.StatusOK:
|
|
// Unhealthy as we did reach the proxy but it got an unexpected response.
|
|
status.Status = Unhealthy
|
|
var builder strings.Builder
|
|
// This string is shown on the UI where newlines are respected.
|
|
// This error message is not ever decoded programmatically, so keep it human-
|
|
// readable.
|
|
builder.WriteString(fmt.Sprintf("unexpected status code %d. ", resp.StatusCode))
|
|
builder.WriteString(fmt.Sprintf("\nEncountered error, send a request to %q from the Coderd environment to debug this issue.", reqURL))
|
|
// err will always be non-nil
|
|
err := codersdk.ReadBodyAsError(resp)
|
|
var apiErr *codersdk.Error
|
|
if xerrors.As(err, &apiErr) {
|
|
builder.WriteString(fmt.Sprintf("\nError Message: %s\nError Detail: %s", apiErr.Message, apiErr.Detail))
|
|
for _, v := range apiErr.Validations {
|
|
// Pretty sure this is not possible from the called endpoint, but just in case.
|
|
builder.WriteString(fmt.Sprintf("\n\tValidation: %s=%s", v.Field, v.Detail))
|
|
}
|
|
}
|
|
builder.WriteString(fmt.Sprintf("\nError: %s", err.Error()))
|
|
|
|
status.Report.Errors = []string{builder.String()}
|
|
case err != nil:
|
|
// Request failed, mark the proxy as unreachable.
|
|
status.Status = Unreachable
|
|
status.Report.Errors = []string{fmt.Sprintf("request to proxy failed: %s", err.Error())}
|
|
default:
|
|
// This should never happen
|
|
status.Status = Unknown
|
|
}
|
|
|
|
u, err := url.Parse(proxy.Url)
|
|
if err != nil {
|
|
// This should never happen. This would mean the proxy sent
|
|
// us an invalid url?
|
|
status.Report.Errors = append(status.Report.Errors, fmt.Sprintf("failed to parse proxy url: %s", err.Error()))
|
|
status.Status = Unhealthy
|
|
}
|
|
status.ProxyHost = u.Host
|
|
|
|
// Set the prometheus metric correctly.
|
|
switch status.Status {
|
|
case Healthy:
|
|
p.healthCheckResults.WithLabelValues(prometheusmetrics.VectorOperationSet, 1, proxy.ID.String())
|
|
case Unhealthy:
|
|
p.healthCheckResults.WithLabelValues(prometheusmetrics.VectorOperationSet, -1, proxy.ID.String())
|
|
case Unreachable:
|
|
p.healthCheckResults.WithLabelValues(prometheusmetrics.VectorOperationSet, -2, proxy.ID.String())
|
|
default:
|
|
// Unknown
|
|
p.healthCheckResults.WithLabelValues(prometheusmetrics.VectorOperationSet, -3, proxy.ID.String())
|
|
}
|
|
|
|
statusMu.Lock()
|
|
defer statusMu.Unlock()
|
|
proxyStatus[proxy.ID] = status
|
|
return nil
|
|
})
|
|
}
|
|
|
|
err = grp.Wait()
|
|
if err != nil {
|
|
return nil, xerrors.Errorf("group run: %w", err)
|
|
}
|
|
p.healthCheckResults.Commit()
|
|
|
|
return proxyStatus, nil
|
|
}
|