chore: Proxy health status checks + endpoint (#7233)

* chore: Implement workspace proxy health check cron

At a given interval will check the reachability of workspace proxies.

* Proxyhealth is an enterprise feature
* Start proxyhealth go routine on enterprise coder
This commit is contained in:
Steven Masley 2023-04-24 10:25:35 -05:00 committed by GitHub
parent 63e68c11d1
commit 3129741e08
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
13 changed files with 912 additions and 60 deletions

62
coderd/apidoc/docs.go generated
View File

@ -8208,6 +8208,40 @@ const docTemplate = `{
"ProvisionerStorageMethodFile"
]
},
"codersdk.ProxyHealthReport": {
"type": "object",
"properties": {
"errors": {
"description": "Errors are problems that prevent the workspace proxy from being healthy",
"type": "array",
"items": {
"type": "string"
}
},
"warnings": {
"description": "Warnings do not prevent the workspace proxy from being healthy, but\nshould be addressed.",
"type": "array",
"items": {
"type": "string"
}
}
}
},
"codersdk.ProxyHealthStatus": {
"type": "string",
"enum": [
"reachable",
"unreachable",
"unhealthy",
"unregistered"
],
"x-enum-varnames": [
"ProxyReachable",
"ProxyUnreachable",
"ProxyUnhealthy",
"ProxyUnregistered"
]
},
"codersdk.PutExtendWorkspaceRequest": {
"type": "object",
"required": [
@ -9701,6 +9735,14 @@ const docTemplate = `{
"name": {
"type": "string"
},
"status": {
"description": "Status is the latest status check of the proxy. This will be empty for deleted\nproxies. This value can be used to determine if a workspace proxy is healthy\nand ready to use.",
"allOf": [
{
"$ref": "#/definitions/codersdk.WorkspaceProxyStatus"
}
]
},
"updated_at": {
"type": "string",
"format": "date-time"
@ -9715,6 +9757,26 @@ const docTemplate = `{
}
}
},
"codersdk.WorkspaceProxyStatus": {
"type": "object",
"properties": {
"checked_at": {
"type": "string",
"format": "date-time"
},
"report": {
"description": "Report provides more information about the health of the workspace proxy.",
"allOf": [
{
"$ref": "#/definitions/codersdk.ProxyHealthReport"
}
]
},
"status": {
"$ref": "#/definitions/codersdk.ProxyHealthStatus"
}
}
},
"codersdk.WorkspaceQuota": {
"type": "object",
"properties": {

View File

@ -7354,6 +7354,35 @@
"enum": ["file"],
"x-enum-varnames": ["ProvisionerStorageMethodFile"]
},
"codersdk.ProxyHealthReport": {
"type": "object",
"properties": {
"errors": {
"description": "Errors are problems that prevent the workspace proxy from being healthy",
"type": "array",
"items": {
"type": "string"
}
},
"warnings": {
"description": "Warnings do not prevent the workspace proxy from being healthy, but\nshould be addressed.",
"type": "array",
"items": {
"type": "string"
}
}
}
},
"codersdk.ProxyHealthStatus": {
"type": "string",
"enum": ["reachable", "unreachable", "unhealthy", "unregistered"],
"x-enum-varnames": [
"ProxyReachable",
"ProxyUnreachable",
"ProxyUnhealthy",
"ProxyUnregistered"
]
},
"codersdk.PutExtendWorkspaceRequest": {
"type": "object",
"required": ["deadline"],
@ -8764,6 +8793,14 @@
"name": {
"type": "string"
},
"status": {
"description": "Status is the latest status check of the proxy. This will be empty for deleted\nproxies. This value can be used to determine if a workspace proxy is healthy\nand ready to use.",
"allOf": [
{
"$ref": "#/definitions/codersdk.WorkspaceProxyStatus"
}
]
},
"updated_at": {
"type": "string",
"format": "date-time"
@ -8778,6 +8815,26 @@
}
}
},
"codersdk.WorkspaceProxyStatus": {
"type": "object",
"properties": {
"checked_at": {
"type": "string",
"format": "date-time"
},
"report": {
"description": "Report provides more information about the health of the workspace proxy.",
"allOf": [
{
"$ref": "#/definitions/codersdk.ProxyHealthReport"
}
]
},
"status": {
"$ref": "#/definitions/codersdk.ProxyHealthStatus"
}
}
},
"codersdk.WorkspaceQuota": {
"type": "object",
"properties": {

View File

@ -12,17 +12,55 @@ import (
"github.com/google/uuid"
)
type ProxyHealthStatus string
const (
// ProxyReachable means the proxy access url is reachable and returns a healthy
// status code.
ProxyReachable ProxyHealthStatus = "reachable"
// ProxyUnreachable means the proxy access url is not responding.
ProxyUnreachable ProxyHealthStatus = "unreachable"
// ProxyUnhealthy means the proxy access url is responding, but there is some
// problem with the proxy. This problem may or may not be preventing functionality.
ProxyUnhealthy ProxyHealthStatus = "unhealthy"
// ProxyUnregistered means the proxy has not registered a url yet. This means
// the proxy was created with the cli, but has not yet been started.
ProxyUnregistered ProxyHealthStatus = "unregistered"
)
type WorkspaceProxyStatus struct {
Status ProxyHealthStatus `json:"status" table:"status"`
// Report provides more information about the health of the workspace proxy.
Report ProxyHealthReport `json:"report,omitempty" table:"report"`
CheckedAt time.Time `json:"checked_at" table:"checked_at" format:"date-time"`
}
// ProxyHealthReport is a report of the health of the workspace proxy.
// A healthy report will have no errors. Warnings are not fatal.
type ProxyHealthReport struct {
// Errors are problems that prevent the workspace proxy from being healthy
Errors []string
// Warnings do not prevent the workspace proxy from being healthy, but
// should be addressed.
Warnings []string
}
type WorkspaceProxy struct {
ID uuid.UUID `db:"id" json:"id" format:"uuid" table:"id"`
Name string `db:"name" json:"name" table:"name,default_sort"`
Icon string `db:"icon" json:"icon" table:"icon"`
ID uuid.UUID `json:"id" format:"uuid" table:"id"`
Name string `json:"name" table:"name,default_sort"`
Icon string `json:"icon" table:"icon"`
// Full url including scheme of the proxy api url: https://us.example.com
URL string `db:"url" json:"url" table:"url"`
URL string `json:"url" table:"url"`
// WildcardHostname with the wildcard for subdomain based app hosting: *.us.example.com
WildcardHostname string `db:"wildcard_hostname" json:"wildcard_hostname" table:"wildcard_hostname"`
CreatedAt time.Time `db:"created_at" json:"created_at" format:"date-time" table:"created_at"`
UpdatedAt time.Time `db:"updated_at" json:"updated_at" format:"date-time" table:"updated_at"`
Deleted bool `db:"deleted" json:"deleted" table:"deleted"`
WildcardHostname string `json:"wildcard_hostname" table:"wildcard_hostname"`
CreatedAt time.Time `json:"created_at" format:"date-time" table:"created_at"`
UpdatedAt time.Time `json:"updated_at" format:"date-time" table:"updated_at"`
Deleted bool `json:"deleted" table:"deleted"`
// Status is the latest status check of the proxy. This will be empty for deleted
// proxies. This value can be used to determine if a workspace proxy is healthy
// and ready to use.
Status WorkspaceProxyStatus `json:"status,omitempty" table:"status"`
}
type CreateWorkspaceProxyRequest struct {

View File

@ -1185,6 +1185,14 @@ curl -X GET http://coder-server:8080/api/v2/workspaceproxies \
"icon": "string",
"id": "497f6eca-6276-4993-bfeb-53cbbbba6f08",
"name": "string",
"status": {
"checked_at": "2019-08-24T14:15:22Z",
"report": {
"errors": ["string"],
"warnings": ["string"]
},
"status": "reachable"
},
"updated_at": "2019-08-24T14:15:22Z",
"url": "string",
"wildcard_hostname": "string"
@ -1202,17 +1210,32 @@ curl -X GET http://coder-server:8080/api/v2/workspaceproxies \
Status Code **200**
| Name | Type | Required | Restrictions | Description |
| --------------------- | ----------------- | -------- | ------------ | -------------------------------------------------------------------------------------- |
| `[array item]` | array | false | | |
| `» created_at` | string(date-time) | false | | |
| `» deleted` | boolean | false | | |
| `» icon` | string | false | | |
| `» id` | string(uuid) | false | | |
| `» name` | string | false | | |
| `» updated_at` | string(date-time) | false | | |
| `» url` | string | false | | Full URL including scheme of the proxy api url: https://us.example.com |
| `» wildcard_hostname` | string | false | | Wildcard hostname with the wildcard for subdomain based app hosting: \*.us.example.com |
| Name | Type | Required | Restrictions | Description |
| --------------------- | ------------------------------------------------------------------------ | -------- | ------------ | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `[array item]` | array | false | | |
| `» created_at` | string(date-time) | false | | |
| `» deleted` | boolean | false | | |
| `» icon` | string | false | | |
| `» id` | string(uuid) | false | | |
| `» name` | string | false | | |
| `» status` | [codersdk.WorkspaceProxyStatus](schemas.md#codersdkworkspaceproxystatus) | false | | Status is the latest status check of the proxy. This will be empty for deleted proxies. This value can be used to determine if a workspace proxy is healthy and ready to use. |
| `»» checked_at` | string(date-time) | false | | |
| `»» report` | [codersdk.ProxyHealthReport](schemas.md#codersdkproxyhealthreport) | false | | Report provides more information about the health of the workspace proxy. |
| `»»» errors` | array | false | | Errors are problems that prevent the workspace proxy from being healthy |
| `»»» warnings` | array | false | | Warnings do not prevent the workspace proxy from being healthy, but should be addressed. |
| `»» status` | [codersdk.ProxyHealthStatus](schemas.md#codersdkproxyhealthstatus) | false | | |
| `» updated_at` | string(date-time) | false | | |
| `» url` | string | false | | Full URL including scheme of the proxy api url: https://us.example.com |
| `» wildcard_hostname` | string | false | | Wildcard hostname with the wildcard for subdomain based app hosting: \*.us.example.com |
#### Enumerated Values
| Property | Value |
| -------- | -------------- |
| `status` | `reachable` |
| `status` | `unreachable` |
| `status` | `unhealthy` |
| `status` | `unregistered` |
To perform this operation, you must be authenticated. [Learn more](authentication.md).
@ -1257,6 +1280,14 @@ curl -X POST http://coder-server:8080/api/v2/workspaceproxies \
"icon": "string",
"id": "497f6eca-6276-4993-bfeb-53cbbbba6f08",
"name": "string",
"status": {
"checked_at": "2019-08-24T14:15:22Z",
"report": {
"errors": ["string"],
"warnings": ["string"]
},
"status": "reachable"
},
"updated_at": "2019-08-24T14:15:22Z",
"url": "string",
"wildcard_hostname": "string"

View File

@ -3382,6 +3382,39 @@ Parameter represents a set value for the scope.
| ------ |
| `file` |
## codersdk.ProxyHealthReport
```json
{
"errors": ["string"],
"warnings": ["string"]
}
```
### Properties
| Name | Type | Required | Restrictions | Description |
| ---------- | --------------- | -------- | ------------ | ---------------------------------------------------------------------------------------- |
| `errors` | array of string | false | | Errors are problems that prevent the workspace proxy from being healthy |
| `warnings` | array of string | false | | Warnings do not prevent the workspace proxy from being healthy, but should be addressed. |
## codersdk.ProxyHealthStatus
```json
"reachable"
```
### Properties
#### Enumerated Values
| Value |
| -------------- |
| `reachable` |
| `unreachable` |
| `unhealthy` |
| `unregistered` |
## codersdk.PutExtendWorkspaceRequest
```json
@ -5192,6 +5225,14 @@ Parameter represents a set value for the scope.
"icon": "string",
"id": "497f6eca-6276-4993-bfeb-53cbbbba6f08",
"name": "string",
"status": {
"checked_at": "2019-08-24T14:15:22Z",
"report": {
"errors": ["string"],
"warnings": ["string"]
},
"status": "reachable"
},
"updated_at": "2019-08-24T14:15:22Z",
"url": "string",
"wildcard_hostname": "string"
@ -5200,16 +5241,38 @@ Parameter represents a set value for the scope.
### Properties
| Name | Type | Required | Restrictions | Description |
| ------------------- | ------- | -------- | ------------ | -------------------------------------------------------------------------------------- |
| `created_at` | string | false | | |
| `deleted` | boolean | false | | |
| `icon` | string | false | | |
| `id` | string | false | | |
| `name` | string | false | | |
| `updated_at` | string | false | | |
| `url` | string | false | | Full URL including scheme of the proxy api url: https://us.example.com |
| `wildcard_hostname` | string | false | | Wildcard hostname with the wildcard for subdomain based app hosting: \*.us.example.com |
| Name | Type | Required | Restrictions | Description |
| ------------------- | -------------------------------------------------------------- | -------- | ------------ | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `created_at` | string | false | | |
| `deleted` | boolean | false | | |
| `icon` | string | false | | |
| `id` | string | false | | |
| `name` | string | false | | |
| `status` | [codersdk.WorkspaceProxyStatus](#codersdkworkspaceproxystatus) | false | | Status is the latest status check of the proxy. This will be empty for deleted proxies. This value can be used to determine if a workspace proxy is healthy and ready to use. |
| `updated_at` | string | false | | |
| `url` | string | false | | Full URL including scheme of the proxy api url: https://us.example.com |
| `wildcard_hostname` | string | false | | Wildcard hostname with the wildcard for subdomain based app hosting: \*.us.example.com |
## codersdk.WorkspaceProxyStatus
```json
{
"checked_at": "2019-08-24T14:15:22Z",
"report": {
"errors": ["string"],
"warnings": ["string"]
},
"status": "reachable"
}
```
### Properties
| Name | Type | Required | Restrictions | Description |
| ------------ | -------------------------------------------------------- | -------- | ------------ | ------------------------------------------------------------------------- |
| `checked_at` | string | false | | |
| `report` | [codersdk.ProxyHealthReport](#codersdkproxyhealthreport) | false | | Report provides more information about the health of the workspace proxy. |
| `status` | [codersdk.ProxyHealthStatus](#codersdkproxyhealthstatus) | false | | |
## codersdk.WorkspaceQuota

View File

@ -227,6 +227,7 @@ func (*RootCmd) proxyServer() *clibase.Cmd {
proxy, err := wsproxy.New(ctx, &wsproxy.Options{
Logger: logger,
HTTPClient: httpClient,
DashboardURL: primaryAccessURL.Value(),
AccessURL: cfg.AccessURL.Value(),
AppHostname: appHostname,

View File

@ -24,6 +24,7 @@ import (
"github.com/coder/coder/coderd/schedule"
"github.com/coder/coder/codersdk"
"github.com/coder/coder/enterprise/coderd/license"
"github.com/coder/coder/enterprise/coderd/proxyhealth"
"github.com/coder/coder/enterprise/derpmesh"
"github.com/coder/coder/enterprise/replicasync"
"github.com/coder/coder/enterprise/tailnet"
@ -52,9 +53,11 @@ func New(ctx context.Context, options *Options) (*API, error) {
}
ctx, cancelFunc := context.WithCancel(ctx)
api := &API{
AGPL: coderd.New(options.Options),
Options: options,
cancelEntitlementsLoop: cancelFunc,
ctx: ctx,
cancel: cancelFunc,
AGPL: coderd.New(options.Options),
Options: options,
}
api.AGPL.Options.SetUserGroups = api.setUserGroups
@ -226,6 +229,24 @@ func New(ctx context.Context, options *Options) (*API, error) {
}
api.derpMesh = derpmesh.New(options.Logger.Named("derpmesh"), api.DERPServer, meshTLSConfig)
if api.AGPL.Experiments.Enabled(codersdk.ExperimentMoons) {
// Proxy health is a moon feature.
api.proxyHealth, err = proxyhealth.New(&proxyhealth.Options{
Interval: time.Second * 5,
DB: api.Database,
Logger: options.Logger.Named("proxyhealth"),
Client: api.HTTPClient,
Prometheus: api.PrometheusRegistry,
})
if err != nil {
return nil, xerrors.Errorf("initialize proxy health: %w", err)
}
go api.proxyHealth.Run(ctx)
// Force the initial loading of the cache. Do this in a go routine in case
// the calls to the workspace proxies hang and this takes some time.
go api.forceWorkspaceProxyHealthUpdate(ctx)
}
err = api.updateEntitlements(ctx)
if err != nil {
return nil, xerrors.Errorf("update entitlements: %w", err)
@ -249,6 +270,7 @@ type Options struct {
DERPServerRegionID int
EntitlementsUpdateInterval time.Duration
ProxyHealthInterval time.Duration
Keys map[string]ed25519.PublicKey
}
@ -256,18 +278,24 @@ type API struct {
AGPL *coderd.API
*Options
// ctx is canceled immediately on shutdown, it can be used to abort
// interruptible tasks.
ctx context.Context
cancel context.CancelFunc
// Detects multiple Coder replicas running at the same time.
replicaManager *replicasync.Manager
// Meshes DERP connections from multiple replicas.
derpMesh *derpmesh.Mesh
// proxyHealth checks the reachability of all workspace proxies.
proxyHealth *proxyhealth.ProxyHealth
cancelEntitlementsLoop func()
entitlementsMu sync.RWMutex
entitlements codersdk.Entitlements
entitlementsMu sync.RWMutex
entitlements codersdk.Entitlements
}
func (api *API) Close() error {
api.cancelEntitlementsLoop()
api.cancel()
_ = api.replicaManager.Close()
_ = api.derpMesh.Close()
return api.AGPL.Close()

View File

@ -0,0 +1,292 @@
package proxyhealth
import (
"context"
"encoding/json"
"fmt"
"net/http"
"strings"
"sync"
"sync/atomic"
"time"
"github.com/google/uuid"
"github.com/prometheus/client_golang/prometheus"
"golang.org/x/sync/errgroup"
"golang.org/x/xerrors"
"cdr.dev/slog"
"github.com/coder/coder/coderd/database"
"github.com/coder/coder/coderd/database/dbauthz"
"github.com/coder/coder/coderd/prometheusmetrics"
"github.com/coder/coder/codersdk"
)
type Status string
const (
// Unknown should never be returned by the proxy health check.
Unknown Status = "unknown"
// Healthy means the proxy access url is reachable and returns a healthy
// status code.
Healthy Status = "ok"
// Unreachable means the proxy access url is not responding.
Unreachable Status = "unreachable"
// Unhealthy means the proxy access url is responding, but there is some
// problem with the proxy. This problem may or may not be preventing functionality.
Unhealthy Status = "unhealthy"
// Unregistered means the proxy has not registered a url yet. This means
// the proxy was created with the cli, but has not yet been started.
Unregistered Status = "unregistered"
)
type Options struct {
// Interval is the interval at which the proxy health is checked.
Interval time.Duration
DB database.Store
Logger slog.Logger
Client *http.Client
Prometheus *prometheus.Registry
}
// ProxyHealth runs a go routine that periodically checks the health of all
// workspace proxies. This information is stored in memory, so each coderd
// replica has its own view of the health of the proxies. These views should be
// consistent, and if they are not, it indicates a problem.
type ProxyHealth struct {
db database.Store
interval time.Duration
logger slog.Logger
client *http.Client
cache *atomic.Pointer[map[uuid.UUID]ProxyStatus]
// PromMetrics
healthCheckDuration prometheus.Histogram
healthCheckResults *prometheusmetrics.CachedGaugeVec
}
func New(opts *Options) (*ProxyHealth, error) {
if opts.Interval <= 0 {
opts.Interval = time.Minute
}
if opts.DB == nil {
return nil, xerrors.Errorf("db is required")
}
if opts.Prometheus == nil {
opts.Prometheus = prometheus.NewRegistry()
}
client := opts.Client
if client == nil {
client = http.DefaultClient
}
// Set a timeout on the client, so we don't wait forever for a healthz response.
tmp := *client
tmp.Timeout = time.Second * 5
client = &tmp
// Prometheus metrics
healthCheckDuration := prometheus.NewHistogram(prometheus.HistogramOpts{
Namespace: "coderd",
Subsystem: "proxyhealth",
Name: "health_check_duration_seconds",
Help: "Histogram for duration of proxy health collection in seconds.",
Buckets: []float64{0.001, 0.005, 0.010, 0.025, 0.050, 0.100, 0.500, 1, 5, 10, 30},
})
opts.Prometheus.MustRegister(healthCheckDuration)
healthCheckResults := prometheusmetrics.NewCachedGaugeVec(prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Namespace: "coderd",
Subsystem: "proxyhealth",
Name: "health_check_results",
Help: "This endpoint returns a number to indicate the health status. " +
"-3 (unknown), -2 (Unreachable), -1 (Unhealthy), 0 (Unregistered), 1 (Healthy)",
}, []string{"proxy_id"}))
opts.Prometheus.MustRegister(healthCheckResults)
return &ProxyHealth{
db: opts.DB,
interval: opts.Interval,
logger: opts.Logger,
client: client,
cache: &atomic.Pointer[map[uuid.UUID]ProxyStatus]{},
healthCheckDuration: healthCheckDuration,
healthCheckResults: healthCheckResults,
}, nil
}
// Run will block until the context is canceled. It will periodically check the
// health of all proxies and store the results in the cache.
func (p *ProxyHealth) Run(ctx context.Context) {
ticker := time.NewTicker(p.interval)
defer ticker.Stop()
for {
select {
case <-ctx.Done():
return
case now := <-ticker.C:
statuses, err := p.runOnce(ctx, now)
if err != nil {
p.logger.Error(ctx, "proxy health check failed", slog.Error(err))
continue
}
// Store the statuses in the cache.
p.cache.Store(&statuses)
}
}
}
// ForceUpdate runs a single health check and updates the cache. If the health
// check fails, the cache is not updated and an error is returned. This is useful
// to trigger an update when a proxy is created or deleted.
func (p *ProxyHealth) ForceUpdate(ctx context.Context) error {
statuses, err := p.runOnce(ctx, time.Now())
if err != nil {
return err
}
// Store the statuses in the cache.
p.cache.Store(&statuses)
return nil
}
// HealthStatus returns the current health status of all proxies stored in the
// cache.
func (p *ProxyHealth) HealthStatus() map[uuid.UUID]ProxyStatus {
ptr := p.cache.Load()
if ptr == nil {
return map[uuid.UUID]ProxyStatus{}
}
return *ptr
}
type ProxyStatus struct {
// ProxyStatus includes the value of the proxy at the time of checking. This is
// useful to know as it helps determine if the proxy checked has different values
// then the proxy in hand. AKA if the proxy was updated, and the status was for
// an older proxy.
Proxy database.WorkspaceProxy
Status Status
Report codersdk.ProxyHealthReport
CheckedAt time.Time
}
// runOnce runs the health check for all workspace proxies. If there is an
// unexpected error, an error is returned. Expected errors will mark a proxy as
// unreachable.
func (p *ProxyHealth) runOnce(ctx context.Context, now time.Time) (map[uuid.UUID]ProxyStatus, error) {
// Record from the given time.
defer p.healthCheckDuration.Observe(time.Since(now).Seconds())
//nolint:gocritic // Proxy health is a system service.
proxies, err := p.db.GetWorkspaceProxies(dbauthz.AsSystemRestricted(ctx))
if err != nil {
return nil, xerrors.Errorf("get workspace proxies: %w", err)
}
// Just use a mutex to protect map writes.
var statusMu sync.Mutex
proxyStatus := map[uuid.UUID]ProxyStatus{}
grp, gctx := errgroup.WithContext(ctx)
// Arbitrary parallelism limit.
grp.SetLimit(5)
for _, proxy := range proxies {
if proxy.Deleted {
// Ignore deleted proxies.
continue
}
// Each proxy needs to have a status set. Make a local copy for the
// call to be run async.
proxy := proxy
status := ProxyStatus{
Proxy: proxy,
CheckedAt: now,
Status: Unknown,
}
grp.Go(func() error {
if proxy.Url == "" {
// Empty URL means the proxy has not registered yet.
// When the proxy is started, it will update the url.
statusMu.Lock()
defer statusMu.Unlock()
p.healthCheckResults.WithLabelValues(prometheusmetrics.VectorOperationSet, 0, proxy.ID.String())
status.Status = Unregistered
proxyStatus[proxy.ID] = status
return nil
}
// Try to hit the healthz-report endpoint for a comprehensive health check.
reqURL := fmt.Sprintf("%s/healthz-report", strings.TrimSuffix(proxy.Url, "/"))
req, err := http.NewRequestWithContext(gctx, http.MethodGet, reqURL, nil)
if err != nil {
return xerrors.Errorf("new request: %w", err)
}
req = req.WithContext(gctx)
resp, err := p.client.Do(req)
if err == nil {
defer resp.Body.Close()
}
// A switch statement felt easier to categorize the different cases than
// if else statements or nested if statements.
switch {
case err == nil && resp.StatusCode == http.StatusOK:
err := json.NewDecoder(resp.Body).Decode(&status.Report)
if err != nil {
// If we cannot read the report, mark the proxy as unhealthy.
status.Report.Errors = []string{fmt.Sprintf("failed to decode health report: %s", err.Error())}
status.Status = Unhealthy
break
}
if len(status.Report.Errors) > 0 {
status.Status = Unhealthy
break
}
status.Status = Healthy
case err == nil && resp.StatusCode != http.StatusOK:
// Unhealthy as we did reach the proxy but it got an unexpected response.
status.Status = Unhealthy
status.Report.Errors = []string{fmt.Sprintf("unexpected status code %d", resp.StatusCode)}
case err != nil:
// Request failed, mark the proxy as unreachable.
status.Status = Unreachable
status.Report.Errors = []string{fmt.Sprintf("request to proxy failed: %s", err.Error())}
default:
// This should never happen
status.Status = Unknown
}
// Set the prometheus metric correctly.
switch status.Status {
case Healthy:
p.healthCheckResults.WithLabelValues(prometheusmetrics.VectorOperationSet, 1, proxy.ID.String())
case Unhealthy:
p.healthCheckResults.WithLabelValues(prometheusmetrics.VectorOperationSet, -1, proxy.ID.String())
case Unreachable:
p.healthCheckResults.WithLabelValues(prometheusmetrics.VectorOperationSet, -2, proxy.ID.String())
default:
// Unknown
p.healthCheckResults.WithLabelValues(prometheusmetrics.VectorOperationSet, -3, proxy.ID.String())
}
statusMu.Lock()
defer statusMu.Unlock()
proxyStatus[proxy.ID] = status
return nil
})
}
err = grp.Wait()
if err != nil {
return nil, xerrors.Errorf("group run: %w", err)
}
p.healthCheckResults.Commit()
return proxyStatus, nil
}

View File

@ -0,0 +1,174 @@
package proxyhealth_test
import (
"context"
"net"
"net/http"
"net/http/httptest"
"testing"
"github.com/stretchr/testify/require"
"golang.org/x/xerrors"
"cdr.dev/slog/sloggers/slogtest"
"github.com/coder/coder/coderd/database"
"github.com/coder/coder/coderd/database/dbfake"
"github.com/coder/coder/coderd/database/dbgen"
"github.com/coder/coder/coderd/httpapi"
"github.com/coder/coder/codersdk"
"github.com/coder/coder/enterprise/coderd/proxyhealth"
"github.com/coder/coder/testutil"
)
func insertProxy(t *testing.T, db database.Store, url string) database.WorkspaceProxy {
t.Helper()
ctx, cancel := context.WithTimeout(context.Background(), testutil.WaitShort)
defer cancel()
proxy, _ := dbgen.WorkspaceProxy(t, db, database.WorkspaceProxy{})
_, err := db.RegisterWorkspaceProxy(ctx, database.RegisterWorkspaceProxyParams{
Url: url,
WildcardHostname: "",
ID: proxy.ID,
})
require.NoError(t, err, "failed to update proxy")
return proxy
}
func TestProxyHealth_Unregistered(t *testing.T) {
t.Parallel()
db := dbfake.New()
proxies := []database.WorkspaceProxy{
insertProxy(t, db, ""),
insertProxy(t, db, ""),
}
ph, err := proxyhealth.New(&proxyhealth.Options{
Interval: 0,
DB: db,
Logger: slogtest.Make(t, nil),
})
require.NoError(t, err, "failed to create proxy health")
ctx, cancel := context.WithTimeout(context.Background(), testutil.WaitShort)
defer cancel()
err = ph.ForceUpdate(ctx)
require.NoError(t, err, "failed to force update")
for _, p := range proxies {
require.Equal(t, ph.HealthStatus()[p.ID].Status, proxyhealth.Unregistered, "expect unregistered proxy")
}
}
func TestProxyHealth_Unhealthy(t *testing.T) {
t.Parallel()
db := dbfake.New()
srvBadReport := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
httpapi.Write(context.Background(), w, http.StatusOK, codersdk.ProxyHealthReport{
Errors: []string{"We have a problem!"},
})
}))
defer srvBadReport.Close()
srvBadCode := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
w.WriteHeader(http.StatusBadRequest)
}))
defer srvBadCode.Close()
proxies := []database.WorkspaceProxy{
// Same url for both, just checking multiple proxies are checked.
insertProxy(t, db, srvBadReport.URL),
insertProxy(t, db, srvBadCode.URL),
}
ph, err := proxyhealth.New(&proxyhealth.Options{
Interval: 0,
DB: db,
Logger: slogtest.Make(t, nil),
Client: srvBadReport.Client(),
})
require.NoError(t, err, "failed to create proxy health")
ctx, cancel := context.WithTimeout(context.Background(), testutil.WaitShort)
defer cancel()
err = ph.ForceUpdate(ctx)
require.NoError(t, err, "failed to force update")
for _, p := range proxies {
require.Equal(t, ph.HealthStatus()[p.ID].Status, proxyhealth.Unhealthy, "expect reachable proxy")
}
}
func TestProxyHealth_Reachable(t *testing.T) {
t.Parallel()
db := dbfake.New()
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
httpapi.Write(context.Background(), w, http.StatusOK, codersdk.ProxyHealthReport{
Warnings: []string{"No problems, just a warning"},
})
}))
defer srv.Close()
proxies := []database.WorkspaceProxy{
// Same url for both, just checking multiple proxies are checked.
insertProxy(t, db, srv.URL),
insertProxy(t, db, srv.URL),
}
ph, err := proxyhealth.New(&proxyhealth.Options{
Interval: 0,
DB: db,
Logger: slogtest.Make(t, nil),
Client: srv.Client(),
})
require.NoError(t, err, "failed to create proxy health")
ctx, cancel := context.WithTimeout(context.Background(), testutil.WaitShort)
defer cancel()
err = ph.ForceUpdate(ctx)
require.NoError(t, err, "failed to force update")
for _, p := range proxies {
require.Equal(t, ph.HealthStatus()[p.ID].Status, proxyhealth.Healthy, "expect reachable proxy")
}
}
func TestProxyHealth_Unreachable(t *testing.T) {
t.Parallel()
db := dbfake.New()
cli := &http.Client{
Transport: &http.Transport{
DialContext: func(ctx context.Context, network, addr string) (net.Conn, error) {
return nil, xerrors.New("Always fail")
},
},
}
proxies := []database.WorkspaceProxy{
// example.com is a real domain, but the client should always fail.
insertProxy(t, db, "https://example.com"),
insertProxy(t, db, "https://random.example.com"),
}
ph, err := proxyhealth.New(&proxyhealth.Options{
Interval: 0,
DB: db,
Logger: slogtest.Make(t, nil),
Client: cli,
})
require.NoError(t, err, "failed to create proxy health")
ctx, cancel := context.WithTimeout(context.Background(), testutil.WaitShort)
defer cancel()
err = ph.ForceUpdate(ctx)
require.NoError(t, err, "failed to force update")
for _, p := range proxies {
require.Equal(t, ph.HealthStatus()[p.ID].Status, proxyhealth.Unreachable, "expect unreachable proxy")
}
}

View File

@ -1,15 +1,18 @@
package coderd
import (
"context"
"crypto/sha256"
"database/sql"
"fmt"
"net/http"
"net/url"
"time"
"github.com/google/uuid"
"golang.org/x/xerrors"
"cdr.dev/slog"
agpl "github.com/coder/coder/coderd"
"github.com/coder/coder/coderd/audit"
"github.com/coder/coder/coderd/database"
@ -19,9 +22,18 @@ import (
"github.com/coder/coder/coderd/workspaceapps"
"github.com/coder/coder/codersdk"
"github.com/coder/coder/cryptorand"
"github.com/coder/coder/enterprise/coderd/proxyhealth"
"github.com/coder/coder/enterprise/wsproxy/wsproxysdk"
)
// forceWorkspaceProxyHealthUpdate forces an update of the proxy health.
// This is useful when a proxy is created or deleted. Errors will be logged.
func (api *API) forceWorkspaceProxyHealthUpdate(ctx context.Context) {
if err := api.proxyHealth.ForceUpdate(ctx); err != nil {
api.Logger.Error(ctx, "force proxy health update", slog.Error(err))
}
}
// @Summary Delete workspace proxy
// @ID delete-workspace-proxy
// @Security CoderSessionToken
@ -62,6 +74,9 @@ func (api *API) deleteWorkspaceProxy(rw http.ResponseWriter, r *http.Request) {
httpapi.Write(ctx, rw, http.StatusOK, codersdk.Response{
Message: "Proxy has been deleted!",
})
// Update the proxy health cache to remove this proxy.
go api.forceWorkspaceProxyHealthUpdate(api.ctx)
}
// @Summary Create workspace proxy
@ -122,9 +137,16 @@ func (api *API) postWorkspaceProxy(rw http.ResponseWriter, r *http.Request) {
aReq.New = proxy
httpapi.Write(ctx, rw, http.StatusCreated, codersdk.CreateWorkspaceProxyResponse{
Proxy: convertProxy(proxy),
Proxy: convertProxy(proxy, proxyhealth.ProxyStatus{
Proxy: proxy,
CheckedAt: time.Now(),
Status: proxyhealth.Unregistered,
}),
ProxyToken: fullToken,
})
// Update the proxy health cache to include this new proxy.
go api.forceWorkspaceProxyHealthUpdate(api.ctx)
}
// nolint:revive
@ -158,28 +180,8 @@ func (api *API) workspaceProxies(rw http.ResponseWriter, r *http.Request) {
return
}
httpapi.Write(ctx, rw, http.StatusOK, convertProxies(proxies))
}
func convertProxies(p []database.WorkspaceProxy) []codersdk.WorkspaceProxy {
resp := make([]codersdk.WorkspaceProxy, 0, len(p))
for _, proxy := range p {
resp = append(resp, convertProxy(proxy))
}
return resp
}
func convertProxy(p database.WorkspaceProxy) codersdk.WorkspaceProxy {
return codersdk.WorkspaceProxy{
ID: p.ID,
Name: p.Name,
Icon: p.Icon,
URL: p.Url,
WildcardHostname: p.WildcardHostname,
CreatedAt: p.CreatedAt,
UpdatedAt: p.UpdatedAt,
Deleted: p.Deleted,
}
statues := api.proxyHealth.HealthStatus()
httpapi.Write(ctx, rw, http.StatusOK, convertProxies(proxies, statues))
}
// @Summary Issue signed workspace app token
@ -295,6 +297,8 @@ func (api *API) workspaceProxyRegister(rw http.ResponseWriter, r *http.Request)
httpapi.Write(ctx, rw, http.StatusCreated, wsproxysdk.RegisterWorkspaceProxyResponse{
AppSecurityKey: api.AppSecurityKey.String(),
})
go api.forceWorkspaceProxyHealthUpdate(api.ctx)
}
// reconnectingPTYSignedToken issues a signed app token for use when connecting
@ -392,3 +396,29 @@ func (api *API) reconnectingPTYSignedToken(rw http.ResponseWriter, r *http.Reque
SignedToken: tokenStr,
})
}
func convertProxies(p []database.WorkspaceProxy, statuses map[uuid.UUID]proxyhealth.ProxyStatus) []codersdk.WorkspaceProxy {
resp := make([]codersdk.WorkspaceProxy, 0, len(p))
for _, proxy := range p {
resp = append(resp, convertProxy(proxy, statuses[proxy.ID]))
}
return resp
}
func convertProxy(p database.WorkspaceProxy, status proxyhealth.ProxyStatus) codersdk.WorkspaceProxy {
return codersdk.WorkspaceProxy{
ID: p.ID,
Name: p.Name,
Icon: p.Icon,
URL: p.Url,
WildcardHostname: p.WildcardHostname,
CreatedAt: p.CreatedAt,
UpdatedAt: p.UpdatedAt,
Deleted: p.Deleted,
Status: codersdk.WorkspaceProxyStatus{
Status: codersdk.ProxyHealthStatus(status.Status),
Report: status.Report,
CheckedAt: status.CheckedAt,
},
}
}

View File

@ -60,7 +60,7 @@ func TestWorkspaceProxyCRUD(t *testing.T) {
proxies, err := client.WorkspaceProxies(ctx)
require.NoError(t, err)
require.Len(t, proxies, 1)
require.Equal(t, proxyRes.Proxy, proxies[0])
require.Equal(t, proxyRes.Proxy.ID, proxies[0].ID)
require.NotEmpty(t, proxyRes.ProxyToken)
})

View File

@ -2,6 +2,7 @@ package wsproxy
import (
"context"
"fmt"
"net/http"
"net/url"
"reflect"
@ -30,6 +31,7 @@ import (
type Options struct {
Logger slog.Logger
HTTPClient *http.Client
// DashboardURL is the URL of the primary coderd instance.
DashboardURL *url.URL
// AccessURL is the URL of the WorkspaceProxy.
@ -120,6 +122,11 @@ func New(ctx context.Context, opts *Options) (*Server, error) {
return nil, xerrors.Errorf("set client token: %w", err)
}
// Use the configured client if provided.
if opts.HTTPClient != nil {
client.SDKClient.HTTPClient = opts.HTTPClient
}
// TODO: Probably do some version checking here
info, err := client.SDKClient.BuildInfo(ctx)
if err != nil {
@ -224,6 +231,8 @@ func New(ctx context.Context, opts *Options) (*Server, error) {
r.Get("/buildinfo", s.buildInfo)
r.Get("/healthz", func(w http.ResponseWriter, r *http.Request) { _, _ = w.Write([]byte("OK")) })
// TODO: @emyrk should this be authenticated or debounced?
r.Get("/healthz-report", s.healthReport)
return s, nil
}
@ -246,6 +255,46 @@ func (s *Server) buildInfo(rw http.ResponseWriter, r *http.Request) {
})
}
// healthReport is a more thorough health check than the '/healthz' endpoint.
// This endpoint not only responds if the server is running, but can do some
// internal diagnostics to ensure that the server is running correctly. The
// primary coderd will use this to determine if this workspace proxy can be used
// by the users. This endpoint will take longer to respond than the '/healthz'.
// Checks:
// - Can communicate with primary coderd
//
// TODO: Config checks to ensure consistent with primary
func (s *Server) healthReport(rw http.ResponseWriter, r *http.Request) {
ctx := r.Context()
var report codersdk.ProxyHealthReport
// Hit the build info to do basic version checking.
primaryBuild, err := s.SDKClient.SDKClient.BuildInfo(ctx)
if err != nil {
report.Errors = append(report.Errors, fmt.Sprintf("failed to get build info: %s", err.Error()))
httpapi.Write(r.Context(), rw, http.StatusOK, report)
return
}
if primaryBuild.WorkspaceProxy {
// This could be a simple mistake of using a proxy url as the dashboard url.
report.Errors = append(report.Errors,
fmt.Sprintf("dashboard url (%s) is a workspace proxy, must be a primary coderd", s.DashboardURL.String()))
}
// If we are in dev mode, never check versions.
if !buildinfo.IsDev() && !buildinfo.VersionsMatch(primaryBuild.Version, buildinfo.Version()) {
// Version mismatches are not fatal, but should be reported.
report.Warnings = append(report.Warnings,
fmt.Sprintf("version mismatch: primary coderd (%s) != workspace proxy (%s)", primaryBuild.Version, buildinfo.Version()))
}
// TODO: We should hit the deployment config endpoint and do some config
// checks. We can check the version from the X-CODER-BUILD-VERSION header
httpapi.Write(r.Context(), rw, http.StatusOK, report)
}
type optErrors []error
func (e optErrors) Error() string {

View File

@ -690,6 +690,12 @@ export interface ProvisionerJobLog {
readonly output: string
}
// From codersdk/workspaceproxy.go
export interface ProxyHealthReport {
readonly Errors: string[]
readonly Warnings: string[]
}
// From codersdk/workspaces.go
export interface PutExtendWorkspaceRequest {
readonly deadline: string
@ -1242,6 +1248,7 @@ export interface WorkspaceProxy {
readonly created_at: string
readonly updated_at: string
readonly deleted: boolean
readonly status?: WorkspaceProxyStatus
}
// From codersdk/deployment.go
@ -1250,6 +1257,13 @@ export interface WorkspaceProxyBuildInfo {
readonly dashboard_url: string
}
// From codersdk/workspaceproxy.go
export interface WorkspaceProxyStatus {
readonly status: ProxyHealthStatus
readonly report?: ProxyHealthReport
readonly checked_at: string
}
// From codersdk/workspaces.go
export interface WorkspaceQuota {
readonly credits_consumed: number
@ -1447,6 +1461,19 @@ export const ProvisionerStorageMethods: ProvisionerStorageMethod[] = ["file"]
export type ProvisionerType = "echo" | "terraform"
export const ProvisionerTypes: ProvisionerType[] = ["echo", "terraform"]
// From codersdk/workspaceproxy.go
export type ProxyHealthStatus =
| "reachable"
| "unhealthy"
| "unreachable"
| "unregistered"
export const ProxyHealthStatuses: ProxyHealthStatus[] = [
"reachable",
"unhealthy",
"unreachable",
"unregistered",
]
// From codersdk/rbacresources.go
export type RBACResource =
| "api_key"