chore: add prometheus monitoring of workspace traffic generation (#7583)

- Exposes reads/writes from scaletest traffic generation (default: 0.0.0.0:21112)
- Adds self-hosted prometheus with remote_write to loadtest terraform
- Adds convenience script to run a traffic generation test
This commit is contained in:
Cian Johnston 2023-05-26 05:53:35 -07:00 committed by GitHub
parent 0fd2ea4044
commit 795050bba3
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
18 changed files with 630 additions and 120 deletions

1
.gitignore vendored
View File

@ -59,3 +59,4 @@ site/stats/
./scaletest/terraform/.terraform
./scaletest/terraform/.terraform.lock.hcl
terraform.tfstate.*
**/*.tfvars

View File

@ -62,6 +62,7 @@ site/stats/
./scaletest/terraform/.terraform
./scaletest/terraform/.terraform.lock.hcl
terraform.tfstate.*
**/*.tfvars
# .prettierignore.include:
# Helm templates contain variables that are invalid YAML and can't be formatted
# by Prettier.

View File

@ -14,9 +14,14 @@ import (
"time"
"github.com/google/uuid"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/promhttp"
"go.opentelemetry.io/otel/trace"
"golang.org/x/xerrors"
"cdr.dev/slog"
"cdr.dev/slog/sloggers/sloghuman"
"github.com/coder/coder/cli/clibase"
"github.com/coder/coder/cli/cliui"
"github.com/coder/coder/coderd/httpapi"
@ -896,8 +901,11 @@ func (r *RootCmd) scaletestCreateWorkspaces() *clibase.Cmd {
func (r *RootCmd) scaletestWorkspaceTraffic() *clibase.Cmd {
var (
tickInterval time.Duration
bytesPerTick int64
tickInterval time.Duration
bytesPerTick int64
scaletestPrometheusAddress string
scaletestPrometheusWait time.Duration
client = &codersdk.Client{}
tracingFlags = &scaletestTracingFlags{}
strategy = &scaletestStrategyFlags{}
@ -913,6 +921,12 @@ func (r *RootCmd) scaletestWorkspaceTraffic() *clibase.Cmd {
),
Handler: func(inv *clibase.Invocation) error {
ctx := inv.Context()
reg := prometheus.NewRegistry()
metrics := workspacetraffic.NewMetrics(reg, "username", "workspace_name", "agent_name")
logger := slog.Make(sloghuman.Sink(io.Discard))
prometheusSrvClose := ServeHandler(ctx, logger, promhttp.HandlerFor(reg, promhttp.HandlerOpts{}), scaletestPrometheusAddress, "prometheus")
defer prometheusSrvClose()
// Bypass rate limiting
client.HTTPClient = &http.Client{
@ -943,6 +957,9 @@ func (r *RootCmd) scaletestWorkspaceTraffic() *clibase.Cmd {
_, _ = fmt.Fprintln(inv.Stderr, "\nUploading traces...")
if err := closeTracing(ctx); err != nil {
_, _ = fmt.Fprintf(inv.Stderr, "\nError uploading traces: %+v\n", err)
// Wait for prometheus metrics to be scraped
_, _ = fmt.Fprintf(inv.Stderr, "Waiting %s for prometheus metrics to be scraped\n", scaletestPrometheusWait)
<-time.After(scaletestPrometheusWait)
}
}()
tracer := tracerProvider.Tracer(scaletestTracerName)
@ -955,9 +972,10 @@ func (r *RootCmd) scaletestWorkspaceTraffic() *clibase.Cmd {
th := harness.NewTestHarness(strategy.toStrategy(), cleanupStrategy.toStrategy())
for idx, ws := range workspaces {
var (
agentID uuid.UUID
name = "workspace-traffic"
id = strconv.Itoa(idx)
agentID uuid.UUID
agentName string
name = "workspace-traffic"
id = strconv.Itoa(idx)
)
for _, res := range ws.LatestBuild.Resources {
@ -965,6 +983,7 @@ func (r *RootCmd) scaletestWorkspaceTraffic() *clibase.Cmd {
continue
}
agentID = res.Agents[0].ID
agentName = res.Agents[0].Name
}
if agentID == uuid.Nil {
@ -974,16 +993,20 @@ func (r *RootCmd) scaletestWorkspaceTraffic() *clibase.Cmd {
// Setup our workspace agent connection.
config := workspacetraffic.Config{
AgentID: agentID,
BytesPerTick: bytesPerTick,
Duration: strategy.timeout,
TickInterval: tickInterval,
AgentID: agentID,
AgentName: agentName,
BytesPerTick: bytesPerTick,
Duration: strategy.timeout,
TickInterval: tickInterval,
WorkspaceName: ws.Name,
WorkspaceOwner: ws.OwnerName,
Registry: reg,
}
if err := config.Validate(); err != nil {
return xerrors.Errorf("validate config: %w", err)
}
var runner harness.Runnable = workspacetraffic.NewRunner(client, config)
var runner harness.Runnable = workspacetraffic.NewRunner(client, config, metrics)
if tracingEnabled {
runner = &runnableTraceWrapper{
tracer: tracer,
@ -1034,6 +1057,20 @@ func (r *RootCmd) scaletestWorkspaceTraffic() *clibase.Cmd {
Description: "How often to send traffic.",
Value: clibase.DurationOf(&tickInterval),
},
{
Flag: "scaletest-prometheus-address",
Env: "CODER_SCALETEST_PROMETHEUS_ADDRESS",
Default: "0.0.0.0:21112",
Description: "Address on which to expose scaletest Prometheus metrics.",
Value: clibase.StringOf(&scaletestPrometheusAddress),
},
{
Flag: "scaletest-prometheus-wait",
Env: "CODER_SCALETEST_PROMETHEUS_WAIT",
Default: "5s",
Description: "How long to wait before exiting in order to allow Prometheus metrics to be scraped.",
Value: clibase.DurationOf(&scaletestPrometheusWait),
},
}
tracingFlags.attach(&cmd.Options)

View File

@ -8,17 +8,12 @@ import (
"path/filepath"
"testing"
"github.com/google/uuid"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
"github.com/coder/coder/agent"
"github.com/coder/coder/cli/clitest"
"github.com/coder/coder/coderd/coderdtest"
"github.com/coder/coder/codersdk"
"github.com/coder/coder/codersdk/agentsdk"
"github.com/coder/coder/provisioner/echo"
"github.com/coder/coder/provisionersdk/proto"
"github.com/coder/coder/pty/ptytest"
"github.com/coder/coder/scaletest/harness"
"github.com/coder/coder/testutil"
@ -205,70 +200,28 @@ param3: 1
})
}
// This test pretends to stand up a workspace and run a no-op traffic generation test.
// It's not a real test, but it's useful for debugging.
// We do not perform any cleanup.
// This test just validates that the CLI command accepts its known arguments.
// A more comprehensive test is performed in workspacetraffic/run_test.go
func TestScaleTestWorkspaceTraffic(t *testing.T) {
t.Parallel()
ctx, cancelFunc := context.WithTimeout(context.Background(), testutil.WaitMedium)
defer cancelFunc()
client := coderdtest.New(t, &coderdtest.Options{IncludeProvisionerDaemon: true})
user := coderdtest.CreateFirstUser(t, client)
authToken := uuid.NewString()
version := coderdtest.CreateTemplateVersion(t, client, user.OrganizationID, &echo.Responses{
Parse: echo.ParseComplete,
ProvisionPlan: echo.ProvisionComplete,
ProvisionApply: []*proto.Provision_Response{{
Type: &proto.Provision_Response_Complete{
Complete: &proto.Provision_Complete{
Resources: []*proto.Resource{{
Name: "example",
Type: "aws_instance",
Agents: []*proto.Agent{{
Id: uuid.NewString(),
Name: "agent",
Auth: &proto.Agent_Token{
Token: authToken,
},
Apps: []*proto.App{},
}},
}},
},
},
}},
})
template := coderdtest.CreateTemplate(t, client, user.OrganizationID, version.ID)
coderdtest.AwaitTemplateVersionJob(t, client, version.ID)
ws := coderdtest.CreateWorkspace(t, client, user.OrganizationID, template.ID, func(cwr *codersdk.CreateWorkspaceRequest) {
cwr.Name = "scaletest-test"
})
coderdtest.AwaitWorkspaceBuildJob(t, client, ws.LatestBuild.ID)
agentClient := agentsdk.New(client.URL)
agentClient.SetSessionToken(authToken)
agentCloser := agent.New(agent.Options{
Client: agentClient,
})
t.Cleanup(func() {
_ = agentCloser.Close()
})
coderdtest.AwaitWorkspaceAgents(t, client, ws.ID)
client := coderdtest.New(t, nil)
_ = coderdtest.CreateFirstUser(t, client)
inv, root := clitest.New(t, "scaletest", "workspace-traffic",
"--timeout", "1s",
"--bytes-per-tick", "1024",
"--tick-interval", "100ms",
"--scaletest-prometheus-address", "127.0.0.1:0",
"--scaletest-prometheus-wait", "0s",
)
clitest.SetupConfig(t, client, root)
var stdout, stderr bytes.Buffer
inv.Stdout = &stdout
inv.Stderr = &stderr
err := inv.WithContext(ctx).Run()
require.NoError(t, err)
require.Contains(t, stdout.String(), "Pass: 1")
require.ErrorContains(t, err, "no scaletest workspaces exist")
}

View File

@ -27,6 +27,13 @@ Generate traffic to scaletest workspaces through coderd
Output format specs in the format "<format>[:<path>]". Not specifying
a path will default to stdout. Available formats: text, json.
--scaletest-prometheus-address string, $CODER_SCALETEST_PROMETHEUS_ADDRESS (default: 0.0.0.0:21112)
Address on which to expose scaletest Prometheus metrics.
--scaletest-prometheus-wait duration, $CODER_SCALETEST_PROMETHEUS_WAIT (default: 5s)
How long to wait before exiting in order to allow Prometheus metrics
to be scraped.
--tick-interval duration, $CODER_SCALETEST_WORKSPACE_TRAFFIC_TICK_INTERVAL (default: 100ms)
How often to send traffic.

View File

@ -82,6 +82,26 @@ Timeout per job. Jobs may take longer to complete under higher concurrency limit
Output format specs in the format "<format>[:<path>]". Not specifying a path will default to stdout. Available formats: text, json.
### --scaletest-prometheus-address
| | |
| ----------- | ------------------------------------------------ |
| Type | <code>string</code> |
| Environment | <code>$CODER_SCALETEST_PROMETHEUS_ADDRESS</code> |
| Default | <code>0.0.0.0:21112</code> |
Address on which to expose scaletest Prometheus metrics.
### --scaletest-prometheus-wait
| | |
| ----------- | --------------------------------------------- |
| Type | <code>duration</code> |
| Environment | <code>$CODER_SCALETEST_PROMETHEUS_WAIT</code> |
| Default | <code>5s</code> |
How long to wait before exiting in order to allow Prometheus metrics to be scraped.
### --tick-interval
| | |

View File

@ -32,9 +32,12 @@ project_id = "some_google_project_id"
1. Run `coder_init.sh <coder_url>` to setup an initial user and a pre-configured Kubernetes
template. It will also download the Coder CLI from the Coder instance locally.
1. Do whatever you need to do with the Coder instance.
1. Do whatever you need to do with the Coder instance:
> To run Coder commands against the instance, you can use `coder_shim.sh <command>`.
> Note: To run Coder commands against the instance, you can use `coder_shim.sh <command>`.
> You don't need to run `coder login` yourself.
- To create workspaces, run `./coder_shim.sh scaletest create-workspaces --template="kubernetes" --count=N`
- To generate workspace traffic, run `./coder_trafficgen.sh <name of loadtest from your Terraform vars>`. This will keep running until you delete the pod `coder-scaletest-workspace-traffic`.
1. When you are finished, you can run `terraform destroy -var-file=override.tfvars`.

View File

@ -128,34 +128,6 @@ EOF
]
}
resource "local_file" "coder-monitoring-manifest" {
filename = "${path.module}/.coderv2/coder-monitoring.yaml"
content = <<EOF
apiVersion: monitoring.googleapis.com/v1
kind: PodMonitoring
metadata:
namespace: ${kubernetes_namespace.coder_namespace.metadata.0.name}
name: coder-monitoring
spec:
selector:
matchLabels:
app.kubernetes.io/name: coder
endpoints:
- port: prometheus-http
interval: 30s
EOF
}
resource "null_resource" "coder-monitoring-manifest_apply" {
provisioner "local-exec" {
working_dir = "${abspath(path.module)}/.coderv2"
command = <<EOF
KUBECONFIG=${var.name}-cluster.kubeconfig gcloud container clusters get-credentials ${var.name}-cluster --project=${var.project_id} --zone=${var.zone} && \
KUBECONFIG=${var.name}-cluster.kubeconfig kubectl apply -f ${abspath(local_file.coder-monitoring-manifest.filename)}
EOF
}
}
resource "local_file" "kubernetes_template" {
filename = "${path.module}/.coderv2/templates/kubernetes/main.tf"
content = <<EOF

View File

@ -0,0 +1,73 @@
#!/usr/bin/env bash
set -euo pipefail
if [[ $# -lt 1 ]]; then
echo "Usage: $0 <loadtest name>"
exit 1
fi
# Allow toggling verbose output
[[ -n ${VERBOSE:-} ]] && set -x
LOADTEST_NAME="$1"
CODER_TOKEN=$(./coder_shim.sh tokens create)
CODER_URL="http://coder.coder-${LOADTEST_NAME}.svc.cluster.local"
export KUBECONFIG="${PWD}/.coderv2/${LOADTEST_NAME}-cluster.kubeconfig"
cat <<EOF | kubectl apply -f -
apiVersion: v1
kind: Pod
metadata:
name: coder-scaletest-workspace-traffic
namespace: coder-${LOADTEST_NAME}
labels:
app.kubernetes.io/name: coder-scaletest-workspace-traffic
spec:
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: cloud.google.com/gke-nodepool
operator: In
values:
- ${LOADTEST_NAME}-misc
containers:
- command:
- sh
- -c
- "curl -fsSL $CODER_URL/bin/coder-linux-amd64 -o /tmp/coder && chmod +x /tmp/coder && /tmp/coder --url=$CODER_URL --token=$CODER_TOKEN scaletest workspace-traffic"
env:
- name: CODER_URL
value: $CODER_URL
- name: CODER_TOKEN
value: $CODER_TOKEN
- name: CODER_SCALETEST_PROMETHEUS_ADDRESS
value: "0.0.0.0:21112"
- name: CODER_SCALETEST_JOB_TIMEOUT
value: "30m"
- name: CODER_SCALETEST_CONCURRENCY
value: "0"
- name: CODER_SCALETEST_WORKSPACE_TRAFFIC_BYTES_PER_TICK
value: "2048"
ports:
- containerPort: 21112
name: prometheus-http
protocol: TCP
name: cli
image: docker.io/codercom/enterprise-minimal:ubuntu
---
apiVersion: monitoring.coreos.com/v1
kind: PodMonitor
metadata:
namespace: coder-${LOADTEST_NAME}
name: coder-workspacetraffic-monitoring
spec:
selector:
matchLabels:
app.kubernetes.io/name: coder-scaletest-workspace-traffic
podMetricsEndpoints:
- port: prometheus-http
interval: 15s
EOF

View File

@ -26,7 +26,7 @@ resource "google_container_cluster" "primary" {
monitoring_config {
enable_components = ["SYSTEM_COMPONENTS"]
managed_prometheus {
enabled = true
enabled = false
}
}
workload_identity_config {

View File

@ -0,0 +1,132 @@
locals {
prometheus_helm_repo = "https://charts.bitnami.com/bitnami"
prometheus_helm_chart = "kube-prometheus"
prometheus_helm_version = null // just use latest
prometheus_release_name = "prometheus"
prometheus_namespace = "prometheus"
prometheus_remote_write_enabled = var.prometheus_remote_write_password != ""
}
# Create a namespace to hold our Prometheus deployment.
resource "kubernetes_namespace" "prometheus_namespace" {
metadata {
name = local.prometheus_namespace
}
depends_on = [
google_container_node_pool.misc
]
}
# Create a secret to store the remote write key
resource "kubernetes_secret" "prometheus-credentials" {
count = local.prometheus_remote_write_enabled ? 1 : 0
type = "kubernetes.io/basic-auth"
metadata {
name = "prometheus-credentials"
namespace = kubernetes_namespace.prometheus_namespace.metadata.0.name
}
data = {
username = var.prometheus_remote_write_user
password = var.prometheus_remote_write_password
}
}
# Install Prometheus using the Bitnami Prometheus helm chart.
resource "helm_release" "prometheus-chart" {
repository = local.prometheus_helm_repo
chart = local.prometheus_helm_chart
name = local.prometheus_release_name
version = local.prometheus_helm_version
namespace = kubernetes_namespace.prometheus_namespace.metadata.0.name
values = [<<EOF
alertmanager:
enabled: false
blackboxExporter:
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: "cloud.google.com/gke-nodepool"
operator: "In"
values: ["${google_container_node_pool.misc.name}"]
operator:
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: "cloud.google.com/gke-nodepool"
operator: "In"
values: ["${google_container_node_pool.misc.name}"]
prometheus:
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: "cloud.google.com/gke-nodepool"
operator: "In"
values: ["${google_container_node_pool.misc.name}"]
externalLabels:
cluster: "${google_container_cluster.primary.name}"
persistence:
enabled: true
storageClass: standard
%{if local.prometheus_remote_write_enabled~}
remoteWrite:
- url: "${var.prometheus_remote_write_url}"
basicAuth:
username:
name: "${kubernetes_secret.prometheus-credentials[0].metadata[0].name}"
key: username
password:
name: "${kubernetes_secret.prometheus-credentials[0].metadata[0].name}"
key: password
tlsConfig:
insecureSkipVerify: ${var.prometheus_remote_write_insecure_skip_verify}
writeRelabelConfigs:
- sourceLabels: [__name__]
regex: "${var.prometheus_remote_write_metrics_regex}"
action: keep
metadataConfig:
sendInterval: "${var.prometheus_remote_write_send_interval}"
%{endif~}
EOF
]
}
# NOTE: this is created as a local file before being applied
# as the kubernetes_manifest resource needs to be run separately
# after creating a cluster, and we want this to be brought up
# with a single command.
resource "local_file" "coder-monitoring-manifest" {
filename = "${path.module}/.coderv2/coder-monitoring.yaml"
depends_on = [helm_release.prometheus-chart]
content = <<EOF
apiVersion: monitoring.coreos.com/v1
kind: PodMonitor
metadata:
namespace: ${kubernetes_namespace.coder_namespace.metadata.0.name}
name: coder-monitoring
spec:
selector:
matchLabels:
app.kubernetes.io/name: coder
podMetricsEndpoints:
- port: prometheus-http
interval: 30s
EOF
}
resource "null_resource" "coder-monitoring-manifest_apply" {
provisioner "local-exec" {
working_dir = "${abspath(path.module)}/.coderv2"
command = <<EOF
KUBECONFIG=${var.name}-cluster.kubeconfig gcloud container clusters get-credentials ${google_container_cluster.primary.name} --project=${var.project_id} --zone=${var.zone} && \
KUBECONFIG=${var.name}-cluster.kubeconfig kubectl apply -f ${abspath(local_file.coder-monitoring-manifest.filename)}
EOF
}
depends_on = [helm_release.prometheus-chart]
}

View File

@ -127,3 +127,33 @@ variable "workspace_image" {
description = "Image and tag to use for workspaces."
default = "docker.io/codercom/enterprise-minimal:ubuntu"
}
variable "prometheus_remote_write_user" {
description = "Username for Prometheus remote write."
default = ""
}
variable "prometheus_remote_write_password" {
description = "Password for Prometheus remote write."
default = ""
}
variable "prometheus_remote_write_url" {
description = "URL for Prometheus remote write. Defaults to stats.dev.c8s.io"
default = "https://stats.dev.c8s.io:9443/api/v1/write"
}
variable "prometheus_remote_write_insecure_skip_verify" {
description = "Skip TLS verification for Prometheus remote write."
default = true
}
variable "prometheus_remote_write_metrics_regex" {
description = "Allowlist regex of metrics for Prometheus remote write."
default = ".*"
}
variable "prometheus_remote_write_send_interval" {
description = "Prometheus remote write interval."
default = "15s"
}

View File

@ -4,6 +4,7 @@ import (
"time"
"github.com/google/uuid"
"github.com/prometheus/client_golang/prometheus"
"golang.org/x/xerrors"
)
@ -11,6 +12,15 @@ type Config struct {
// AgentID is the workspace agent ID to which to connect.
AgentID uuid.UUID `json:"agent_id"`
// AgentName is the name of the agent. Used for metrics.
AgentName string `json:"agent_name"`
// WorkspaceName is the name of the workspace. Used for metrics.
WorkspaceName string `json:"workspace_name"`
// WorkspaceOwner is the owner of the workspace. Used for metrics.
WorkspaceOwner string `json:"workspace_owner"`
// BytesPerTick is the number of bytes to send to the agent per tick.
BytesPerTick int64 `json:"bytes_per_tick"`
@ -20,6 +30,9 @@ type Config struct {
// TickInterval specifies the interval between ticks (that is, attempts to
// send data to workspace agents).
TickInterval time.Duration `json:"tick_interval"`
// Registry is a prometheus.Registerer for logging metrics
Registry prometheus.Registerer
}
func (c Config) Validate() error {

View File

@ -0,0 +1,56 @@
package workspacetraffic
import "github.com/prometheus/client_golang/prometheus"
type Metrics struct {
BytesReadTotal prometheus.CounterVec
BytesWrittenTotal prometheus.CounterVec
ReadErrorsTotal prometheus.CounterVec
WriteErrorsTotal prometheus.CounterVec
ReadLatencySeconds prometheus.HistogramVec
WriteLatencySeconds prometheus.HistogramVec
LabelNames []string
}
func NewMetrics(reg prometheus.Registerer, labelNames ...string) *Metrics {
m := &Metrics{
BytesReadTotal: *prometheus.NewCounterVec(prometheus.CounterOpts{
Namespace: "coderd",
Subsystem: "scaletest",
Name: "bytes_read_total",
}, labelNames),
BytesWrittenTotal: *prometheus.NewCounterVec(prometheus.CounterOpts{
Namespace: "coderd",
Subsystem: "scaletest",
Name: "bytes_written_total",
}, labelNames),
ReadErrorsTotal: *prometheus.NewCounterVec(prometheus.CounterOpts{
Namespace: "coderd",
Subsystem: "scaletest",
Name: "read_errors_total",
}, labelNames),
WriteErrorsTotal: *prometheus.NewCounterVec(prometheus.CounterOpts{
Namespace: "coderd",
Subsystem: "scaletest",
Name: "write_errors_total",
}, labelNames),
ReadLatencySeconds: *prometheus.NewHistogramVec(prometheus.HistogramOpts{
Namespace: "coderd",
Subsystem: "scaletest",
Name: "read_latency_seconds",
}, labelNames),
WriteLatencySeconds: *prometheus.NewHistogramVec(prometheus.HistogramOpts{
Namespace: "coderd",
Subsystem: "scaletest",
Name: "write_latency_seconds",
}, labelNames),
}
reg.MustRegister(m.BytesReadTotal)
reg.MustRegister(m.BytesWrittenTotal)
reg.MustRegister(m.ReadErrorsTotal)
reg.MustRegister(m.WriteErrorsTotal)
reg.MustRegister(m.ReadLatencySeconds)
reg.MustRegister(m.WriteLatencySeconds)
return m
}

View File

@ -3,8 +3,8 @@ package workspacetraffic
import (
"context"
"encoding/json"
"errors"
"io"
"sync/atomic"
"time"
"github.com/google/uuid"
@ -19,11 +19,14 @@ import (
"github.com/coder/coder/cryptorand"
"github.com/coder/coder/scaletest/harness"
"github.com/coder/coder/scaletest/loadtestutil"
promtest "github.com/prometheus/client_golang/prometheus/testutil"
)
type Runner struct {
client *codersdk.Client
cfg Config
client *codersdk.Client
cfg Config
metrics *Metrics
}
var (
@ -31,10 +34,11 @@ var (
_ harness.Cleanable = &Runner{}
)
func NewRunner(client *codersdk.Client, cfg Config) *Runner {
func NewRunner(client *codersdk.Client, cfg Config, metrics *Metrics) *Runner {
return &Runner{
client: client,
cfg: cfg,
client: client,
cfg: cfg,
metrics: metrics,
}
}
@ -47,6 +51,16 @@ func (r *Runner) Run(ctx context.Context, _ string, logs io.Writer) error {
r.client.Logger = logger
r.client.LogBodies = true
// Initialize our metrics eagerly. This is mainly so that we can test for the
// presence of a zero-valued metric as opposed to the absence of a metric.
lvs := []string{r.cfg.WorkspaceOwner, r.cfg.WorkspaceName, r.cfg.AgentName}
r.metrics.BytesReadTotal.WithLabelValues(lvs...).Add(0)
r.metrics.BytesWrittenTotal.WithLabelValues(lvs...).Add(0)
r.metrics.ReadErrorsTotal.WithLabelValues(lvs...).Add(0)
r.metrics.WriteErrorsTotal.WithLabelValues(lvs...).Add(0)
r.metrics.ReadLatencySeconds.WithLabelValues(lvs...).Observe(0)
r.metrics.WriteLatencySeconds.WithLabelValues(lvs...).Observe(0)
var (
agentID = r.cfg.AgentID
reconnect = uuid.New()
@ -90,7 +104,7 @@ func (r *Runner) Run(ctx context.Context, _ string, logs io.Writer) error {
}()
// Wrap the conn in a countReadWriter so we can monitor bytes sent/rcvd.
crw := countReadWriter{ReadWriter: conn}
crw := countReadWriter{ReadWriter: conn, metrics: r.metrics, labels: lvs}
// Create a ticker for sending data to the PTY.
tick := time.NewTicker(tickInterval)
@ -131,11 +145,12 @@ func (r *Runner) Run(ctx context.Context, _ string, logs io.Writer) error {
}
duration := time.Since(start)
logger.Info(ctx, "results",
logger.Info(ctx, "Test Results",
slog.F("duration", duration),
slog.F("sent", crw.BytesWritten()),
slog.F("rcvd", crw.BytesRead()),
slog.F("bytes_read_total", promtest.ToFloat64(r.metrics.BytesReadTotal)),
slog.F("bytes_written_total", promtest.ToFloat64(r.metrics.BytesWrittenTotal)),
slog.F("read_errors_total", promtest.ToFloat64(r.metrics.ReadErrorsTotal)),
slog.F("write_errors_total", promtest.ToFloat64(r.metrics.WriteErrorsTotal)),
)
return nil
@ -184,34 +199,36 @@ func writeRandomData(dst io.Writer, size int64, tick <-chan time.Time) error {
// countReadWriter wraps an io.ReadWriter and counts the number of bytes read and written.
type countReadWriter struct {
io.ReadWriter
bytesRead atomic.Int64
bytesWritten atomic.Int64
metrics *Metrics
labels []string
}
func (w *countReadWriter) Read(p []byte) (int, error) {
start := time.Now()
n, err := w.ReadWriter.Read(p)
if err == nil {
w.bytesRead.Add(int64(n))
if reportableErr(err) {
w.metrics.ReadErrorsTotal.WithLabelValues(w.labels...).Inc()
}
w.metrics.ReadLatencySeconds.WithLabelValues(w.labels...).Observe(time.Since(start).Seconds())
if n > 0 {
w.metrics.BytesReadTotal.WithLabelValues(w.labels...).Add(float64(n))
}
return n, err
}
func (w *countReadWriter) Write(p []byte) (int, error) {
start := time.Now()
n, err := w.ReadWriter.Write(p)
if err == nil {
w.bytesWritten.Add(int64(n))
if reportableErr(err) {
w.metrics.WriteErrorsTotal.WithLabelValues(w.labels...).Inc()
}
w.metrics.WriteLatencySeconds.WithLabelValues(w.labels...).Observe(time.Since(start).Seconds())
if n > 0 {
w.metrics.BytesWrittenTotal.WithLabelValues(w.labels...).Add(float64(n))
}
return n, err
}
func (w *countReadWriter) BytesRead() int64 {
return w.bytesRead.Load()
}
func (w *countReadWriter) BytesWritten() int64 {
return w.bytesWritten.Load()
}
func mustRandStr(l int64) string {
if l < 1 {
l = 1
@ -222,3 +239,19 @@ func mustRandStr(l int64) string {
}
return randStr
}
// some errors we want to report in metrics; others we want to ignore
// such as websocket.StatusNormalClosure or context.Canceled
func reportableErr(err error) bool {
if err == nil {
return false
}
if xerrors.Is(err, context.Canceled) {
return false
}
var wsErr websocket.CloseError
if errors.As(err, &wsErr) {
return wsErr.Code != websocket.StatusNormalClosure
}
return false
}

View File

@ -0,0 +1,177 @@
package workspacetraffic_test
import (
"context"
"strings"
"testing"
"time"
"github.com/coder/coder/agent"
"github.com/coder/coder/coderd/coderdtest"
"github.com/coder/coder/codersdk"
"github.com/coder/coder/codersdk/agentsdk"
"github.com/coder/coder/provisioner/echo"
"github.com/coder/coder/provisionersdk/proto"
"github.com/coder/coder/scaletest/workspacetraffic"
"github.com/coder/coder/testutil"
"github.com/google/uuid"
"github.com/prometheus/client_golang/prometheus"
dto "github.com/prometheus/client_model/go"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
)
func TestRun(t *testing.T) {
t.Parallel()
// We need to stand up an in-memory coderd and run a fake workspace.
var (
client = coderdtest.New(t, &coderdtest.Options{IncludeProvisionerDaemon: true})
firstUser = coderdtest.CreateFirstUser(t, client)
authToken = uuid.NewString()
agentName = "agent"
version = coderdtest.CreateTemplateVersion(t, client, firstUser.OrganizationID, &echo.Responses{
Parse: echo.ParseComplete,
ProvisionPlan: echo.ProvisionComplete,
ProvisionApply: []*proto.Provision_Response{{
Type: &proto.Provision_Response_Complete{
Complete: &proto.Provision_Complete{
Resources: []*proto.Resource{{
Name: "example",
Type: "aws_instance",
Agents: []*proto.Agent{{
// Agent ID gets generated no matter what we say ¯\_(ツ)_/¯
Name: agentName,
Auth: &proto.Agent_Token{
Token: authToken,
},
Apps: []*proto.App{},
}},
}},
},
},
}},
})
template = coderdtest.CreateTemplate(t, client, firstUser.OrganizationID, version.ID)
_ = coderdtest.AwaitTemplateVersionJob(t, client, version.ID)
// In order to be picked up as a scaletest workspace, the workspace must be named specifically
ws = coderdtest.CreateWorkspace(t, client, firstUser.OrganizationID, template.ID, func(cwr *codersdk.CreateWorkspaceRequest) {
cwr.Name = "scaletest-test"
})
_ = coderdtest.AwaitWorkspaceBuildJob(t, client, ws.LatestBuild.ID)
)
// We also need a running agent to run this test.
agentClient := agentsdk.New(client.URL)
agentClient.SetSessionToken(authToken)
agentCloser := agent.New(agent.Options{
Client: agentClient,
})
ctx, cancel := context.WithCancel(context.Background())
t.Cleanup(cancel)
t.Cleanup(func() {
_ = agentCloser.Close()
})
// We actually need to know the full user and not just the UserID / OrgID
user, err := client.User(ctx, firstUser.UserID.String())
require.NoError(t, err, "get first user")
// Make sure the agent is connected before we go any further.
resources := coderdtest.AwaitWorkspaceAgents(t, client, ws.ID)
var agentID uuid.UUID
for _, res := range resources {
for _, agt := range res.Agents {
agentID = agt.ID
}
}
require.NotEqual(t, uuid.Nil, agentID, "did not expect agentID to be nil")
// Now we can start the runner.
var (
bytesPerTick = 1024
tickInterval = 1000 * time.Millisecond
cancelAfter = 1500 * time.Millisecond
fudgeWrite = 12 // The ReconnectingPTY payload incurs some overhead
)
reg := prometheus.NewRegistry()
metrics := workspacetraffic.NewMetrics(reg, "username", "workspace_name", "agent_name")
runner := workspacetraffic.NewRunner(client, workspacetraffic.Config{
AgentID: agentID,
AgentName: agentName,
WorkspaceName: ws.Name,
WorkspaceOwner: ws.OwnerName,
BytesPerTick: int64(bytesPerTick),
TickInterval: tickInterval,
Duration: testutil.WaitLong,
Registry: reg,
}, metrics)
var logs strings.Builder
// Stop the test after one 'tick'. This will cause an EOF.
go func() {
<-time.After(cancelAfter)
cancel()
}()
require.NoError(t, runner.Run(ctx, "", &logs), "unexpected error calling Run()")
// We want to ensure the metrics are somewhat accurate.
lvs := []string{user.Username, ws.Name, agentName}
assert.InDelta(t, bytesPerTick+fudgeWrite, toFloat64(t, metrics.BytesWrittenTotal.WithLabelValues(lvs...)), 0.1)
// Read is highly variable, depending on how far we read before stopping.
// Just ensure it's not zero.
assert.NotZero(t, bytesPerTick, toFloat64(t, metrics.BytesReadTotal.WithLabelValues(lvs...)))
// Latency should report non-zero values.
assert.NotZero(t, toFloat64(t, metrics.ReadLatencySeconds))
assert.NotZero(t, toFloat64(t, metrics.WriteLatencySeconds))
// Should not report any errors!
assert.Zero(t, toFloat64(t, metrics.ReadErrorsTotal.WithLabelValues(lvs...)))
assert.Zero(t, toFloat64(t, metrics.ReadErrorsTotal.WithLabelValues(lvs...)))
}
// toFloat64 version of Prometheus' testutil.ToFloat64 that integrates with
// github.com/stretchr/testify/require and handles histograms (somewhat)
func toFloat64(t testing.TB, c prometheus.Collector) float64 {
var (
m prometheus.Metric
mCount int
mChan = make(chan prometheus.Metric)
done = make(chan struct{})
)
go func() {
for m = range mChan {
mCount++
}
close(done)
}()
c.Collect(mChan)
close(mChan)
<-done
require.Equal(t, 1, mCount, "expected exactly 1 metric but got %d", mCount)
pb := &dto.Metric{}
require.NoError(t, m.Write(pb), "unexpected error collecting metrics")
if pb.Gauge != nil {
return pb.Gauge.GetValue()
}
if pb.Counter != nil {
return pb.Counter.GetValue()
}
if pb.Untyped != nil {
return pb.Untyped.GetValue()
}
if pb.Histogram != nil {
// If no samples, just return zero.
if pb.Histogram.GetSampleCount() == 0 {
return 0
}
// Average is sufficient for testing purposes.
return pb.Histogram.GetSampleSum() / pb.Histogram.GetSampleCountFloat()
}
require.Fail(t, "collected a non-gauge/counter/untyped/histogram metric: %s", pb)
return 0
}

View File

@ -62,6 +62,7 @@ stats/
.././scaletest/terraform/.terraform
.././scaletest/terraform/.terraform.lock.hcl
terraform.tfstate.*
**/*.tfvars
# .prettierignore.include:
# Helm templates contain variables that are invalid YAML and can't be formatted
# by Prettier.

View File

@ -62,6 +62,7 @@ stats/
.././scaletest/terraform/.terraform
.././scaletest/terraform/.terraform.lock.hcl
terraform.tfstate.*
**/*.tfvars
# .prettierignore.include:
# Helm templates contain variables that are invalid YAML and can't be formatted
# by Prettier.