diff --git a/.gitignore b/.gitignore index 69b58c4cee..29b297a9e4 100644 --- a/.gitignore +++ b/.gitignore @@ -59,3 +59,4 @@ site/stats/ ./scaletest/terraform/.terraform ./scaletest/terraform/.terraform.lock.hcl terraform.tfstate.* +**/*.tfvars diff --git a/.prettierignore b/.prettierignore index cc4a83b023..d96e9df947 100644 --- a/.prettierignore +++ b/.prettierignore @@ -62,6 +62,7 @@ site/stats/ ./scaletest/terraform/.terraform ./scaletest/terraform/.terraform.lock.hcl terraform.tfstate.* +**/*.tfvars # .prettierignore.include: # Helm templates contain variables that are invalid YAML and can't be formatted # by Prettier. diff --git a/cli/scaletest.go b/cli/scaletest.go index 67186da221..0977ab0f70 100644 --- a/cli/scaletest.go +++ b/cli/scaletest.go @@ -14,9 +14,14 @@ import ( "time" "github.com/google/uuid" + "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/client_golang/prometheus/promhttp" "go.opentelemetry.io/otel/trace" "golang.org/x/xerrors" + "cdr.dev/slog" + "cdr.dev/slog/sloggers/sloghuman" + "github.com/coder/coder/cli/clibase" "github.com/coder/coder/cli/cliui" "github.com/coder/coder/coderd/httpapi" @@ -896,8 +901,11 @@ func (r *RootCmd) scaletestCreateWorkspaces() *clibase.Cmd { func (r *RootCmd) scaletestWorkspaceTraffic() *clibase.Cmd { var ( - tickInterval time.Duration - bytesPerTick int64 + tickInterval time.Duration + bytesPerTick int64 + scaletestPrometheusAddress string + scaletestPrometheusWait time.Duration + client = &codersdk.Client{} tracingFlags = &scaletestTracingFlags{} strategy = &scaletestStrategyFlags{} @@ -913,6 +921,12 @@ func (r *RootCmd) scaletestWorkspaceTraffic() *clibase.Cmd { ), Handler: func(inv *clibase.Invocation) error { ctx := inv.Context() + reg := prometheus.NewRegistry() + metrics := workspacetraffic.NewMetrics(reg, "username", "workspace_name", "agent_name") + + logger := slog.Make(sloghuman.Sink(io.Discard)) + prometheusSrvClose := ServeHandler(ctx, logger, promhttp.HandlerFor(reg, promhttp.HandlerOpts{}), scaletestPrometheusAddress, "prometheus") + defer prometheusSrvClose() // Bypass rate limiting client.HTTPClient = &http.Client{ @@ -943,6 +957,9 @@ func (r *RootCmd) scaletestWorkspaceTraffic() *clibase.Cmd { _, _ = fmt.Fprintln(inv.Stderr, "\nUploading traces...") if err := closeTracing(ctx); err != nil { _, _ = fmt.Fprintf(inv.Stderr, "\nError uploading traces: %+v\n", err) + // Wait for prometheus metrics to be scraped + _, _ = fmt.Fprintf(inv.Stderr, "Waiting %s for prometheus metrics to be scraped\n", scaletestPrometheusWait) + <-time.After(scaletestPrometheusWait) } }() tracer := tracerProvider.Tracer(scaletestTracerName) @@ -955,9 +972,10 @@ func (r *RootCmd) scaletestWorkspaceTraffic() *clibase.Cmd { th := harness.NewTestHarness(strategy.toStrategy(), cleanupStrategy.toStrategy()) for idx, ws := range workspaces { var ( - agentID uuid.UUID - name = "workspace-traffic" - id = strconv.Itoa(idx) + agentID uuid.UUID + agentName string + name = "workspace-traffic" + id = strconv.Itoa(idx) ) for _, res := range ws.LatestBuild.Resources { @@ -965,6 +983,7 @@ func (r *RootCmd) scaletestWorkspaceTraffic() *clibase.Cmd { continue } agentID = res.Agents[0].ID + agentName = res.Agents[0].Name } if agentID == uuid.Nil { @@ -974,16 +993,20 @@ func (r *RootCmd) scaletestWorkspaceTraffic() *clibase.Cmd { // Setup our workspace agent connection. config := workspacetraffic.Config{ - AgentID: agentID, - BytesPerTick: bytesPerTick, - Duration: strategy.timeout, - TickInterval: tickInterval, + AgentID: agentID, + AgentName: agentName, + BytesPerTick: bytesPerTick, + Duration: strategy.timeout, + TickInterval: tickInterval, + WorkspaceName: ws.Name, + WorkspaceOwner: ws.OwnerName, + Registry: reg, } if err := config.Validate(); err != nil { return xerrors.Errorf("validate config: %w", err) } - var runner harness.Runnable = workspacetraffic.NewRunner(client, config) + var runner harness.Runnable = workspacetraffic.NewRunner(client, config, metrics) if tracingEnabled { runner = &runnableTraceWrapper{ tracer: tracer, @@ -1034,6 +1057,20 @@ func (r *RootCmd) scaletestWorkspaceTraffic() *clibase.Cmd { Description: "How often to send traffic.", Value: clibase.DurationOf(&tickInterval), }, + { + Flag: "scaletest-prometheus-address", + Env: "CODER_SCALETEST_PROMETHEUS_ADDRESS", + Default: "0.0.0.0:21112", + Description: "Address on which to expose scaletest Prometheus metrics.", + Value: clibase.StringOf(&scaletestPrometheusAddress), + }, + { + Flag: "scaletest-prometheus-wait", + Env: "CODER_SCALETEST_PROMETHEUS_WAIT", + Default: "5s", + Description: "How long to wait before exiting in order to allow Prometheus metrics to be scraped.", + Value: clibase.DurationOf(&scaletestPrometheusWait), + }, } tracingFlags.attach(&cmd.Options) diff --git a/cli/scaletest_test.go b/cli/scaletest_test.go index b026e7636b..b1473b64c9 100644 --- a/cli/scaletest_test.go +++ b/cli/scaletest_test.go @@ -8,17 +8,12 @@ import ( "path/filepath" "testing" - "github.com/google/uuid" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" - "github.com/coder/coder/agent" "github.com/coder/coder/cli/clitest" "github.com/coder/coder/coderd/coderdtest" "github.com/coder/coder/codersdk" - "github.com/coder/coder/codersdk/agentsdk" - "github.com/coder/coder/provisioner/echo" - "github.com/coder/coder/provisionersdk/proto" "github.com/coder/coder/pty/ptytest" "github.com/coder/coder/scaletest/harness" "github.com/coder/coder/testutil" @@ -205,70 +200,28 @@ param3: 1 }) } -// This test pretends to stand up a workspace and run a no-op traffic generation test. -// It's not a real test, but it's useful for debugging. -// We do not perform any cleanup. +// This test just validates that the CLI command accepts its known arguments. +// A more comprehensive test is performed in workspacetraffic/run_test.go func TestScaleTestWorkspaceTraffic(t *testing.T) { t.Parallel() ctx, cancelFunc := context.WithTimeout(context.Background(), testutil.WaitMedium) defer cancelFunc() - client := coderdtest.New(t, &coderdtest.Options{IncludeProvisionerDaemon: true}) - user := coderdtest.CreateFirstUser(t, client) - - authToken := uuid.NewString() - version := coderdtest.CreateTemplateVersion(t, client, user.OrganizationID, &echo.Responses{ - Parse: echo.ParseComplete, - ProvisionPlan: echo.ProvisionComplete, - ProvisionApply: []*proto.Provision_Response{{ - Type: &proto.Provision_Response_Complete{ - Complete: &proto.Provision_Complete{ - Resources: []*proto.Resource{{ - Name: "example", - Type: "aws_instance", - Agents: []*proto.Agent{{ - Id: uuid.NewString(), - Name: "agent", - Auth: &proto.Agent_Token{ - Token: authToken, - }, - Apps: []*proto.App{}, - }}, - }}, - }, - }, - }}, - }) - template := coderdtest.CreateTemplate(t, client, user.OrganizationID, version.ID) - coderdtest.AwaitTemplateVersionJob(t, client, version.ID) - - ws := coderdtest.CreateWorkspace(t, client, user.OrganizationID, template.ID, func(cwr *codersdk.CreateWorkspaceRequest) { - cwr.Name = "scaletest-test" - }) - coderdtest.AwaitWorkspaceBuildJob(t, client, ws.LatestBuild.ID) - - agentClient := agentsdk.New(client.URL) - agentClient.SetSessionToken(authToken) - agentCloser := agent.New(agent.Options{ - Client: agentClient, - }) - t.Cleanup(func() { - _ = agentCloser.Close() - }) - - coderdtest.AwaitWorkspaceAgents(t, client, ws.ID) + client := coderdtest.New(t, nil) + _ = coderdtest.CreateFirstUser(t, client) inv, root := clitest.New(t, "scaletest", "workspace-traffic", "--timeout", "1s", "--bytes-per-tick", "1024", "--tick-interval", "100ms", + "--scaletest-prometheus-address", "127.0.0.1:0", + "--scaletest-prometheus-wait", "0s", ) clitest.SetupConfig(t, client, root) var stdout, stderr bytes.Buffer inv.Stdout = &stdout inv.Stderr = &stderr err := inv.WithContext(ctx).Run() - require.NoError(t, err) - require.Contains(t, stdout.String(), "Pass: 1") + require.ErrorContains(t, err, "no scaletest workspaces exist") } diff --git a/cli/testdata/coder_scaletest_workspace-traffic_--help.golden b/cli/testdata/coder_scaletest_workspace-traffic_--help.golden index b7de6ca960..04f7688937 100644 --- a/cli/testdata/coder_scaletest_workspace-traffic_--help.golden +++ b/cli/testdata/coder_scaletest_workspace-traffic_--help.golden @@ -27,6 +27,13 @@ Generate traffic to scaletest workspaces through coderd Output format specs in the format "[:]". Not specifying a path will default to stdout. Available formats: text, json. + --scaletest-prometheus-address string, $CODER_SCALETEST_PROMETHEUS_ADDRESS (default: 0.0.0.0:21112) + Address on which to expose scaletest Prometheus metrics. + + --scaletest-prometheus-wait duration, $CODER_SCALETEST_PROMETHEUS_WAIT (default: 5s) + How long to wait before exiting in order to allow Prometheus metrics + to be scraped. + --tick-interval duration, $CODER_SCALETEST_WORKSPACE_TRAFFIC_TICK_INTERVAL (default: 100ms) How often to send traffic. diff --git a/docs/cli/scaletest_workspace-traffic.md b/docs/cli/scaletest_workspace-traffic.md index 5303847345..399885f0c1 100644 --- a/docs/cli/scaletest_workspace-traffic.md +++ b/docs/cli/scaletest_workspace-traffic.md @@ -82,6 +82,26 @@ Timeout per job. Jobs may take longer to complete under higher concurrency limit Output format specs in the format "[:]". Not specifying a path will default to stdout. Available formats: text, json. +### --scaletest-prometheus-address + +| | | +| ----------- | ------------------------------------------------ | +| Type | string | +| Environment | $CODER_SCALETEST_PROMETHEUS_ADDRESS | +| Default | 0.0.0.0:21112 | + +Address on which to expose scaletest Prometheus metrics. + +### --scaletest-prometheus-wait + +| | | +| ----------- | --------------------------------------------- | +| Type | duration | +| Environment | $CODER_SCALETEST_PROMETHEUS_WAIT | +| Default | 5s | + +How long to wait before exiting in order to allow Prometheus metrics to be scraped. + ### --tick-interval | | | diff --git a/scaletest/terraform/README.md b/scaletest/terraform/README.md index f5a2bc376d..3933c6f8c4 100644 --- a/scaletest/terraform/README.md +++ b/scaletest/terraform/README.md @@ -32,9 +32,12 @@ project_id = "some_google_project_id" 1. Run `coder_init.sh ` to setup an initial user and a pre-configured Kubernetes template. It will also download the Coder CLI from the Coder instance locally. -1. Do whatever you need to do with the Coder instance. +1. Do whatever you need to do with the Coder instance: - > To run Coder commands against the instance, you can use `coder_shim.sh `. + > Note: To run Coder commands against the instance, you can use `coder_shim.sh `. > You don't need to run `coder login` yourself. + - To create workspaces, run `./coder_shim.sh scaletest create-workspaces --template="kubernetes" --count=N` + - To generate workspace traffic, run `./coder_trafficgen.sh `. This will keep running until you delete the pod `coder-scaletest-workspace-traffic`. + 1. When you are finished, you can run `terraform destroy -var-file=override.tfvars`. diff --git a/scaletest/terraform/coder.tf b/scaletest/terraform/coder.tf index d86aa2a7fe..2486f753f7 100644 --- a/scaletest/terraform/coder.tf +++ b/scaletest/terraform/coder.tf @@ -128,34 +128,6 @@ EOF ] } -resource "local_file" "coder-monitoring-manifest" { - filename = "${path.module}/.coderv2/coder-monitoring.yaml" - content = <" + exit 1 +fi + +# Allow toggling verbose output +[[ -n ${VERBOSE:-} ]] && set -x + +LOADTEST_NAME="$1" +CODER_TOKEN=$(./coder_shim.sh tokens create) +CODER_URL="http://coder.coder-${LOADTEST_NAME}.svc.cluster.local" +export KUBECONFIG="${PWD}/.coderv2/${LOADTEST_NAME}-cluster.kubeconfig" + +cat < 0 { + w.metrics.BytesReadTotal.WithLabelValues(w.labels...).Add(float64(n)) } return n, err } func (w *countReadWriter) Write(p []byte) (int, error) { + start := time.Now() n, err := w.ReadWriter.Write(p) - if err == nil { - w.bytesWritten.Add(int64(n)) + if reportableErr(err) { + w.metrics.WriteErrorsTotal.WithLabelValues(w.labels...).Inc() + } + w.metrics.WriteLatencySeconds.WithLabelValues(w.labels...).Observe(time.Since(start).Seconds()) + if n > 0 { + w.metrics.BytesWrittenTotal.WithLabelValues(w.labels...).Add(float64(n)) } return n, err } -func (w *countReadWriter) BytesRead() int64 { - return w.bytesRead.Load() -} - -func (w *countReadWriter) BytesWritten() int64 { - return w.bytesWritten.Load() -} - func mustRandStr(l int64) string { if l < 1 { l = 1 @@ -222,3 +239,19 @@ func mustRandStr(l int64) string { } return randStr } + +// some errors we want to report in metrics; others we want to ignore +// such as websocket.StatusNormalClosure or context.Canceled +func reportableErr(err error) bool { + if err == nil { + return false + } + if xerrors.Is(err, context.Canceled) { + return false + } + var wsErr websocket.CloseError + if errors.As(err, &wsErr) { + return wsErr.Code != websocket.StatusNormalClosure + } + return false +} diff --git a/scaletest/workspacetraffic/run_test.go b/scaletest/workspacetraffic/run_test.go new file mode 100644 index 0000000000..e53d408bcd --- /dev/null +++ b/scaletest/workspacetraffic/run_test.go @@ -0,0 +1,177 @@ +package workspacetraffic_test + +import ( + "context" + "strings" + "testing" + "time" + + "github.com/coder/coder/agent" + "github.com/coder/coder/coderd/coderdtest" + "github.com/coder/coder/codersdk" + "github.com/coder/coder/codersdk/agentsdk" + "github.com/coder/coder/provisioner/echo" + "github.com/coder/coder/provisionersdk/proto" + "github.com/coder/coder/scaletest/workspacetraffic" + "github.com/coder/coder/testutil" + + "github.com/google/uuid" + "github.com/prometheus/client_golang/prometheus" + dto "github.com/prometheus/client_model/go" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func TestRun(t *testing.T) { + t.Parallel() + + // We need to stand up an in-memory coderd and run a fake workspace. + var ( + client = coderdtest.New(t, &coderdtest.Options{IncludeProvisionerDaemon: true}) + firstUser = coderdtest.CreateFirstUser(t, client) + authToken = uuid.NewString() + agentName = "agent" + version = coderdtest.CreateTemplateVersion(t, client, firstUser.OrganizationID, &echo.Responses{ + Parse: echo.ParseComplete, + ProvisionPlan: echo.ProvisionComplete, + ProvisionApply: []*proto.Provision_Response{{ + Type: &proto.Provision_Response_Complete{ + Complete: &proto.Provision_Complete{ + Resources: []*proto.Resource{{ + Name: "example", + Type: "aws_instance", + Agents: []*proto.Agent{{ + // Agent ID gets generated no matter what we say ¯\_(ツ)_/¯ + Name: agentName, + Auth: &proto.Agent_Token{ + Token: authToken, + }, + Apps: []*proto.App{}, + }}, + }}, + }, + }, + }}, + }) + template = coderdtest.CreateTemplate(t, client, firstUser.OrganizationID, version.ID) + _ = coderdtest.AwaitTemplateVersionJob(t, client, version.ID) + // In order to be picked up as a scaletest workspace, the workspace must be named specifically + ws = coderdtest.CreateWorkspace(t, client, firstUser.OrganizationID, template.ID, func(cwr *codersdk.CreateWorkspaceRequest) { + cwr.Name = "scaletest-test" + }) + _ = coderdtest.AwaitWorkspaceBuildJob(t, client, ws.LatestBuild.ID) + ) + + // We also need a running agent to run this test. + agentClient := agentsdk.New(client.URL) + agentClient.SetSessionToken(authToken) + agentCloser := agent.New(agent.Options{ + Client: agentClient, + }) + ctx, cancel := context.WithCancel(context.Background()) + t.Cleanup(cancel) + t.Cleanup(func() { + _ = agentCloser.Close() + }) + // We actually need to know the full user and not just the UserID / OrgID + user, err := client.User(ctx, firstUser.UserID.String()) + require.NoError(t, err, "get first user") + + // Make sure the agent is connected before we go any further. + resources := coderdtest.AwaitWorkspaceAgents(t, client, ws.ID) + var agentID uuid.UUID + for _, res := range resources { + for _, agt := range res.Agents { + agentID = agt.ID + } + } + require.NotEqual(t, uuid.Nil, agentID, "did not expect agentID to be nil") + + // Now we can start the runner. + var ( + bytesPerTick = 1024 + tickInterval = 1000 * time.Millisecond + cancelAfter = 1500 * time.Millisecond + fudgeWrite = 12 // The ReconnectingPTY payload incurs some overhead + ) + reg := prometheus.NewRegistry() + metrics := workspacetraffic.NewMetrics(reg, "username", "workspace_name", "agent_name") + runner := workspacetraffic.NewRunner(client, workspacetraffic.Config{ + AgentID: agentID, + AgentName: agentName, + WorkspaceName: ws.Name, + WorkspaceOwner: ws.OwnerName, + BytesPerTick: int64(bytesPerTick), + TickInterval: tickInterval, + Duration: testutil.WaitLong, + Registry: reg, + }, metrics) + + var logs strings.Builder + // Stop the test after one 'tick'. This will cause an EOF. + go func() { + <-time.After(cancelAfter) + cancel() + }() + require.NoError(t, runner.Run(ctx, "", &logs), "unexpected error calling Run()") + + // We want to ensure the metrics are somewhat accurate. + lvs := []string{user.Username, ws.Name, agentName} + assert.InDelta(t, bytesPerTick+fudgeWrite, toFloat64(t, metrics.BytesWrittenTotal.WithLabelValues(lvs...)), 0.1) + // Read is highly variable, depending on how far we read before stopping. + // Just ensure it's not zero. + assert.NotZero(t, bytesPerTick, toFloat64(t, metrics.BytesReadTotal.WithLabelValues(lvs...))) + // Latency should report non-zero values. + assert.NotZero(t, toFloat64(t, metrics.ReadLatencySeconds)) + assert.NotZero(t, toFloat64(t, metrics.WriteLatencySeconds)) + // Should not report any errors! + assert.Zero(t, toFloat64(t, metrics.ReadErrorsTotal.WithLabelValues(lvs...))) + assert.Zero(t, toFloat64(t, metrics.ReadErrorsTotal.WithLabelValues(lvs...))) +} + +// toFloat64 version of Prometheus' testutil.ToFloat64 that integrates with +// github.com/stretchr/testify/require and handles histograms (somewhat) +func toFloat64(t testing.TB, c prometheus.Collector) float64 { + var ( + m prometheus.Metric + mCount int + mChan = make(chan prometheus.Metric) + done = make(chan struct{}) + ) + + go func() { + for m = range mChan { + mCount++ + } + close(done) + }() + + c.Collect(mChan) + close(mChan) + <-done + + require.Equal(t, 1, mCount, "expected exactly 1 metric but got %d", mCount) + + pb := &dto.Metric{} + require.NoError(t, m.Write(pb), "unexpected error collecting metrics") + + if pb.Gauge != nil { + return pb.Gauge.GetValue() + } + if pb.Counter != nil { + return pb.Counter.GetValue() + } + if pb.Untyped != nil { + return pb.Untyped.GetValue() + } + if pb.Histogram != nil { + // If no samples, just return zero. + if pb.Histogram.GetSampleCount() == 0 { + return 0 + } + // Average is sufficient for testing purposes. + return pb.Histogram.GetSampleSum() / pb.Histogram.GetSampleCountFloat() + } + require.Fail(t, "collected a non-gauge/counter/untyped/histogram metric: %s", pb) + return 0 +} diff --git a/site/.eslintignore b/site/.eslintignore index 865d1e7006..f768843a9e 100644 --- a/site/.eslintignore +++ b/site/.eslintignore @@ -62,6 +62,7 @@ stats/ .././scaletest/terraform/.terraform .././scaletest/terraform/.terraform.lock.hcl terraform.tfstate.* +**/*.tfvars # .prettierignore.include: # Helm templates contain variables that are invalid YAML and can't be formatted # by Prettier. diff --git a/site/.prettierignore b/site/.prettierignore index 865d1e7006..f768843a9e 100644 --- a/site/.prettierignore +++ b/site/.prettierignore @@ -62,6 +62,7 @@ stats/ .././scaletest/terraform/.terraform .././scaletest/terraform/.terraform.lock.hcl terraform.tfstate.* +**/*.tfvars # .prettierignore.include: # Helm templates contain variables that are invalid YAML and can't be formatted # by Prettier.