From dd86100f33cd4b791b70b2df5e1f6ce51f5b7050 Mon Sep 17 00:00:00 2001
From: Cian Johnston <cian@coder.com>
Date: Mon, 16 Oct 2023 12:37:12 +0100
Subject: [PATCH] fix(scaletest): fix flake in Test_Runner/Cleanup (#10252)

* fix(scaletest/createworkspaces): address flake in Test_Runner/CleanupPendingBuild

* fix(scaletest): pass io.Writer to Cleanup()

* add some extra logs to workspacebuild cleanup

* fixup! fix(scaletest): pass io.Writer to Cleanup()

* remove race

* fmt

* address PR comments
---
 cli/exp_scaletest.go                   |  4 +--
 scaletest/createworkspaces/run.go      |  6 +++--
 scaletest/createworkspaces/run_test.go | 34 ++++++++++++++++++++------
 scaletest/dashboard/run.go             |  2 +-
 scaletest/harness/harness_test.go      |  6 ++---
 scaletest/harness/run.go               |  4 +--
 scaletest/harness/run_test.go          | 10 ++++----
 scaletest/workspacebuild/run.go        | 22 +++++++++++------
 scaletest/workspacebuild/run_test.go   |  3 ++-
 scaletest/workspacetraffic/run.go      |  2 +-
 10 files changed, 61 insertions(+), 32 deletions(-)

diff --git a/cli/exp_scaletest.go b/cli/exp_scaletest.go
index e09e5aff2d..f615e650b5 100644
--- a/cli/exp_scaletest.go
+++ b/cli/exp_scaletest.go
@@ -1241,7 +1241,7 @@ func (r *runnableTraceWrapper) Run(ctx context.Context, id string, logs io.Write
 	return r.runner.Run(ctx2, id, logs)
 }
 
-func (r *runnableTraceWrapper) Cleanup(ctx context.Context, id string) error {
+func (r *runnableTraceWrapper) Cleanup(ctx context.Context, id string, logs io.Writer) error {
 	c, ok := r.runner.(harness.Cleanable)
 	if !ok {
 		return nil
@@ -1253,7 +1253,7 @@ func (r *runnableTraceWrapper) Cleanup(ctx context.Context, id string) error {
 	ctx, span := r.tracer.Start(ctx, r.spanName+" cleanup")
 	defer span.End()
 
-	return c.Cleanup(ctx, id)
+	return c.Cleanup(ctx, id, logs)
 }
 
 // newScaleTestUser returns a random username and email address that can be used
diff --git a/scaletest/createworkspaces/run.go b/scaletest/createworkspaces/run.go
index d1c1713e3d..6793475012 100644
--- a/scaletest/createworkspaces/run.go
+++ b/scaletest/createworkspaces/run.go
@@ -176,13 +176,14 @@ resourceLoop:
 }
 
 // Cleanup implements Cleanable.
-func (r *Runner) Cleanup(ctx context.Context, id string) error {
+func (r *Runner) Cleanup(ctx context.Context, id string, logs io.Writer) error {
 	if r.cfg.NoCleanup {
+		_, _ = fmt.Fprintln(logs, "skipping cleanup")
 		return nil
 	}
 
 	if r.workspacebuildRunner != nil {
-		err := r.workspacebuildRunner.Cleanup(ctx, id)
+		err := r.workspacebuildRunner.Cleanup(ctx, id, logs)
 		if err != nil {
 			return xerrors.Errorf("cleanup workspace: %w", err)
 		}
@@ -191,6 +192,7 @@ func (r *Runner) Cleanup(ctx context.Context, id string) error {
 	if r.userID != uuid.Nil {
 		err := r.client.DeleteUser(ctx, r.userID)
 		if err != nil {
+			_, _ = fmt.Fprintf(logs, "failed to delete user %q: %v\n", r.userID.String(), err)
 			return xerrors.Errorf("delete user: %w", err)
 		}
 	}
diff --git a/scaletest/createworkspaces/run_test.go b/scaletest/createworkspaces/run_test.go
index 5d27cc3fb0..88e1e59cd9 100644
--- a/scaletest/createworkspaces/run_test.go
+++ b/scaletest/createworkspaces/run_test.go
@@ -177,8 +177,13 @@ func Test_Runner(t *testing.T) {
 		require.Contains(t, logsStr, "Opening reconnecting PTY connection to agent")
 		require.Contains(t, logsStr, "Opening connection to workspace agent")
 
-		err = runner.Cleanup(ctx, "1")
+		cleanupLogs := bytes.NewBuffer(nil)
+		err = runner.Cleanup(ctx, "1", cleanupLogs)
 		require.NoError(t, err)
+		cleanupLogsStr := cleanupLogs.String()
+		require.Contains(t, cleanupLogsStr, "deleting workspace")
+		require.NotContains(t, cleanupLogsStr, "canceling workspace build") // The build should have already completed.
+		require.Contains(t, cleanupLogsStr, "Build succeeded!")
 
 		// Ensure the user and workspace were deleted.
 		users, err = client.Users(ctx, codersdk.UsersRequest{})
@@ -217,7 +222,7 @@ func Test_Runner(t *testing.T) {
 			},
 			ProvisionApply: []*proto.Response{
 				{
-					Type: &proto.Response_Log{Log: &proto.Log{}},
+					Type: &proto.Response_Log{Log: &proto.Log{}}, // This provisioner job will never complete.
 				},
 			},
 		})
@@ -257,24 +262,36 @@ func Test_Runner(t *testing.T) {
 			close(done)
 		}()
 
+		// Wait for the workspace build job to be picked up.
 		require.Eventually(t, func() bool {
 			workspaces, err := client.Workspaces(ctx, codersdk.WorkspaceFilter{})
 			if err != nil {
 				return false
 			}
+			if len(workspaces.Workspaces) == 0 {
+				return false
+			}
 
-			return len(workspaces.Workspaces) > 0
-		}, testutil.WaitShort, testutil.IntervalFast)
+			ws := workspaces.Workspaces[0]
+			t.Logf("checking build: %s | %s", ws.LatestBuild.Transition, ws.LatestBuild.Job.Status)
+			// There should be only one build at present.
+			if ws.LatestBuild.Transition != codersdk.WorkspaceTransitionStart {
+				t.Errorf("expected build transition %s, got %s", codersdk.WorkspaceTransitionStart, ws.LatestBuild.Transition)
+				return false
+			}
+			return ws.LatestBuild.Job.Status == codersdk.ProvisionerJobRunning
+		}, testutil.WaitShort, testutil.IntervalMedium)
 
 		cancelFunc()
 		<-done
 
 		// When we run the cleanup, it should be canceled
+		cleanupLogs := bytes.NewBuffer(nil)
 		cancelCtx, cancelFunc = context.WithCancel(ctx)
 		done = make(chan struct{})
 		go func() {
 			// This will return an error as the "delete" operation will never complete.
-			_ = runner.Cleanup(cancelCtx, "1")
+			_ = runner.Cleanup(cancelCtx, "1", cleanupLogs)
 			close(done)
 		}()
 
@@ -311,9 +328,11 @@ func Test_Runner(t *testing.T) {
 				}
 			}
 			return false
-		}, testutil.WaitShort, testutil.IntervalFast)
+		}, testutil.WaitShort, testutil.IntervalMedium)
 		cancelFunc()
 		<-done
+		cleanupLogsStr := cleanupLogs.String()
+		require.Contains(t, cleanupLogsStr, "canceling workspace build")
 	})
 
 	t.Run("NoCleanup", func(t *testing.T) {
@@ -447,7 +466,8 @@ func Test_Runner(t *testing.T) {
 		require.Contains(t, logsStr, "Opening reconnecting PTY connection to agent")
 		require.Contains(t, logsStr, "Opening connection to workspace agent")
 
-		err = runner.Cleanup(ctx, "1")
+		cleanupLogs := bytes.NewBuffer(nil)
+		err = runner.Cleanup(ctx, "1", cleanupLogs)
 		require.NoError(t, err)
 
 		// Ensure the user and workspace were not deleted.
diff --git a/scaletest/dashboard/run.go b/scaletest/dashboard/run.go
index 9d7d6f32f2..0c80d98d59 100644
--- a/scaletest/dashboard/run.go
+++ b/scaletest/dashboard/run.go
@@ -125,6 +125,6 @@ func (r *Runner) runUntilDeadlineExceeded(ctx context.Context) error {
 	}
 }
 
-func (*Runner) Cleanup(_ context.Context, _ string) error {
+func (*Runner) Cleanup(_ context.Context, _ string, _ io.Writer) error {
 	return nil
 }
diff --git a/scaletest/harness/harness_test.go b/scaletest/harness/harness_test.go
index 11fb8d8bfe..10e1f87bd7 100644
--- a/scaletest/harness/harness_test.go
+++ b/scaletest/harness/harness_test.go
@@ -112,7 +112,7 @@ func Test_TestHarness(t *testing.T) {
 				RunFn: func(_ context.Context, _ string, _ io.Writer) error {
 					return nil
 				},
-				CleanupFn: func(_ context.Context, _ string) error {
+				CleanupFn: func(_ context.Context, _ string, _ io.Writer) error {
 					panic(testPanicMessage)
 				},
 			})
@@ -150,7 +150,7 @@ func Test_TestHarness(t *testing.T) {
 				RunFn: func(_ context.Context, _ string, _ io.Writer) error {
 					return nil
 				},
-				CleanupFn: func(_ context.Context, _ string) error {
+				CleanupFn: func(_ context.Context, _ string, _ io.Writer) error {
 					return nil
 				},
 			})
@@ -295,7 +295,7 @@ func fakeTestFns(err, cleanupErr error) testFns {
 		RunFn: func(_ context.Context, _ string, _ io.Writer) error {
 			return err
 		},
-		CleanupFn: func(_ context.Context, _ string) error {
+		CleanupFn: func(_ context.Context, _ string, _ io.Writer) error {
 			return cleanupErr
 		},
 	}
diff --git a/scaletest/harness/run.go b/scaletest/harness/run.go
index 4ee4ee976c..00cdc0dbf1 100644
--- a/scaletest/harness/run.go
+++ b/scaletest/harness/run.go
@@ -28,7 +28,7 @@ type Runnable interface {
 type Cleanable interface {
 	Runnable
 	// Cleanup should clean up any lingering resources from the test.
-	Cleanup(ctx context.Context, id string) error
+	Cleanup(ctx context.Context, id string, logs io.Writer) error
 }
 
 // AddRun creates a new *TestRun with the given name, ID and Runnable, adds it
@@ -131,7 +131,7 @@ func (r *TestRun) Cleanup(ctx context.Context) (err error) {
 		}
 	}()
 
-	err = c.Cleanup(ctx, r.id)
+	err = c.Cleanup(ctx, r.id, r.logs)
 	//nolint:revive // we use named returns because we mutate it in a defer
 	return
 }
diff --git a/scaletest/harness/run_test.go b/scaletest/harness/run_test.go
index e339849061..7466e97435 100644
--- a/scaletest/harness/run_test.go
+++ b/scaletest/harness/run_test.go
@@ -16,7 +16,7 @@ import (
 type testFns struct {
 	RunFn func(ctx context.Context, id string, logs io.Writer) error
 	// CleanupFn is optional if no cleanup is required.
-	CleanupFn func(ctx context.Context, id string) error
+	CleanupFn func(ctx context.Context, id string, logs io.Writer) error
 }
 
 // Run implements Runnable.
@@ -25,12 +25,12 @@ func (fns testFns) Run(ctx context.Context, id string, logs io.Writer) error {
 }
 
 // Cleanup implements Cleanable.
-func (fns testFns) Cleanup(ctx context.Context, id string) error {
+func (fns testFns) Cleanup(ctx context.Context, id string, logs io.Writer) error {
 	if fns.CleanupFn == nil {
 		return nil
 	}
 
-	return fns.CleanupFn(ctx, id)
+	return fns.CleanupFn(ctx, id, logs)
 }
 
 func Test_TestRun(t *testing.T) {
@@ -49,7 +49,7 @@ func Test_TestRun(t *testing.T) {
 					atomic.AddInt64(&runCalled, 1)
 					return nil
 				},
-				CleanupFn: func(ctx context.Context, id string) error {
+				CleanupFn: func(ctx context.Context, id string, logs io.Writer) error {
 					atomic.AddInt64(&cleanupCalled, 1)
 					return nil
 				},
@@ -93,7 +93,7 @@ func Test_TestRun(t *testing.T) {
 				RunFn: func(ctx context.Context, id string, logs io.Writer) error {
 					return nil
 				},
-				CleanupFn: func(ctx context.Context, id string) error {
+				CleanupFn: func(ctx context.Context, id string, logs io.Writer) error {
 					atomic.AddInt64(&cleanupCalled, 1)
 					return nil
 				},
diff --git a/scaletest/workspacebuild/run.go b/scaletest/workspacebuild/run.go
index 49b2ba6041..7d5046eabf 100644
--- a/scaletest/workspacebuild/run.go
+++ b/scaletest/workspacebuild/run.go
@@ -4,6 +4,7 @@ import (
 	"context"
 	"fmt"
 	"io"
+	"net/http"
 	"time"
 
 	"github.com/google/uuid"
@@ -106,25 +107,31 @@ func NewCleanupRunner(client *codersdk.Client, workspaceID uuid.UUID) *CleanupRu
 
 // Run implements Runnable.
 func (r *CleanupRunner) Run(ctx context.Context, _ string, logs io.Writer) error {
+	ctx, span := tracing.StartSpan(ctx)
+	defer span.End()
+	logs = loadtestutil.NewSyncWriter(logs)
+	logger := slog.Make(sloghuman.Sink(logs)).Leveled(slog.LevelDebug)
 	if r.workspaceID == uuid.Nil {
 		return nil
 	}
-	ctx, span := tracing.StartSpan(ctx)
-	defer span.End()
-
-	logs = loadtestutil.NewSyncWriter(logs)
-	logger := slog.Make(sloghuman.Sink(logs)).Leveled(slog.LevelDebug)
+	logger.Info(ctx, "deleting workspace", slog.F("workspace_id", r.workspaceID))
 	r.client.SetLogger(logger)
 	r.client.SetLogBodies(true)
 
 	ws, err := r.client.Workspace(ctx, r.workspaceID)
 	if err != nil {
+		var sdkErr *codersdk.Error
+		if xerrors.As(err, &sdkErr) && sdkErr.StatusCode() == http.StatusNotFound {
+			logger.Info(ctx, "workspace not found, skipping delete", slog.F("workspace_id", r.workspaceID))
+			return nil
+		}
 		return err
 	}
 
 	build, err := r.client.WorkspaceBuild(ctx, ws.LatestBuild.ID)
 	if err == nil && build.Job.Status.Active() {
 		// mark the build as canceled
+		logger.Info(ctx, "canceling workspace build", slog.F("build_id", build.ID), slog.F("workspace_id", r.workspaceID))
 		if err = r.client.CancelWorkspaceBuild(ctx, build.ID); err == nil {
 			// Wait for the job to cancel before we delete it
 			_ = waitForBuild(ctx, logs, r.client, build.ID) // it will return a "build canceled" error
@@ -151,12 +158,11 @@ func (r *CleanupRunner) Run(ctx context.Context, _ string, logs io.Writer) error
 }
 
 // Cleanup implements Cleanable by wrapping CleanupRunner.
-func (r *Runner) Cleanup(ctx context.Context, id string) error {
-	// TODO: capture these logs
+func (r *Runner) Cleanup(ctx context.Context, id string, w io.Writer) error {
 	return (&CleanupRunner{
 		client:      r.client,
 		workspaceID: r.workspaceID,
-	}).Run(ctx, id, io.Discard)
+	}).Run(ctx, id, w)
 }
 
 func waitForBuild(ctx context.Context, w io.Writer, client *codersdk.Client, buildID uuid.UUID) error {
diff --git a/scaletest/workspacebuild/run_test.go b/scaletest/workspacebuild/run_test.go
index 335e2d071a..2ed5b09c0c 100644
--- a/scaletest/workspacebuild/run_test.go
+++ b/scaletest/workspacebuild/run_test.go
@@ -180,7 +180,8 @@ func Test_Runner(t *testing.T) {
 		coderdtest.AwaitWorkspaceBuildJobCompleted(t, client, workspaces[0].LatestBuild.ID)
 		coderdtest.AwaitWorkspaceAgents(t, client, workspaces[0].ID)
 
-		err = runner.Cleanup(ctx, "1")
+		cleanupLogs := bytes.NewBuffer(nil)
+		err = runner.Cleanup(ctx, "1", cleanupLogs)
 		require.NoError(t, err)
 	})
 
diff --git a/scaletest/workspacetraffic/run.go b/scaletest/workspacetraffic/run.go
index aea4345c47..d3fcd6033b 100644
--- a/scaletest/workspacetraffic/run.go
+++ b/scaletest/workspacetraffic/run.go
@@ -169,7 +169,7 @@ func (r *Runner) Run(ctx context.Context, _ string, logs io.Writer) error {
 }
 
 // Cleanup does nothing, successfully.
-func (*Runner) Cleanup(context.Context, string) error {
+func (*Runner) Cleanup(context.Context, string, io.Writer) error {
 	return nil
 }