chore: fix race in cron close behavior (TestAgent_WriteVSCodeConfigs) (#11243)

* chore: add unit test to excercise flake * Implement a *fix for cron stop() before run() This fix still has a race condition. I do not see a clean solution without modifying the cron libary. The cron library uses a boolean to indicate running, and that boolean needs to be set to "true" before we call "Close()". Or "Close()" should prevent "Run()" from doing anything. In either case, this solves the issue for a niche unit test bug in which the test finishes, calling Close(), before there was an oppertunity to start the go routine. It probably isn't worth a lot of time investment, and this fix will suffice
2023-12-18 09:26:40 -06:00 · 2023-12-18 09:26:40 -06:00 · a6901ae2c5
parent 56cbd47082
commit a6901ae2c5
2 changed files with 22 additions and 1 deletions
--- a/agent/agentscripts/agentscripts.go
+++ b/agent/agentscripts/agentscripts.go
@ -129,7 +129,18 @@ func (r *Runner) StartCron() {
 	// has exited by the time the `cron.Stop()` context returns, so we need to
 	// track it manually.
 	err := r.trackCommandGoroutine(func() {
-		r.cron.Run()
+		// Since this is run async, in quick unit tests, it is possible the
+		// Close() function gets called before we even start the cron.
+		// In these cases, the Run() will never end.
+		// So if we are closed, we just return, and skip the Run() entirely.
+		select {
+		case <-r.cronCtx.Done():
+			// The cronCtx is cancelled before cron.Close() happens. So if the ctx is
+			// cancelled, then Close() will be called, or it is about to be called.
+			// So do nothing!
+		default:
+			r.cron.Run()
+		}
 	})
 	if err != nil {
 		r.Logger.Warn(context.Background(), "start cron failed", slog.Error(err))
@ -315,6 +326,7 @@ func (r *Runner) Close() error {
 		return nil
 	}
 	close(r.closed)
+	// Must cancel the cron ctx BEFORE stopping the cron.
 	r.cronCtxCancel()
 	<-r.cron.Stop().Done()
 	r.cmdCloseWait.Wait()
--- a/agent/agentscripts/agentscripts_test.go
+++ b/agent/agentscripts/agentscripts_test.go
@ -53,6 +53,15 @@ func TestTimeout(t *testing.T) {
 	require.ErrorIs(t, runner.Execute(context.Background(), nil), agentscripts.ErrTimeout)
 }

+// TestCronClose exists because cron.Run() can happen after cron.Close().
+// If this happens, there used to be a deadlock.
+func TestCronClose(t *testing.T) {
+	t.Parallel()
+	runner := agentscripts.New(agentscripts.Options{})
+	runner.StartCron()
+	require.NoError(t, runner.Close(), "close runner")
+}
+
 func setup(t *testing.T, patchLogs func(ctx context.Context, req agentsdk.PatchLogs) error) *agentscripts.Runner {
 	t.Helper()
 	if patchLogs == nil {