diff --git a/agent/agent.go b/agent/agent.go index 10ecdfb5d5..34af2b59f3 100644 --- a/agent/agent.go +++ b/agent/agent.go @@ -210,25 +210,31 @@ func (a *agent) collectMetadata(ctx context.Context, md codersdk.WorkspaceAgentM var out bytes.Buffer result := &codersdk.WorkspaceAgentMetadataResult{ // CollectedAt is set here for testing purposes and overrode by - // the server to the time the server received the result to protect - // against clock skew. + // coderd to the time of server receipt to solve clock skew. // // In the future, the server may accept the timestamp from the agent - // if it is certain the clocks are in sync. + // if it can guarantee the clocks are synchronized. CollectedAt: time.Now(), } cmd, err := a.sshServer.CreateCommand(ctx, md.Script, nil) if err != nil { - result.Error = err.Error() + result.Error = fmt.Sprintf("create cmd: %+v", err) return result } cmd.Stdout = &out cmd.Stderr = &out + cmd.Stdin = io.LimitReader(nil, 0) - // The error isn't mutually exclusive with useful output. - err = cmd.Run() + // We split up Start and Wait instead of calling Run so that we can return a more precise error. + err = cmd.Start() + if err != nil { + result.Error = fmt.Sprintf("start cmd: %+v", err) + return result + } + // This error isn't mutually exclusive with useful output. + err = cmd.Wait() const bufLimit = 10 << 10 if out.Len() > bufLimit { err = errors.Join( @@ -238,8 +244,12 @@ func (a *agent) collectMetadata(ctx context.Context, md codersdk.WorkspaceAgentM out.Truncate(bufLimit) } + // Important: if the command times out, we may see a misleading error like + // "exit status 1", so it's important to include the context error. + err = errors.Join(err, ctx.Err()) + if err != nil { - result.Error = err.Error() + result.Error = fmt.Sprintf("run cmd: %+v", err) } result.Value = out.String() return result diff --git a/agent/agent_test.go b/agent/agent_test.go index 1d5a852f7d..ee135a05ce 100644 --- a/agent/agent_test.go +++ b/agent/agent_test.go @@ -951,19 +951,17 @@ func TestAgent_StartupScript(t *testing.T) { func TestAgent_Metadata(t *testing.T) { t.Parallel() + echoHello := "echo 'hello'" + t.Run("Once", func(t *testing.T) { t.Parallel() - script := "echo -n hello" - if runtime.GOOS == "windows" { - script = "powershell " + script - } //nolint:dogsled _, client, _, _, _ := setupAgent(t, agentsdk.Manifest{ Metadata: []codersdk.WorkspaceAgentMetadataDescription{ { Key: "greeting", Interval: 0, - Script: script, + Script: echoHello, }, }, }, 0) @@ -986,80 +984,113 @@ func TestAgent_Metadata(t *testing.T) { }) t.Run("Many", func(t *testing.T) { - if runtime.GOOS == "windows" { - // Shell scripting in Windows is a pain, and we have already tested - // that the OS logic works in the simpler "Once" test above. - t.Skip() - } t.Parallel() - - dir := t.TempDir() - - const reportInterval = 2 - const intervalUnit = 100 * time.Millisecond - var ( - greetingPath = filepath.Join(dir, "greeting") - script = "echo hello | tee -a " + greetingPath - ) + //nolint:dogsled _, client, _, _, _ := setupAgent(t, agentsdk.Manifest{ Metadata: []codersdk.WorkspaceAgentMetadataDescription{ { Key: "greeting", - Interval: reportInterval, - Script: script, - }, - { - Key: "bad", - Interval: reportInterval, - Script: "exit 1", + Interval: 1, + Timeout: 100, + Script: echoHello, }, }, }, 0) + var gotMd map[string]agentsdk.PostMetadataRequest require.Eventually(t, func() bool { - return len(client.getMetadata()) == 2 + gotMd = client.getMetadata() + return len(gotMd) == 1 }, testutil.WaitShort, testutil.IntervalMedium) - for start := time.Now(); time.Since(start) < testutil.WaitMedium; time.Sleep(testutil.IntervalMedium) { - md := client.getMetadata() - if len(md) != 2 { - panic("unexpected number of metadata entries") - } + collectedAt1 := gotMd["greeting"].CollectedAt + if !assert.Equal(t, "hello", strings.TrimSpace(gotMd["greeting"].Value)) { + t.Errorf("got: %+v", gotMd) + } - require.Equal(t, "hello\n", md["greeting"].Value) - require.Equal(t, "exit status 1", md["bad"].Error) - - greetingByt, err := os.ReadFile(greetingPath) - require.NoError(t, err) - - var ( - numGreetings = bytes.Count(greetingByt, []byte("hello")) - idealNumGreetings = time.Since(start) / (reportInterval * intervalUnit) - // We allow a 50% error margin because the report loop may backlog - // in CI and other toasters. In production, there is no hard - // guarantee on timing either, and the frontend gives similar - // wiggle room to the staleness of the value. - upperBound = int(idealNumGreetings) + 1 - lowerBound = (int(idealNumGreetings) / 2) - ) - - if idealNumGreetings < 50 { - // There is an insufficient sample size. - continue - } - - t.Logf("numGreetings: %d, idealNumGreetings: %d", numGreetings, idealNumGreetings) - // The report loop may slow down on load, but it should never, ever - // speed up. - if numGreetings > upperBound { - t.Fatalf("too many greetings: %d > %d in %v", numGreetings, upperBound, time.Since(start)) - } else if numGreetings < lowerBound { - t.Fatalf("too few greetings: %d < %d", numGreetings, lowerBound) - } + if !assert.Eventually(t, func() bool { + gotMd = client.getMetadata() + return gotMd["greeting"].CollectedAt.After(collectedAt1) + }, testutil.WaitShort, testutil.IntervalMedium) { + t.Fatalf("expected metadata to be collected again") } }) } +func TestAgentMetadata_Timing(t *testing.T) { + if runtime.GOOS == "windows" { + // Shell scripting in Windows is a pain, and we have already tested + // that the OS logic works in the simpler tests. + t.Skip() + } + testutil.SkipIfNotTiming(t) + t.Parallel() + + dir := t.TempDir() + + const reportInterval = 2 + const intervalUnit = 100 * time.Millisecond + var ( + greetingPath = filepath.Join(dir, "greeting") + script = "echo hello | tee -a " + greetingPath + ) + //nolint:dogsled + _, client, _, _, _ := setupAgent(t, agentsdk.Manifest{ + Metadata: []codersdk.WorkspaceAgentMetadataDescription{ + { + Key: "greeting", + Interval: reportInterval, + Script: script, + }, + { + Key: "bad", + Interval: reportInterval, + Script: "exit 1", + }, + }, + }, 0) + + require.Eventually(t, func() bool { + return len(client.getMetadata()) == 2 + }, testutil.WaitShort, testutil.IntervalMedium) + + for start := time.Now(); time.Since(start) < testutil.WaitMedium; time.Sleep(testutil.IntervalMedium) { + md := client.getMetadata() + require.Len(t, md, 2, "got: %+v", md) + + require.Equal(t, "hello\n", md["greeting"].Value) + require.Equal(t, "run cmd: exit status 1", md["bad"].Error) + + greetingByt, err := os.ReadFile(greetingPath) + require.NoError(t, err) + + var ( + numGreetings = bytes.Count(greetingByt, []byte("hello")) + idealNumGreetings = time.Since(start) / (reportInterval * intervalUnit) + // We allow a 50% error margin because the report loop may backlog + // in CI and other toasters. In production, there is no hard + // guarantee on timing either, and the frontend gives similar + // wiggle room to the staleness of the value. + upperBound = int(idealNumGreetings) + 1 + lowerBound = (int(idealNumGreetings) / 2) + ) + + if idealNumGreetings < 50 { + // There is an insufficient sample size. + continue + } + + t.Logf("numGreetings: %d, idealNumGreetings: %d", numGreetings, idealNumGreetings) + // The report loop may slow down on load, but it should never, ever + // speed up. + if numGreetings > upperBound { + t.Fatalf("too many greetings: %d > %d in %v", numGreetings, upperBound, time.Since(start)) + } else if numGreetings < lowerBound { + t.Fatalf("too few greetings: %d < %d", numGreetings, lowerBound) + } + } +} + func TestAgent_Lifecycle(t *testing.T) { t.Parallel() diff --git a/testutil/enable_timing.go b/testutil/enable_timing.go new file mode 100644 index 0000000000..d9fffafd95 --- /dev/null +++ b/testutil/enable_timing.go @@ -0,0 +1,8 @@ +//go:build timing + +package testutil + +var _ = func() any { + timing = true + return nil +}() diff --git a/testutil/timing.go b/testutil/timing.go new file mode 100644 index 0000000000..9cdd3bd8f6 --- /dev/null +++ b/testutil/timing.go @@ -0,0 +1,20 @@ +package testutil + +import ( + "testing" +) + +// We can't run timing-sensitive tests in CI because of the +// great variance in runner performance. Instead of not testing timing at all, +// we relegate it to humans manually running certain tests with the "-timing" +// flag from time to time. +// +// Eventually, we should run all timing tests in a self-hosted runner. + +var timing bool + +func SkipIfNotTiming(t *testing.T) { + if !timing { + t.Skip("skipping timing-sensitive test") + } +}