ci: use big runners (#4990)

* chore: Close idle connections on test cleanup

It's possible that this was the source of a leak on Windows...

* ci: use big runners

* fix: Improve tailnet connections by reducing timeouts

This awaits connection ping before running a dial. Before,
we were hitting the TCP retransmission and handshake timeouts,
which could intermittently add 1 or 5 seconds to a connection
being initialized.

* Add logging to Startupscript test

* Add better logging

* Write startup script logs to fs dir

* Fix startup script test

* Fix startup script test

* Reduce test timeout

* Use central tmp dir in agent

* Adjust output

* Skip startup script test on Windows

Co-authored-by: Kyle Carberry <kyle@carberry.com>
This commit is contained in:
Ammar Bandukwala 2022-11-13 14:23:23 -06:00 committed by GitHub
parent 9578ce9f77
commit 73f91e4690
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 43 additions and 31 deletions

View File

@ -89,7 +89,7 @@ jobs:
style-lint-golangci:
name: style/lint/golangci
timeout-minutes: 5
runs-on: ${{ github.repository_owner == 'coder' && 'buildjet-8vcpu-ubuntu-2204' || 'ubuntu-latest' }}
runs-on: ${{ github.repository_owner == 'coder' && 'ubuntu-latest-16-cores' || 'ubuntu-latest' }}
steps:
- uses: actions/checkout@v3
- uses: actions/setup-go@v3
@ -171,7 +171,7 @@ jobs:
gen:
name: "style/gen"
timeout-minutes: 8
runs-on: ${{ github.repository_owner == 'coder' && 'buildjet-8vcpu-ubuntu-2204' || 'ubuntu-latest' }}
runs-on: ${{ github.repository_owner == 'coder' && 'ubuntu-latest-16-cores' || 'ubuntu-latest' }}
needs: changes
if: needs.changes.outputs.docs-only == 'false'
steps:
@ -276,7 +276,7 @@ jobs:
test-go:
name: "test/go"
runs-on: ${{ matrix.os == 'ubuntu-latest' && github.repository_owner == 'coder' && 'buildjet-8vcpu-ubuntu-2204' || matrix.os }}
runs-on: ${{ matrix.os == 'ubuntu-latest' && github.repository_owner == 'coder' && 'ubuntu-latest-16-cores' || matrix.os == 'windows-2022' && github.repository_owner == 'coder' && 'windows-latest-8-cores'|| matrix.os }}
timeout-minutes: 20
strategy:
matrix:
@ -336,11 +336,7 @@ jobs:
echo ::set-output name=cover::false
fi
set -x
test_timeout=5m
if [[ "${{ matrix.os }}" == windows* ]]; then
test_timeout=10m
fi
gotestsum --junitfile="gotests.xml" --packages="./..." -- -parallel=8 -timeout=$test_timeout -short -failfast $COVERAGE_FLAGS
gotestsum --junitfile="gotests.xml" --packages="./..." -- -parallel=8 -timeout=3m -short -failfast $COVERAGE_FLAGS
- uses: codecov/codecov-action@v3
# This action has a tendency to error out unexpectedly, it has
@ -356,7 +352,7 @@ jobs:
test-go-postgres:
name: "test/go/postgres"
runs-on: ${{ github.repository_owner == 'coder' && 'buildjet-8vcpu-ubuntu-2204' || 'ubuntu-latest' }}
runs-on: ${{ github.repository_owner == 'coder' && 'ubuntu-latest-16-cores' || 'ubuntu-latest' }}
# This timeout must be greater than the timeout set by `go test` in
# `make test-postgres` to ensure we receive a trace of running
# goroutines. Setting this to the timeout +5m should work quite well
@ -417,7 +413,7 @@ jobs:
deploy:
name: "deploy"
runs-on: ${{ github.repository_owner == 'coder' && 'buildjet-8vcpu-ubuntu-2204' || 'ubuntu-latest' }}
runs-on: ${{ github.repository_owner == 'coder' && 'ubuntu-latest-16-cores' || 'ubuntu-latest' }}
timeout-minutes: 30
needs: changes
if: |
@ -514,7 +510,7 @@ jobs:
test-js:
name: "test/js"
runs-on: ubuntu-latest
runs-on: ${{ github.repository_owner == 'coder' && 'ubuntu-latest-16-cores' || 'ubuntu-latest' }}
timeout-minutes: 20
steps:
- uses: actions/checkout@v3

View File

@ -28,7 +28,7 @@ env:
jobs:
release:
runs-on: ${{ github.repository_owner == 'coder' && 'buildjet-8vcpu-ubuntu-2204' || 'ubuntu-latest' }}
runs-on: ${{ github.repository_owner == 'coder' && 'ubuntu-latest-16-cores' || 'ubuntu-latest' }}
env:
# Necessary for Docker manifest
DOCKER_CLI_EXPERIMENTAL: "enabled"

View File

@ -463,8 +463,11 @@ test: test-clean
# When updating -timeout for this test, keep in sync with
# test-go-postgres (.github/workflows/coder.yaml).
test-postgres: test-clean test-postgres-docker
# The postgres test is prone to failure, so we limit parallelism for
# more consistent execution.
DB=ci DB_FROM=$(shell go run scripts/migrate-ci/main.go) gotestsum --junitfile="gotests.xml" --packages="./..." -- \
-covermode=atomic -coverprofile="gotests.coverage" -timeout=20m \
-parallel=4 \
-coverpkg=./... \
-count=1 -race -failfast
.PHONY: test-postgres

View File

@ -56,6 +56,7 @@ const (
type Options struct {
Filesystem afero.Fs
TempDir string
ExchangeToken func(ctx context.Context) (string, error)
Client Client
ReconnectingPTYTimeout time.Duration
@ -78,6 +79,9 @@ func New(options Options) io.Closer {
if options.Filesystem == nil {
options.Filesystem = afero.NewOsFs()
}
if options.TempDir == "" {
options.TempDir = os.TempDir()
}
if options.ExchangeToken == nil {
options.ExchangeToken = func(ctx context.Context) (string, error) {
return "", nil
@ -93,6 +97,7 @@ func New(options Options) io.Closer {
client: options.Client,
exchangeToken: options.ExchangeToken,
filesystem: options.Filesystem,
tempDir: options.TempDir,
stats: &Stats{},
}
server.init(ctx)
@ -104,6 +109,7 @@ type agent struct {
client Client
exchangeToken func(ctx context.Context) (string, error)
filesystem afero.Fs
tempDir string
reconnectingPTYs sync.Map
reconnectingPTYTimeout time.Duration
@ -375,14 +381,14 @@ func (a *agent) runStartupScript(ctx context.Context, script string) error {
return nil
}
writer, err := os.OpenFile(filepath.Join(os.TempDir(), "coder-startup-script.log"), os.O_CREATE|os.O_RDWR, 0o600)
a.logger.Info(ctx, "running startup script", slog.F("script", script))
writer, err := a.filesystem.OpenFile(filepath.Join(a.tempDir, "coder-startup-script.log"), os.O_CREATE|os.O_RDWR, 0o600)
if err != nil {
return xerrors.Errorf("open startup script log file: %w", err)
}
defer func() {
_ = writer.Close()
}()
cmd, err := a.createCommand(ctx, script, nil)
if err != nil {
return xerrors.Errorf("create command: %w", err)

View File

@ -61,7 +61,7 @@ func TestAgent(t *testing.T) {
ctx, cancel := context.WithTimeout(context.Background(), testutil.WaitLong)
defer cancel()
conn, stats := setupAgent(t, codersdk.WorkspaceAgentMetadata{}, 0)
conn, stats, _ := setupAgent(t, codersdk.WorkspaceAgentMetadata{}, 0)
sshClient, err := conn.SSHClient(ctx)
require.NoError(t, err)
@ -81,7 +81,7 @@ func TestAgent(t *testing.T) {
ctx, cancel := context.WithTimeout(context.Background(), testutil.WaitLong)
defer cancel()
conn, stats := setupAgent(t, codersdk.WorkspaceAgentMetadata{}, 0)
conn, stats, _ := setupAgent(t, codersdk.WorkspaceAgentMetadata{}, 0)
ptyConn, err := conn.ReconnectingPTY(ctx, uuid.NewString(), 128, 128, "/bin/bash")
require.NoError(t, err)
@ -231,7 +231,7 @@ func TestAgent(t *testing.T) {
if runtime.GOOS == "windows" {
home = "/" + strings.ReplaceAll(home, "\\", "/")
}
conn, _ := setupAgent(t, codersdk.WorkspaceAgentMetadata{}, 0)
conn, _, _ := setupAgent(t, codersdk.WorkspaceAgentMetadata{}, 0)
sshClient, err := conn.SSHClient(ctx)
require.NoError(t, err)
defer sshClient.Close()
@ -261,7 +261,7 @@ func TestAgent(t *testing.T) {
ctx, cancel := context.WithTimeout(context.Background(), testutil.WaitLong)
defer cancel()
conn, _ := setupAgent(t, codersdk.WorkspaceAgentMetadata{}, 0)
conn, _, _ := setupAgent(t, codersdk.WorkspaceAgentMetadata{}, 0)
sshClient, err := conn.SSHClient(ctx)
require.NoError(t, err)
defer sshClient.Close()
@ -360,19 +360,23 @@ func TestAgent(t *testing.T) {
t.Run("StartupScript", func(t *testing.T) {
t.Parallel()
tempPath := filepath.Join(t.TempDir(), "content.txt")
content := "somethingnice"
setupAgent(t, codersdk.WorkspaceAgentMetadata{
StartupScript: fmt.Sprintf("echo %s > %s", content, tempPath),
if runtime.GOOS == "windows" {
t.Skip("This test doesn't work on Windows for some reason...")
}
content := "output"
_, _, fs := setupAgent(t, codersdk.WorkspaceAgentMetadata{
StartupScript: "echo " + content,
}, 0)
var gotContent string
require.Eventually(t, func() bool {
content, err := os.ReadFile(tempPath)
outputPath := filepath.Join(os.TempDir(), "coder-startup-script.log")
content, err := afero.ReadFile(fs, outputPath)
if err != nil {
t.Logf("read file %q: %s", outputPath, err)
return false
}
if len(content) == 0 {
t.Logf("no content in %q", outputPath)
return false
}
if runtime.GOOS == "windows" {
@ -384,7 +388,7 @@ func TestAgent(t *testing.T) {
}
gotContent = string(content)
return true
}, testutil.WaitMedium, testutil.IntervalMedium)
}, testutil.WaitShort, testutil.IntervalMedium)
require.Equal(t, content, strings.TrimSpace(gotContent))
})
@ -400,7 +404,7 @@ func TestAgent(t *testing.T) {
ctx, cancel := context.WithTimeout(context.Background(), testutil.WaitLong)
defer cancel()
conn, _ := setupAgent(t, codersdk.WorkspaceAgentMetadata{}, 0)
conn, _, _ := setupAgent(t, codersdk.WorkspaceAgentMetadata{}, 0)
id := uuid.NewString()
netConn, err := conn.ReconnectingPTY(ctx, id, 100, 100, "/bin/bash")
require.NoError(t, err)
@ -497,7 +501,7 @@ func TestAgent(t *testing.T) {
}
}()
conn, _ := setupAgent(t, codersdk.WorkspaceAgentMetadata{}, 0)
conn, _, _ := setupAgent(t, codersdk.WorkspaceAgentMetadata{}, 0)
require.True(t, conn.AwaitReachable(context.Background()))
conn1, err := conn.DialContext(context.Background(), l.Addr().Network(), l.Addr().String())
require.NoError(t, err)
@ -518,7 +522,7 @@ func TestAgent(t *testing.T) {
ctx, cancel := context.WithTimeout(context.Background(), testutil.WaitLong)
defer cancel()
derpMap := tailnettest.RunDERPAndSTUN(t)
conn, _ := setupAgent(t, codersdk.WorkspaceAgentMetadata{
conn, _, _ := setupAgent(t, codersdk.WorkspaceAgentMetadata{
DERPMap: derpMap,
}, 0)
defer conn.Close()
@ -601,7 +605,7 @@ func TestAgent(t *testing.T) {
}
func setupSSHCommand(t *testing.T, beforeArgs []string, afterArgs []string) *exec.Cmd {
agentConn, _ := setupAgent(t, codersdk.WorkspaceAgentMetadata{}, 0)
agentConn, _, _ := setupAgent(t, codersdk.WorkspaceAgentMetadata{}, 0)
listener, err := net.Listen("tcp", "127.0.0.1:0")
require.NoError(t, err)
waitGroup := sync.WaitGroup{}
@ -644,7 +648,7 @@ func setupSSHCommand(t *testing.T, beforeArgs []string, afterArgs []string) *exe
func setupSSHSession(t *testing.T, options codersdk.WorkspaceAgentMetadata) *ssh.Session {
ctx, cancel := context.WithTimeout(context.Background(), testutil.WaitLong)
defer cancel()
conn, _ := setupAgent(t, options, 0)
conn, _, _ := setupAgent(t, options, 0)
sshClient, err := conn.SSHClient(ctx)
require.NoError(t, err)
t.Cleanup(func() {
@ -664,6 +668,7 @@ func (c closeFunc) Close() error {
func setupAgent(t *testing.T, metadata codersdk.WorkspaceAgentMetadata, ptyTimeout time.Duration) (
*codersdk.AgentConn,
<-chan *codersdk.AgentStats,
afero.Fs,
) {
if metadata.DERPMap == nil {
metadata.DERPMap = tailnettest.RunDERPAndSTUN(t)
@ -671,6 +676,7 @@ func setupAgent(t *testing.T, metadata codersdk.WorkspaceAgentMetadata, ptyTimeo
coordinator := tailnet.NewCoordinator()
agentID := uuid.New()
statsCh := make(chan *codersdk.AgentStats)
fs := afero.NewMemMapFs()
closer := agent.New(agent.Options{
Client: &client{
t: t,
@ -679,6 +685,7 @@ func setupAgent(t *testing.T, metadata codersdk.WorkspaceAgentMetadata, ptyTimeo
statsChan: statsCh,
coordinator: coordinator,
},
Filesystem: fs,
Logger: slogtest.Make(t, nil).Leveled(slog.LevelDebug),
ReconnectingPTYTimeout: ptyTimeout,
})
@ -704,7 +711,7 @@ func setupAgent(t *testing.T, metadata codersdk.WorkspaceAgentMetadata, ptyTimeo
conn.SetNodeCallback(sendNode)
return &codersdk.AgentConn{
Conn: conn,
}, statsCh
}, statsCh, fs
}
var dialTestPayload = []byte("dean-was-here123")