feat: automatically stop workspaces based on failure_ttl (#7989)

This commit is contained in:
Jon Ayers 2023-06-22 00:33:22 -04:00 committed by GitHub
parent d434181941
commit 1b0124ecdb
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
17 changed files with 421 additions and 161 deletions

View File

@ -62,7 +62,7 @@ import (
"github.com/coder/coder/cli/cliui"
"github.com/coder/coder/cli/config"
"github.com/coder/coder/coderd"
"github.com/coder/coder/coderd/autobuild/executor"
"github.com/coder/coder/coderd/autobuild"
"github.com/coder/coder/coderd/database"
"github.com/coder/coder/coderd/database/dbfake"
"github.com/coder/coder/coderd/database/dbmetrics"
@ -900,7 +900,7 @@ func (r *RootCmd) Server(newAPI func(context.Context, *coderd.Options) (*coderd.
autobuildPoller := time.NewTicker(cfg.AutobuildPollInterval.Value())
defer autobuildPoller.Stop()
autobuildExecutor := executor.New(ctx, options.Database, coderAPI.TemplateScheduleStore, logger, autobuildPoller.C)
autobuildExecutor := autobuild.NewExecutor(ctx, options.Database, coderAPI.TemplateScheduleStore, logger, autobuildPoller.C)
autobuildExecutor.Run()
// Currently there is no way to ask the server to shut

3
coderd/autobuild/doc.go Normal file
View File

@ -0,0 +1,3 @@
// Package autobuild contains logic for scheduling workspace
// builds in the background.
package autobuild

View File

@ -1,4 +1,4 @@
package executor
package autobuild
import (
"context"
@ -13,9 +13,11 @@ import (
"cdr.dev/slog"
"github.com/coder/coder/coderd/database"
"github.com/coder/coder/coderd/database/db2sdk"
"github.com/coder/coder/coderd/database/dbauthz"
"github.com/coder/coder/coderd/schedule"
"github.com/coder/coder/coderd/wsbuilder"
"github.com/coder/coder/codersdk"
)
// Executor automatically starts or stops workspaces.
@ -35,8 +37,8 @@ type Stats struct {
Error error
}
// New returns a new autobuild executor.
func New(ctx context.Context, db database.Store, tss *atomic.Pointer[schedule.TemplateScheduleStore], log slog.Logger, tick <-chan time.Time) *Executor {
// New returns a new wsactions executor.
func NewExecutor(ctx context.Context, db database.Store, tss *atomic.Pointer[schedule.TemplateScheduleStore], log slog.Logger, tick <-chan time.Time) *Executor {
le := &Executor{
//nolint:gocritic // Autostart has a limited set of permissions.
ctx: dbauthz.AsAutostart(ctx),
@ -108,7 +110,7 @@ func (e *Executor) runOnce(t time.Time) Stats {
// NOTE: If a workspace build is created with a given TTL and then the user either
// changes or unsets the TTL, the deadline for the workspace build will not
// have changed. This behavior is as expected per #2229.
workspaces, err := e.db.GetWorkspacesEligibleForAutoStartStop(e.ctx, t)
workspaces, err := e.db.GetWorkspacesEligibleForTransition(e.ctx, t)
if err != nil {
e.log.Error(e.ctx, "get workspaces for autostart or autostop", slog.Error(err))
return stats
@ -125,77 +127,56 @@ func (e *Executor) runOnce(t time.Time) Stats {
log := e.log.With(slog.F("workspace_id", wsID))
eg.Go(func() error {
err := e.db.InTx(func(db database.Store) error {
err := e.db.InTx(func(tx database.Store) error {
// Re-check eligibility since the first check was outside the
// transaction and the workspace settings may have changed.
ws, err := db.GetWorkspaceByID(e.ctx, wsID)
ws, err := tx.GetWorkspaceByID(e.ctx, wsID)
if err != nil {
log.Error(e.ctx, "get workspace autostart failed", slog.Error(err))
return nil
}
// Determine the workspace state based on its latest build.
priorHistory, err := db.GetLatestWorkspaceBuildByWorkspaceID(e.ctx, ws.ID)
latestBuild, err := tx.GetLatestWorkspaceBuildByWorkspaceID(e.ctx, ws.ID)
if err != nil {
log.Warn(e.ctx, "get latest workspace build", slog.Error(err))
return nil
}
templateSchedule, err := (*(e.templateScheduleStore.Load())).GetTemplateScheduleOptions(e.ctx, db, ws.TemplateID)
templateSchedule, err := (*(e.templateScheduleStore.Load())).GetTemplateScheduleOptions(e.ctx, tx, ws.TemplateID)
if err != nil {
log.Warn(e.ctx, "get template schedule options", slog.Error(err))
return nil
}
if !isEligibleForAutoStartStop(ws, priorHistory, templateSchedule) {
return nil
}
priorJob, err := db.GetProvisionerJobByID(e.ctx, priorHistory.JobID)
latestJob, err := tx.GetProvisionerJobByID(e.ctx, latestBuild.JobID)
if err != nil {
log.Warn(e.ctx, "get last provisioner job for workspace %q: %w", slog.Error(err))
return nil
}
validTransition, nextTransition, err := getNextTransition(ws, priorHistory, priorJob)
nextTransition, reason, err := getNextTransition(ws, latestBuild, latestJob, templateSchedule, currentTick)
if err != nil {
log.Debug(e.ctx, "skipping workspace", slog.Error(err))
return nil
}
if currentTick.Before(nextTransition) {
log.Debug(e.ctx, "skipping workspace: too early",
slog.F("next_transition_at", nextTransition),
slog.F("transition", validTransition),
slog.F("current_tick", currentTick),
)
return nil
}
builder := wsbuilder.New(ws, validTransition).
SetLastWorkspaceBuildInTx(&priorHistory).
SetLastWorkspaceBuildJobInTx(&priorJob)
builder := wsbuilder.New(ws, nextTransition).
SetLastWorkspaceBuildInTx(&latestBuild).
SetLastWorkspaceBuildJobInTx(&latestJob).
Reason(reason)
switch validTransition {
case database.WorkspaceTransitionStart:
builder = builder.Reason(database.BuildReasonAutostart)
case database.WorkspaceTransitionStop:
builder = builder.Reason(database.BuildReasonAutostop)
default:
log.Error(e.ctx, "unsupported transition", slog.F("transition", validTransition))
return nil
}
if _, _, err := builder.Build(e.ctx, db, nil); err != nil {
if _, _, err := builder.Build(e.ctx, tx, nil); err != nil {
log.Error(e.ctx, "unable to transition workspace",
slog.F("transition", validTransition),
slog.F("transition", nextTransition),
slog.Error(err),
)
return nil
}
statsMu.Lock()
stats.Transitions[ws.ID] = validTransition
stats.Transitions[ws.ID] = nextTransition
statsMu.Unlock()
log.Info(e.ctx, "scheduling workspace transition", slog.F("transition", validTransition))
log.Info(e.ctx, "scheduling workspace transition", slog.F("transition", nextTransition))
return nil
@ -218,53 +199,81 @@ func (e *Executor) runOnce(t time.Time) Stats {
return stats
}
func isEligibleForAutoStartStop(ws database.Workspace, priorHistory database.WorkspaceBuild, templateSchedule schedule.TemplateScheduleOptions) bool {
if ws.Deleted {
return false
}
if templateSchedule.UserAutostartEnabled && ws.AutostartSchedule.Valid && ws.AutostartSchedule.String != "" {
return true
}
// Don't check the template schedule to see whether it allows autostop, this
// is done during the build when determining the deadline.
if priorHistory.Transition == database.WorkspaceTransitionStart && !priorHistory.Deadline.IsZero() {
return true
}
return false
}
func getNextTransition(
ws database.Workspace,
priorHistory database.WorkspaceBuild,
priorJob database.ProvisionerJob,
latestBuild database.WorkspaceBuild,
latestJob database.ProvisionerJob,
templateSchedule schedule.TemplateScheduleOptions,
currentTick time.Time,
) (
validTransition database.WorkspaceTransition,
nextTransition time.Time,
err error,
database.WorkspaceTransition,
database.BuildReason,
error,
) {
if !priorJob.CompletedAt.Valid || priorJob.Error.String != "" {
return "", time.Time{}, xerrors.Errorf("last workspace build did not complete successfully")
}
switch priorHistory.Transition {
case database.WorkspaceTransitionStart:
if priorHistory.Deadline.IsZero() {
return "", time.Time{}, xerrors.Errorf("latest workspace build has zero deadline")
}
// For stopping, do not truncate. This is inconsistent with autostart, but
// it ensures we will not stop too early.
return database.WorkspaceTransitionStop, priorHistory.Deadline, nil
case database.WorkspaceTransitionStop:
sched, err := schedule.Weekly(ws.AutostartSchedule.String)
if err != nil {
return "", time.Time{}, xerrors.Errorf("workspace has invalid autostart schedule: %w", err)
}
// Round down to the nearest minute, as this is the finest granularity cron supports.
// Truncate is probably not necessary here, but doing it anyway to be sure.
nextTransition = sched.Next(priorHistory.CreatedAt).Truncate(time.Minute)
return database.WorkspaceTransitionStart, nextTransition, nil
switch {
case isEligibleForAutostop(latestBuild, latestJob, currentTick):
return database.WorkspaceTransitionStop, database.BuildReasonAutostop, nil
case isEligibleForAutostart(ws, latestBuild, latestJob, templateSchedule, currentTick):
return database.WorkspaceTransitionStart, database.BuildReasonAutostart, nil
case isEligibleForFailedStop(latestBuild, latestJob, templateSchedule):
return database.WorkspaceTransitionStop, database.BuildReasonAutostop, nil
default:
return "", time.Time{}, xerrors.Errorf("last transition not valid for autostart or autostop")
return "", "", xerrors.Errorf("last transition not valid for autostart or autostop")
}
}
// isEligibleForAutostart returns true if the workspace should be autostarted.
func isEligibleForAutostart(ws database.Workspace, build database.WorkspaceBuild, job database.ProvisionerJob, templateSchedule schedule.TemplateScheduleOptions, currentTick time.Time) bool {
// Don't attempt to autostart failed workspaces.
if !job.CompletedAt.Valid || job.Error.String != "" {
return false
}
// If the last transition for the workspace was not 'stop' then the workspace
// cannot be started.
if build.Transition != database.WorkspaceTransitionStop {
return false
}
// If autostart isn't enabled, or the schedule isn't valid/populated we can't
// autostart the workspace.
if !templateSchedule.UserAutostartEnabled || !ws.AutostartSchedule.Valid || ws.AutostartSchedule.String == "" {
return false
}
sched, err := schedule.Weekly(ws.AutostartSchedule.String)
if err != nil {
return false
}
// Round down to the nearest minute, as this is the finest granularity cron supports.
// Truncate is probably not necessary here, but doing it anyway to be sure.
nextTransition := sched.Next(build.CreatedAt).Truncate(time.Minute)
return !currentTick.Before(nextTransition)
}
// isEligibleForAutostart returns true if the workspace should be autostopped.
func isEligibleForAutostop(build database.WorkspaceBuild, job database.ProvisionerJob, currentTick time.Time) bool {
// Don't attempt to autostop failed workspaces.
if !job.CompletedAt.Valid || job.Error.String != "" {
return false
}
// A workspace must be started in order for it to be auto-stopped.
return build.Transition == database.WorkspaceTransitionStart &&
!build.Deadline.IsZero() &&
// We do not want to stop a workspace prior to it breaching its deadline.
!currentTick.Before(build.Deadline)
}
// isEligibleForFailedStop returns true if the workspace is eligible to be stopped
// due to a failed build.
func isEligibleForFailedStop(build database.WorkspaceBuild, job database.ProvisionerJob, templateSchedule schedule.TemplateScheduleOptions) bool {
// If the template has specified a failure TLL.
return templateSchedule.FailureTTL > 0 &&
// And the job resulted in failure.
db2sdk.ProvisionerJobStatus(job) == codersdk.ProvisionerJobFailed &&
build.Transition == database.WorkspaceTransitionStart &&
// And sufficient time has elapsed since the job has completed.
job.CompletedAt.Valid && database.Now().Sub(job.CompletedAt.Time) > templateSchedule.FailureTTL
}

View File

@ -1,4 +1,4 @@
package executor_test
package autobuild_test
import (
"context"
@ -11,7 +11,9 @@ import (
"github.com/stretchr/testify/require"
"go.uber.org/goleak"
"github.com/coder/coder/coderd/autobuild/executor"
"cdr.dev/slog/sloggers/slogtest"
"github.com/coder/coder/coderd/autobuild"
"github.com/coder/coder/coderd/coderdtest"
"github.com/coder/coder/coderd/database"
"github.com/coder/coder/coderd/schedule"
@ -19,6 +21,7 @@ import (
"github.com/coder/coder/codersdk"
"github.com/coder/coder/provisioner/echo"
"github.com/coder/coder/provisionersdk/proto"
"github.com/coder/coder/testutil"
)
func TestExecutorAutostartOK(t *testing.T) {
@ -27,7 +30,7 @@ func TestExecutorAutostartOK(t *testing.T) {
var (
sched = mustSchedule(t, "CRON_TZ=UTC 0 * * * *")
tickCh = make(chan time.Time)
statsCh = make(chan executor.Stats)
statsCh = make(chan autobuild.Stats)
client = coderdtest.New(t, &coderdtest.Options{
AutobuildTicker: tickCh,
IncludeProvisionerDaemon: true,
@ -66,7 +69,7 @@ func TestExecutorAutostartTemplateUpdated(t *testing.T) {
ctx = context.Background()
err error
tickCh = make(chan time.Time)
statsCh = make(chan executor.Stats)
statsCh = make(chan autobuild.Stats)
client = coderdtest.New(t, &coderdtest.Options{
AutobuildTicker: tickCh,
IncludeProvisionerDaemon: true,
@ -113,7 +116,7 @@ func TestExecutorAutostartAlreadyRunning(t *testing.T) {
var (
sched = mustSchedule(t, "CRON_TZ=UTC 0 * * * *")
tickCh = make(chan time.Time)
statsCh = make(chan executor.Stats)
statsCh = make(chan autobuild.Stats)
client = coderdtest.New(t, &coderdtest.Options{
AutobuildTicker: tickCh,
IncludeProvisionerDaemon: true,
@ -145,7 +148,7 @@ func TestExecutorAutostartNotEnabled(t *testing.T) {
var (
tickCh = make(chan time.Time)
statsCh = make(chan executor.Stats)
statsCh = make(chan autobuild.Stats)
client = coderdtest.New(t, &coderdtest.Options{
AutobuildTicker: tickCh,
IncludeProvisionerDaemon: true,
@ -180,7 +183,7 @@ func TestExecutorAutostopOK(t *testing.T) {
var (
tickCh = make(chan time.Time)
statsCh = make(chan executor.Stats)
statsCh = make(chan autobuild.Stats)
client = coderdtest.New(t, &coderdtest.Options{
AutobuildTicker: tickCh,
IncludeProvisionerDaemon: true,
@ -216,7 +219,7 @@ func TestExecutorAutostopExtend(t *testing.T) {
var (
ctx = context.Background()
tickCh = make(chan time.Time)
statsCh = make(chan executor.Stats)
statsCh = make(chan autobuild.Stats)
client = coderdtest.New(t, &coderdtest.Options{
AutobuildTicker: tickCh,
IncludeProvisionerDaemon: true,
@ -266,7 +269,7 @@ func TestExecutorAutostopAlreadyStopped(t *testing.T) {
var (
tickCh = make(chan time.Time)
statsCh = make(chan executor.Stats)
statsCh = make(chan autobuild.Stats)
client = coderdtest.New(t, &coderdtest.Options{
AutobuildTicker: tickCh,
IncludeProvisionerDaemon: true,
@ -299,7 +302,7 @@ func TestExecutorAutostopNotEnabled(t *testing.T) {
var (
ctx = context.Background()
tickCh = make(chan time.Time)
statsCh = make(chan executor.Stats)
statsCh = make(chan autobuild.Stats)
client = coderdtest.New(t, &coderdtest.Options{
AutobuildTicker: tickCh,
IncludeProvisionerDaemon: true,
@ -341,7 +344,7 @@ func TestExecutorWorkspaceDeleted(t *testing.T) {
var (
sched = mustSchedule(t, "CRON_TZ=UTC 0 * * * *")
tickCh = make(chan time.Time)
statsCh = make(chan executor.Stats)
statsCh = make(chan autobuild.Stats)
client = coderdtest.New(t, &coderdtest.Options{
AutobuildTicker: tickCh,
IncludeProvisionerDaemon: true,
@ -374,7 +377,7 @@ func TestExecutorWorkspaceAutostartTooEarly(t *testing.T) {
var (
sched = mustSchedule(t, "CRON_TZ=UTC 0 * * * *")
tickCh = make(chan time.Time)
statsCh = make(chan executor.Stats)
statsCh = make(chan autobuild.Stats)
client = coderdtest.New(t, &coderdtest.Options{
AutobuildTicker: tickCh,
IncludeProvisionerDaemon: true,
@ -405,7 +408,7 @@ func TestExecutorWorkspaceAutostopBeforeDeadline(t *testing.T) {
var (
tickCh = make(chan time.Time)
statsCh = make(chan executor.Stats)
statsCh = make(chan autobuild.Stats)
client = coderdtest.New(t, &coderdtest.Options{
AutobuildTicker: tickCh,
IncludeProvisionerDaemon: true,
@ -433,7 +436,7 @@ func TestExecutorWorkspaceAutostopNoWaitChangedMyMind(t *testing.T) {
var (
ctx = context.Background()
tickCh = make(chan time.Time)
statsCh = make(chan executor.Stats)
statsCh = make(chan autobuild.Stats)
client = coderdtest.New(t, &coderdtest.Options{
AutobuildTicker: tickCh,
IncludeProvisionerDaemon: true,
@ -501,8 +504,8 @@ func TestExecutorAutostartMultipleOK(t *testing.T) {
sched = mustSchedule(t, "CRON_TZ=UTC 0 * * * *")
tickCh = make(chan time.Time)
tickCh2 = make(chan time.Time)
statsCh1 = make(chan executor.Stats)
statsCh2 = make(chan executor.Stats)
statsCh1 = make(chan autobuild.Stats)
statsCh2 = make(chan autobuild.Stats)
client = coderdtest.New(t, &coderdtest.Options{
AutobuildTicker: tickCh,
IncludeProvisionerDaemon: true,
@ -556,7 +559,7 @@ func TestExecutorAutostartWithParameters(t *testing.T) {
var (
sched = mustSchedule(t, "CRON_TZ=UTC 0 * * * *")
tickCh = make(chan time.Time)
statsCh = make(chan executor.Stats)
statsCh = make(chan autobuild.Stats)
client = coderdtest.New(t, &coderdtest.Options{
AutobuildTicker: tickCh,
IncludeProvisionerDaemon: true,
@ -609,7 +612,7 @@ func TestExecutorAutostartTemplateDisabled(t *testing.T) {
var (
sched = mustSchedule(t, "CRON_TZ=UTC 0 * * * *")
tickCh = make(chan time.Time)
statsCh = make(chan executor.Stats)
statsCh = make(chan autobuild.Stats)
client = coderdtest.New(t, &coderdtest.Options{
AutobuildTicker: tickCh,
@ -648,6 +651,60 @@ func TestExecutorAutostartTemplateDisabled(t *testing.T) {
assert.Len(t, stats.Transitions, 0)
}
// TesetExecutorFailedWorkspace tests that failed workspaces that breach
// their template failed_ttl threshold trigger a stop job.
// For enterprise functionality see enterprise/coderd/workspaces_test.go
func TestExecutorFailedWorkspace(t *testing.T) {
t.Parallel()
// Test that an AGPL TemplateScheduleStore properly disables
// functionality.
t.Run("OK", func(t *testing.T) {
t.Parallel()
var (
ticker = make(chan time.Time)
statCh = make(chan autobuild.Stats)
logger = slogtest.Make(t, &slogtest.Options{
// We ignore errors here since we expect to fail
// builds.
IgnoreErrors: true,
})
failureTTL = time.Millisecond
client = coderdtest.New(t, &coderdtest.Options{
Logger: &logger,
AutobuildTicker: ticker,
IncludeProvisionerDaemon: true,
AutobuildStats: statCh,
TemplateScheduleStore: schedule.NewAGPLTemplateScheduleStore(),
})
)
user := coderdtest.CreateFirstUser(t, client)
version := coderdtest.CreateTemplateVersion(t, client, user.OrganizationID, &echo.Responses{
Parse: echo.ParseComplete,
ProvisionPlan: echo.ProvisionComplete,
ProvisionApply: echo.ProvisionFailed,
})
template := coderdtest.CreateTemplate(t, client, user.OrganizationID, version.ID, func(ctr *codersdk.CreateTemplateRequest) {
ctr.FailureTTLMillis = ptr.Ref[int64](failureTTL.Milliseconds())
})
coderdtest.AwaitTemplateVersionJob(t, client, version.ID)
ws := coderdtest.CreateWorkspace(t, client, user.OrganizationID, template.ID)
build := coderdtest.AwaitWorkspaceBuildJob(t, client, ws.LatestBuild.ID)
require.Equal(t, codersdk.WorkspaceStatusFailed, build.Status)
require.Eventually(t,
func() bool {
return database.Now().Sub(*build.Job.CompletedAt) > failureTTL
},
testutil.IntervalMedium, testutil.IntervalFast)
ticker <- time.Now()
stats := <-statCh
// Expect no transitions since we're using AGPL.
require.Len(t, stats.Transitions, 0)
})
}
func mustProvisionWorkspace(t *testing.T, client *codersdk.Client, mut ...func(*codersdk.CreateWorkspaceRequest)) codersdk.Workspace {
t.Helper()
user := coderdtest.CreateFirstUser(t, client)

View File

@ -54,7 +54,7 @@ import (
"cdr.dev/slog/sloggers/slogtest"
"github.com/coder/coder/coderd"
"github.com/coder/coder/coderd/audit"
"github.com/coder/coder/coderd/autobuild/executor"
"github.com/coder/coder/coderd/autobuild"
"github.com/coder/coder/coderd/awsidentity"
"github.com/coder/coder/coderd/database"
"github.com/coder/coder/coderd/database/dbauthz"
@ -102,7 +102,7 @@ type Options struct {
GoogleTokenValidator *idtoken.Validator
SSHKeygenAlgorithm gitsshkey.Algorithm
AutobuildTicker <-chan time.Time
AutobuildStats chan<- executor.Stats
AutobuildStats chan<- autobuild.Stats
Auditor audit.Auditor
TLSCertificates []tls.Certificate
GitAuthConfigs []*gitauth.Config
@ -136,6 +136,9 @@ type Options struct {
ConfigSSH codersdk.SSHConfigResponse
SwaggerEndpoint bool
// Logger should only be overridden if you expect errors
// as part of your test.
Logger *slog.Logger
}
// New constructs a codersdk client connected to an in-memory API instance.
@ -244,7 +247,7 @@ func NewOptions(t testing.TB, options *Options) (func(http.Handler), context.Can
templateScheduleStore.Store(&options.TemplateScheduleStore)
ctx, cancelFunc := context.WithCancel(context.Background())
lifecycleExecutor := executor.New(
lifecycleExecutor := autobuild.NewExecutor(
ctx,
options.Database,
&templateScheduleStore,
@ -311,6 +314,10 @@ func NewOptions(t testing.TB, options *Options) (func(http.Handler), context.Can
require.NoError(t, err)
}
if options.Logger == nil {
logger := slogtest.Make(t, nil).Leveled(slog.LevelDebug)
options.Logger = &logger
}
region := &tailcfg.DERPRegion{
EmbeddedRelay: true,
RegionID: int(options.DeploymentValues.DERP.Server.RegionID.Value()),
@ -346,7 +353,7 @@ func NewOptions(t testing.TB, options *Options) (func(http.Handler), context.Can
AccessURL: accessURL,
AppHostname: options.AppHostname,
AppHostnameRegex: appHostnameRegex,
Logger: slogtest.Make(t, nil).Leveled(slog.LevelDebug),
Logger: *options.Logger,
CacheDir: t.TempDir(),
Database: options.Database,
Pubsub: options.Pubsub,
@ -433,7 +440,7 @@ func NewProvisionerDaemon(t testing.TB, coderAPI *coderd.API) io.Closer {
return coderAPI.CreateInMemoryProvisionerDaemon(ctx, 0)
}, &provisionerd.Options{
Filesystem: fs,
Logger: slogtest.Make(t, nil).Named("provisionerd").Leveled(slog.LevelDebug),
Logger: coderAPI.Logger.Named("provisionerd").Leveled(slog.LevelDebug),
JobPollInterval: 50 * time.Millisecond,
UpdateInterval: 250 * time.Millisecond,
ForceCancelInterval: time.Second,

View File

@ -1717,8 +1717,8 @@ func (q *querier) GetWorkspaces(ctx context.Context, arg database.GetWorkspacesP
return q.db.GetAuthorizedWorkspaces(ctx, arg, prep)
}
func (q *querier) GetWorkspacesEligibleForAutoStartStop(ctx context.Context, now time.Time) ([]database.Workspace, error) {
return q.db.GetWorkspacesEligibleForAutoStartStop(ctx, now)
func (q *querier) GetWorkspacesEligibleForTransition(ctx context.Context, now time.Time) ([]database.Workspace, error) {
return q.db.GetWorkspacesEligibleForTransition(ctx, now)
}
func (q *querier) InsertAPIKey(ctx context.Context, arg database.InsertAPIKeyParams) (database.APIKey, error) {

View File

@ -20,9 +20,11 @@ import (
"golang.org/x/xerrors"
"github.com/coder/coder/coderd/database"
"github.com/coder/coder/coderd/database/db2sdk"
"github.com/coder/coder/coderd/httpapi"
"github.com/coder/coder/coderd/rbac"
"github.com/coder/coder/coderd/util/slice"
"github.com/coder/coder/codersdk"
)
var validProxyByHostnameRegex = regexp.MustCompile(`^[a-zA-Z0-9._-]+$`)
@ -3432,7 +3434,7 @@ func (q *fakeQuerier) GetWorkspaces(ctx context.Context, arg database.GetWorkspa
return workspaceRows, err
}
func (q *fakeQuerier) GetWorkspacesEligibleForAutoStartStop(ctx context.Context, now time.Time) ([]database.Workspace, error) {
func (q *fakeQuerier) GetWorkspacesEligibleForTransition(ctx context.Context, now time.Time) ([]database.Workspace, error) {
q.mutex.RLock()
defer q.mutex.RUnlock()
@ -3452,6 +3454,15 @@ func (q *fakeQuerier) GetWorkspacesEligibleForAutoStartStop(ctx context.Context,
workspaces = append(workspaces, workspace)
continue
}
job, err := q.getProvisionerJobByIDNoLock(ctx, build.JobID)
if err != nil {
return nil, xerrors.Errorf("get provisioner job by ID: %w", err)
}
if db2sdk.ProvisionerJobStatus(job) == codersdk.ProvisionerJobFailed {
workspaces = append(workspaces, workspace)
continue
}
}
return workspaces, nil

View File

@ -992,9 +992,9 @@ func (m metricsStore) GetWorkspaces(ctx context.Context, arg database.GetWorkspa
return workspaces, err
}
func (m metricsStore) GetWorkspacesEligibleForAutoStartStop(ctx context.Context, now time.Time) ([]database.Workspace, error) {
func (m metricsStore) GetWorkspacesEligibleForTransition(ctx context.Context, now time.Time) ([]database.Workspace, error) {
start := time.Now()
workspaces, err := m.s.GetWorkspacesEligibleForAutoStartStop(ctx, now)
workspaces, err := m.s.GetWorkspacesEligibleForTransition(ctx, now)
m.queryLatencies.WithLabelValues("GetWorkspacesEligibleForAutoStartStop").Observe(time.Since(start).Seconds())
return workspaces, err
}

View File

@ -2007,19 +2007,19 @@ func (mr *MockStoreMockRecorder) GetWorkspaces(arg0, arg1 interface{}) *gomock.C
return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "GetWorkspaces", reflect.TypeOf((*MockStore)(nil).GetWorkspaces), arg0, arg1)
}
// GetWorkspacesEligibleForAutoStartStop mocks base method.
func (m *MockStore) GetWorkspacesEligibleForAutoStartStop(arg0 context.Context, arg1 time.Time) ([]database.Workspace, error) {
// GetWorkspacesEligibleForTransition mocks base method.
func (m *MockStore) GetWorkspacesEligibleForTransition(arg0 context.Context, arg1 time.Time) ([]database.Workspace, error) {
m.ctrl.T.Helper()
ret := m.ctrl.Call(m, "GetWorkspacesEligibleForAutoStartStop", arg0, arg1)
ret := m.ctrl.Call(m, "GetWorkspacesEligibleForTransition", arg0, arg1)
ret0, _ := ret[0].([]database.Workspace)
ret1, _ := ret[1].(error)
return ret0, ret1
}
// GetWorkspacesEligibleForAutoStartStop indicates an expected call of GetWorkspacesEligibleForAutoStartStop.
func (mr *MockStoreMockRecorder) GetWorkspacesEligibleForAutoStartStop(arg0, arg1 interface{}) *gomock.Call {
// GetWorkspacesEligibleForTransition indicates an expected call of GetWorkspacesEligibleForTransition.
func (mr *MockStoreMockRecorder) GetWorkspacesEligibleForTransition(arg0, arg1 interface{}) *gomock.Call {
mr.mock.ctrl.T.Helper()
return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "GetWorkspacesEligibleForAutoStartStop", reflect.TypeOf((*MockStore)(nil).GetWorkspacesEligibleForAutoStartStop), arg0, arg1)
return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "GetWorkspacesEligibleForTransition", reflect.TypeOf((*MockStore)(nil).GetWorkspacesEligibleForTransition), arg0, arg1)
}
// InTx mocks base method.

View File

@ -171,7 +171,7 @@ type sqlcQuerier interface {
GetWorkspaceResourcesByJobIDs(ctx context.Context, ids []uuid.UUID) ([]WorkspaceResource, error)
GetWorkspaceResourcesCreatedAfter(ctx context.Context, createdAt time.Time) ([]WorkspaceResource, error)
GetWorkspaces(ctx context.Context, arg GetWorkspacesParams) ([]GetWorkspacesRow, error)
GetWorkspacesEligibleForAutoStartStop(ctx context.Context, now time.Time) ([]Workspace, error)
GetWorkspacesEligibleForTransition(ctx context.Context, now time.Time) ([]Workspace, error)
InsertAPIKey(ctx context.Context, arg InsertAPIKeyParams) (APIKey, error)
// We use the organization_id as the id
// for simplicity since all users is

View File

@ -8505,13 +8505,15 @@ func (q *sqlQuerier) GetWorkspaces(ctx context.Context, arg GetWorkspacesParams)
return items, nil
}
const getWorkspacesEligibleForAutoStartStop = `-- name: GetWorkspacesEligibleForAutoStartStop :many
const getWorkspacesEligibleForTransition = `-- name: GetWorkspacesEligibleForTransition :many
SELECT
workspaces.id, workspaces.created_at, workspaces.updated_at, workspaces.owner_id, workspaces.organization_id, workspaces.template_id, workspaces.deleted, workspaces.name, workspaces.autostart_schedule, workspaces.ttl, workspaces.last_used_at
FROM
workspaces
LEFT JOIN
workspace_builds ON workspace_builds.workspace_id = workspaces.id
INNER JOIN
provisioner_jobs ON workspace_builds.job_id = provisioner_jobs.id
WHERE
workspace_builds.build_number = (
SELECT
@ -8541,12 +8543,20 @@ WHERE
(
workspace_builds.transition = 'stop'::workspace_transition AND
workspaces.autostart_schedule IS NOT NULL
) OR
-- If the workspace's most recent job resulted in an error
-- it may be eligible for failed stop.
(
provisioner_jobs.error IS NOT NULL AND
provisioner_jobs.error != '' AND
workspace_builds.transition = 'start'::workspace_transition
)
)
) AND workspaces.deleted = 'false'
`
func (q *sqlQuerier) GetWorkspacesEligibleForAutoStartStop(ctx context.Context, now time.Time) ([]Workspace, error) {
rows, err := q.db.QueryContext(ctx, getWorkspacesEligibleForAutoStartStop, now)
func (q *sqlQuerier) GetWorkspacesEligibleForTransition(ctx context.Context, now time.Time) ([]Workspace, error) {
rows, err := q.db.QueryContext(ctx, getWorkspacesEligibleForTransition, now)
if err != nil {
return nil, err
}

View File

@ -405,13 +405,15 @@ SELECT
stopped_workspaces.count AS stopped_workspaces
FROM pending_workspaces, building_workspaces, running_workspaces, failed_workspaces, stopped_workspaces;
-- name: GetWorkspacesEligibleForAutoStartStop :many
-- name: GetWorkspacesEligibleForTransition :many
SELECT
workspaces.*
FROM
workspaces
LEFT JOIN
workspace_builds ON workspace_builds.workspace_id = workspaces.id
INNER JOIN
provisioner_jobs ON workspace_builds.job_id = provisioner_jobs.id
WHERE
workspace_builds.build_number = (
SELECT
@ -441,5 +443,13 @@ WHERE
(
workspace_builds.transition = 'stop'::workspace_transition AND
workspaces.autostart_schedule IS NOT NULL
) OR
-- If the workspace's most recent job resulted in an error
-- it may be eligible for failed stop.
(
provisioner_jobs.error IS NOT NULL AND
provisioner_jobs.error != '' AND
workspace_builds.transition = 'start'::workspace_transition
)
);
) AND workspaces.deleted = 'false';

View File

@ -617,11 +617,12 @@ func (b *Builder) authorize(authFunc func(action rbac.Action, object rbac.Object
case database.WorkspaceTransitionStart, database.WorkspaceTransitionStop:
action = rbac.ActionUpdate
default:
return BuildError{http.StatusBadRequest, fmt.Sprintf("Transition %q not supported.", b.trans), xerrors.New("")}
msg := fmt.Sprintf("Transition %q not supported.", b.trans)
return BuildError{http.StatusBadRequest, msg, xerrors.New(msg)}
}
if !authFunc(action, b.workspace) {
// We use the same wording as the httpapi to avoid leaking the existence of the workspace
return BuildError{http.StatusNotFound, httpapi.ResourceNotFoundResponse.Message, xerrors.New("")}
return BuildError{http.StatusNotFound, httpapi.ResourceNotFoundResponse.Message, xerrors.New(httpapi.ResourceNotFoundResponse.Message)}
}
template, err := b.getTemplate()
@ -633,7 +634,7 @@ func (b *Builder) authorize(authFunc func(action rbac.Action, object rbac.Object
// cloud state.
if b.state.explicit != nil || b.state.orphan {
if !authFunc(rbac.ActionUpdate, template.RBACObject()) {
return BuildError{http.StatusForbidden, "Only template managers may provide custom state", xerrors.New("")}
return BuildError{http.StatusForbidden, "Only template managers may provide custom state", xerrors.New("Only template managers may provide custom state")}
}
}
@ -641,7 +642,7 @@ func (b *Builder) authorize(authFunc func(action rbac.Action, object rbac.Object
return BuildError{
http.StatusBadRequest,
"Workspace builds with a custom log level are restricted to template authors only.",
xerrors.New(""),
xerrors.New("Workspace builds with a custom log level are restricted to template authors only."),
}
}
return nil
@ -686,22 +687,26 @@ func (b *Builder) checkTemplateJobStatus() error {
templateVersionJobStatus := db2sdk.ProvisionerJobStatus(*templateVersionJob)
switch templateVersionJobStatus {
case codersdk.ProvisionerJobPending, codersdk.ProvisionerJobRunning:
msg := fmt.Sprintf("The provided template version is %s. Wait for it to complete importing!", templateVersionJobStatus)
return BuildError{
http.StatusNotAcceptable,
fmt.Sprintf("The provided template version is %s. Wait for it to complete importing!", templateVersionJobStatus),
xerrors.New(""),
msg,
xerrors.New(msg),
}
case codersdk.ProvisionerJobFailed:
msg := fmt.Sprintf("The provided template version %q has failed to import: %q. You cannot build workspaces with it!", templateVersion.Name, templateVersionJob.Error.String)
return BuildError{
http.StatusBadRequest,
fmt.Sprintf("The provided template version %q has failed to import: %q. You cannot build workspaces with it!", templateVersion.Name, templateVersionJob.Error.String),
xerrors.New(""),
msg,
xerrors.New(msg),
}
case codersdk.ProvisionerJobCanceled:
msg := fmt.Sprintf("The provided template version %q has failed to import: %q. You cannot build workspaces with it!", templateVersion.Name, templateVersionJob.Error.String)
return BuildError{
http.StatusBadRequest,
"The provided template version was canceled during import. You cannot build workspaces with it!",
xerrors.New(""),
msg,
xerrors.New(msg),
}
}
return nil
@ -717,10 +722,11 @@ func (b *Builder) checkRunningBuild() error {
return BuildError{http.StatusInternalServerError, "failed to fetch prior build", err}
}
if db2sdk.ProvisionerJobStatus(*job).Active() {
msg := "A workspace build is already active."
return BuildError{
http.StatusConflict,
"A workspace build is already active.",
xerrors.New(""),
msg,
xerrors.New(msg),
}
}
return nil

View File

@ -402,7 +402,7 @@ func (api *API) updateEntitlements(ctx context.Context) error {
if changed, enabled := featureChanged(codersdk.FeatureAdvancedTemplateScheduling); changed {
if enabled {
store := &enterpriseTemplateScheduleStore{}
store := &EnterpriseTemplateScheduleStore{}
ptr := schedule.TemplateScheduleStore(store)
api.AGPL.TemplateScheduleStore.Store(&ptr)
} else {

View File

@ -310,11 +310,11 @@ func websocketNetConn(ctx context.Context, conn *websocket.Conn, msgType websock
}
}
type enterpriseTemplateScheduleStore struct{}
type EnterpriseTemplateScheduleStore struct{}
var _ schedule.TemplateScheduleStore = &enterpriseTemplateScheduleStore{}
var _ schedule.TemplateScheduleStore = &EnterpriseTemplateScheduleStore{}
func (*enterpriseTemplateScheduleStore) GetTemplateScheduleOptions(ctx context.Context, db database.Store, templateID uuid.UUID) (schedule.TemplateScheduleOptions, error) {
func (*EnterpriseTemplateScheduleStore) GetTemplateScheduleOptions(ctx context.Context, db database.Store, templateID uuid.UUID) (schedule.TemplateScheduleOptions, error) {
tpl, err := db.GetTemplateByID(ctx, templateID)
if err != nil {
return schedule.TemplateScheduleOptions{}, err
@ -331,7 +331,7 @@ func (*enterpriseTemplateScheduleStore) GetTemplateScheduleOptions(ctx context.C
}, nil
}
func (*enterpriseTemplateScheduleStore) SetTemplateScheduleOptions(ctx context.Context, db database.Store, tpl database.Template, opts schedule.TemplateScheduleOptions) (database.Template, error) {
func (*EnterpriseTemplateScheduleStore) SetTemplateScheduleOptions(ctx context.Context, db database.Store, tpl database.Template, opts schedule.TemplateScheduleOptions) (database.Template, error) {
if int64(opts.DefaultTTL) == tpl.DefaultTTL &&
int64(opts.MaxTTL) == tpl.MaxTTL &&
int64(opts.FailureTTL) == tpl.FailureTTL &&

View File

@ -10,12 +10,17 @@ import (
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
"cdr.dev/slog/sloggers/slogtest"
"github.com/coder/coder/coderd/autobuild"
"github.com/coder/coder/coderd/coderdtest"
"github.com/coder/coder/coderd/database"
"github.com/coder/coder/coderd/util/ptr"
"github.com/coder/coder/codersdk"
"github.com/coder/coder/enterprise/coderd"
"github.com/coder/coder/enterprise/coderd/coderdenttest"
"github.com/coder/coder/enterprise/coderd/license"
"github.com/coder/coder/provisioner/echo"
"github.com/coder/coder/testutil"
)
@ -74,6 +79,157 @@ func TestCreateWorkspace(t *testing.T) {
})
}
func TestWorkspaceAutobuild(t *testing.T) {
t.Parallel()
t.Run("FailureTTLOK", func(t *testing.T) {
t.Parallel()
var (
ticker = make(chan time.Time)
statCh = make(chan autobuild.Stats)
logger = slogtest.Make(t, &slogtest.Options{
// We ignore errors here since we expect to fail
// builds.
IgnoreErrors: true,
})
failureTTL = time.Millisecond
client = coderdenttest.New(t, &coderdenttest.Options{
Options: &coderdtest.Options{
Logger: &logger,
AutobuildTicker: ticker,
IncludeProvisionerDaemon: true,
AutobuildStats: statCh,
TemplateScheduleStore: &coderd.EnterpriseTemplateScheduleStore{},
},
})
)
user := coderdtest.CreateFirstUser(t, client)
_ = coderdenttest.AddLicense(t, client, coderdenttest.LicenseOptions{
Features: license.Features{
codersdk.FeatureAdvancedTemplateScheduling: 1,
},
})
version := coderdtest.CreateTemplateVersion(t, client, user.OrganizationID, &echo.Responses{
Parse: echo.ParseComplete,
ProvisionPlan: echo.ProvisionComplete,
ProvisionApply: echo.ProvisionFailed,
})
template := coderdtest.CreateTemplate(t, client, user.OrganizationID, version.ID, func(ctr *codersdk.CreateTemplateRequest) {
ctr.FailureTTLMillis = ptr.Ref[int64](failureTTL.Milliseconds())
})
coderdtest.AwaitTemplateVersionJob(t, client, version.ID)
ws := coderdtest.CreateWorkspace(t, client, user.OrganizationID, template.ID)
build := coderdtest.AwaitWorkspaceBuildJob(t, client, ws.LatestBuild.ID)
require.Equal(t, codersdk.WorkspaceStatusFailed, build.Status)
require.Eventually(t,
func() bool {
return database.Now().Sub(*build.Job.CompletedAt) > failureTTL
},
testutil.IntervalMedium, testutil.IntervalFast)
ticker <- time.Now()
stats := <-statCh
// Expect workspace to transition to stopped state for breaching
// failure TTL.
require.Len(t, stats.Transitions, 1)
require.Equal(t, stats.Transitions[ws.ID], database.WorkspaceTransitionStop)
})
t.Run("FailureTTLTooEarly", func(t *testing.T) {
t.Parallel()
var (
ticker = make(chan time.Time)
statCh = make(chan autobuild.Stats)
logger = slogtest.Make(t, &slogtest.Options{
// We ignore errors here since we expect to fail
// builds.
IgnoreErrors: true,
})
failureTTL = time.Minute
client = coderdenttest.New(t, &coderdenttest.Options{
Options: &coderdtest.Options{
Logger: &logger,
AutobuildTicker: ticker,
IncludeProvisionerDaemon: true,
AutobuildStats: statCh,
TemplateScheduleStore: &coderd.EnterpriseTemplateScheduleStore{},
},
})
)
user := coderdtest.CreateFirstUser(t, client)
_ = coderdenttest.AddLicense(t, client, coderdenttest.LicenseOptions{
Features: license.Features{
codersdk.FeatureAdvancedTemplateScheduling: 1,
},
})
version := coderdtest.CreateTemplateVersion(t, client, user.OrganizationID, &echo.Responses{
Parse: echo.ParseComplete,
ProvisionPlan: echo.ProvisionComplete,
ProvisionApply: echo.ProvisionFailed,
})
template := coderdtest.CreateTemplate(t, client, user.OrganizationID, version.ID, func(ctr *codersdk.CreateTemplateRequest) {
ctr.FailureTTLMillis = ptr.Ref[int64](failureTTL.Milliseconds())
})
coderdtest.AwaitTemplateVersionJob(t, client, version.ID)
ws := coderdtest.CreateWorkspace(t, client, user.OrganizationID, template.ID)
build := coderdtest.AwaitWorkspaceBuildJob(t, client, ws.LatestBuild.ID)
require.Equal(t, codersdk.WorkspaceStatusFailed, build.Status)
ticker <- time.Now()
stats := <-statCh
// Expect no transitions since not enough time has elapsed.
require.Len(t, stats.Transitions, 0)
})
t.Run("FailureTTLUnset", func(t *testing.T) {
t.Parallel()
var (
ticker = make(chan time.Time)
statCh = make(chan autobuild.Stats)
logger = slogtest.Make(t, &slogtest.Options{
// We ignore errors here since we expect to fail
// builds.
IgnoreErrors: true,
})
client = coderdenttest.New(t, &coderdenttest.Options{
Options: &coderdtest.Options{
Logger: &logger,
AutobuildTicker: ticker,
IncludeProvisionerDaemon: true,
AutobuildStats: statCh,
TemplateScheduleStore: &coderd.EnterpriseTemplateScheduleStore{},
},
})
)
user := coderdtest.CreateFirstUser(t, client)
_ = coderdenttest.AddLicense(t, client, coderdenttest.LicenseOptions{
Features: license.Features{
codersdk.FeatureAdvancedTemplateScheduling: 1,
},
})
version := coderdtest.CreateTemplateVersion(t, client, user.OrganizationID, &echo.Responses{
Parse: echo.ParseComplete,
ProvisionPlan: echo.ProvisionComplete,
ProvisionApply: echo.ProvisionFailed,
})
// Create a template without setting a failure_ttl.
template := coderdtest.CreateTemplate(t, client, user.OrganizationID, version.ID)
coderdtest.AwaitTemplateVersionJob(t, client, version.ID)
ws := coderdtest.CreateWorkspace(t, client, user.OrganizationID, template.ID)
build := coderdtest.AwaitWorkspaceBuildJob(t, client, ws.LatestBuild.ID)
require.Equal(t, codersdk.WorkspaceStatusFailed, build.Status)
ticker <- time.Now()
stats := <-statCh
// Expect no transitions since the field is unset on the template.
require.Len(t, stats.Transitions, 0)
})
}
func TestWorkspacesFiltering(t *testing.T) {
t.Parallel()

View File

@ -18,21 +18,6 @@ import (
"github.com/coder/coder/provisionersdk/proto"
)
const (
ParameterExecKey = "echo.exec"
errorKey = "error"
successKey = "success"
)
func ParameterError(s string) string {
return formatExecValue(errorKey, s)
}
func ParameterSucceed() string {
return formatExecValue(successKey, "")
}
// ProvisionApplyWithAgent returns provision responses that will mock a fake
// "aws_instance" resource with an agent that has the given auth token.
func ProvisionApplyWithAgent(authToken string) []*proto.Provision_Response {
@ -55,10 +40,6 @@ func ProvisionApplyWithAgent(authToken string) []*proto.Provision_Response {
}}
}
func formatExecValue(key, value string) string {
return fmt.Sprintf("%s=%s", key, value)
}
var (
// ParseComplete is a helper to indicate an empty parse completion.
ParseComplete = []*proto.Parse_Response{{
@ -72,6 +53,16 @@ var (
Complete: &proto.Provision_Complete{},
},
}}
// ProvisionFailed is a helper to convey a failed provision
// operation.
ProvisionFailed = []*proto.Provision_Response{{
Type: &proto.Provision_Response_Complete{
Complete: &proto.Provision_Complete{
Error: "failed!",
},
},
}}
)
// Serve starts the echo provisioner.