fix: separate signals for passive, active, and forced shutdown (#12358)

* fix: separate signals for passive, active, and forced shutdown

`SIGTERM`: Passive shutdown stopping provisioner daemons from accepting new
jobs but waiting for existing jobs to successfully complete.

`SIGINT` (old existing behavior): Notify provisioner daemons to cancel in-flight jobs, wait 5s for jobs to be exited, then force quit.

`SIGKILL`: Untouched from before, will force-quit.

* Revert dramatic signal changes

* Rename

* Fix shutdown behavior for provisioner daemons

* Add test for graceful shutdown
This commit is contained in:
Kyle Carberry 2024-03-15 14:16:36 +01:00 committed by GitHub
parent 2c947c1921
commit 895df54051
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
18 changed files with 136 additions and 35 deletions

View File

@ -125,7 +125,7 @@ func (r *RootCmd) workspaceAgent() *clibase.Cmd {
args := append(os.Args, "--no-reap")
err := reaper.ForkReap(
reaper.WithExecArgs(args...),
reaper.WithCatchSignals(InterruptSignals...),
reaper.WithCatchSignals(StopSignals...),
)
if err != nil {
logger.Error(ctx, "agent process reaper unable to fork", slog.Error(err))
@ -144,7 +144,7 @@ func (r *RootCmd) workspaceAgent() *clibase.Cmd {
// Note that we don't want to handle these signals in the
// process that runs as PID 1, that's why we do this after
// the reaper forked.
ctx, stopNotify := inv.SignalNotifyContext(ctx, InterruptSignals...)
ctx, stopNotify := inv.SignalNotifyContext(ctx, StopSignals...)
defer stopNotify()
// DumpHandler does signal handling, so we call it after the

View File

@ -890,7 +890,7 @@ func (r *RootCmd) scaletestWorkspaceTraffic() *clibase.Cmd {
Handler: func(inv *clibase.Invocation) (err error) {
ctx := inv.Context()
notifyCtx, stop := signal.NotifyContext(ctx, InterruptSignals...) // Checked later.
notifyCtx, stop := signal.NotifyContext(ctx, StopSignals...) // Checked later.
defer stop()
ctx = notifyCtx

View File

@ -65,7 +65,7 @@ fi
Handler: func(inv *clibase.Invocation) error {
ctx := inv.Context()
ctx, stop := inv.SignalNotifyContext(ctx, InterruptSignals...)
ctx, stop := inv.SignalNotifyContext(ctx, StopSignals...)
defer stop()
client, err := r.createAgentClient()

View File

@ -25,7 +25,7 @@ func (r *RootCmd) gitAskpass() *clibase.Cmd {
Handler: func(inv *clibase.Invocation) error {
ctx := inv.Context()
ctx, stop := inv.SignalNotifyContext(ctx, InterruptSignals...)
ctx, stop := inv.SignalNotifyContext(ctx, StopSignals...)
defer stop()
user, host, err := gitauth.ParseAskpass(inv.Args[0])

View File

@ -29,7 +29,7 @@ func (r *RootCmd) gitssh() *clibase.Cmd {
// Catch interrupt signals to ensure the temporary private
// key file is cleaned up on most cases.
ctx, stop := inv.SignalNotifyContext(ctx, InterruptSignals...)
ctx, stop := inv.SignalNotifyContext(ctx, StopSignals...)
defer stop()
// Early check so errors are reported immediately.

View File

@ -337,7 +337,7 @@ func (r *RootCmd) Server(newAPI func(context.Context, *coderd.Options) (*coderd.
// Register signals early on so that graceful shutdown can't
// be interrupted by additional signals. Note that we avoid
// shadowing cancel() (from above) here because notifyStop()
// shadowing cancel() (from above) here because stopCancel()
// restores default behavior for the signals. This protects
// the shutdown sequence from abruptly terminating things
// like: database migrations, provisioner work, workspace
@ -345,8 +345,10 @@ func (r *RootCmd) Server(newAPI func(context.Context, *coderd.Options) (*coderd.
//
// To get out of a graceful shutdown, the user can send
// SIGQUIT with ctrl+\ or SIGKILL with `kill -9`.
notifyCtx, notifyStop := inv.SignalNotifyContext(ctx, InterruptSignals...)
defer notifyStop()
stopCtx, stopCancel := signalNotifyContext(ctx, inv, StopSignalsNoInterrupt...)
defer stopCancel()
interruptCtx, interruptCancel := signalNotifyContext(ctx, inv, InterruptSignals...)
defer interruptCancel()
cacheDir := vals.CacheDir.String()
err = os.MkdirAll(cacheDir, 0o700)
@ -1028,13 +1030,18 @@ func (r *RootCmd) Server(newAPI func(context.Context, *coderd.Options) (*coderd.
hangDetector.Start()
defer hangDetector.Close()
waitForProvisionerJobs := false
// Currently there is no way to ask the server to shut
// itself down, so any exit signal will result in a non-zero
// exit of the server.
var exitErr error
select {
case <-notifyCtx.Done():
exitErr = notifyCtx.Err()
case <-stopCtx.Done():
exitErr = stopCtx.Err()
waitForProvisionerJobs = true
_, _ = io.WriteString(inv.Stdout, cliui.Bold("Stop caught, waiting for provisioner jobs to complete and gracefully exiting. Use ctrl+\\ to force quit"))
case <-interruptCtx.Done():
exitErr = interruptCtx.Err()
_, _ = io.WriteString(inv.Stdout, cliui.Bold("Interrupt caught, gracefully exiting. Use ctrl+\\ to force quit"))
case <-tunnelDone:
exitErr = xerrors.New("dev tunnel closed unexpectedly")
@ -1082,7 +1089,16 @@ func (r *RootCmd) Server(newAPI func(context.Context, *coderd.Options) (*coderd.
defer wg.Done()
r.Verbosef(inv, "Shutting down provisioner daemon %d...", id)
err := shutdownWithTimeout(provisionerDaemon.Shutdown, 5*time.Second)
timeout := 5 * time.Second
if waitForProvisionerJobs {
// It can last for a long time...
timeout = 30 * time.Minute
}
err := shutdownWithTimeout(func(ctx context.Context) error {
// We only want to cancel active jobs if we aren't exiting gracefully.
return provisionerDaemon.Shutdown(ctx, !waitForProvisionerJobs)
}, timeout)
if err != nil {
cliui.Errorf(inv.Stderr, "Failed to shut down provisioner daemon %d: %s\n", id, err)
return
@ -2512,3 +2528,12 @@ func escapePostgresURLUserInfo(v string) (string, error) {
return v, nil
}
func signalNotifyContext(ctx context.Context, inv *clibase.Invocation, sig ...os.Signal) (context.Context, context.CancelFunc) {
// On Windows, some of our signal functions lack support.
// If we pass in no signals, we should just return the context as-is.
if len(sig) == 0 {
return context.WithCancel(ctx)
}
return inv.SignalNotifyContext(ctx, sig...)
}

View File

@ -47,7 +47,7 @@ func (r *RootCmd) newCreateAdminUserCommand() *clibase.Cmd {
logger = logger.Leveled(slog.LevelDebug)
}
ctx, cancel := inv.SignalNotifyContext(ctx, InterruptSignals...)
ctx, cancel := inv.SignalNotifyContext(ctx, StopSignals...)
defer cancel()
if newUserDBURL == "" {

View File

@ -21,6 +21,7 @@ import (
"net/url"
"os"
"path/filepath"
"reflect"
"runtime"
"strconv"
"strings"
@ -1605,7 +1606,7 @@ func TestServer_Production(t *testing.T) {
}
//nolint:tparallel,paralleltest // This test cannot be run in parallel due to signal handling.
func TestServer_Shutdown(t *testing.T) {
func TestServer_InterruptShutdown(t *testing.T) {
t.Skip("This test issues an interrupt signal which will propagate to the test runner.")
if runtime.GOOS == "windows" {
@ -1638,6 +1639,46 @@ func TestServer_Shutdown(t *testing.T) {
require.NoError(t, err)
}
func TestServer_GracefulShutdown(t *testing.T) {
t.Parallel()
if runtime.GOOS == "windows" {
// Sending interrupt signal isn't supported on Windows!
t.SkipNow()
}
ctx, cancelFunc := context.WithCancel(context.Background())
defer cancelFunc()
root, cfg := clitest.New(t,
"server",
"--in-memory",
"--http-address", ":0",
"--access-url", "http://example.com",
"--provisioner-daemons", "1",
"--cache-dir", t.TempDir(),
)
var stopFunc context.CancelFunc
root = root.WithTestSignalNotifyContext(t, func(parent context.Context, signals ...os.Signal) (context.Context, context.CancelFunc) {
if !reflect.DeepEqual(cli.StopSignalsNoInterrupt, signals) {
return context.WithCancel(ctx)
}
var ctx context.Context
ctx, stopFunc = context.WithCancel(parent)
return ctx, stopFunc
})
serverErr := make(chan error, 1)
pty := ptytest.New(t).Attach(root)
go func() {
serverErr <- root.WithContext(ctx).Run()
}()
_ = waitAccessURL(t, cfg)
// It's fair to assume `stopFunc` isn't nil here, because the server
// has started and access URL is propagated.
stopFunc()
pty.ExpectMatch("waiting for provisioner jobs to complete")
err := <-serverErr
require.NoError(t, err)
}
func BenchmarkServerHelp(b *testing.B) {
// server --help is a good proxy for measuring the
// constant overhead of each command.

View File

@ -7,8 +7,23 @@ import (
"syscall"
)
var InterruptSignals = []os.Signal{
// StopSignals is the list of signals that are used for handling
// shutdown behavior.
var StopSignals = []os.Signal{
os.Interrupt,
syscall.SIGTERM,
syscall.SIGHUP,
}
// StopSignals is the list of signals that are used for handling
// graceful shutdown behavior.
var StopSignalsNoInterrupt = []os.Signal{
syscall.SIGTERM,
syscall.SIGHUP,
}
// InterruptSignals is the list of signals that are used for handling
// immediate shutdown behavior.
var InterruptSignals = []os.Signal{
os.Interrupt,
}

View File

@ -6,4 +6,12 @@ import (
"os"
)
var InterruptSignals = []os.Signal{os.Interrupt}
var StopSignals = []os.Signal{
os.Interrupt,
}
var StopSignalsNoInterrupt = []os.Signal{}
var InterruptSignals = []os.Signal{
os.Interrupt,
}

View File

@ -73,7 +73,7 @@ func (r *RootCmd) ssh() *clibase.Cmd {
// session can persist for up to 72 hours, since we set a long
// timeout on the Agent side of the connection. In particular,
// OpenSSH sends SIGHUP to terminate a proxy command.
ctx, stop := inv.SignalNotifyContext(inv.Context(), InterruptSignals...)
ctx, stop := inv.SignalNotifyContext(inv.Context(), StopSignals...)
defer stop()
ctx, cancel := context.WithCancel(ctx)
defer cancel()

View File

@ -328,7 +328,7 @@ func TestTemplatePull_ToDir(t *testing.T) {
require.NoError(t, inv.Run())
// Validate behaviour of choosing template name in the absence of an output path argument.
// Validate behavior of choosing template name in the absence of an output path argument.
destPath := actualDest
if destPath == "" {
destPath = template.Name

View File

@ -498,7 +498,7 @@ func (c *provisionerdCloser) Close() error {
c.closed = true
ctx, cancel := context.WithTimeout(context.Background(), testutil.WaitShort)
defer cancel()
shutdownErr := c.d.Shutdown(ctx)
shutdownErr := c.d.Shutdown(ctx, true)
closeErr := c.d.Close()
if shutdownErr != nil {
return shutdownErr

View File

@ -88,8 +88,10 @@ func (r *RootCmd) provisionerDaemonStart() *clibase.Cmd {
ctx, cancel := context.WithCancel(inv.Context())
defer cancel()
notifyCtx, notifyStop := inv.SignalNotifyContext(ctx, agpl.InterruptSignals...)
defer notifyStop()
stopCtx, stopCancel := inv.SignalNotifyContext(ctx, agpl.StopSignalsNoInterrupt...)
defer stopCancel()
interruptCtx, interruptCancel := inv.SignalNotifyContext(ctx, agpl.InterruptSignals...)
defer interruptCancel()
tags, err := agpl.ParseProvisionerTags(rawTags)
if err != nil {
@ -212,10 +214,17 @@ func (r *RootCmd) provisionerDaemonStart() *clibase.Cmd {
Metrics: metrics,
})
waitForProvisionerJobs := false
var exitErr error
select {
case <-notifyCtx.Done():
exitErr = notifyCtx.Err()
case <-stopCtx.Done():
exitErr = stopCtx.Err()
_, _ = fmt.Fprintln(inv.Stdout, cliui.Bold(
"Stop caught, waiting for provisioner jobs to complete and gracefully exiting. Use ctrl+\\ to force quit",
))
waitForProvisionerJobs = true
case <-interruptCtx.Done():
exitErr = interruptCtx.Err()
_, _ = fmt.Fprintln(inv.Stdout, cliui.Bold(
"Interrupt caught, gracefully exiting. Use ctrl+\\ to force quit",
))
@ -225,7 +234,7 @@ func (r *RootCmd) provisionerDaemonStart() *clibase.Cmd {
cliui.Errorf(inv.Stderr, "Unexpected error, shutting down server: %s\n", exitErr)
}
err = srv.Shutdown(ctx)
err = srv.Shutdown(ctx, waitForProvisionerJobs)
if err != nil {
return xerrors.Errorf("shutdown: %w", err)
}

View File

@ -142,7 +142,7 @@ func (r *RootCmd) proxyServer() *clibase.Cmd {
//
// To get out of a graceful shutdown, the user can send
// SIGQUIT with ctrl+\ or SIGKILL with `kill -9`.
notifyCtx, notifyStop := inv.SignalNotifyContext(ctx, cli.InterruptSignals...)
notifyCtx, notifyStop := inv.SignalNotifyContext(ctx, cli.StopSignals...)
defer notifyStop()
// Clean up idle connections at the end, e.g.

View File

@ -441,7 +441,7 @@ func TestProvisionerDaemonServe(t *testing.T) {
build := coderdtest.AwaitWorkspaceBuildJobCompleted(t, client, workspace.LatestBuild.ID)
require.Equal(t, codersdk.WorkspaceStatusRunning, build.Status)
err = pd.Shutdown(ctx)
err = pd.Shutdown(ctx, false)
require.NoError(t, err)
err = terraformServer.Close()
require.NoError(t, err)

View File

@ -474,15 +474,18 @@ func (p *Server) isClosed() bool {
}
}
// Shutdown triggers a graceful exit of each registered provisioner.
func (p *Server) Shutdown(ctx context.Context) error {
// Shutdown gracefully exists with the option to cancel the active job.
// If false, it will wait for the job to complete.
//
//nolint:revive
func (p *Server) Shutdown(ctx context.Context, cancelActiveJob bool) error {
p.mutex.Lock()
p.opts.Logger.Info(ctx, "attempting graceful shutdown")
if !p.shuttingDownB {
close(p.shuttingDownCh)
p.shuttingDownB = true
}
if p.activeJob != nil {
if cancelActiveJob && p.activeJob != nil {
p.activeJob.Cancel()
}
p.mutex.Unlock()

View File

@ -671,7 +671,7 @@ func TestProvisionerd(t *testing.T) {
}),
})
require.Condition(t, closedWithin(updateChan, testutil.WaitShort))
err := server.Shutdown(context.Background())
err := server.Shutdown(context.Background(), true)
require.NoError(t, err)
require.Condition(t, closedWithin(completeChan, testutil.WaitShort))
require.NoError(t, server.Close())
@ -762,7 +762,7 @@ func TestProvisionerd(t *testing.T) {
require.Condition(t, closedWithin(completeChan, testutil.WaitShort))
ctx, cancel := context.WithTimeout(context.Background(), testutil.WaitShort)
defer cancel()
require.NoError(t, server.Shutdown(ctx))
require.NoError(t, server.Shutdown(ctx, true))
require.NoError(t, server.Close())
})
@ -853,7 +853,7 @@ func TestProvisionerd(t *testing.T) {
require.Condition(t, closedWithin(completeChan, testutil.WaitShort))
ctx, cancel := context.WithTimeout(context.Background(), testutil.WaitShort)
defer cancel()
require.NoError(t, server.Shutdown(ctx))
require.NoError(t, server.Shutdown(ctx, true))
require.NoError(t, server.Close())
})
@ -944,7 +944,7 @@ func TestProvisionerd(t *testing.T) {
t.Log("completeChan closed")
ctx, cancel := context.WithTimeout(context.Background(), testutil.WaitShort)
defer cancel()
require.NoError(t, server.Shutdown(ctx))
require.NoError(t, server.Shutdown(ctx, true))
require.NoError(t, server.Close())
})
@ -1039,7 +1039,7 @@ func TestProvisionerd(t *testing.T) {
require.Condition(t, closedWithin(completeChan, testutil.WaitShort))
ctx, cancel := context.WithTimeout(context.Background(), testutil.WaitShort)
defer cancel()
require.NoError(t, server.Shutdown(ctx))
require.NoError(t, server.Shutdown(ctx, true))
require.NoError(t, server.Close())
assert.Equal(t, ops[len(ops)-1], "CompleteJob")
assert.Contains(t, ops[0:len(ops)-1], "Log: Cleaning Up | ")
@ -1076,7 +1076,7 @@ func createProvisionerd(t *testing.T, dialer provisionerd.Dialer, connector prov
t.Cleanup(func() {
ctx, cancel := context.WithTimeout(context.Background(), testutil.WaitShort)
defer cancel()
_ = server.Shutdown(ctx)
_ = server.Shutdown(ctx, true)
_ = server.Close()
})
return server