fix: Add reaper to coder agent (#2441)

* fix: Add reaper to coder agent

- The coder agent runs as PID 1 in some of our Docker workspaces.
  In such cases it is the responsibility of the init process to
  reap dead processes. Failing to do so can result in an inability
  to create new processes by running out of PIDs.

  This PR adds a reaper to our agent that is only spawned if it
  detects that it is PID1.
This commit is contained in:
Jon Ayers 2022-06-17 11:51:46 -05:00 committed by GitHub
parent 6c1208e3db
commit 18973a65c1
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 191 additions and 2 deletions

4
agent/reaper/doc.go Normal file
View File

@ -0,0 +1,4 @@
// Package reaper contains logic for reaping subprocesses. It is
// specifically used in the agent to avoid the accumulation of
// zombie processes.
package reaper

View File

@ -0,0 +1,19 @@
//go:build !linux
package reaper
import "github.com/hashicorp/go-reap"
// IsChild returns true if we're the forked process.
func IsChild() bool {
return false
}
// IsInitProcess returns true if the current process's PID is 1.
func IsInitProcess() bool {
return false
}
func ForkReap(pids reap.PidCh) error {
return nil
}

View File

@ -0,0 +1,66 @@
//go:build linux
package reaper_test
import (
"os"
"os/exec"
"testing"
"time"
"github.com/hashicorp/go-reap"
"github.com/stretchr/testify/require"
"github.com/coder/coder/agent/reaper"
)
func TestReap(t *testing.T) {
t.Parallel()
// Don't run the reaper test in CI. It does weird
// things like forkexecing which may have unintended
// consequences in CI.
if _, ok := os.LookupEnv("CI"); ok {
t.Skip("Detected CI, skipping reaper tests")
}
// Because we're forkexecing these tests will try to run twice...
if reaper.IsChild() {
t.Skip("I'm a child!")
}
// OK checks that's the reaper is successfully reaping
// exited processes and passing the PIDs through the shared
// channel.
t.Run("OK", func(t *testing.T) {
pids := make(reap.PidCh, 1)
err := reaper.ForkReap(pids)
require.NoError(t, err)
cmd := exec.Command("tail", "-f", "/dev/null")
err = cmd.Start()
require.NoError(t, err)
cmd2 := exec.Command("tail", "-f", "/dev/null")
err = cmd2.Start()
require.NoError(t, err)
err = cmd.Process.Kill()
require.NoError(t, err)
err = cmd2.Process.Kill()
require.NoError(t, err)
expectedPIDs := []int{cmd.Process.Pid, cmd2.Process.Pid}
deadline := time.NewTimer(time.Second * 5)
for i := 0; i < len(expectedPIDs); i++ {
select {
case <-deadline.C:
t.Fatalf("Timed out waiting for process")
case pid := <-pids:
require.Contains(t, expectedPIDs, pid)
}
}
})
}

View File

@ -0,0 +1,79 @@
//go:build linux
package reaper
import (
"fmt"
"os"
"syscall"
"github.com/hashicorp/go-reap"
"golang.org/x/xerrors"
)
// agentEnvMark is a simple environment variable that we use as a marker
// to indicated that the process is a child as opposed to the reaper.
// Since we are forkexec'ing we need to be able to differentiate between
// the two to avoid fork bombing ourselves.
const agentEnvMark = "CODER_DO_NOT_REAP"
// IsChild returns true if we're the forked process.
func IsChild() bool {
return os.Getenv(agentEnvMark) != ""
}
// IsInitProcess returns true if the current process's PID is 1.
func IsInitProcess() bool {
return os.Getpid() == 1
}
// ForkReap spawns a goroutine that reaps children. In order to avoid
// complications with spawning `exec.Commands` in the same process that
// is reaping, we forkexec a child process. This prevents a race between
// the reaper and an exec.Command waiting for its process to complete.
// The provided 'pids' channel may be nil if the caller does not care about the
// reaped children PIDs.
func ForkReap(pids reap.PidCh) error {
// Check if the process is the parent or the child.
// If it's the child we want to skip attempting to reap.
if IsChild() {
return nil
}
go reap.ReapChildren(pids, nil, nil, nil)
args := os.Args
// This is simply done to help identify the real agent process
// when viewing in something like 'ps'.
args = append(args, "#Agent")
pwd, err := os.Getwd()
if err != nil {
return xerrors.Errorf("get wd: %w", err)
}
pattrs := &syscall.ProcAttr{
Dir: pwd,
// Add our marker for identifying the child process.
Env: append(os.Environ(), fmt.Sprintf("%s=true", agentEnvMark)),
Sys: &syscall.SysProcAttr{
Setsid: true,
},
Files: []uintptr{
uintptr(syscall.Stdin),
uintptr(syscall.Stdout),
uintptr(syscall.Stderr),
},
}
//#nosec G204
pid, _ := syscall.ForkExec(args[0], args, pattrs)
var wstatus syscall.WaitStatus
_, err = syscall.Wait4(pid, &wstatus, 0, nil)
for xerrors.Is(err, syscall.EINTR) {
_, err = syscall.Wait4(pid, &wstatus, 0, nil)
}
return nil
}

View File

@ -8,6 +8,7 @@ import (
"net/url"
"os"
"path/filepath"
"runtime"
"time"
"cloud.google.com/go/compute/metadata"
@ -18,6 +19,7 @@ import (
"cdr.dev/slog/sloggers/sloghuman"
"github.com/coder/coder/agent"
"github.com/coder/coder/agent/reaper"
"github.com/coder/coder/cli/cliflag"
"github.com/coder/coder/codersdk"
"github.com/coder/retry"
@ -51,6 +53,23 @@ func workspaceAgent() *cobra.Command {
}
defer logWriter.Close()
logger := slog.Make(sloghuman.Sink(cmd.ErrOrStderr()), sloghuman.Sink(logWriter)).Leveled(slog.LevelDebug)
isLinux := runtime.GOOS == "linux"
// Spawn a reaper so that we don't accumulate a ton
// of zombie processes.
if reaper.IsInitProcess() && !reaper.IsChild() && isLinux {
logger.Info(cmd.Context(), "spawning reaper process")
err := reaper.ForkReap(nil)
if err != nil {
logger.Error(cmd.Context(), "failed to reap", slog.Error(err))
return xerrors.Errorf("fork reap: %w", err)
}
logger.Info(cmd.Context(), "reaper process exiting")
return nil
}
client := codersdk.New(coderURL)
if pprofEnabled {

4
go.mod
View File

@ -71,6 +71,7 @@ require (
github.com/golang-migrate/migrate/v4 v4.15.2
github.com/google/go-github/v43 v43.0.1-0.20220414155304-00e42332e405
github.com/google/uuid v1.3.0
github.com/hashicorp/go-reap v0.0.0-20170704170343-bf58d8a43e7b
github.com/hashicorp/go-version v1.5.0
github.com/hashicorp/hc-install v0.3.2
github.com/hashicorp/hcl/v2 v2.12.0
@ -131,8 +132,6 @@ require (
storj.io/drpc v0.0.30
)
require github.com/xi2/xz v0.0.0-20171230120015-48954b6210f8 // indirect
require (
github.com/agnivade/levenshtein v1.0.1 // indirect
github.com/elastic/go-windows v1.0.0 // indirect
@ -240,6 +239,7 @@ require (
github.com/xeipuuv/gojsonpointer v0.0.0-20190905194746-02993c407bfb // indirect
github.com/xeipuuv/gojsonreference v0.0.0-20180127040603-bd5ef7bd5415 // indirect
github.com/xeipuuv/gojsonschema v1.2.0 // indirect
github.com/xi2/xz v0.0.0-20171230120015-48954b6210f8 // indirect
github.com/yashtewari/glob-intersection v0.1.0 // indirect
github.com/zclconf/go-cty v1.10.0 // indirect
github.com/zeebo/errs v1.2.2 // indirect

2
go.sum
View File

@ -896,6 +896,8 @@ github.com/hashicorp/go-multierror v1.0.0/go.mod h1:dHtQlpGsu+cZNNAkkCN/P3hoUDHh
github.com/hashicorp/go-multierror v1.1.0/go.mod h1:spPvp8C1qA32ftKqdAHm4hHTbPw+vmowP0z+KUhOZdA=
github.com/hashicorp/go-multierror v1.1.1 h1:H5DkEtf6CXdFp0N0Em5UCwQpXMWke8IA0+lD48awMYo=
github.com/hashicorp/go-multierror v1.1.1/go.mod h1:iw975J/qwKPdAO1clOe2L8331t/9/fmwbPZ6JB6eMoM=
github.com/hashicorp/go-reap v0.0.0-20170704170343-bf58d8a43e7b h1:3GrpnZQBxcMj1gCXQLelfjCT1D5MPGTuGMKHVzSIH6A=
github.com/hashicorp/go-reap v0.0.0-20170704170343-bf58d8a43e7b/go.mod h1:qIFzeFcJU3OIFk/7JreWXcUjFmcCaeHTH9KoNyHYVCs=
github.com/hashicorp/go-rootcerts v1.0.0/go.mod h1:K6zTfqpRlCUIjkwsN4Z+hiSfzSTQa6eBIzfwKfwNnHU=
github.com/hashicorp/go-sockaddr v1.0.0/go.mod h1:7Xibr9yA9JjQq1JpNB2Vw7kxv8xerXegt+ozgdvDeDU=
github.com/hashicorp/go-syslog v1.0.0/go.mod h1:qPfqrKkXGihmCqbJM2mZgkZGvKG1dFdvsLplgctolz4=