mirror of https://github.com/coder/coder.git
fix: Add reaper to coder agent (#2441)
* fix: Add reaper to coder agent - The coder agent runs as PID 1 in some of our Docker workspaces. In such cases it is the responsibility of the init process to reap dead processes. Failing to do so can result in an inability to create new processes by running out of PIDs. This PR adds a reaper to our agent that is only spawned if it detects that it is PID1.
This commit is contained in:
parent
6c1208e3db
commit
18973a65c1
|
@ -0,0 +1,4 @@
|
|||
// Package reaper contains logic for reaping subprocesses. It is
|
||||
// specifically used in the agent to avoid the accumulation of
|
||||
// zombie processes.
|
||||
package reaper
|
|
@ -0,0 +1,19 @@
|
|||
//go:build !linux
|
||||
|
||||
package reaper
|
||||
|
||||
import "github.com/hashicorp/go-reap"
|
||||
|
||||
// IsChild returns true if we're the forked process.
|
||||
func IsChild() bool {
|
||||
return false
|
||||
}
|
||||
|
||||
// IsInitProcess returns true if the current process's PID is 1.
|
||||
func IsInitProcess() bool {
|
||||
return false
|
||||
}
|
||||
|
||||
func ForkReap(pids reap.PidCh) error {
|
||||
return nil
|
||||
}
|
|
@ -0,0 +1,66 @@
|
|||
//go:build linux
|
||||
|
||||
package reaper_test
|
||||
|
||||
import (
|
||||
"os"
|
||||
"os/exec"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/hashicorp/go-reap"
|
||||
"github.com/stretchr/testify/require"
|
||||
|
||||
"github.com/coder/coder/agent/reaper"
|
||||
)
|
||||
|
||||
func TestReap(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
// Don't run the reaper test in CI. It does weird
|
||||
// things like forkexecing which may have unintended
|
||||
// consequences in CI.
|
||||
if _, ok := os.LookupEnv("CI"); ok {
|
||||
t.Skip("Detected CI, skipping reaper tests")
|
||||
}
|
||||
|
||||
// Because we're forkexecing these tests will try to run twice...
|
||||
if reaper.IsChild() {
|
||||
t.Skip("I'm a child!")
|
||||
}
|
||||
|
||||
// OK checks that's the reaper is successfully reaping
|
||||
// exited processes and passing the PIDs through the shared
|
||||
// channel.
|
||||
t.Run("OK", func(t *testing.T) {
|
||||
pids := make(reap.PidCh, 1)
|
||||
err := reaper.ForkReap(pids)
|
||||
require.NoError(t, err)
|
||||
|
||||
cmd := exec.Command("tail", "-f", "/dev/null")
|
||||
err = cmd.Start()
|
||||
require.NoError(t, err)
|
||||
|
||||
cmd2 := exec.Command("tail", "-f", "/dev/null")
|
||||
err = cmd2.Start()
|
||||
require.NoError(t, err)
|
||||
|
||||
err = cmd.Process.Kill()
|
||||
require.NoError(t, err)
|
||||
|
||||
err = cmd2.Process.Kill()
|
||||
require.NoError(t, err)
|
||||
|
||||
expectedPIDs := []int{cmd.Process.Pid, cmd2.Process.Pid}
|
||||
|
||||
deadline := time.NewTimer(time.Second * 5)
|
||||
for i := 0; i < len(expectedPIDs); i++ {
|
||||
select {
|
||||
case <-deadline.C:
|
||||
t.Fatalf("Timed out waiting for process")
|
||||
case pid := <-pids:
|
||||
require.Contains(t, expectedPIDs, pid)
|
||||
}
|
||||
}
|
||||
})
|
||||
}
|
|
@ -0,0 +1,79 @@
|
|||
//go:build linux
|
||||
|
||||
package reaper
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"os"
|
||||
"syscall"
|
||||
|
||||
"github.com/hashicorp/go-reap"
|
||||
"golang.org/x/xerrors"
|
||||
)
|
||||
|
||||
// agentEnvMark is a simple environment variable that we use as a marker
|
||||
// to indicated that the process is a child as opposed to the reaper.
|
||||
// Since we are forkexec'ing we need to be able to differentiate between
|
||||
// the two to avoid fork bombing ourselves.
|
||||
const agentEnvMark = "CODER_DO_NOT_REAP"
|
||||
|
||||
// IsChild returns true if we're the forked process.
|
||||
func IsChild() bool {
|
||||
return os.Getenv(agentEnvMark) != ""
|
||||
}
|
||||
|
||||
// IsInitProcess returns true if the current process's PID is 1.
|
||||
func IsInitProcess() bool {
|
||||
return os.Getpid() == 1
|
||||
}
|
||||
|
||||
// ForkReap spawns a goroutine that reaps children. In order to avoid
|
||||
// complications with spawning `exec.Commands` in the same process that
|
||||
// is reaping, we forkexec a child process. This prevents a race between
|
||||
// the reaper and an exec.Command waiting for its process to complete.
|
||||
// The provided 'pids' channel may be nil if the caller does not care about the
|
||||
// reaped children PIDs.
|
||||
func ForkReap(pids reap.PidCh) error {
|
||||
// Check if the process is the parent or the child.
|
||||
// If it's the child we want to skip attempting to reap.
|
||||
if IsChild() {
|
||||
return nil
|
||||
}
|
||||
|
||||
go reap.ReapChildren(pids, nil, nil, nil)
|
||||
|
||||
args := os.Args
|
||||
// This is simply done to help identify the real agent process
|
||||
// when viewing in something like 'ps'.
|
||||
args = append(args, "#Agent")
|
||||
|
||||
pwd, err := os.Getwd()
|
||||
if err != nil {
|
||||
return xerrors.Errorf("get wd: %w", err)
|
||||
}
|
||||
|
||||
pattrs := &syscall.ProcAttr{
|
||||
Dir: pwd,
|
||||
// Add our marker for identifying the child process.
|
||||
Env: append(os.Environ(), fmt.Sprintf("%s=true", agentEnvMark)),
|
||||
Sys: &syscall.SysProcAttr{
|
||||
Setsid: true,
|
||||
},
|
||||
Files: []uintptr{
|
||||
uintptr(syscall.Stdin),
|
||||
uintptr(syscall.Stdout),
|
||||
uintptr(syscall.Stderr),
|
||||
},
|
||||
}
|
||||
|
||||
//#nosec G204
|
||||
pid, _ := syscall.ForkExec(args[0], args, pattrs)
|
||||
|
||||
var wstatus syscall.WaitStatus
|
||||
_, err = syscall.Wait4(pid, &wstatus, 0, nil)
|
||||
for xerrors.Is(err, syscall.EINTR) {
|
||||
_, err = syscall.Wait4(pid, &wstatus, 0, nil)
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
19
cli/agent.go
19
cli/agent.go
|
@ -8,6 +8,7 @@ import (
|
|||
"net/url"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"runtime"
|
||||
"time"
|
||||
|
||||
"cloud.google.com/go/compute/metadata"
|
||||
|
@ -18,6 +19,7 @@ import (
|
|||
"cdr.dev/slog/sloggers/sloghuman"
|
||||
|
||||
"github.com/coder/coder/agent"
|
||||
"github.com/coder/coder/agent/reaper"
|
||||
"github.com/coder/coder/cli/cliflag"
|
||||
"github.com/coder/coder/codersdk"
|
||||
"github.com/coder/retry"
|
||||
|
@ -51,6 +53,23 @@ func workspaceAgent() *cobra.Command {
|
|||
}
|
||||
defer logWriter.Close()
|
||||
logger := slog.Make(sloghuman.Sink(cmd.ErrOrStderr()), sloghuman.Sink(logWriter)).Leveled(slog.LevelDebug)
|
||||
|
||||
isLinux := runtime.GOOS == "linux"
|
||||
|
||||
// Spawn a reaper so that we don't accumulate a ton
|
||||
// of zombie processes.
|
||||
if reaper.IsInitProcess() && !reaper.IsChild() && isLinux {
|
||||
logger.Info(cmd.Context(), "spawning reaper process")
|
||||
err := reaper.ForkReap(nil)
|
||||
if err != nil {
|
||||
logger.Error(cmd.Context(), "failed to reap", slog.Error(err))
|
||||
return xerrors.Errorf("fork reap: %w", err)
|
||||
}
|
||||
|
||||
logger.Info(cmd.Context(), "reaper process exiting")
|
||||
return nil
|
||||
}
|
||||
|
||||
client := codersdk.New(coderURL)
|
||||
|
||||
if pprofEnabled {
|
||||
|
|
4
go.mod
4
go.mod
|
@ -71,6 +71,7 @@ require (
|
|||
github.com/golang-migrate/migrate/v4 v4.15.2
|
||||
github.com/google/go-github/v43 v43.0.1-0.20220414155304-00e42332e405
|
||||
github.com/google/uuid v1.3.0
|
||||
github.com/hashicorp/go-reap v0.0.0-20170704170343-bf58d8a43e7b
|
||||
github.com/hashicorp/go-version v1.5.0
|
||||
github.com/hashicorp/hc-install v0.3.2
|
||||
github.com/hashicorp/hcl/v2 v2.12.0
|
||||
|
@ -131,8 +132,6 @@ require (
|
|||
storj.io/drpc v0.0.30
|
||||
)
|
||||
|
||||
require github.com/xi2/xz v0.0.0-20171230120015-48954b6210f8 // indirect
|
||||
|
||||
require (
|
||||
github.com/agnivade/levenshtein v1.0.1 // indirect
|
||||
github.com/elastic/go-windows v1.0.0 // indirect
|
||||
|
@ -240,6 +239,7 @@ require (
|
|||
github.com/xeipuuv/gojsonpointer v0.0.0-20190905194746-02993c407bfb // indirect
|
||||
github.com/xeipuuv/gojsonreference v0.0.0-20180127040603-bd5ef7bd5415 // indirect
|
||||
github.com/xeipuuv/gojsonschema v1.2.0 // indirect
|
||||
github.com/xi2/xz v0.0.0-20171230120015-48954b6210f8 // indirect
|
||||
github.com/yashtewari/glob-intersection v0.1.0 // indirect
|
||||
github.com/zclconf/go-cty v1.10.0 // indirect
|
||||
github.com/zeebo/errs v1.2.2 // indirect
|
||||
|
|
2
go.sum
2
go.sum
|
@ -896,6 +896,8 @@ github.com/hashicorp/go-multierror v1.0.0/go.mod h1:dHtQlpGsu+cZNNAkkCN/P3hoUDHh
|
|||
github.com/hashicorp/go-multierror v1.1.0/go.mod h1:spPvp8C1qA32ftKqdAHm4hHTbPw+vmowP0z+KUhOZdA=
|
||||
github.com/hashicorp/go-multierror v1.1.1 h1:H5DkEtf6CXdFp0N0Em5UCwQpXMWke8IA0+lD48awMYo=
|
||||
github.com/hashicorp/go-multierror v1.1.1/go.mod h1:iw975J/qwKPdAO1clOe2L8331t/9/fmwbPZ6JB6eMoM=
|
||||
github.com/hashicorp/go-reap v0.0.0-20170704170343-bf58d8a43e7b h1:3GrpnZQBxcMj1gCXQLelfjCT1D5MPGTuGMKHVzSIH6A=
|
||||
github.com/hashicorp/go-reap v0.0.0-20170704170343-bf58d8a43e7b/go.mod h1:qIFzeFcJU3OIFk/7JreWXcUjFmcCaeHTH9KoNyHYVCs=
|
||||
github.com/hashicorp/go-rootcerts v1.0.0/go.mod h1:K6zTfqpRlCUIjkwsN4Z+hiSfzSTQa6eBIzfwKfwNnHU=
|
||||
github.com/hashicorp/go-sockaddr v1.0.0/go.mod h1:7Xibr9yA9JjQq1JpNB2Vw7kxv8xerXegt+ozgdvDeDU=
|
||||
github.com/hashicorp/go-syslog v1.0.0/go.mod h1:qPfqrKkXGihmCqbJM2mZgkZGvKG1dFdvsLplgctolz4=
|
||||
|
|
Loading…
Reference in New Issue