coder/agent/reconnectingpty/screen.go

389 lines
13 KiB
Go

package reconnectingpty
import (
"bytes"
"context"
"crypto/rand"
"encoding/hex"
"errors"
"io"
"net"
"os"
"os/exec"
"path/filepath"
"strings"
"sync"
"time"
"github.com/gliderlabs/ssh"
"github.com/prometheus/client_golang/prometheus"
"golang.org/x/xerrors"
"cdr.dev/slog"
"github.com/coder/coder/v2/pty"
)
// screenReconnectingPTY provides a reconnectable PTY via `screen`.
type screenReconnectingPTY struct {
command *pty.Cmd
// id holds the id of the session for both creating and attaching. This will
// be generated uniquely for each session because without control of the
// screen daemon we do not have its PID and without the PID screen will do
// partial matching. Enforcing a unique ID should guarantee we match on the
// right session.
id string
// mutex prevents concurrent attaches to the session. Screen will happily
// spawn two separate sessions with the same name if multiple attaches happen
// in a close enough interval. We are not able to control the screen daemon
// ourselves to prevent this because the daemon will spawn with a hardcoded
// 24x80 size which results in confusing padding above the prompt once the
// attach comes in and resizes.
mutex sync.Mutex
configFile string
metrics *prometheus.CounterVec
state *ptyState
// timer will close the reconnecting pty when it expires. The timer will be
// reset as long as there are active connections.
timer *time.Timer
timeout time.Duration
}
// newScreen creates a new screen-backed reconnecting PTY. It writes config
// settings and creates the socket directory. If we could, we would want to
// spawn the daemon here and attach each connection to it but since doing that
// spawns the daemon with a hardcoded 24x80 size it is not a very good user
// experience. Instead we will let the attach command spawn the daemon on its
// own which causes it to spawn with the specified size.
func newScreen(ctx context.Context, cmd *pty.Cmd, options *Options, logger slog.Logger) *screenReconnectingPTY {
rpty := &screenReconnectingPTY{
command: cmd,
metrics: options.Metrics,
state: newState(),
timeout: options.Timeout,
}
go rpty.lifecycle(ctx, logger)
// Socket paths are limited to around 100 characters on Linux and macOS which
// depending on the temporary directory can be a problem. To give more leeway
// use a short ID.
buf := make([]byte, 4)
_, err := rand.Read(buf)
if err != nil {
rpty.state.setState(StateDone, xerrors.Errorf("generate screen id: %w", err))
return rpty
}
rpty.id = hex.EncodeToString(buf)
settings := []string{
// Tell screen not to handle motion for xterm* terminals which allows
// scrolling the terminal via the mouse wheel or scroll bar (by default
// screen uses it to cycle through the command history). There does not
// seem to be a way to make screen itself scroll on mouse wheel. tmux can
// do it but then there is no scroll bar and it kicks you into copy mode
// where keys stop working until you exit copy mode which seems like it
// could be confusing.
"termcapinfo xterm* ti@:te@",
// Enable alternate screen emulation otherwise applications get rendered in
// the current window which wipes out visible output resulting in missing
// output when scrolling back with the mouse wheel (copy mode still works
// since that is screen itself scrolling).
"altscreen on",
// Remap the control key to C-s since C-a may be used in applications. C-s
// is chosen because it cannot actually be used because by default it will
// pause and C-q to resume will just kill the browser window. We may not
// want people using the control key anyway since it will not be obvious
// they are in screen and doing things like switching windows makes mouse
// wheel scroll wonky due to the terminal doing the scrolling rather than
// screen itself (but again copy mode will work just fine).
"escape ^Ss",
}
rpty.configFile = filepath.Join(os.TempDir(), "coder-screen", "config")
err = os.MkdirAll(filepath.Dir(rpty.configFile), 0o700)
if err != nil {
rpty.state.setState(StateDone, xerrors.Errorf("make screen config dir: %w", err))
return rpty
}
err = os.WriteFile(rpty.configFile, []byte(strings.Join(settings, "\n")), 0o600)
if err != nil {
rpty.state.setState(StateDone, xerrors.Errorf("create config file: %w", err))
return rpty
}
return rpty
}
// lifecycle manages the lifecycle of the reconnecting pty. If the context ends
// the reconnecting pty will be closed.
func (rpty *screenReconnectingPTY) lifecycle(ctx context.Context, logger slog.Logger) {
rpty.timer = time.AfterFunc(attachTimeout, func() {
rpty.Close(xerrors.New("reconnecting pty timeout"))
})
logger.Debug(ctx, "reconnecting pty ready")
rpty.state.setState(StateReady, nil)
state, reasonErr := rpty.state.waitForStateOrContext(ctx, StateClosing)
if state < StateClosing {
// If we have not closed yet then the context is what unblocked us (which
// means the agent is shutting down) so move into the closing phase.
rpty.Close(reasonErr)
}
rpty.timer.Stop()
// If the command errors that the session is already gone that is fine.
err := rpty.sendCommand(context.Background(), "quit", []string{"No screen session found"})
if err != nil {
logger.Error(ctx, "close screen session", slog.Error(err))
}
logger.Info(ctx, "closed reconnecting pty")
rpty.state.setState(StateDone, reasonErr)
}
func (rpty *screenReconnectingPTY) Attach(ctx context.Context, _ string, conn net.Conn, height, width uint16, logger slog.Logger) error {
logger.Info(ctx, "attach to reconnecting pty")
// This will kill the heartbeat once we hit EOF or an error.
ctx, cancel := context.WithCancel(ctx)
defer cancel()
state, err := rpty.state.waitForStateOrContext(ctx, StateReady)
if state != StateReady {
return err
}
go heartbeat(ctx, rpty.timer, rpty.timeout)
ptty, process, err := rpty.doAttach(ctx, conn, height, width, logger)
if err != nil {
if errors.Is(err, context.Canceled) {
// Likely the process was too short-lived and canceled the version command.
// TODO: Is it worth distinguishing between that and a cancel from the
// Attach() caller? Additionally, since this could also happen if
// the command was invalid, should we check the process's exit code?
return nil
}
return err
}
defer func() {
// Log only for debugging since the process might have already exited on its
// own.
err := ptty.Close()
if err != nil {
logger.Debug(ctx, "closed ptty with error", slog.Error(err))
}
err = process.Kill()
if err != nil {
logger.Debug(ctx, "killed process with error", slog.Error(err))
}
}()
// Pipe conn -> pty and block.
readConnLoop(ctx, conn, ptty, rpty.metrics, logger)
return nil
}
// doAttach spawns the screen client and starts the heartbeat. It exists
// separately only so we can defer the mutex unlock which is not possible in
// Attach since it blocks.
func (rpty *screenReconnectingPTY) doAttach(ctx context.Context, conn net.Conn, height, width uint16, logger slog.Logger) (pty.PTYCmd, pty.Process, error) {
// Ensure another attach does not come in and spawn a duplicate session.
rpty.mutex.Lock()
defer rpty.mutex.Unlock()
logger.Debug(ctx, "spawning screen client", slog.F("screen_id", rpty.id))
// Wrap the command with screen and tie it to the connection's context.
cmd := pty.CommandContext(ctx, "screen", append([]string{
// -S is for setting the session's name.
"-S", rpty.id,
// -x allows attaching to an already attached session.
// -RR reattaches to the daemon or creates the session daemon if missing.
// -q disables the "New screen..." message that appears for five seconds
// when creating a new session with -RR.
// -c is the flag for the config file.
"-xRRqc", rpty.configFile,
rpty.command.Path,
// pty.Cmd duplicates Path as the first argument so remove it.
}, rpty.command.Args[1:]...)...)
cmd.Env = append(rpty.command.Env, "TERM=xterm-256color")
cmd.Dir = rpty.command.Dir
ptty, process, err := pty.Start(cmd, pty.WithPTYOption(
pty.WithSSHRequest(ssh.Pty{
Window: ssh.Window{
// Make sure to spawn at the right size because if we resize afterward it
// leaves confusing padding (screen will resize such that the screen
// contents are aligned to the bottom).
Height: int(height),
Width: int(width),
},
}),
))
if err != nil {
rpty.metrics.WithLabelValues("screen_spawn").Add(1)
return nil, nil, err
}
// This context lets us abort the version command if the process dies.
versionCtx, versionCancel := context.WithCancel(ctx)
defer versionCancel()
// Pipe pty -> conn and close the connection when the process exits.
// We do not need to separately monitor for the process exiting. When it
// exits, our ptty.OutputReader() will return EOF after reading all process
// output.
go func() {
defer versionCancel()
defer func() {
err := conn.Close()
if err != nil {
// Log only for debugging since the connection might have already closed
// on its own.
logger.Debug(ctx, "closed connection with error", slog.Error(err))
}
}()
buffer := make([]byte, 1024)
for {
read, err := ptty.OutputReader().Read(buffer)
if err != nil {
// When the PTY is closed, this is triggered.
// Error is typically a benign EOF, so only log for debugging.
if errors.Is(err, io.EOF) {
logger.Debug(ctx, "unable to read pty output; screen might have exited", slog.Error(err))
} else {
logger.Warn(ctx, "unable to read pty output; screen might have exited", slog.Error(err))
rpty.metrics.WithLabelValues("screen_output_reader").Add(1)
}
// The process might have died because the session itself died or it
// might have been separately killed and the session is still up (for
// example `exit` or we killed it when the connection closed). If the
// session is still up we might leave the reconnecting pty in memory
// around longer than it needs to be but it will eventually clean up
// with the timer or context, or the next attach will respawn the screen
// daemon which is fine too.
break
}
part := buffer[:read]
_, err = conn.Write(part)
if err != nil {
// Connection might have been closed.
if errors.Unwrap(err).Error() != "endpoint is closed for send" {
logger.Warn(ctx, "error writing to active conn", slog.Error(err))
rpty.metrics.WithLabelValues("screen_write").Add(1)
}
break
}
}
}()
// Version seems to be the only command without a side effect (other than
// making the version pop up briefly) so use it to wait for the session to
// come up. If we do not wait we could end up spawning multiple sessions with
// the same name.
err = rpty.sendCommand(versionCtx, "version", nil)
if err != nil {
// Log only for debugging since the process might already have closed.
closeErr := ptty.Close()
if closeErr != nil {
logger.Debug(ctx, "closed ptty with error", slog.Error(closeErr))
}
closeErr = process.Kill()
if closeErr != nil {
logger.Debug(ctx, "killed process with error", slog.Error(closeErr))
}
rpty.metrics.WithLabelValues("screen_wait").Add(1)
return nil, nil, err
}
return ptty, process, nil
}
// sendCommand runs a screen command against a running screen session. If the
// command fails with an error matching anything in successErrors it will be
// considered a success state (for example "no session" when quitting and the
// session is already dead). The command will be retried until successful, the
// timeout is reached, or the context ends. A canceled context will return the
// canceled context's error as-is while a timed-out context returns together
// with the last error from the command.
func (rpty *screenReconnectingPTY) sendCommand(ctx context.Context, command string, successErrors []string) error {
ctx, cancel := context.WithTimeout(ctx, attachTimeout)
defer cancel()
var lastErr error
run := func() bool {
var stdout bytes.Buffer
//nolint:gosec
cmd := exec.CommandContext(ctx, "screen",
// -x targets an attached session.
"-x", rpty.id,
// -c is the flag for the config file.
"-c", rpty.configFile,
// -X runs a command in the matching session.
"-X", command,
)
cmd.Env = append(rpty.command.Env, "TERM=xterm-256color")
cmd.Dir = rpty.command.Dir
cmd.Stdout = &stdout
err := cmd.Run()
if err == nil {
return true
}
stdoutStr := stdout.String()
for _, se := range successErrors {
if strings.Contains(stdoutStr, se) {
return true
}
}
// Things like "exit status 1" are imprecise so include stdout as it may
// contain more information ("no screen session found" for example).
if !errors.Is(err, context.Canceled) && !errors.Is(err, context.DeadlineExceeded) {
lastErr = xerrors.Errorf("`screen -x %s -X %s`: %w: %s", rpty.id, command, err, stdoutStr)
}
return false
}
// Run immediately.
if done := run(); done {
return nil
}
// Then run on an interval.
ticker := time.NewTicker(250 * time.Millisecond)
defer ticker.Stop()
for {
select {
case <-ctx.Done():
if errors.Is(ctx.Err(), context.Canceled) {
return ctx.Err()
}
return errors.Join(ctx.Err(), lastErr)
case <-ticker.C:
if done := run(); done {
return nil
}
}
}
}
func (rpty *screenReconnectingPTY) Wait() {
_, _ = rpty.state.waitForState(StateClosing)
}
func (rpty *screenReconnectingPTY) Close(err error) {
// The closing state change will be handled by the lifecycle.
rpty.state.setState(StateClosing, err)
}