2022-02-19 05:13:32 +00:00
|
|
|
package agent
|
|
|
|
|
|
|
|
import (
|
2023-03-31 20:26:19 +00:00
|
|
|
"bytes"
|
2022-02-19 05:13:32 +00:00
|
|
|
"context"
|
2022-09-01 01:09:44 +00:00
|
|
|
"encoding/binary"
|
2022-04-29 22:30:10 +00:00
|
|
|
"encoding/json"
|
2022-02-19 05:13:32 +00:00
|
|
|
"errors"
|
|
|
|
"fmt"
|
|
|
|
"io"
|
|
|
|
"net"
|
2022-10-06 12:38:22 +00:00
|
|
|
"net/http"
|
2022-09-01 01:09:44 +00:00
|
|
|
"net/netip"
|
2022-03-22 19:17:50 +00:00
|
|
|
"os"
|
2023-06-20 11:41:55 +00:00
|
|
|
"os/exec"
|
2022-02-19 05:13:32 +00:00
|
|
|
"os/user"
|
2022-05-02 16:36:51 +00:00
|
|
|
"path/filepath"
|
2023-03-02 14:06:00 +00:00
|
|
|
"sort"
|
2022-04-29 22:30:10 +00:00
|
|
|
"strconv"
|
2022-04-25 19:41:52 +00:00
|
|
|
"strings"
|
2022-02-19 05:13:32 +00:00
|
|
|
"sync"
|
|
|
|
"time"
|
|
|
|
|
2023-08-08 17:56:08 +00:00
|
|
|
"github.com/go-chi/chi/v5"
|
2022-04-29 22:30:10 +00:00
|
|
|
"github.com/google/uuid"
|
2023-05-25 10:52:36 +00:00
|
|
|
"github.com/prometheus/client_golang/prometheus"
|
2022-10-25 00:46:24 +00:00
|
|
|
"github.com/spf13/afero"
|
2022-04-25 18:30:39 +00:00
|
|
|
"go.uber.org/atomic"
|
2022-12-13 19:28:07 +00:00
|
|
|
"golang.org/x/exp/slices"
|
2023-07-26 16:21:04 +00:00
|
|
|
"golang.org/x/sync/errgroup"
|
2022-05-24 21:03:42 +00:00
|
|
|
"golang.org/x/xerrors"
|
2022-09-05 22:15:49 +00:00
|
|
|
"tailscale.com/net/speedtest"
|
2022-09-01 01:09:44 +00:00
|
|
|
"tailscale.com/tailcfg"
|
2022-11-18 22:46:53 +00:00
|
|
|
"tailscale.com/types/netlogtype"
|
2022-04-25 18:30:39 +00:00
|
|
|
|
2022-02-19 05:13:32 +00:00
|
|
|
"cdr.dev/slog"
|
2023-08-18 18:55:43 +00:00
|
|
|
"github.com/coder/coder/v2/agent/agentssh"
|
|
|
|
"github.com/coder/coder/v2/agent/reconnectingpty"
|
|
|
|
"github.com/coder/coder/v2/buildinfo"
|
2023-09-01 15:41:22 +00:00
|
|
|
"github.com/coder/coder/v2/cli/gitauth"
|
2023-09-01 16:50:12 +00:00
|
|
|
"github.com/coder/coder/v2/coderd/database/dbtime"
|
2023-08-18 18:55:43 +00:00
|
|
|
"github.com/coder/coder/v2/codersdk"
|
|
|
|
"github.com/coder/coder/v2/codersdk/agentsdk"
|
|
|
|
"github.com/coder/coder/v2/tailnet"
|
2022-02-19 05:13:32 +00:00
|
|
|
"github.com/coder/retry"
|
2022-05-24 21:03:42 +00:00
|
|
|
)
|
2022-02-19 05:13:32 +00:00
|
|
|
|
2022-05-24 21:03:42 +00:00
|
|
|
const (
|
|
|
|
ProtocolReconnectingPTY = "reconnecting-pty"
|
|
|
|
ProtocolSSH = "ssh"
|
|
|
|
ProtocolDial = "dial"
|
2022-02-19 05:13:32 +00:00
|
|
|
)
|
|
|
|
|
2022-04-29 22:30:10 +00:00
|
|
|
type Options struct {
|
2023-07-14 13:10:26 +00:00
|
|
|
Filesystem afero.Fs
|
|
|
|
LogDir string
|
|
|
|
TempDir string
|
|
|
|
ExchangeToken func(ctx context.Context) (string, error)
|
|
|
|
Client Client
|
|
|
|
ReconnectingPTYTimeout time.Duration
|
|
|
|
EnvironmentVariables map[string]string
|
|
|
|
Logger slog.Logger
|
|
|
|
IgnorePorts map[int]string
|
|
|
|
SSHMaxTimeout time.Duration
|
|
|
|
TailnetListenPort uint16
|
2023-08-09 05:10:28 +00:00
|
|
|
Subsystems []codersdk.AgentSubsystem
|
2023-07-14 13:10:26 +00:00
|
|
|
Addresses []netip.Prefix
|
|
|
|
PrometheusRegistry *prometheus.Registry
|
|
|
|
ReportMetadataInterval time.Duration
|
|
|
|
ServiceBannerRefreshInterval time.Duration
|
2022-06-24 15:25:01 +00:00
|
|
|
}
|
|
|
|
|
2022-10-24 03:35:08 +00:00
|
|
|
type Client interface {
|
2023-03-31 20:26:19 +00:00
|
|
|
Manifest(ctx context.Context) (agentsdk.Manifest, error)
|
2023-01-29 21:47:24 +00:00
|
|
|
Listen(ctx context.Context) (net.Conn, error)
|
2023-07-26 16:21:04 +00:00
|
|
|
DERPMapUpdates(ctx context.Context) (<-chan agentsdk.DERPMapUpdate, io.Closer, error)
|
2023-02-10 03:43:18 +00:00
|
|
|
ReportStats(ctx context.Context, log slog.Logger, statsChan <-chan *agentsdk.Stats, setInterval func(time.Duration)) (io.Closer, error)
|
2023-01-29 21:47:24 +00:00
|
|
|
PostLifecycle(ctx context.Context, state agentsdk.PostLifecycleRequest) error
|
|
|
|
PostAppHealth(ctx context.Context, req agentsdk.PostAppHealthsRequest) error
|
2023-02-07 21:35:09 +00:00
|
|
|
PostStartup(ctx context.Context, req agentsdk.PostStartupRequest) error
|
2023-03-31 20:26:19 +00:00
|
|
|
PostMetadata(ctx context.Context, key string, req agentsdk.PostMetadataRequest) error
|
2023-07-28 15:57:23 +00:00
|
|
|
PatchLogs(ctx context.Context, req agentsdk.PatchLogs) error
|
2023-06-30 18:41:29 +00:00
|
|
|
GetServiceBanner(ctx context.Context) (codersdk.ServiceBannerConfig, error)
|
2022-10-24 03:35:08 +00:00
|
|
|
}
|
2022-09-01 01:09:44 +00:00
|
|
|
|
2023-04-26 18:01:49 +00:00
|
|
|
type Agent interface {
|
|
|
|
HTTPDebug() http.Handler
|
2023-08-01 15:50:43 +00:00
|
|
|
// TailnetConn may be nil.
|
|
|
|
TailnetConn() *tailnet.Conn
|
2023-04-26 18:01:49 +00:00
|
|
|
io.Closer
|
|
|
|
}
|
|
|
|
|
|
|
|
func New(options Options) Agent {
|
2022-10-25 00:46:24 +00:00
|
|
|
if options.Filesystem == nil {
|
|
|
|
options.Filesystem = afero.NewOsFs()
|
|
|
|
}
|
2022-11-13 20:23:23 +00:00
|
|
|
if options.TempDir == "" {
|
|
|
|
options.TempDir = os.TempDir()
|
|
|
|
}
|
2023-01-30 16:39:52 +00:00
|
|
|
if options.LogDir == "" {
|
|
|
|
if options.TempDir != os.TempDir() {
|
|
|
|
options.Logger.Debug(context.Background(), "log dir not set, using temp dir", slog.F("temp_dir", options.TempDir))
|
|
|
|
}
|
|
|
|
options.LogDir = options.TempDir
|
|
|
|
}
|
2022-11-04 16:44:36 +00:00
|
|
|
if options.ExchangeToken == nil {
|
|
|
|
options.ExchangeToken = func(ctx context.Context) (string, error) {
|
|
|
|
return "", nil
|
|
|
|
}
|
|
|
|
}
|
2023-07-14 13:10:26 +00:00
|
|
|
if options.ReportMetadataInterval == 0 {
|
2023-07-20 19:02:58 +00:00
|
|
|
options.ReportMetadataInterval = time.Second
|
2023-07-14 13:10:26 +00:00
|
|
|
}
|
|
|
|
if options.ServiceBannerRefreshInterval == 0 {
|
|
|
|
options.ServiceBannerRefreshInterval = 2 * time.Minute
|
|
|
|
}
|
2023-05-25 10:52:36 +00:00
|
|
|
|
|
|
|
prometheusRegistry := options.PrometheusRegistry
|
|
|
|
if prometheusRegistry == nil {
|
|
|
|
prometheusRegistry = prometheus.NewRegistry()
|
|
|
|
}
|
|
|
|
|
2022-02-19 05:13:32 +00:00
|
|
|
ctx, cancelFunc := context.WithCancel(context.Background())
|
2022-12-13 19:28:07 +00:00
|
|
|
a := &agent{
|
2023-07-14 13:10:26 +00:00
|
|
|
tailnetListenPort: options.TailnetListenPort,
|
|
|
|
reconnectingPTYTimeout: options.ReconnectingPTYTimeout,
|
|
|
|
logger: options.Logger,
|
|
|
|
closeCancel: cancelFunc,
|
|
|
|
closed: make(chan struct{}),
|
|
|
|
envVars: options.EnvironmentVariables,
|
|
|
|
client: options.Client,
|
|
|
|
exchangeToken: options.ExchangeToken,
|
|
|
|
filesystem: options.Filesystem,
|
|
|
|
logDir: options.LogDir,
|
|
|
|
tempDir: options.TempDir,
|
|
|
|
lifecycleUpdate: make(chan struct{}, 1),
|
|
|
|
lifecycleReported: make(chan codersdk.WorkspaceAgentLifecycle, 1),
|
|
|
|
lifecycleStates: []agentsdk.PostLifecycleRequest{{State: codersdk.WorkspaceAgentLifecycleCreated}},
|
|
|
|
ignorePorts: options.IgnorePorts,
|
|
|
|
connStatsChan: make(chan *agentsdk.Stats, 1),
|
|
|
|
reportMetadataInterval: options.ReportMetadataInterval,
|
|
|
|
serviceBannerRefreshInterval: options.ServiceBannerRefreshInterval,
|
|
|
|
sshMaxTimeout: options.SSHMaxTimeout,
|
2023-08-09 05:10:28 +00:00
|
|
|
subsystems: options.Subsystems,
|
2023-07-14 13:10:26 +00:00
|
|
|
addresses: options.Addresses,
|
2023-05-25 10:52:36 +00:00
|
|
|
|
|
|
|
prometheusRegistry: prometheusRegistry,
|
|
|
|
metrics: newAgentMetrics(prometheusRegistry),
|
2022-02-19 05:13:32 +00:00
|
|
|
}
|
2022-12-13 19:28:07 +00:00
|
|
|
a.init(ctx)
|
|
|
|
return a
|
2022-02-19 05:13:32 +00:00
|
|
|
}
|
|
|
|
|
2022-03-25 19:48:08 +00:00
|
|
|
type agent struct {
|
2023-04-03 16:20:19 +00:00
|
|
|
logger slog.Logger
|
|
|
|
client Client
|
|
|
|
exchangeToken func(ctx context.Context) (string, error)
|
|
|
|
tailnetListenPort uint16
|
|
|
|
filesystem afero.Fs
|
|
|
|
logDir string
|
|
|
|
tempDir string
|
2023-03-09 16:53:00 +00:00
|
|
|
// ignorePorts tells the api handler which ports to ignore when
|
|
|
|
// listing all listening ports. This is helpful to hide ports that
|
|
|
|
// are used by the agent, that the user does not care about.
|
|
|
|
ignorePorts map[int]string
|
2023-08-09 05:10:28 +00:00
|
|
|
subsystems []codersdk.AgentSubsystem
|
2022-02-19 05:13:32 +00:00
|
|
|
|
2022-04-29 22:30:10 +00:00
|
|
|
reconnectingPTYs sync.Map
|
|
|
|
reconnectingPTYTimeout time.Duration
|
|
|
|
|
2022-03-07 17:40:54 +00:00
|
|
|
connCloseWait sync.WaitGroup
|
|
|
|
closeCancel context.CancelFunc
|
|
|
|
closeMutex sync.Mutex
|
|
|
|
closed chan struct{}
|
2022-02-19 05:13:32 +00:00
|
|
|
|
2022-04-30 16:40:30 +00:00
|
|
|
envVars map[string]string
|
2023-07-14 13:10:26 +00:00
|
|
|
|
|
|
|
manifest atomic.Pointer[agentsdk.Manifest] // manifest is atomic because values can change after reconnection.
|
|
|
|
reportMetadataInterval time.Duration
|
|
|
|
serviceBanner atomic.Pointer[codersdk.ServiceBannerConfig] // serviceBanner is atomic because it is periodically updated.
|
|
|
|
serviceBannerRefreshInterval time.Duration
|
|
|
|
sessionToken atomic.Pointer[string]
|
|
|
|
sshServer *agentssh.Server
|
|
|
|
sshMaxTimeout time.Duration
|
2022-10-24 03:35:08 +00:00
|
|
|
|
2023-03-06 19:34:00 +00:00
|
|
|
lifecycleUpdate chan struct{}
|
|
|
|
lifecycleReported chan codersdk.WorkspaceAgentLifecycle
|
|
|
|
lifecycleMu sync.RWMutex // Protects following.
|
2023-06-20 11:41:55 +00:00
|
|
|
lifecycleStates []agentsdk.PostLifecycleRequest
|
2023-01-24 12:24:27 +00:00
|
|
|
|
2023-02-10 03:43:18 +00:00
|
|
|
network *tailnet.Conn
|
2023-07-12 22:37:31 +00:00
|
|
|
addresses []netip.Prefix
|
2023-02-10 03:43:18 +00:00
|
|
|
connStatsChan chan *agentsdk.Stats
|
2023-03-09 03:05:45 +00:00
|
|
|
latestStat atomic.Pointer[agentsdk.Stats]
|
2023-03-02 14:06:00 +00:00
|
|
|
|
|
|
|
connCountReconnectingPTY atomic.Int64
|
2023-05-25 10:52:36 +00:00
|
|
|
|
|
|
|
prometheusRegistry *prometheus.Registry
|
|
|
|
metrics *agentMetrics
|
2023-04-06 16:39:22 +00:00
|
|
|
}
|
|
|
|
|
2023-08-01 15:50:43 +00:00
|
|
|
func (a *agent) TailnetConn() *tailnet.Conn {
|
|
|
|
return a.network
|
|
|
|
}
|
|
|
|
|
2023-04-06 16:39:22 +00:00
|
|
|
func (a *agent) init(ctx context.Context) {
|
2023-05-25 10:52:36 +00:00
|
|
|
sshSrv, err := agentssh.NewServer(ctx, a.logger.Named("ssh-server"), a.prometheusRegistry, a.filesystem, a.sshMaxTimeout, "")
|
2023-04-06 16:39:22 +00:00
|
|
|
if err != nil {
|
|
|
|
panic(err)
|
|
|
|
}
|
|
|
|
sshSrv.Env = a.envVars
|
|
|
|
sshSrv.AgentToken = func() string { return *a.sessionToken.Load() }
|
|
|
|
sshSrv.Manifest = &a.manifest
|
2023-06-30 18:41:29 +00:00
|
|
|
sshSrv.ServiceBanner = &a.serviceBanner
|
2023-04-06 16:39:22 +00:00
|
|
|
a.sshServer = sshSrv
|
|
|
|
|
|
|
|
go a.runLoop(ctx)
|
2022-02-19 05:13:32 +00:00
|
|
|
}
|
|
|
|
|
2022-10-24 03:35:08 +00:00
|
|
|
// runLoop attempts to start the agent in a retry loop.
|
|
|
|
// Coder may be offline temporarily, a connection issue
|
|
|
|
// may be happening, but regardless after the intermittent
|
|
|
|
// failure, you'll want the agent to reconnect.
|
|
|
|
func (a *agent) runLoop(ctx context.Context) {
|
2023-01-24 12:24:27 +00:00
|
|
|
go a.reportLifecycleLoop(ctx)
|
2023-03-31 20:26:19 +00:00
|
|
|
go a.reportMetadataLoop(ctx)
|
2023-06-30 18:41:29 +00:00
|
|
|
go a.fetchServiceBannerLoop(ctx)
|
2023-01-24 12:24:27 +00:00
|
|
|
|
2022-10-24 03:35:08 +00:00
|
|
|
for retrier := retry.New(100*time.Millisecond, 10*time.Second); retrier.Wait(ctx); {
|
2023-02-27 15:20:24 +00:00
|
|
|
a.logger.Info(ctx, "connecting to coderd")
|
2022-10-24 03:35:08 +00:00
|
|
|
err := a.run(ctx)
|
|
|
|
// Cancel after the run is complete to clean up any leaked resources!
|
|
|
|
if err == nil {
|
2022-03-22 19:17:50 +00:00
|
|
|
continue
|
|
|
|
}
|
2023-08-02 07:25:07 +00:00
|
|
|
if ctx.Err() != nil {
|
|
|
|
// Context canceled errors may come from websocket pings, so we
|
|
|
|
// don't want to use `errors.Is(err, context.Canceled)` here.
|
2022-09-01 01:09:44 +00:00
|
|
|
return
|
|
|
|
}
|
2022-10-24 03:35:08 +00:00
|
|
|
if a.isClosed() {
|
|
|
|
return
|
|
|
|
}
|
|
|
|
if errors.Is(err, io.EOF) {
|
2023-02-27 15:20:24 +00:00
|
|
|
a.logger.Info(ctx, "disconnected from coderd")
|
2022-10-24 03:35:08 +00:00
|
|
|
continue
|
|
|
|
}
|
|
|
|
a.logger.Warn(ctx, "run exited with error", slog.Error(err))
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2023-07-20 18:49:44 +00:00
|
|
|
func (a *agent) collectMetadata(ctx context.Context, md codersdk.WorkspaceAgentMetadataDescription, now time.Time) *codersdk.WorkspaceAgentMetadataResult {
|
2023-03-31 20:26:19 +00:00
|
|
|
var out bytes.Buffer
|
|
|
|
result := &codersdk.WorkspaceAgentMetadataResult{
|
|
|
|
// CollectedAt is set here for testing purposes and overrode by
|
2023-05-02 10:41:41 +00:00
|
|
|
// coderd to the time of server receipt to solve clock skew.
|
2023-03-31 20:26:19 +00:00
|
|
|
//
|
|
|
|
// In the future, the server may accept the timestamp from the agent
|
2023-05-02 10:41:41 +00:00
|
|
|
// if it can guarantee the clocks are synchronized.
|
2023-07-20 18:49:44 +00:00
|
|
|
CollectedAt: now,
|
2023-03-31 20:26:19 +00:00
|
|
|
}
|
2023-05-03 07:43:05 +00:00
|
|
|
cmdPty, err := a.sshServer.CreateCommand(ctx, md.Script, nil)
|
2023-03-31 20:26:19 +00:00
|
|
|
if err != nil {
|
2023-05-02 10:41:41 +00:00
|
|
|
result.Error = fmt.Sprintf("create cmd: %+v", err)
|
2023-03-31 20:26:19 +00:00
|
|
|
return result
|
|
|
|
}
|
2023-05-03 07:43:05 +00:00
|
|
|
cmd := cmdPty.AsExec()
|
2023-03-31 20:26:19 +00:00
|
|
|
|
|
|
|
cmd.Stdout = &out
|
|
|
|
cmd.Stderr = &out
|
2023-05-02 10:41:41 +00:00
|
|
|
cmd.Stdin = io.LimitReader(nil, 0)
|
2023-03-31 20:26:19 +00:00
|
|
|
|
2023-05-02 10:41:41 +00:00
|
|
|
// We split up Start and Wait instead of calling Run so that we can return a more precise error.
|
|
|
|
err = cmd.Start()
|
|
|
|
if err != nil {
|
|
|
|
result.Error = fmt.Sprintf("start cmd: %+v", err)
|
|
|
|
return result
|
|
|
|
}
|
2023-03-31 20:26:19 +00:00
|
|
|
|
2023-05-02 10:41:41 +00:00
|
|
|
// This error isn't mutually exclusive with useful output.
|
|
|
|
err = cmd.Wait()
|
2023-03-31 20:26:19 +00:00
|
|
|
const bufLimit = 10 << 10
|
|
|
|
if out.Len() > bufLimit {
|
|
|
|
err = errors.Join(
|
|
|
|
err,
|
|
|
|
xerrors.Errorf("output truncated from %v to %v bytes", out.Len(), bufLimit),
|
|
|
|
)
|
|
|
|
out.Truncate(bufLimit)
|
|
|
|
}
|
|
|
|
|
2023-05-02 10:41:41 +00:00
|
|
|
// Important: if the command times out, we may see a misleading error like
|
|
|
|
// "exit status 1", so it's important to include the context error.
|
|
|
|
err = errors.Join(err, ctx.Err())
|
|
|
|
|
2023-03-31 20:26:19 +00:00
|
|
|
if err != nil {
|
2023-05-02 10:41:41 +00:00
|
|
|
result.Error = fmt.Sprintf("run cmd: %+v", err)
|
2023-03-31 20:26:19 +00:00
|
|
|
}
|
|
|
|
result.Value = out.String()
|
|
|
|
return result
|
|
|
|
}
|
|
|
|
|
|
|
|
type metadataResultAndKey struct {
|
|
|
|
result *codersdk.WorkspaceAgentMetadataResult
|
|
|
|
key string
|
|
|
|
}
|
|
|
|
|
2023-04-01 21:34:42 +00:00
|
|
|
type trySingleflight struct {
|
2023-07-20 18:49:44 +00:00
|
|
|
mu sync.Mutex
|
|
|
|
m map[string]struct{}
|
2023-04-01 21:34:42 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
func (t *trySingleflight) Do(key string, fn func()) {
|
2023-07-20 18:49:44 +00:00
|
|
|
t.mu.Lock()
|
|
|
|
_, ok := t.m[key]
|
|
|
|
if ok {
|
|
|
|
t.mu.Unlock()
|
2023-04-01 21:34:42 +00:00
|
|
|
return
|
|
|
|
}
|
|
|
|
|
2023-07-20 18:49:44 +00:00
|
|
|
t.m[key] = struct{}{}
|
|
|
|
t.mu.Unlock()
|
|
|
|
defer func() {
|
|
|
|
t.mu.Lock()
|
|
|
|
delete(t.m, key)
|
|
|
|
t.mu.Unlock()
|
|
|
|
}()
|
|
|
|
|
2023-04-01 21:34:42 +00:00
|
|
|
fn()
|
|
|
|
}
|
|
|
|
|
2023-03-31 20:26:19 +00:00
|
|
|
func (a *agent) reportMetadataLoop(ctx context.Context) {
|
|
|
|
const metadataLimit = 128
|
|
|
|
|
|
|
|
var (
|
2023-07-20 18:49:44 +00:00
|
|
|
baseTicker = time.NewTicker(a.reportMetadataInterval)
|
|
|
|
lastCollectedAtMu sync.RWMutex
|
|
|
|
lastCollectedAts = make(map[string]time.Time)
|
|
|
|
metadataResults = make(chan metadataResultAndKey, metadataLimit)
|
|
|
|
logger = a.logger.Named("metadata")
|
2023-03-31 20:26:19 +00:00
|
|
|
)
|
|
|
|
defer baseTicker.Stop()
|
|
|
|
|
2023-04-01 21:34:42 +00:00
|
|
|
// We use a custom singleflight that immediately returns if there is already
|
|
|
|
// a goroutine running for a given key. This is to prevent a build-up of
|
|
|
|
// goroutines waiting on Do when the script takes many multiples of
|
|
|
|
// baseInterval to run.
|
2023-07-20 18:49:44 +00:00
|
|
|
flight := trySingleflight{m: map[string]struct{}{}}
|
2023-03-31 20:26:19 +00:00
|
|
|
|
2023-07-20 19:02:58 +00:00
|
|
|
postMetadata := func(mr metadataResultAndKey) {
|
|
|
|
err := a.client.PostMetadata(ctx, mr.key, *mr.result)
|
|
|
|
if err != nil {
|
|
|
|
a.logger.Error(ctx, "agent failed to report metadata", slog.Error(err))
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2023-03-31 20:26:19 +00:00
|
|
|
for {
|
|
|
|
select {
|
|
|
|
case <-ctx.Done():
|
|
|
|
return
|
|
|
|
case mr := <-metadataResults:
|
2023-07-20 19:02:58 +00:00
|
|
|
postMetadata(mr)
|
2023-07-20 18:49:44 +00:00
|
|
|
continue
|
2023-03-31 20:26:19 +00:00
|
|
|
case <-baseTicker.C:
|
|
|
|
}
|
|
|
|
|
|
|
|
if len(metadataResults) > 0 {
|
|
|
|
// The inner collection loop expects the channel is empty before spinning up
|
|
|
|
// all the collection goroutines.
|
2023-07-20 18:49:44 +00:00
|
|
|
logger.Debug(ctx, "metadata collection backpressured",
|
2023-03-31 20:26:19 +00:00
|
|
|
slog.F("queue_len", len(metadataResults)),
|
|
|
|
)
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
|
|
|
|
manifest := a.manifest.Load()
|
|
|
|
if manifest == nil {
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
|
|
|
|
if len(manifest.Metadata) > metadataLimit {
|
2023-07-20 18:49:44 +00:00
|
|
|
logger.Error(
|
2023-03-31 20:26:19 +00:00
|
|
|
ctx, "metadata limit exceeded",
|
|
|
|
slog.F("limit", metadataLimit), slog.F("got", len(manifest.Metadata)),
|
|
|
|
)
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
|
|
|
|
// If the manifest changes (e.g. on agent reconnect) we need to
|
|
|
|
// purge old cache values to prevent lastCollectedAt from growing
|
|
|
|
// boundlessly.
|
2023-07-20 18:49:44 +00:00
|
|
|
lastCollectedAtMu.Lock()
|
2023-03-31 20:26:19 +00:00
|
|
|
for key := range lastCollectedAts {
|
|
|
|
if slices.IndexFunc(manifest.Metadata, func(md codersdk.WorkspaceAgentMetadataDescription) bool {
|
|
|
|
return md.Key == key
|
|
|
|
}) < 0 {
|
2023-07-20 18:49:44 +00:00
|
|
|
logger.Debug(ctx, "deleting lastCollected key, missing from manifest",
|
|
|
|
slog.F("key", key),
|
|
|
|
)
|
2023-03-31 20:26:19 +00:00
|
|
|
delete(lastCollectedAts, key)
|
|
|
|
}
|
|
|
|
}
|
2023-07-20 18:49:44 +00:00
|
|
|
lastCollectedAtMu.Unlock()
|
2023-03-31 20:26:19 +00:00
|
|
|
|
|
|
|
// Spawn a goroutine for each metadata collection, and use a
|
|
|
|
// channel to synchronize the results and avoid both messy
|
|
|
|
// mutex logic and overloading the API.
|
|
|
|
for _, md := range manifest.Metadata {
|
|
|
|
md := md
|
|
|
|
// We send the result to the channel in the goroutine to avoid
|
|
|
|
// sending the same result multiple times. So, we don't care about
|
|
|
|
// the return values.
|
2023-04-01 21:34:42 +00:00
|
|
|
go flight.Do(md.Key, func() {
|
2023-07-20 18:49:44 +00:00
|
|
|
ctx := slog.With(ctx, slog.F("key", md.Key))
|
|
|
|
lastCollectedAtMu.RLock()
|
|
|
|
collectedAt, ok := lastCollectedAts[md.Key]
|
|
|
|
lastCollectedAtMu.RUnlock()
|
|
|
|
if ok {
|
|
|
|
// If the interval is zero, we assume the user just wants
|
|
|
|
// a single collection at startup, not a spinning loop.
|
|
|
|
if md.Interval == 0 {
|
|
|
|
return
|
|
|
|
}
|
2023-07-20 19:02:58 +00:00
|
|
|
intervalUnit := time.Second
|
|
|
|
// reportMetadataInterval is only less than a second in tests,
|
|
|
|
// so adjust the interval unit for them.
|
|
|
|
if a.reportMetadataInterval < time.Second {
|
|
|
|
intervalUnit = 100 * time.Millisecond
|
|
|
|
}
|
2023-07-20 18:49:44 +00:00
|
|
|
// The last collected value isn't quite stale yet, so we skip it.
|
2023-07-20 19:02:58 +00:00
|
|
|
if collectedAt.Add(time.Duration(md.Interval) * intervalUnit).After(time.Now()) {
|
2023-07-20 18:49:44 +00:00
|
|
|
return
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2023-03-31 20:26:19 +00:00
|
|
|
timeout := md.Timeout
|
|
|
|
if timeout == 0 {
|
2023-07-20 18:49:44 +00:00
|
|
|
if md.Interval != 0 {
|
|
|
|
timeout = md.Interval
|
|
|
|
} else if interval := int64(a.reportMetadataInterval.Seconds()); interval != 0 {
|
|
|
|
// Fallback to the report interval
|
|
|
|
timeout = interval * 3
|
|
|
|
} else {
|
|
|
|
// If the interval is still 0 (possible if the interval
|
|
|
|
// is less than a second), default to 5. This was
|
|
|
|
// randomly picked.
|
|
|
|
timeout = 5
|
|
|
|
}
|
2023-03-31 20:26:19 +00:00
|
|
|
}
|
2023-07-20 18:49:44 +00:00
|
|
|
ctxTimeout := time.Duration(timeout) * time.Second
|
|
|
|
ctx, cancel := context.WithTimeout(ctx, ctxTimeout)
|
2023-03-31 20:26:19 +00:00
|
|
|
defer cancel()
|
|
|
|
|
2023-07-20 18:49:44 +00:00
|
|
|
now := time.Now()
|
2023-03-31 20:26:19 +00:00
|
|
|
select {
|
|
|
|
case <-ctx.Done():
|
2023-07-20 18:49:44 +00:00
|
|
|
logger.Warn(ctx, "metadata collection timed out", slog.F("timeout", ctxTimeout))
|
2023-03-31 20:26:19 +00:00
|
|
|
case metadataResults <- metadataResultAndKey{
|
|
|
|
key: md.Key,
|
2023-07-20 18:49:44 +00:00
|
|
|
result: a.collectMetadata(ctx, md, now),
|
2023-03-31 20:26:19 +00:00
|
|
|
}:
|
2023-07-20 18:49:44 +00:00
|
|
|
lastCollectedAtMu.Lock()
|
|
|
|
lastCollectedAts[md.Key] = now
|
|
|
|
lastCollectedAtMu.Unlock()
|
2023-03-31 20:26:19 +00:00
|
|
|
}
|
|
|
|
})
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2023-06-20 11:41:55 +00:00
|
|
|
// reportLifecycleLoop reports the current lifecycle state once. All state
|
|
|
|
// changes are reported in order.
|
2023-01-24 12:24:27 +00:00
|
|
|
func (a *agent) reportLifecycleLoop(ctx context.Context) {
|
2023-06-20 11:41:55 +00:00
|
|
|
lastReportedIndex := 0 // Start off with the created state without reporting it.
|
2023-01-24 12:24:27 +00:00
|
|
|
for {
|
|
|
|
select {
|
|
|
|
case <-a.lifecycleUpdate:
|
|
|
|
case <-ctx.Done():
|
|
|
|
return
|
|
|
|
}
|
|
|
|
|
|
|
|
for r := retry.New(time.Second, 15*time.Second); r.Wait(ctx); {
|
2023-03-06 19:34:00 +00:00
|
|
|
a.lifecycleMu.RLock()
|
2023-06-20 11:41:55 +00:00
|
|
|
lastIndex := len(a.lifecycleStates) - 1
|
|
|
|
report := a.lifecycleStates[lastReportedIndex]
|
|
|
|
if len(a.lifecycleStates) > lastReportedIndex+1 {
|
|
|
|
report = a.lifecycleStates[lastReportedIndex+1]
|
|
|
|
}
|
2023-03-06 19:34:00 +00:00
|
|
|
a.lifecycleMu.RUnlock()
|
2023-01-24 12:24:27 +00:00
|
|
|
|
2023-06-20 11:41:55 +00:00
|
|
|
if lastIndex == lastReportedIndex {
|
2023-01-24 12:24:27 +00:00
|
|
|
break
|
|
|
|
}
|
|
|
|
|
2023-06-20 11:41:55 +00:00
|
|
|
a.logger.Debug(ctx, "reporting lifecycle state", slog.F("payload", report))
|
2023-01-24 12:24:27 +00:00
|
|
|
|
2023-06-20 11:41:55 +00:00
|
|
|
err := a.client.PostLifecycle(ctx, report)
|
2023-01-24 12:24:27 +00:00
|
|
|
if err == nil {
|
2023-06-20 11:41:55 +00:00
|
|
|
lastReportedIndex++
|
2023-03-06 19:34:00 +00:00
|
|
|
select {
|
2023-06-20 11:41:55 +00:00
|
|
|
case a.lifecycleReported <- report.State:
|
2023-03-06 19:34:00 +00:00
|
|
|
case <-a.lifecycleReported:
|
2023-06-20 11:41:55 +00:00
|
|
|
a.lifecycleReported <- report.State
|
|
|
|
}
|
|
|
|
if lastReportedIndex < lastIndex {
|
|
|
|
// Keep reporting until we've sent all messages, we can't
|
|
|
|
// rely on the channel triggering us before the backlog is
|
|
|
|
// consumed.
|
|
|
|
continue
|
2023-03-06 19:34:00 +00:00
|
|
|
}
|
2023-01-24 12:24:27 +00:00
|
|
|
break
|
|
|
|
}
|
|
|
|
if xerrors.Is(err, context.Canceled) || xerrors.Is(err, context.DeadlineExceeded) {
|
|
|
|
return
|
|
|
|
}
|
|
|
|
// If we fail to report the state we probably shouldn't exit, log only.
|
2023-06-21 10:00:38 +00:00
|
|
|
a.logger.Error(ctx, "agent failed to report the lifecycle state", slog.Error(err))
|
2023-01-24 12:24:27 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2023-03-06 19:34:00 +00:00
|
|
|
// setLifecycle sets the lifecycle state and notifies the lifecycle loop.
|
|
|
|
// The state is only updated if it's a valid state transition.
|
2023-01-24 12:24:27 +00:00
|
|
|
func (a *agent) setLifecycle(ctx context.Context, state codersdk.WorkspaceAgentLifecycle) {
|
2023-06-20 11:41:55 +00:00
|
|
|
report := agentsdk.PostLifecycleRequest{
|
|
|
|
State: state,
|
2023-09-01 16:50:12 +00:00
|
|
|
ChangedAt: dbtime.Now(),
|
2023-06-20 11:41:55 +00:00
|
|
|
}
|
|
|
|
|
2023-01-24 12:24:27 +00:00
|
|
|
a.lifecycleMu.Lock()
|
2023-06-20 11:41:55 +00:00
|
|
|
lastReport := a.lifecycleStates[len(a.lifecycleStates)-1]
|
|
|
|
if slices.Index(codersdk.WorkspaceAgentLifecycleOrder, lastReport.State) >= slices.Index(codersdk.WorkspaceAgentLifecycleOrder, report.State) {
|
|
|
|
a.logger.Warn(ctx, "attempted to set lifecycle state to a previous state", slog.F("last", lastReport), slog.F("current", report))
|
2023-03-06 19:34:00 +00:00
|
|
|
a.lifecycleMu.Unlock()
|
|
|
|
return
|
|
|
|
}
|
2023-06-20 11:41:55 +00:00
|
|
|
a.lifecycleStates = append(a.lifecycleStates, report)
|
|
|
|
a.logger.Debug(ctx, "set lifecycle state", slog.F("current", report), slog.F("last", lastReport))
|
2023-03-06 19:34:00 +00:00
|
|
|
a.lifecycleMu.Unlock()
|
|
|
|
|
2023-01-24 12:24:27 +00:00
|
|
|
select {
|
|
|
|
case a.lifecycleUpdate <- struct{}{}:
|
|
|
|
default:
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2023-06-30 18:41:29 +00:00
|
|
|
// fetchServiceBannerLoop fetches the service banner on an interval. It will
|
|
|
|
// not be fetched immediately; the expectation is that it is primed elsewhere
|
|
|
|
// (and must be done before the session actually starts).
|
|
|
|
func (a *agent) fetchServiceBannerLoop(ctx context.Context) {
|
2023-07-14 13:10:26 +00:00
|
|
|
ticker := time.NewTicker(a.serviceBannerRefreshInterval)
|
2023-06-30 18:41:29 +00:00
|
|
|
defer ticker.Stop()
|
|
|
|
for {
|
|
|
|
select {
|
|
|
|
case <-ctx.Done():
|
|
|
|
return
|
|
|
|
case <-ticker.C:
|
|
|
|
serviceBanner, err := a.client.GetServiceBanner(ctx)
|
|
|
|
if err != nil {
|
|
|
|
if ctx.Err() != nil {
|
|
|
|
return
|
|
|
|
}
|
|
|
|
a.logger.Error(ctx, "failed to update service banner", slog.Error(err))
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
a.serviceBanner.Store(&serviceBanner)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2022-10-24 03:35:08 +00:00
|
|
|
func (a *agent) run(ctx context.Context) error {
|
|
|
|
// This allows the agent to refresh it's token if necessary.
|
|
|
|
// For instance identity this is required, since the instance
|
|
|
|
// may not have re-provisioned, but a new agent ID was created.
|
2022-11-04 16:44:36 +00:00
|
|
|
sessionToken, err := a.exchangeToken(ctx)
|
|
|
|
if err != nil {
|
|
|
|
return xerrors.Errorf("exchange token: %w", err)
|
2022-10-24 03:35:08 +00:00
|
|
|
}
|
2022-11-04 16:44:36 +00:00
|
|
|
a.sessionToken.Store(&sessionToken)
|
2022-09-01 01:09:44 +00:00
|
|
|
|
2023-06-30 18:41:29 +00:00
|
|
|
serviceBanner, err := a.client.GetServiceBanner(ctx)
|
|
|
|
if err != nil {
|
|
|
|
return xerrors.Errorf("fetch service banner: %w", err)
|
|
|
|
}
|
|
|
|
a.serviceBanner.Store(&serviceBanner)
|
|
|
|
|
2023-03-31 20:26:19 +00:00
|
|
|
manifest, err := a.client.Manifest(ctx)
|
2022-10-24 03:35:08 +00:00
|
|
|
if err != nil {
|
|
|
|
return xerrors.Errorf("fetch metadata: %w", err)
|
2022-09-23 19:51:04 +00:00
|
|
|
}
|
2023-03-31 20:26:19 +00:00
|
|
|
a.logger.Info(ctx, "fetched manifest", slog.F("manifest", manifest))
|
2023-02-07 21:35:09 +00:00
|
|
|
|
2023-07-12 22:37:31 +00:00
|
|
|
if manifest.AgentID == uuid.Nil {
|
|
|
|
return xerrors.New("nil agentID returned by manifest")
|
|
|
|
}
|
|
|
|
|
2023-02-07 21:35:09 +00:00
|
|
|
// Expand the directory and send it back to coderd so external
|
|
|
|
// applications that rely on the directory can use it.
|
|
|
|
//
|
|
|
|
// An example is VS Code Remote, which must know the directory
|
|
|
|
// before initializing a connection.
|
2023-03-31 20:26:19 +00:00
|
|
|
manifest.Directory, err = expandDirectory(manifest.Directory)
|
2023-02-07 21:35:09 +00:00
|
|
|
if err != nil {
|
|
|
|
return xerrors.Errorf("expand directory: %w", err)
|
|
|
|
}
|
|
|
|
err = a.client.PostStartup(ctx, agentsdk.PostStartupRequest{
|
|
|
|
Version: buildinfo.Version(),
|
2023-03-31 20:26:19 +00:00
|
|
|
ExpandedDirectory: manifest.Directory,
|
2023-08-09 05:10:28 +00:00
|
|
|
Subsystems: a.subsystems,
|
2023-02-07 21:35:09 +00:00
|
|
|
})
|
|
|
|
if err != nil {
|
|
|
|
return xerrors.Errorf("update workspace agent version: %w", err)
|
|
|
|
}
|
|
|
|
|
2023-03-31 20:26:19 +00:00
|
|
|
oldManifest := a.manifest.Swap(&manifest)
|
2022-10-24 03:35:08 +00:00
|
|
|
|
|
|
|
// The startup script should only execute on the first run!
|
2023-03-31 20:26:19 +00:00
|
|
|
if oldManifest == nil {
|
2023-01-24 12:24:27 +00:00
|
|
|
a.setLifecycle(ctx, codersdk.WorkspaceAgentLifecycleStarting)
|
|
|
|
|
|
|
|
// Perform overrides early so that Git auth can work even if users
|
|
|
|
// connect to a workspace that is not yet ready. We don't run this
|
|
|
|
// concurrently with the startup script to avoid conflicts between
|
|
|
|
// them.
|
2023-03-31 20:26:19 +00:00
|
|
|
if manifest.GitAuthConfigs > 0 {
|
2023-01-24 12:24:27 +00:00
|
|
|
// If this fails, we should consider surfacing the error in the
|
|
|
|
// startup log and setting the lifecycle state to be "start_error"
|
|
|
|
// (after startup script completion), but for now we'll just log it.
|
|
|
|
err := gitauth.OverrideVSCodeConfigs(a.filesystem)
|
|
|
|
if err != nil {
|
|
|
|
a.logger.Warn(ctx, "failed to override vscode git auth configs", slog.Error(err))
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2023-03-06 19:34:00 +00:00
|
|
|
lifecycleState := codersdk.WorkspaceAgentLifecycleReady
|
2023-01-24 12:24:27 +00:00
|
|
|
scriptDone := make(chan error, 1)
|
2023-03-06 19:34:00 +00:00
|
|
|
err = a.trackConnGoroutine(func() {
|
2023-01-24 12:24:27 +00:00
|
|
|
defer close(scriptDone)
|
2023-03-31 20:26:19 +00:00
|
|
|
scriptDone <- a.runStartupScript(ctx, manifest.StartupScript)
|
2023-02-03 18:25:11 +00:00
|
|
|
})
|
|
|
|
if err != nil {
|
|
|
|
return xerrors.Errorf("track startup script: %w", err)
|
|
|
|
}
|
2023-01-24 12:24:27 +00:00
|
|
|
go func() {
|
|
|
|
var timeout <-chan time.Time
|
|
|
|
// If timeout is zero, an older version of the coder
|
|
|
|
// provider was used. Otherwise a timeout is always > 0.
|
2023-03-31 20:26:19 +00:00
|
|
|
if manifest.StartupScriptTimeout > 0 {
|
|
|
|
t := time.NewTimer(manifest.StartupScriptTimeout)
|
2023-01-24 12:24:27 +00:00
|
|
|
defer t.Stop()
|
|
|
|
timeout = t.C
|
|
|
|
}
|
|
|
|
|
|
|
|
var err error
|
|
|
|
select {
|
|
|
|
case err = <-scriptDone:
|
|
|
|
case <-timeout:
|
2023-08-18 17:35:49 +00:00
|
|
|
a.logger.Warn(ctx, "script timed out", slog.F("lifecycle", "startup"), slog.F("timeout", manifest.StartupScriptTimeout))
|
2023-01-24 12:24:27 +00:00
|
|
|
a.setLifecycle(ctx, codersdk.WorkspaceAgentLifecycleStartTimeout)
|
|
|
|
err = <-scriptDone // The script can still complete after a timeout.
|
|
|
|
}
|
2023-06-20 11:41:55 +00:00
|
|
|
if err != nil {
|
|
|
|
if errors.Is(err, context.Canceled) {
|
|
|
|
return
|
2023-03-06 19:34:00 +00:00
|
|
|
}
|
2023-06-20 11:41:55 +00:00
|
|
|
lifecycleState = codersdk.WorkspaceAgentLifecycleStartError
|
2022-10-24 03:35:08 +00:00
|
|
|
}
|
2023-03-06 19:34:00 +00:00
|
|
|
a.setLifecycle(ctx, lifecycleState)
|
2023-01-24 12:24:27 +00:00
|
|
|
}()
|
2022-10-25 00:46:24 +00:00
|
|
|
}
|
|
|
|
|
2022-10-24 03:35:08 +00:00
|
|
|
// This automatically closes when the context ends!
|
|
|
|
appReporterCtx, appReporterCtxCancel := context.WithCancel(ctx)
|
|
|
|
defer appReporterCtxCancel()
|
|
|
|
go NewWorkspaceAppHealthReporter(
|
2023-03-31 20:26:19 +00:00
|
|
|
a.logger, manifest.Apps, a.client.PostAppHealth)(appReporterCtx)
|
2022-10-24 03:35:08 +00:00
|
|
|
|
2022-09-01 01:09:44 +00:00
|
|
|
a.closeMutex.Lock()
|
2022-10-24 03:35:08 +00:00
|
|
|
network := a.network
|
|
|
|
a.closeMutex.Unlock()
|
2022-11-14 11:48:44 +00:00
|
|
|
if network == nil {
|
2023-08-24 17:22:31 +00:00
|
|
|
network, err = a.createTailnet(ctx, manifest.AgentID, manifest.DERPMap, manifest.DERPForceWebSockets, manifest.DisableDirectConnections)
|
2022-10-24 03:35:08 +00:00
|
|
|
if err != nil {
|
|
|
|
return xerrors.Errorf("create tailnet: %w", err)
|
|
|
|
}
|
|
|
|
a.closeMutex.Lock()
|
2022-12-12 11:26:49 +00:00
|
|
|
// Re-check if agent was closed while initializing the network.
|
|
|
|
closed := a.isClosed()
|
|
|
|
if !closed {
|
|
|
|
a.network = network
|
|
|
|
}
|
2022-10-24 03:35:08 +00:00
|
|
|
a.closeMutex.Unlock()
|
2022-12-12 11:26:49 +00:00
|
|
|
if closed {
|
|
|
|
_ = network.Close()
|
|
|
|
return xerrors.New("agent is closed")
|
|
|
|
}
|
2022-12-14 16:45:46 +00:00
|
|
|
|
2023-03-02 14:06:00 +00:00
|
|
|
a.startReportingConnectionStats(ctx)
|
2022-10-24 03:35:08 +00:00
|
|
|
} else {
|
2023-07-12 22:37:31 +00:00
|
|
|
// Update the wireguard IPs if the agent ID changed.
|
|
|
|
err := network.SetAddresses(a.wireguardAddresses(manifest.AgentID))
|
|
|
|
if err != nil {
|
|
|
|
a.logger.Error(ctx, "update tailnet addresses", slog.Error(err))
|
|
|
|
}
|
2023-08-24 17:22:31 +00:00
|
|
|
// Update the DERP map, force WebSocket setting and allow/disallow
|
|
|
|
// direct connections.
|
2023-03-31 20:26:19 +00:00
|
|
|
network.SetDERPMap(manifest.DERPMap)
|
2023-08-24 17:22:31 +00:00
|
|
|
network.SetDERPForceWebSockets(manifest.DERPForceWebSockets)
|
2023-06-21 22:02:05 +00:00
|
|
|
network.SetBlockEndpoints(manifest.DisableDirectConnections)
|
2022-09-01 01:09:44 +00:00
|
|
|
}
|
2022-10-24 03:35:08 +00:00
|
|
|
|
2023-07-26 16:21:04 +00:00
|
|
|
eg, egCtx := errgroup.WithContext(ctx)
|
|
|
|
eg.Go(func() error {
|
|
|
|
a.logger.Debug(egCtx, "running tailnet connection coordinator")
|
|
|
|
err := a.runCoordinator(egCtx, network)
|
|
|
|
if err != nil {
|
|
|
|
return xerrors.Errorf("run coordinator: %w", err)
|
|
|
|
}
|
|
|
|
return nil
|
|
|
|
})
|
|
|
|
|
|
|
|
eg.Go(func() error {
|
|
|
|
a.logger.Debug(egCtx, "running derp map subscriber")
|
|
|
|
err := a.runDERPMapSubscriber(egCtx, network)
|
|
|
|
if err != nil {
|
|
|
|
return xerrors.Errorf("run derp map subscriber: %w", err)
|
|
|
|
}
|
|
|
|
return nil
|
|
|
|
})
|
|
|
|
|
|
|
|
return eg.Wait()
|
2022-10-24 03:35:08 +00:00
|
|
|
}
|
|
|
|
|
2023-07-12 22:37:31 +00:00
|
|
|
func (a *agent) wireguardAddresses(agentID uuid.UUID) []netip.Prefix {
|
|
|
|
if len(a.addresses) == 0 {
|
|
|
|
return []netip.Prefix{
|
|
|
|
// This is the IP that should be used primarily.
|
|
|
|
netip.PrefixFrom(tailnet.IPFromUUID(agentID), 128),
|
|
|
|
// We also listen on the legacy codersdk.WorkspaceAgentIP. This
|
|
|
|
// allows for a transition away from wsconncache.
|
|
|
|
netip.PrefixFrom(codersdk.WorkspaceAgentIP, 128),
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return a.addresses
|
|
|
|
}
|
|
|
|
|
2022-12-02 14:24:40 +00:00
|
|
|
func (a *agent) trackConnGoroutine(fn func()) error {
|
|
|
|
a.closeMutex.Lock()
|
|
|
|
defer a.closeMutex.Unlock()
|
|
|
|
if a.isClosed() {
|
|
|
|
return xerrors.New("track conn goroutine: agent is closed")
|
|
|
|
}
|
|
|
|
a.connCloseWait.Add(1)
|
|
|
|
go func() {
|
|
|
|
defer a.connCloseWait.Done()
|
|
|
|
fn()
|
|
|
|
}()
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
2023-08-24 17:22:31 +00:00
|
|
|
func (a *agent) createTailnet(ctx context.Context, agentID uuid.UUID, derpMap *tailcfg.DERPMap, derpForceWebSockets, disableDirectConnections bool) (_ *tailnet.Conn, err error) {
|
2022-12-05 22:18:23 +00:00
|
|
|
network, err := tailnet.NewConn(&tailnet.Options{
|
2023-08-24 17:22:31 +00:00
|
|
|
ID: agentID,
|
|
|
|
Addresses: a.wireguardAddresses(agentID),
|
|
|
|
DERPMap: derpMap,
|
|
|
|
DERPForceWebSockets: derpForceWebSockets,
|
|
|
|
Logger: a.logger.Named("net.tailnet"),
|
|
|
|
ListenPort: a.tailnetListenPort,
|
|
|
|
BlockEndpoints: disableDirectConnections,
|
2022-09-01 01:09:44 +00:00
|
|
|
})
|
|
|
|
if err != nil {
|
2022-10-24 03:35:08 +00:00
|
|
|
return nil, xerrors.Errorf("create tailnet: %w", err)
|
2022-09-01 01:09:44 +00:00
|
|
|
}
|
2022-12-02 14:24:40 +00:00
|
|
|
defer func() {
|
|
|
|
if err != nil {
|
|
|
|
network.Close()
|
|
|
|
}
|
|
|
|
}()
|
2022-09-01 01:09:44 +00:00
|
|
|
|
2023-01-29 21:47:24 +00:00
|
|
|
sshListener, err := network.Listen("tcp", ":"+strconv.Itoa(codersdk.WorkspaceAgentSSHPort))
|
2022-09-01 01:09:44 +00:00
|
|
|
if err != nil {
|
2022-10-24 03:35:08 +00:00
|
|
|
return nil, xerrors.Errorf("listen on the ssh port: %w", err)
|
2022-09-01 01:09:44 +00:00
|
|
|
}
|
2022-12-02 14:24:40 +00:00
|
|
|
defer func() {
|
|
|
|
if err != nil {
|
|
|
|
_ = sshListener.Close()
|
|
|
|
}
|
|
|
|
}()
|
|
|
|
if err = a.trackConnGoroutine(func() {
|
2023-04-06 16:39:22 +00:00
|
|
|
_ = a.sshServer.Serve(sshListener)
|
2022-12-02 14:24:40 +00:00
|
|
|
}); err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
2022-10-06 12:38:22 +00:00
|
|
|
|
2023-01-29 21:47:24 +00:00
|
|
|
reconnectingPTYListener, err := network.Listen("tcp", ":"+strconv.Itoa(codersdk.WorkspaceAgentReconnectingPTYPort))
|
2022-09-01 01:09:44 +00:00
|
|
|
if err != nil {
|
2022-10-24 03:35:08 +00:00
|
|
|
return nil, xerrors.Errorf("listen for reconnecting pty: %w", err)
|
2022-09-01 01:09:44 +00:00
|
|
|
}
|
2022-12-02 14:24:40 +00:00
|
|
|
defer func() {
|
|
|
|
if err != nil {
|
|
|
|
_ = reconnectingPTYListener.Close()
|
|
|
|
}
|
|
|
|
}()
|
|
|
|
if err = a.trackConnGoroutine(func() {
|
2022-12-13 19:28:07 +00:00
|
|
|
logger := a.logger.Named("reconnecting-pty")
|
2023-01-29 23:20:30 +00:00
|
|
|
var wg sync.WaitGroup
|
2022-09-01 01:09:44 +00:00
|
|
|
for {
|
|
|
|
conn, err := reconnectingPTYListener.Accept()
|
2022-04-25 18:30:39 +00:00
|
|
|
if err != nil {
|
2023-02-27 15:20:24 +00:00
|
|
|
if !a.isClosed() {
|
|
|
|
logger.Debug(ctx, "accept pty failed", slog.Error(err))
|
|
|
|
}
|
2023-01-29 23:20:30 +00:00
|
|
|
break
|
2022-09-01 01:09:44 +00:00
|
|
|
}
|
2023-04-27 09:59:01 +00:00
|
|
|
logger.Debug(ctx, "accepted conn", slog.F("remote", conn.RemoteAddr().String()))
|
2023-01-29 23:20:30 +00:00
|
|
|
wg.Add(1)
|
|
|
|
closed := make(chan struct{})
|
|
|
|
go func() {
|
|
|
|
select {
|
|
|
|
case <-closed:
|
|
|
|
case <-a.closed:
|
|
|
|
_ = conn.Close()
|
|
|
|
}
|
|
|
|
wg.Done()
|
|
|
|
}()
|
2022-12-13 19:28:07 +00:00
|
|
|
go func() {
|
2023-01-29 23:20:30 +00:00
|
|
|
defer close(closed)
|
|
|
|
// This cannot use a JSON decoder, since that can
|
|
|
|
// buffer additional data that is required for the PTY.
|
|
|
|
rawLen := make([]byte, 2)
|
|
|
|
_, err = conn.Read(rawLen)
|
|
|
|
if err != nil {
|
|
|
|
return
|
|
|
|
}
|
|
|
|
length := binary.LittleEndian.Uint16(rawLen)
|
|
|
|
data := make([]byte, length)
|
|
|
|
_, err = conn.Read(data)
|
|
|
|
if err != nil {
|
|
|
|
return
|
|
|
|
}
|
|
|
|
var msg codersdk.WorkspaceAgentReconnectingPTYInit
|
|
|
|
err = json.Unmarshal(data, &msg)
|
|
|
|
if err != nil {
|
2023-04-27 09:59:01 +00:00
|
|
|
logger.Warn(ctx, "failed to unmarshal init", slog.F("raw", data))
|
2023-01-29 23:20:30 +00:00
|
|
|
return
|
|
|
|
}
|
2022-12-13 19:28:07 +00:00
|
|
|
_ = a.handleReconnectingPTY(ctx, logger, msg, conn)
|
|
|
|
}()
|
2022-09-01 01:09:44 +00:00
|
|
|
}
|
2023-01-29 23:20:30 +00:00
|
|
|
wg.Wait()
|
2022-12-02 14:24:40 +00:00
|
|
|
}); err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
2022-10-06 12:38:22 +00:00
|
|
|
|
2023-01-29 21:47:24 +00:00
|
|
|
speedtestListener, err := network.Listen("tcp", ":"+strconv.Itoa(codersdk.WorkspaceAgentSpeedtestPort))
|
2022-09-05 22:15:49 +00:00
|
|
|
if err != nil {
|
2022-10-24 03:35:08 +00:00
|
|
|
return nil, xerrors.Errorf("listen for speedtest: %w", err)
|
2022-09-05 22:15:49 +00:00
|
|
|
}
|
2022-12-02 14:24:40 +00:00
|
|
|
defer func() {
|
|
|
|
if err != nil {
|
|
|
|
_ = speedtestListener.Close()
|
|
|
|
}
|
|
|
|
}()
|
|
|
|
if err = a.trackConnGoroutine(func() {
|
2023-01-29 23:20:30 +00:00
|
|
|
var wg sync.WaitGroup
|
2022-09-05 22:15:49 +00:00
|
|
|
for {
|
|
|
|
conn, err := speedtestListener.Accept()
|
|
|
|
if err != nil {
|
2023-02-27 15:20:24 +00:00
|
|
|
if !a.isClosed() {
|
|
|
|
a.logger.Debug(ctx, "speedtest listener failed", slog.Error(err))
|
|
|
|
}
|
2023-01-29 23:20:30 +00:00
|
|
|
break
|
2022-09-05 22:15:49 +00:00
|
|
|
}
|
2023-01-29 23:20:30 +00:00
|
|
|
wg.Add(1)
|
|
|
|
closed := make(chan struct{})
|
|
|
|
go func() {
|
|
|
|
select {
|
|
|
|
case <-closed:
|
|
|
|
case <-a.closed:
|
|
|
|
_ = conn.Close()
|
|
|
|
}
|
|
|
|
wg.Done()
|
|
|
|
}()
|
|
|
|
go func() {
|
|
|
|
defer close(closed)
|
2022-09-05 22:15:49 +00:00
|
|
|
_ = speedtest.ServeConn(conn)
|
2023-01-29 23:20:30 +00:00
|
|
|
}()
|
2022-09-05 22:15:49 +00:00
|
|
|
}
|
2023-01-29 23:20:30 +00:00
|
|
|
wg.Wait()
|
2022-12-02 14:24:40 +00:00
|
|
|
}); err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
2022-10-06 12:38:22 +00:00
|
|
|
|
2023-02-01 18:05:57 +00:00
|
|
|
apiListener, err := network.Listen("tcp", ":"+strconv.Itoa(codersdk.WorkspaceAgentHTTPAPIServerPort))
|
2022-10-06 12:38:22 +00:00
|
|
|
if err != nil {
|
2023-02-01 18:05:57 +00:00
|
|
|
return nil, xerrors.Errorf("api listener: %w", err)
|
2022-10-06 12:38:22 +00:00
|
|
|
}
|
2022-12-02 14:24:40 +00:00
|
|
|
defer func() {
|
|
|
|
if err != nil {
|
2023-02-01 18:05:57 +00:00
|
|
|
_ = apiListener.Close()
|
2022-12-02 14:24:40 +00:00
|
|
|
}
|
|
|
|
}()
|
|
|
|
if err = a.trackConnGoroutine(func() {
|
2023-02-01 18:05:57 +00:00
|
|
|
defer apiListener.Close()
|
2022-10-06 12:38:22 +00:00
|
|
|
server := &http.Server{
|
2023-02-01 18:05:57 +00:00
|
|
|
Handler: a.apiHandler(),
|
2022-10-06 12:38:22 +00:00
|
|
|
ReadTimeout: 20 * time.Second,
|
|
|
|
ReadHeaderTimeout: 20 * time.Second,
|
|
|
|
WriteTimeout: 20 * time.Second,
|
2023-02-01 18:05:57 +00:00
|
|
|
ErrorLog: slog.Stdlib(ctx, a.logger.Named("http_api_server"), slog.LevelInfo),
|
2022-10-06 12:38:22 +00:00
|
|
|
}
|
|
|
|
go func() {
|
2023-01-29 23:20:30 +00:00
|
|
|
select {
|
|
|
|
case <-ctx.Done():
|
|
|
|
case <-a.closed:
|
|
|
|
}
|
2022-10-06 12:38:22 +00:00
|
|
|
_ = server.Close()
|
|
|
|
}()
|
|
|
|
|
2023-02-01 18:05:57 +00:00
|
|
|
err := server.Serve(apiListener)
|
2022-10-06 12:38:22 +00:00
|
|
|
if err != nil && !xerrors.Is(err, http.ErrServerClosed) && !strings.Contains(err.Error(), "use of closed network connection") {
|
2023-02-01 18:05:57 +00:00
|
|
|
a.logger.Critical(ctx, "serve HTTP API server", slog.Error(err))
|
2022-10-06 12:38:22 +00:00
|
|
|
}
|
2022-12-02 14:24:40 +00:00
|
|
|
}); err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
2022-09-01 01:09:44 +00:00
|
|
|
|
2022-10-24 03:35:08 +00:00
|
|
|
return network, nil
|
2022-09-23 15:08:13 +00:00
|
|
|
}
|
|
|
|
|
2022-10-24 03:35:08 +00:00
|
|
|
// runCoordinator runs a coordinator and returns whether a reconnect
|
|
|
|
// should occur.
|
|
|
|
func (a *agent) runCoordinator(ctx context.Context, network *tailnet.Conn) error {
|
2023-01-23 20:05:29 +00:00
|
|
|
ctx, cancel := context.WithCancel(ctx)
|
|
|
|
defer cancel()
|
|
|
|
|
2023-01-29 21:47:24 +00:00
|
|
|
coordinator, err := a.client.Listen(ctx)
|
2022-10-24 03:35:08 +00:00
|
|
|
if err != nil {
|
|
|
|
return err
|
2022-09-01 01:09:44 +00:00
|
|
|
}
|
2022-10-24 03:35:08 +00:00
|
|
|
defer coordinator.Close()
|
2023-02-27 15:20:24 +00:00
|
|
|
a.logger.Info(ctx, "connected to coordination endpoint")
|
2023-02-24 16:16:29 +00:00
|
|
|
sendNodes, errChan := tailnet.ServeCoordinator(coordinator, func(nodes []*tailnet.Node) error {
|
|
|
|
return network.UpdateNodes(nodes, false)
|
|
|
|
})
|
2022-10-24 03:35:08 +00:00
|
|
|
network.SetNodeCallback(sendNodes)
|
2022-09-01 01:09:44 +00:00
|
|
|
select {
|
|
|
|
case <-ctx.Done():
|
2022-10-24 03:35:08 +00:00
|
|
|
return ctx.Err()
|
2022-09-01 01:09:44 +00:00
|
|
|
case err := <-errChan:
|
2022-10-24 03:35:08 +00:00
|
|
|
return err
|
2022-04-25 18:30:39 +00:00
|
|
|
}
|
2022-09-01 01:09:44 +00:00
|
|
|
}
|
2022-03-22 19:17:50 +00:00
|
|
|
|
2023-07-26 16:21:04 +00:00
|
|
|
// runDERPMapSubscriber runs a coordinator and returns if a reconnect should occur.
|
|
|
|
func (a *agent) runDERPMapSubscriber(ctx context.Context, network *tailnet.Conn) error {
|
|
|
|
ctx, cancel := context.WithCancel(ctx)
|
|
|
|
defer cancel()
|
|
|
|
|
|
|
|
updates, closer, err := a.client.DERPMapUpdates(ctx)
|
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
defer closer.Close()
|
|
|
|
|
|
|
|
a.logger.Info(ctx, "connected to derp map endpoint")
|
|
|
|
for {
|
|
|
|
select {
|
|
|
|
case <-ctx.Done():
|
|
|
|
return ctx.Err()
|
|
|
|
case update := <-updates:
|
|
|
|
if update.Err != nil {
|
|
|
|
return update.Err
|
|
|
|
}
|
|
|
|
if update.DERPMap != nil && !tailnet.CompareDERPMaps(network.DERPMap(), update.DERPMap) {
|
|
|
|
a.logger.Info(ctx, "updating derp map due to detected changes")
|
|
|
|
network.SetDERPMap(update.DERPMap)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2022-06-06 19:20:25 +00:00
|
|
|
func (a *agent) runStartupScript(ctx context.Context, script string) error {
|
2023-03-06 19:34:00 +00:00
|
|
|
return a.runScript(ctx, "startup", script)
|
|
|
|
}
|
|
|
|
|
|
|
|
func (a *agent) runShutdownScript(ctx context.Context, script string) error {
|
|
|
|
return a.runScript(ctx, "shutdown", script)
|
|
|
|
}
|
|
|
|
|
2023-06-20 11:41:55 +00:00
|
|
|
func (a *agent) runScript(ctx context.Context, lifecycle, script string) (err error) {
|
2022-04-25 18:30:39 +00:00
|
|
|
if script == "" {
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
2023-06-20 11:41:55 +00:00
|
|
|
logger := a.logger.With(slog.F("lifecycle", lifecycle))
|
|
|
|
|
|
|
|
logger.Info(ctx, fmt.Sprintf("running %s script", lifecycle), slog.F("script", script))
|
2023-03-23 19:09:13 +00:00
|
|
|
fileWriter, err := a.filesystem.OpenFile(filepath.Join(a.logDir, fmt.Sprintf("coder-%s-script.log", lifecycle)), os.O_CREATE|os.O_RDWR, 0o600)
|
2022-04-25 18:30:39 +00:00
|
|
|
if err != nil {
|
2023-03-06 19:34:00 +00:00
|
|
|
return xerrors.Errorf("open %s script log file: %w", lifecycle, err)
|
2022-04-25 18:30:39 +00:00
|
|
|
}
|
|
|
|
defer func() {
|
2023-06-20 11:41:55 +00:00
|
|
|
err := fileWriter.Close()
|
|
|
|
if err != nil {
|
|
|
|
logger.Warn(ctx, fmt.Sprintf("close %s script log file", lifecycle), slog.Error(err))
|
|
|
|
}
|
2022-04-25 18:30:39 +00:00
|
|
|
}()
|
2023-03-23 19:09:13 +00:00
|
|
|
|
2023-06-20 11:41:55 +00:00
|
|
|
cmdPty, err := a.sshServer.CreateCommand(ctx, script, nil)
|
|
|
|
if err != nil {
|
|
|
|
return xerrors.Errorf("%s script: create command: %w", lifecycle, err)
|
|
|
|
}
|
|
|
|
cmd := cmdPty.AsExec()
|
|
|
|
|
2023-06-22 20:28:59 +00:00
|
|
|
var stdout, stderr io.Writer = fileWriter, fileWriter
|
2023-03-23 19:09:13 +00:00
|
|
|
if lifecycle == "startup" {
|
2023-07-28 15:57:23 +00:00
|
|
|
send, flushAndClose := agentsdk.LogsSender(a.client.PatchLogs, logger)
|
2023-06-22 20:28:59 +00:00
|
|
|
// If ctx is canceled here (or in a writer below), we may be
|
|
|
|
// discarding logs, but that's okay because we're shutting down
|
|
|
|
// anyway. We could consider creating a new context here if we
|
|
|
|
// want better control over flush during shutdown.
|
2023-03-23 19:09:13 +00:00
|
|
|
defer func() {
|
2023-06-22 20:28:59 +00:00
|
|
|
if err := flushAndClose(ctx); err != nil {
|
|
|
|
logger.Warn(ctx, "flush startup logs failed", slog.Error(err))
|
|
|
|
}
|
2023-03-23 19:09:13 +00:00
|
|
|
}()
|
2023-06-22 20:28:59 +00:00
|
|
|
|
2023-07-28 15:57:23 +00:00
|
|
|
infoW := agentsdk.StartupLogsWriter(ctx, send, codersdk.WorkspaceAgentLogSourceStartupScript, codersdk.LogLevelInfo)
|
2023-06-22 20:28:59 +00:00
|
|
|
defer infoW.Close()
|
2023-07-28 15:57:23 +00:00
|
|
|
errW := agentsdk.StartupLogsWriter(ctx, send, codersdk.WorkspaceAgentLogSourceStartupScript, codersdk.LogLevelError)
|
2023-06-22 20:28:59 +00:00
|
|
|
defer errW.Close()
|
|
|
|
|
|
|
|
stdout = io.MultiWriter(fileWriter, infoW)
|
|
|
|
stderr = io.MultiWriter(fileWriter, errW)
|
2023-03-23 19:09:13 +00:00
|
|
|
}
|
|
|
|
|
2023-06-22 20:28:59 +00:00
|
|
|
cmd.Stdout = stdout
|
|
|
|
cmd.Stderr = stderr
|
2023-06-20 11:41:55 +00:00
|
|
|
|
|
|
|
start := time.Now()
|
|
|
|
defer func() {
|
|
|
|
end := time.Now()
|
|
|
|
execTime := end.Sub(start)
|
|
|
|
exitCode := 0
|
|
|
|
if err != nil {
|
|
|
|
exitCode = 255 // Unknown status.
|
|
|
|
var exitError *exec.ExitError
|
|
|
|
if xerrors.As(err, &exitError) {
|
|
|
|
exitCode = exitError.ExitCode()
|
|
|
|
}
|
|
|
|
logger.Warn(ctx, fmt.Sprintf("%s script failed", lifecycle), slog.F("execution_time", execTime), slog.F("exit_code", exitCode), slog.Error(err))
|
|
|
|
} else {
|
|
|
|
logger.Info(ctx, fmt.Sprintf("%s script completed", lifecycle), slog.F("execution_time", execTime), slog.F("exit_code", exitCode))
|
|
|
|
}
|
|
|
|
}()
|
|
|
|
|
2022-04-25 18:30:39 +00:00
|
|
|
err = cmd.Run()
|
|
|
|
if err != nil {
|
2022-05-24 21:03:42 +00:00
|
|
|
// cmd.Run does not return a context canceled error, it returns "signal: killed".
|
|
|
|
if ctx.Err() != nil {
|
|
|
|
return ctx.Err()
|
|
|
|
}
|
|
|
|
|
2023-06-20 11:41:55 +00:00
|
|
|
return xerrors.Errorf("%s script: run: %w", lifecycle, err)
|
2022-04-25 18:30:39 +00:00
|
|
|
}
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
2023-01-29 21:47:24 +00:00
|
|
|
func (a *agent) handleReconnectingPTY(ctx context.Context, logger slog.Logger, msg codersdk.WorkspaceAgentReconnectingPTYInit, conn net.Conn) (retErr error) {
|
2022-04-29 22:30:10 +00:00
|
|
|
defer conn.Close()
|
2023-05-25 10:52:36 +00:00
|
|
|
a.metrics.connectionsTotal.Add(1)
|
2022-04-29 22:30:10 +00:00
|
|
|
|
2023-03-02 14:06:00 +00:00
|
|
|
a.connCountReconnectingPTY.Add(1)
|
2023-03-09 03:05:45 +00:00
|
|
|
defer a.connCountReconnectingPTY.Add(-1)
|
2023-03-02 14:06:00 +00:00
|
|
|
|
2022-11-17 16:57:15 +00:00
|
|
|
connectionID := uuid.NewString()
|
2023-08-14 19:19:13 +00:00
|
|
|
connLogger := logger.With(slog.F("message_id", msg.ID), slog.F("connection_id", connectionID))
|
|
|
|
connLogger.Debug(ctx, "starting handler")
|
2022-12-13 19:28:07 +00:00
|
|
|
|
|
|
|
defer func() {
|
|
|
|
if err := retErr; err != nil {
|
|
|
|
a.closeMutex.Lock()
|
|
|
|
closed := a.isClosed()
|
|
|
|
a.closeMutex.Unlock()
|
|
|
|
|
|
|
|
// If the agent is closed, we don't want to
|
|
|
|
// log this as an error since it's expected.
|
|
|
|
if closed {
|
2023-08-14 19:19:13 +00:00
|
|
|
connLogger.Debug(ctx, "reconnecting pty failed with attach error (agent closed)", slog.Error(err))
|
2022-12-13 19:28:07 +00:00
|
|
|
} else {
|
2023-08-14 19:19:13 +00:00
|
|
|
connLogger.Error(ctx, "reconnecting pty failed with attach error", slog.Error(err))
|
2022-12-13 19:28:07 +00:00
|
|
|
}
|
|
|
|
}
|
2023-08-14 19:19:13 +00:00
|
|
|
connLogger.Debug(ctx, "reconnecting pty connection closed")
|
2022-12-13 19:28:07 +00:00
|
|
|
}()
|
|
|
|
|
2023-08-14 19:19:13 +00:00
|
|
|
var rpty reconnectingpty.ReconnectingPTY
|
|
|
|
sendConnected := make(chan reconnectingpty.ReconnectingPTY, 1)
|
2023-06-13 15:14:07 +00:00
|
|
|
// On store, reserve this ID to prevent multiple concurrent new connections.
|
|
|
|
waitReady, ok := a.reconnectingPTYs.LoadOrStore(msg.ID, sendConnected)
|
2022-04-29 22:30:10 +00:00
|
|
|
if ok {
|
2023-06-13 15:14:07 +00:00
|
|
|
close(sendConnected) // Unused.
|
2023-08-14 19:19:13 +00:00
|
|
|
connLogger.Debug(ctx, "connecting to existing reconnecting pty")
|
|
|
|
c, ok := waitReady.(chan reconnectingpty.ReconnectingPTY)
|
2022-04-29 22:30:10 +00:00
|
|
|
if !ok {
|
2023-06-13 15:14:07 +00:00
|
|
|
return xerrors.Errorf("found invalid type in reconnecting pty map: %T", waitReady)
|
2022-04-29 22:30:10 +00:00
|
|
|
}
|
2023-06-13 15:14:07 +00:00
|
|
|
rpty, ok = <-c
|
|
|
|
if !ok || rpty == nil {
|
|
|
|
return xerrors.Errorf("reconnecting pty closed before connection")
|
|
|
|
}
|
|
|
|
c <- rpty // Put it back for the next reconnect.
|
2022-04-29 22:30:10 +00:00
|
|
|
} else {
|
2023-08-14 19:19:13 +00:00
|
|
|
connLogger.Debug(ctx, "creating new reconnecting pty")
|
2022-12-13 19:28:07 +00:00
|
|
|
|
2023-06-13 15:14:07 +00:00
|
|
|
connected := false
|
|
|
|
defer func() {
|
|
|
|
if !connected && retErr != nil {
|
|
|
|
a.reconnectingPTYs.Delete(msg.ID)
|
|
|
|
close(sendConnected)
|
|
|
|
}
|
|
|
|
}()
|
|
|
|
|
2022-04-29 22:30:10 +00:00
|
|
|
// Empty command will default to the users shell!
|
2023-04-06 16:39:22 +00:00
|
|
|
cmd, err := a.sshServer.CreateCommand(ctx, msg.Command, nil)
|
2022-04-29 22:30:10 +00:00
|
|
|
if err != nil {
|
2023-05-25 10:52:36 +00:00
|
|
|
a.metrics.reconnectingPTYErrors.WithLabelValues("create_command").Add(1)
|
2022-12-13 19:28:07 +00:00
|
|
|
return xerrors.Errorf("create command: %w", err)
|
2022-04-29 22:30:10 +00:00
|
|
|
}
|
|
|
|
|
2023-08-14 19:19:13 +00:00
|
|
|
rpty = reconnectingpty.New(ctx, cmd, &reconnectingpty.Options{
|
|
|
|
Timeout: a.reconnectingPTYTimeout,
|
|
|
|
Metrics: a.metrics.reconnectingPTYErrors,
|
|
|
|
}, logger.With(slog.F("message_id", msg.ID)))
|
2022-04-29 22:30:10 +00:00
|
|
|
|
2022-12-02 14:24:40 +00:00
|
|
|
if err = a.trackConnGoroutine(func() {
|
2023-08-14 19:19:13 +00:00
|
|
|
rpty.Wait()
|
2022-09-01 01:09:44 +00:00
|
|
|
a.reconnectingPTYs.Delete(msg.ID)
|
2022-12-02 14:24:40 +00:00
|
|
|
}); err != nil {
|
2023-08-14 23:54:23 +00:00
|
|
|
rpty.Close(err)
|
2022-12-13 19:28:07 +00:00
|
|
|
return xerrors.Errorf("start routine: %w", err)
|
2022-12-02 14:24:40 +00:00
|
|
|
}
|
2023-08-14 19:19:13 +00:00
|
|
|
|
2023-06-13 15:14:07 +00:00
|
|
|
connected = true
|
|
|
|
sendConnected <- rpty
|
2022-04-29 22:30:10 +00:00
|
|
|
}
|
2023-08-14 19:19:13 +00:00
|
|
|
return rpty.Attach(ctx, connectionID, conn, msg.Height, msg.Width, connLogger)
|
2022-04-29 22:30:10 +00:00
|
|
|
}
|
|
|
|
|
2023-03-02 14:06:00 +00:00
|
|
|
// startReportingConnectionStats runs the connection stats reporting goroutine.
|
|
|
|
func (a *agent) startReportingConnectionStats(ctx context.Context) {
|
|
|
|
reportStats := func(networkStats map[netlogtype.Connection]netlogtype.Counts) {
|
|
|
|
stats := &agentsdk.Stats{
|
|
|
|
ConnectionCount: int64(len(networkStats)),
|
|
|
|
ConnectionsByProto: map[string]int64{},
|
|
|
|
}
|
|
|
|
for conn, counts := range networkStats {
|
|
|
|
stats.ConnectionsByProto[conn.Proto.String()]++
|
2023-03-09 03:05:45 +00:00
|
|
|
stats.RxBytes += int64(counts.RxBytes)
|
|
|
|
stats.RxPackets += int64(counts.RxPackets)
|
|
|
|
stats.TxBytes += int64(counts.TxBytes)
|
|
|
|
stats.TxPackets += int64(counts.TxPackets)
|
2023-03-02 14:06:00 +00:00
|
|
|
}
|
|
|
|
|
2023-03-09 03:05:45 +00:00
|
|
|
// The count of active sessions.
|
2023-04-06 16:39:22 +00:00
|
|
|
sshStats := a.sshServer.ConnStats()
|
|
|
|
stats.SessionCountSSH = sshStats.Sessions
|
|
|
|
stats.SessionCountVSCode = sshStats.VSCode
|
|
|
|
stats.SessionCountJetBrains = sshStats.JetBrains
|
|
|
|
|
2023-03-02 14:06:00 +00:00
|
|
|
stats.SessionCountReconnectingPTY = a.connCountReconnectingPTY.Load()
|
|
|
|
|
|
|
|
// Compute the median connection latency!
|
|
|
|
var wg sync.WaitGroup
|
|
|
|
var mu sync.Mutex
|
|
|
|
status := a.network.Status()
|
|
|
|
durations := []float64{}
|
2023-05-25 10:52:36 +00:00
|
|
|
pingCtx, cancelFunc := context.WithTimeout(ctx, 5*time.Second)
|
2023-03-02 14:06:00 +00:00
|
|
|
defer cancelFunc()
|
|
|
|
for nodeID, peer := range status.Peer {
|
|
|
|
if !peer.Active {
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
addresses, found := a.network.NodeAddresses(nodeID)
|
|
|
|
if !found {
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
if len(addresses) == 0 {
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
wg.Add(1)
|
|
|
|
go func() {
|
|
|
|
defer wg.Done()
|
2023-05-25 10:52:36 +00:00
|
|
|
duration, _, _, err := a.network.Ping(pingCtx, addresses[0].Addr())
|
2023-03-02 14:06:00 +00:00
|
|
|
if err != nil {
|
|
|
|
return
|
|
|
|
}
|
|
|
|
mu.Lock()
|
|
|
|
durations = append(durations, float64(duration.Microseconds()))
|
|
|
|
mu.Unlock()
|
|
|
|
}()
|
|
|
|
}
|
|
|
|
wg.Wait()
|
|
|
|
sort.Float64s(durations)
|
|
|
|
durationsLength := len(durations)
|
|
|
|
if durationsLength == 0 {
|
|
|
|
stats.ConnectionMedianLatencyMS = -1
|
|
|
|
} else if durationsLength%2 == 0 {
|
|
|
|
stats.ConnectionMedianLatencyMS = (durations[durationsLength/2-1] + durations[durationsLength/2]) / 2
|
|
|
|
} else {
|
|
|
|
stats.ConnectionMedianLatencyMS = durations[durationsLength/2]
|
|
|
|
}
|
|
|
|
// Convert from microseconds to milliseconds.
|
|
|
|
stats.ConnectionMedianLatencyMS /= 1000
|
|
|
|
|
2023-04-27 10:34:00 +00:00
|
|
|
// Collect agent metrics.
|
|
|
|
// Agent metrics are changing all the time, so there is no need to perform
|
|
|
|
// reflect.DeepEqual to see if stats should be transferred.
|
2023-05-25 10:52:36 +00:00
|
|
|
|
|
|
|
metricsCtx, cancelFunc := context.WithTimeout(ctx, 5*time.Second)
|
|
|
|
defer cancelFunc()
|
|
|
|
stats.Metrics = a.collectMetrics(metricsCtx)
|
2023-04-27 10:34:00 +00:00
|
|
|
|
2023-03-09 03:05:45 +00:00
|
|
|
a.latestStat.Store(stats)
|
|
|
|
|
2023-03-02 14:06:00 +00:00
|
|
|
select {
|
|
|
|
case a.connStatsChan <- stats:
|
2023-03-09 03:05:45 +00:00
|
|
|
case <-a.closed:
|
2023-03-02 14:06:00 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Report statistics from the created network.
|
|
|
|
cl, err := a.client.ReportStats(ctx, a.logger, a.connStatsChan, func(d time.Duration) {
|
|
|
|
a.network.SetConnStatsCallback(d, 2048,
|
|
|
|
func(_, _ time.Time, virtual, _ map[netlogtype.Connection]netlogtype.Counts) {
|
|
|
|
reportStats(virtual)
|
|
|
|
},
|
|
|
|
)
|
|
|
|
})
|
|
|
|
if err != nil {
|
2023-06-21 10:00:38 +00:00
|
|
|
a.logger.Error(ctx, "agent failed to report stats", slog.Error(err))
|
2023-03-02 14:06:00 +00:00
|
|
|
} else {
|
|
|
|
if err = a.trackConnGoroutine(func() {
|
|
|
|
// This is OK because the agent never re-creates the tailnet
|
|
|
|
// and the only shutdown indicator is agent.Close().
|
|
|
|
<-a.closed
|
|
|
|
_ = cl.Close()
|
|
|
|
}); err != nil {
|
|
|
|
a.logger.Debug(ctx, "report stats goroutine", slog.Error(err))
|
|
|
|
_ = cl.Close()
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2022-02-19 05:13:32 +00:00
|
|
|
// isClosed returns whether the API is closed or not.
|
2022-03-30 22:59:54 +00:00
|
|
|
func (a *agent) isClosed() bool {
|
2022-02-19 05:13:32 +00:00
|
|
|
select {
|
2022-03-30 22:59:54 +00:00
|
|
|
case <-a.closed:
|
2022-02-19 05:13:32 +00:00
|
|
|
return true
|
|
|
|
default:
|
|
|
|
return false
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2023-04-26 18:01:49 +00:00
|
|
|
func (a *agent) HTTPDebug() http.Handler {
|
2023-08-08 17:56:08 +00:00
|
|
|
r := chi.NewRouter()
|
|
|
|
|
|
|
|
requireNetwork := func(w http.ResponseWriter) (*tailnet.Conn, bool) {
|
2023-04-26 18:01:49 +00:00
|
|
|
a.closeMutex.Lock()
|
|
|
|
network := a.network
|
|
|
|
a.closeMutex.Unlock()
|
|
|
|
|
|
|
|
if network == nil {
|
2023-08-08 17:56:08 +00:00
|
|
|
w.WriteHeader(http.StatusNotFound)
|
2023-04-26 18:01:49 +00:00
|
|
|
_, _ = w.Write([]byte("network is not ready yet"))
|
2023-08-08 17:56:08 +00:00
|
|
|
return nil, false
|
|
|
|
}
|
|
|
|
|
|
|
|
return network, true
|
|
|
|
}
|
|
|
|
|
|
|
|
r.Get("/debug/magicsock", func(w http.ResponseWriter, r *http.Request) {
|
|
|
|
network, ok := requireNetwork(w)
|
|
|
|
if !ok {
|
2023-04-26 18:01:49 +00:00
|
|
|
return
|
|
|
|
}
|
2023-08-08 17:56:08 +00:00
|
|
|
network.MagicsockServeHTTPDebug(w, r)
|
|
|
|
})
|
2023-04-26 18:01:49 +00:00
|
|
|
|
2023-08-08 17:56:08 +00:00
|
|
|
r.Get("/debug/magicsock/debug-logging/{state}", func(w http.ResponseWriter, r *http.Request) {
|
|
|
|
state := chi.URLParam(r, "state")
|
|
|
|
stateBool, err := strconv.ParseBool(state)
|
|
|
|
if err != nil {
|
|
|
|
w.WriteHeader(http.StatusBadRequest)
|
|
|
|
_, _ = fmt.Fprintf(w, "invalid state %q, must be a boolean", state)
|
|
|
|
return
|
|
|
|
}
|
|
|
|
|
|
|
|
network, ok := requireNetwork(w)
|
|
|
|
if !ok {
|
|
|
|
return
|
2023-04-26 18:01:49 +00:00
|
|
|
}
|
2023-08-08 17:56:08 +00:00
|
|
|
|
|
|
|
network.MagicsockSetDebugLoggingEnabled(stateBool)
|
|
|
|
a.logger.Info(r.Context(), "updated magicsock debug logging due to debug request", slog.F("new_state", stateBool))
|
|
|
|
|
|
|
|
w.WriteHeader(http.StatusOK)
|
|
|
|
_, _ = fmt.Fprintf(w, "updated magicsock debug logging to %v", stateBool)
|
2023-04-26 18:01:49 +00:00
|
|
|
})
|
2023-08-08 17:56:08 +00:00
|
|
|
|
|
|
|
r.NotFound(func(w http.ResponseWriter, r *http.Request) {
|
|
|
|
w.WriteHeader(http.StatusNotFound)
|
|
|
|
_, _ = w.Write([]byte("404 not found"))
|
|
|
|
})
|
|
|
|
|
|
|
|
return r
|
2023-04-26 18:01:49 +00:00
|
|
|
}
|
|
|
|
|
2022-03-30 22:59:54 +00:00
|
|
|
func (a *agent) Close() error {
|
|
|
|
a.closeMutex.Lock()
|
|
|
|
defer a.closeMutex.Unlock()
|
|
|
|
if a.isClosed() {
|
2022-02-19 05:13:32 +00:00
|
|
|
return nil
|
|
|
|
}
|
2023-03-06 19:34:00 +00:00
|
|
|
|
|
|
|
ctx := context.Background()
|
2023-04-06 16:39:22 +00:00
|
|
|
a.logger.Info(ctx, "shutting down agent")
|
2023-03-06 19:34:00 +00:00
|
|
|
a.setLifecycle(ctx, codersdk.WorkspaceAgentLifecycleShuttingDown)
|
|
|
|
|
2023-04-06 16:39:22 +00:00
|
|
|
// Attempt to gracefully shut down all active SSH connections and
|
|
|
|
// stop accepting new ones.
|
|
|
|
err := a.sshServer.Shutdown(ctx)
|
|
|
|
if err != nil {
|
|
|
|
a.logger.Error(ctx, "ssh server shutdown", slog.Error(err))
|
|
|
|
}
|
|
|
|
|
2023-03-06 19:34:00 +00:00
|
|
|
lifecycleState := codersdk.WorkspaceAgentLifecycleOff
|
2023-03-31 20:26:19 +00:00
|
|
|
if manifest := a.manifest.Load(); manifest != nil && manifest.ShutdownScript != "" {
|
2023-03-06 19:34:00 +00:00
|
|
|
scriptDone := make(chan error, 1)
|
|
|
|
go func() {
|
|
|
|
defer close(scriptDone)
|
2023-03-31 20:26:19 +00:00
|
|
|
scriptDone <- a.runShutdownScript(ctx, manifest.ShutdownScript)
|
2023-03-06 19:34:00 +00:00
|
|
|
}()
|
|
|
|
|
|
|
|
var timeout <-chan time.Time
|
|
|
|
// If timeout is zero, an older version of the coder
|
|
|
|
// provider was used. Otherwise a timeout is always > 0.
|
2023-03-31 20:26:19 +00:00
|
|
|
if manifest.ShutdownScriptTimeout > 0 {
|
|
|
|
t := time.NewTimer(manifest.ShutdownScriptTimeout)
|
2023-03-06 19:34:00 +00:00
|
|
|
defer t.Stop()
|
|
|
|
timeout = t.C
|
|
|
|
}
|
|
|
|
|
|
|
|
var err error
|
|
|
|
select {
|
|
|
|
case err = <-scriptDone:
|
|
|
|
case <-timeout:
|
2023-06-20 11:41:55 +00:00
|
|
|
a.logger.Warn(ctx, "script timed out", slog.F("lifecycle", "shutdown"), slog.F("timeout", manifest.ShutdownScriptTimeout))
|
2023-03-06 19:34:00 +00:00
|
|
|
a.setLifecycle(ctx, codersdk.WorkspaceAgentLifecycleShutdownTimeout)
|
|
|
|
err = <-scriptDone // The script can still complete after a timeout.
|
|
|
|
}
|
|
|
|
if err != nil {
|
|
|
|
lifecycleState = codersdk.WorkspaceAgentLifecycleShutdownError
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Set final state and wait for it to be reported because context
|
|
|
|
// cancellation will stop the report loop.
|
|
|
|
a.setLifecycle(ctx, lifecycleState)
|
|
|
|
|
|
|
|
// Wait for the lifecycle to be reported, but don't wait forever so
|
|
|
|
// that we don't break user expectations.
|
|
|
|
ctx, cancel := context.WithTimeout(ctx, 5*time.Second)
|
|
|
|
defer cancel()
|
|
|
|
lifecycleWaitLoop:
|
|
|
|
for {
|
|
|
|
select {
|
|
|
|
case <-ctx.Done():
|
|
|
|
break lifecycleWaitLoop
|
|
|
|
case s := <-a.lifecycleReported:
|
|
|
|
if s == lifecycleState {
|
|
|
|
break lifecycleWaitLoop
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2022-03-30 22:59:54 +00:00
|
|
|
close(a.closed)
|
|
|
|
a.closeCancel()
|
2023-03-06 19:34:00 +00:00
|
|
|
_ = a.sshServer.Close()
|
2022-09-01 01:09:44 +00:00
|
|
|
if a.network != nil {
|
|
|
|
_ = a.network.Close()
|
|
|
|
}
|
2022-03-30 22:59:54 +00:00
|
|
|
a.connCloseWait.Wait()
|
2023-03-06 19:34:00 +00:00
|
|
|
|
2022-02-19 05:13:32 +00:00
|
|
|
return nil
|
|
|
|
}
|
2022-04-29 22:30:10 +00:00
|
|
|
|
2022-11-24 12:22:20 +00:00
|
|
|
// userHomeDir returns the home directory of the current user, giving
|
|
|
|
// priority to the $HOME environment variable.
|
|
|
|
func userHomeDir() (string, error) {
|
|
|
|
// First we check the environment.
|
|
|
|
homedir, err := os.UserHomeDir()
|
|
|
|
if err == nil {
|
|
|
|
return homedir, nil
|
|
|
|
}
|
|
|
|
|
|
|
|
// As a fallback, we try the user information.
|
|
|
|
u, err := user.Current()
|
|
|
|
if err != nil {
|
|
|
|
return "", xerrors.Errorf("current user: %w", err)
|
|
|
|
}
|
|
|
|
return u.HomeDir, nil
|
2022-05-18 14:10:40 +00:00
|
|
|
}
|
2023-02-07 21:35:09 +00:00
|
|
|
|
|
|
|
// expandDirectory converts a directory path to an absolute path.
|
|
|
|
// It primarily resolves the home directory and any environment
|
|
|
|
// variables that may be set
|
|
|
|
func expandDirectory(dir string) (string, error) {
|
|
|
|
if dir == "" {
|
|
|
|
return "", nil
|
|
|
|
}
|
|
|
|
if dir[0] == '~' {
|
|
|
|
home, err := userHomeDir()
|
|
|
|
if err != nil {
|
|
|
|
return "", err
|
|
|
|
}
|
|
|
|
dir = filepath.Join(home, dir[1:])
|
|
|
|
}
|
2023-04-14 14:32:18 +00:00
|
|
|
dir = os.ExpandEnv(dir)
|
|
|
|
|
|
|
|
if !filepath.IsAbs(dir) {
|
|
|
|
home, err := userHomeDir()
|
|
|
|
if err != nil {
|
|
|
|
return "", err
|
|
|
|
}
|
|
|
|
dir = filepath.Join(home, dir)
|
|
|
|
}
|
|
|
|
return dir, nil
|
2023-02-07 21:35:09 +00:00
|
|
|
}
|
2023-05-18 03:49:25 +00:00
|
|
|
|
|
|
|
// EnvAgentSubsystem is the environment variable used to denote the
|
|
|
|
// specialized environment in which the agent is running
|
|
|
|
// (e.g. envbox, envbuilder).
|
|
|
|
const EnvAgentSubsystem = "CODER_AGENT_SUBSYSTEM"
|