2022-02-10 14:33:27 +00:00
package cli
2022-03-22 19:17:50 +00:00
import (
2023-01-06 07:52:19 +00:00
"bytes"
2022-03-29 00:19:28 +00:00
"context"
2022-07-25 15:25:34 +00:00
"errors"
2022-05-13 17:09:04 +00:00
"fmt"
2022-03-30 22:59:54 +00:00
"io"
2024-05-03 22:12:06 +00:00
"log"
2024-01-23 17:01:44 +00:00
"net/http"
2022-11-07 18:12:39 +00:00
"net/url"
2022-03-30 22:59:54 +00:00
"os"
2023-01-06 07:52:19 +00:00
"os/exec"
2022-05-13 17:09:04 +00:00
"path/filepath"
2022-04-11 23:54:30 +00:00
"strings"
2023-06-12 05:18:33 +00:00
"sync"
2022-05-13 17:09:04 +00:00
"time"
2022-03-22 19:17:50 +00:00
2022-05-13 17:09:04 +00:00
"github.com/gen2brain/beeep"
"github.com/gofrs/flock"
2022-04-11 21:06:15 +00:00
"github.com/google/uuid"
2022-03-30 22:59:54 +00:00
"github.com/mattn/go-isatty"
2022-03-29 00:19:28 +00:00
gossh "golang.org/x/crypto/ssh"
2022-05-25 18:28:10 +00:00
gosshagent "golang.org/x/crypto/ssh/agent"
2022-04-01 21:25:46 +00:00
"golang.org/x/term"
2022-03-25 19:48:08 +00:00
"golang.org/x/xerrors"
fix: close ssh sessions gracefully (#10732)
Re-enables TestSSH/RemoteForward_Unix_Signal and addresses the underlying race: we were not closing the remote forward on context expiry, only the session and connection.
However, there is still a more fundamental issue in that we don't have the ability to ensure that TCP sessions are properly terminated before tearing down the Tailnet conn. This is due to the assumption in the sockets API, that the underlying IP interface is long
lived compared with the TCP socket, and thus closing a socket returns immediately and does not wait for the TCP termination handshake --- that is handled async in the tcpip stack. However, this assumption does not hold for us and tailnet, since on shutdown,
we also tear down the tailnet connection, and this can race with the TCP termination.
Closing the remote forward explicitly should prevent forward state from accumulating, since the Close() function waits for a reply from the remote SSH server.
I've also attempted to workaround the TCP/tailnet issue for `--stdio` by using `CloseWrite()` instead of `Close()`. By closing the write side of the connection, half-close the TCP connection, and the server detects this and closes the other direction, which then
triggers our read loop to exit only after the server has had a chance to process the close.
TODO in a stacked PR is to implement this logic for `vscodessh` as well.
2023-11-17 08:43:20 +00:00
"gvisor.dev/gvisor/pkg/tcpip/adapters/gonet"
2022-03-22 19:17:50 +00:00
2023-05-25 05:07:39 +00:00
"cdr.dev/slog"
"cdr.dev/slog/sloggers/sloghuman"
2023-08-18 18:55:43 +00:00
"github.com/coder/coder/v2/cli/cliui"
2023-11-14 12:38:34 +00:00
"github.com/coder/coder/v2/cli/cliutil"
2023-08-18 18:55:43 +00:00
"github.com/coder/coder/v2/coderd/autobuild/notify"
"github.com/coder/coder/v2/coderd/util/ptr"
"github.com/coder/coder/v2/codersdk"
2024-03-26 17:44:31 +00:00
"github.com/coder/coder/v2/codersdk/workspacesdk"
2023-08-18 18:55:43 +00:00
"github.com/coder/coder/v2/cryptorand"
2024-04-17 18:01:20 +00:00
"github.com/coder/coder/v2/pty"
"github.com/coder/retry"
"github.com/coder/serpent"
2022-03-22 19:17:50 +00:00
)
2022-08-25 16:10:42 +00:00
var (
workspacePollInterval = time . Minute
autostopNotifyCountdown = [ ] time . Duration { 30 * time . Minute }
)
2022-05-13 17:09:04 +00:00
2024-03-17 14:45:26 +00:00
func ( r * RootCmd ) ssh ( ) * serpent . Command {
2022-03-30 22:59:54 +00:00
var (
2023-12-08 16:01:13 +00:00
stdio bool
forwardAgent bool
forwardGPG bool
identityAgent string
wsPollInterval time . Duration
waitEnum string
noWait bool
logDirPath string
2024-01-19 13:21:10 +00:00
remoteForwards [ ] string
2024-04-22 10:13:48 +00:00
env [ ] string
2023-12-08 16:01:13 +00:00
disableAutostart bool
2022-03-30 22:59:54 +00:00
)
2023-03-23 22:42:20 +00:00
client := new ( codersdk . Client )
2024-03-17 14:45:26 +00:00
cmd := & serpent . Command {
2022-05-09 22:42:02 +00:00
Annotations : workspaceCommand ,
Use : "ssh <workspace>" ,
2022-09-19 16:36:18 +00:00
Short : "Start a shell into a workspace" ,
2024-03-15 16:24:38 +00:00
Middleware : serpent . Chain (
serpent . RequireNArgs ( 1 ) ,
2023-03-23 22:42:20 +00:00
r . InitClient ( client ) ,
) ,
2024-03-15 16:24:38 +00:00
Handler : func ( inv * serpent . Invocation ) ( retErr error ) {
2023-11-13 11:14:42 +00:00
// Before dialing the SSH server over TCP, capture Interrupt signals
// so that if we are interrupted, we have a chance to tear down the
// TCP session cleanly before exiting. If we don't, then the TCP
// session can persist for up to 72 hours, since we set a long
// timeout on the Agent side of the connection. In particular,
// OpenSSH sends SIGHUP to terminate a proxy command.
2024-03-15 13:16:36 +00:00
ctx , stop := inv . SignalNotifyContext ( inv . Context ( ) , StopSignals ... )
2023-11-13 11:14:42 +00:00
defer stop ( )
ctx , cancel := context . WithCancel ( ctx )
2022-08-02 14:44:59 +00:00
defer cancel ( )
2024-05-03 22:12:06 +00:00
// Prevent unnecessary logs from the stdlib from messing up the TTY.
// See: https://github.com/coder/coder/issues/13144
log . SetOutput ( io . Discard )
2023-11-14 18:56:27 +00:00
logger := inv . Logger
2023-05-25 05:07:39 +00:00
defer func ( ) {
if retErr != nil {
// catch and log all returned errors so we see them in the
// log file (if there is one)
logger . Error ( ctx , "command exit" , slog . Error ( retErr ) )
}
} ( )
2023-06-12 05:18:33 +00:00
2024-02-08 11:09:42 +00:00
// In stdio mode, we can't allow any writes to stdin or stdout
// because they are used by the SSH protocol.
stdioReader , stdioWriter := inv . Stdin , inv . Stdout
if stdio {
inv . Stdin = stdioErrLogReader { inv . Logger }
inv . Stdout = inv . Stderr
}
2023-06-12 05:18:33 +00:00
// This WaitGroup solves for a race condition where we were logging
// while closing the log file in a defer. It probably solves
// others too.
var wg sync . WaitGroup
wg . Add ( 1 )
defer wg . Done ( )
if logDirPath != "" {
nonce , err := cryptorand . StringCharset ( cryptorand . Lower , 5 )
2023-05-25 05:07:39 +00:00
if err != nil {
2023-06-12 05:18:33 +00:00
return xerrors . Errorf ( "generate nonce: %w" , err )
2023-05-25 05:07:39 +00:00
}
2023-06-12 05:18:33 +00:00
logFilePath := filepath . Join (
logDirPath ,
fmt . Sprintf (
"coder-ssh-%s-%s.log" ,
// The time portion makes it easier to find the right
// log file.
time . Now ( ) . Format ( "20060102-150405" ) ,
// The nonce prevents collisions, as SSH invocations
// frequently happen in parallel.
nonce ,
) ,
)
logFile , err := os . OpenFile (
logFilePath ,
os . O_CREATE | os . O_APPEND | os . O_WRONLY | os . O_EXCL ,
0 o600 ,
)
2023-05-25 05:07:39 +00:00
if err != nil {
2023-06-12 05:18:33 +00:00
return xerrors . Errorf ( "error opening %s for logging: %w" , logDirPath , err )
2023-05-25 05:07:39 +00:00
}
2023-11-14 12:38:34 +00:00
dc := cliutil . DiscardAfterClose ( logFile )
2023-06-12 05:18:33 +00:00
go func ( ) {
wg . Wait ( )
2023-11-14 12:38:34 +00:00
_ = dc . Close ( )
2023-06-12 05:18:33 +00:00
} ( )
2023-11-14 12:38:34 +00:00
logger = logger . AppendSinks ( sloghuman . Sink ( dc ) )
2023-05-25 05:07:39 +00:00
if r . verbose {
logger = logger . Leveled ( slog . LevelDebug )
}
// log HTTP requests
2023-07-06 08:43:39 +00:00
client . SetLogger ( logger )
2023-05-25 05:07:39 +00:00
}
fix: close ssh sessions gracefully (#10732)
Re-enables TestSSH/RemoteForward_Unix_Signal and addresses the underlying race: we were not closing the remote forward on context expiry, only the session and connection.
However, there is still a more fundamental issue in that we don't have the ability to ensure that TCP sessions are properly terminated before tearing down the Tailnet conn. This is due to the assumption in the sockets API, that the underlying IP interface is long
lived compared with the TCP socket, and thus closing a socket returns immediately and does not wait for the TCP termination handshake --- that is handled async in the tcpip stack. However, this assumption does not hold for us and tailnet, since on shutdown,
we also tear down the tailnet connection, and this can race with the TCP termination.
Closing the remote forward explicitly should prevent forward state from accumulating, since the Close() function waits for a reply from the remote SSH server.
I've also attempted to workaround the TCP/tailnet issue for `--stdio` by using `CloseWrite()` instead of `Close()`. By closing the write side of the connection, half-close the TCP connection, and the server detects this and closes the other direction, which then
triggers our read loop to exit only after the server has had a chance to process the close.
TODO in a stacked PR is to implement this logic for `vscodessh` as well.
2023-11-17 08:43:20 +00:00
stack := newCloserStack ( ctx , logger )
defer stack . close ( nil )
2023-05-25 05:07:39 +00:00
2024-04-22 10:13:48 +00:00
for _ , remoteForward := range remoteForwards {
isValid := validateRemoteForward ( remoteForward )
if ! isValid {
return xerrors . Errorf ( ` invalid format of remote-forward, expected: remote_port:local_address:local_port ` )
}
if isValid && stdio {
return xerrors . Errorf ( ` remote-forward can't be enabled in the stdio mode ` )
2023-07-20 10:05:39 +00:00
}
}
2024-04-22 10:13:48 +00:00
var parsedEnv [ ] [ 2 ] string
for _ , e := range env {
k , v , ok := strings . Cut ( e , "=" )
if ! ok {
return xerrors . Errorf ( "invalid environment variable setting %q" , e )
}
parsedEnv = append ( parsedEnv , [ 2 ] string { k , v } )
}
2024-04-13 18:39:57 +00:00
workspace , workspaceAgent , err := getWorkspaceAndAgent ( ctx , inv , client , ! disableAutostart , inv . Args [ 0 ] )
2022-03-22 19:17:50 +00:00
if err != nil {
return err
}
2022-04-01 19:42:36 +00:00
2023-06-08 13:52:44 +00:00
// Select the startup script behavior based on template configuration or flags.
var wait bool
switch waitEnum {
case "yes" :
wait = true
case "no" :
wait = false
case "auto" :
2023-09-25 21:47:17 +00:00
for _ , script := range workspaceAgent . Scripts {
if script . StartBlocksLogin {
wait = true
break
}
2023-06-08 13:52:44 +00:00
}
default :
return xerrors . Errorf ( "unknown wait value %q" , waitEnum )
}
// The `--no-wait` flag is deprecated, but for now, check it.
if noWait {
wait = false
}
2023-06-02 09:16:46 +00:00
templateVersion , err := client . TemplateVersion ( ctx , workspace . LatestBuild . TemplateVersionID )
if err != nil {
return err
}
var unsupportedWorkspace bool
for _ , warning := range templateVersion . Warnings {
if warning == codersdk . TemplateVersionWarningUnsupportedWorkspaces {
unsupportedWorkspace = true
break
}
}
if unsupportedWorkspace && isTTYErr ( inv ) {
_ , _ = fmt . Fprintln ( inv . Stderr , "👋 Your workspace uses legacy parameters which are not supported anymore. Contact your administrator for assistance." )
}
2022-11-07 18:12:39 +00:00
updateWorkspaceBanner , outdated := verifyWorkspaceOutdated ( client , workspace )
2023-03-23 22:42:20 +00:00
if outdated && isTTYErr ( inv ) {
_ , _ = fmt . Fprintln ( inv . Stderr , updateWorkspaceBanner )
2022-11-07 18:12:39 +00:00
}
2022-03-30 22:59:54 +00:00
// OpenSSH passes stderr directly to the calling TTY.
// This is required in "stdio" mode so a connecting indicator can be displayed.
2023-07-12 15:21:54 +00:00
err = cliui . Agent ( ctx , inv . Stderr , workspaceAgent . ID , cliui . AgentOptions {
Fetch : client . WorkspaceAgent ,
2023-07-28 15:57:23 +00:00
FetchLogs : client . WorkspaceAgentLogsAfter ,
2023-06-28 08:54:13 +00:00
Wait : wait ,
2022-03-29 00:19:28 +00:00
} )
if err != nil {
2023-01-27 17:05:40 +00:00
if xerrors . Is ( err , context . Canceled ) {
return cliui . Canceled
}
2024-01-02 18:46:18 +00:00
return err
2022-03-25 19:48:08 +00:00
}
2022-03-22 19:17:50 +00:00
2023-06-21 20:22:43 +00:00
if r . disableDirect {
_ , _ = fmt . Fprintln ( inv . Stderr , "Direct connections disabled." )
}
2024-03-26 17:44:31 +00:00
conn , err := workspacesdk . New ( client ) .
DialAgent ( ctx , workspaceAgent . ID , & workspacesdk . DialAgentOptions {
Logger : logger ,
BlockEndpoints : r . disableDirect ,
} )
2022-09-01 01:09:44 +00:00
if err != nil {
2023-05-25 05:07:39 +00:00
return xerrors . Errorf ( "dial agent: %w" , err )
2022-09-01 01:09:44 +00:00
}
fix: close ssh sessions gracefully (#10732)
Re-enables TestSSH/RemoteForward_Unix_Signal and addresses the underlying race: we were not closing the remote forward on context expiry, only the session and connection.
However, there is still a more fundamental issue in that we don't have the ability to ensure that TCP sessions are properly terminated before tearing down the Tailnet conn. This is due to the assumption in the sockets API, that the underlying IP interface is long
lived compared with the TCP socket, and thus closing a socket returns immediately and does not wait for the TCP termination handshake --- that is handled async in the tcpip stack. However, this assumption does not hold for us and tailnet, since on shutdown,
we also tear down the tailnet connection, and this can race with the TCP termination.
Closing the remote forward explicitly should prevent forward state from accumulating, since the Close() function waits for a reply from the remote SSH server.
I've also attempted to workaround the TCP/tailnet issue for `--stdio` by using `CloseWrite()` instead of `Close()`. By closing the write side of the connection, half-close the TCP connection, and the server detects this and closes the other direction, which then
triggers our read loop to exit only after the server has had a chance to process the close.
TODO in a stacked PR is to implement this logic for `vscodessh` as well.
2023-11-17 08:43:20 +00:00
if err = stack . push ( "agent conn" , conn ) ; err != nil {
return err
}
2022-11-13 17:33:05 +00:00
conn . AwaitReachable ( ctx )
2023-07-20 10:05:39 +00:00
2022-09-01 01:09:44 +00:00
stopPolling := tryPollWorkspaceAutostop ( ctx , client , workspace )
defer stopPolling ( )
2022-06-24 21:21:46 +00:00
2022-09-01 01:09:44 +00:00
if stdio {
2022-11-08 22:10:48 +00:00
rawSSH , err := conn . SSH ( ctx )
2022-09-01 01:09:44 +00:00
if err != nil {
2023-05-25 05:07:39 +00:00
return xerrors . Errorf ( "connect SSH: %w" , err )
2022-06-24 21:21:46 +00:00
}
2024-02-08 11:09:42 +00:00
copier := newRawSSHCopier ( logger , rawSSH , stdioReader , stdioWriter )
fix: close ssh sessions gracefully (#10732)
Re-enables TestSSH/RemoteForward_Unix_Signal and addresses the underlying race: we were not closing the remote forward on context expiry, only the session and connection.
However, there is still a more fundamental issue in that we don't have the ability to ensure that TCP sessions are properly terminated before tearing down the Tailnet conn. This is due to the assumption in the sockets API, that the underlying IP interface is long
lived compared with the TCP socket, and thus closing a socket returns immediately and does not wait for the TCP termination handshake --- that is handled async in the tcpip stack. However, this assumption does not hold for us and tailnet, since on shutdown,
we also tear down the tailnet connection, and this can race with the TCP termination.
Closing the remote forward explicitly should prevent forward state from accumulating, since the Close() function waits for a reply from the remote SSH server.
I've also attempted to workaround the TCP/tailnet issue for `--stdio` by using `CloseWrite()` instead of `Close()`. By closing the write side of the connection, half-close the TCP connection, and the server detects this and closes the other direction, which then
triggers our read loop to exit only after the server has had a chance to process the close.
TODO in a stacked PR is to implement this logic for `vscodessh` as well.
2023-11-17 08:43:20 +00:00
if err = stack . push ( "rawSSHCopier" , copier ) ; err != nil {
return err
}
2022-06-24 21:21:46 +00:00
2023-06-12 05:18:33 +00:00
wg . Add ( 1 )
2022-09-01 01:09:44 +00:00
go func ( ) {
2023-06-12 05:18:33 +00:00
defer wg . Done ( )
watchAndClose ( ctx , func ( ) error {
fix: close ssh sessions gracefully (#10732)
Re-enables TestSSH/RemoteForward_Unix_Signal and addresses the underlying race: we were not closing the remote forward on context expiry, only the session and connection.
However, there is still a more fundamental issue in that we don't have the ability to ensure that TCP sessions are properly terminated before tearing down the Tailnet conn. This is due to the assumption in the sockets API, that the underlying IP interface is long
lived compared with the TCP socket, and thus closing a socket returns immediately and does not wait for the TCP termination handshake --- that is handled async in the tcpip stack. However, this assumption does not hold for us and tailnet, since on shutdown,
we also tear down the tailnet connection, and this can race with the TCP termination.
Closing the remote forward explicitly should prevent forward state from accumulating, since the Close() function waits for a reply from the remote SSH server.
I've also attempted to workaround the TCP/tailnet issue for `--stdio` by using `CloseWrite()` instead of `Close()`. By closing the write side of the connection, half-close the TCP connection, and the server detects this and closes the other direction, which then
triggers our read loop to exit only after the server has had a chance to process the close.
TODO in a stacked PR is to implement this logic for `vscodessh` as well.
2023-11-17 08:43:20 +00:00
stack . close ( xerrors . New ( "watchAndClose" ) )
return nil
2023-06-12 05:18:33 +00:00
} , logger , client , workspace )
} ( )
fix: close ssh sessions gracefully (#10732)
Re-enables TestSSH/RemoteForward_Unix_Signal and addresses the underlying race: we were not closing the remote forward on context expiry, only the session and connection.
However, there is still a more fundamental issue in that we don't have the ability to ensure that TCP sessions are properly terminated before tearing down the Tailnet conn. This is due to the assumption in the sockets API, that the underlying IP interface is long
lived compared with the TCP socket, and thus closing a socket returns immediately and does not wait for the TCP termination handshake --- that is handled async in the tcpip stack. However, this assumption does not hold for us and tailnet, since on shutdown,
we also tear down the tailnet connection, and this can race with the TCP termination.
Closing the remote forward explicitly should prevent forward state from accumulating, since the Close() function waits for a reply from the remote SSH server.
I've also attempted to workaround the TCP/tailnet issue for `--stdio` by using `CloseWrite()` instead of `Close()`. By closing the write side of the connection, half-close the TCP connection, and the server detects this and closes the other direction, which then
triggers our read loop to exit only after the server has had a chance to process the close.
TODO in a stacked PR is to implement this logic for `vscodessh` as well.
2023-11-17 08:43:20 +00:00
copier . copy ( & wg )
2022-09-01 01:09:44 +00:00
return nil
2022-08-02 14:44:59 +00:00
}
2022-06-24 21:21:46 +00:00
2022-11-08 22:10:48 +00:00
sshClient , err := conn . SSHClient ( ctx )
2022-08-02 14:44:59 +00:00
if err != nil {
2023-05-25 05:07:39 +00:00
return xerrors . Errorf ( "ssh client: %w" , err )
2022-08-02 14:44:59 +00:00
}
fix: close ssh sessions gracefully (#10732)
Re-enables TestSSH/RemoteForward_Unix_Signal and addresses the underlying race: we were not closing the remote forward on context expiry, only the session and connection.
However, there is still a more fundamental issue in that we don't have the ability to ensure that TCP sessions are properly terminated before tearing down the Tailnet conn. This is due to the assumption in the sockets API, that the underlying IP interface is long
lived compared with the TCP socket, and thus closing a socket returns immediately and does not wait for the TCP termination handshake --- that is handled async in the tcpip stack. However, this assumption does not hold for us and tailnet, since on shutdown,
we also tear down the tailnet connection, and this can race with the TCP termination.
Closing the remote forward explicitly should prevent forward state from accumulating, since the Close() function waits for a reply from the remote SSH server.
I've also attempted to workaround the TCP/tailnet issue for `--stdio` by using `CloseWrite()` instead of `Close()`. By closing the write side of the connection, half-close the TCP connection, and the server detects this and closes the other direction, which then
triggers our read loop to exit only after the server has had a chance to process the close.
TODO in a stacked PR is to implement this logic for `vscodessh` as well.
2023-11-17 08:43:20 +00:00
if err = stack . push ( "ssh client" , sshClient ) ; err != nil {
return err
}
2022-08-02 14:44:59 +00:00
sshSession , err := sshClient . NewSession ( )
if err != nil {
2023-05-25 05:07:39 +00:00
return xerrors . Errorf ( "ssh session: %w" , err )
2022-03-25 19:48:08 +00:00
}
fix: close ssh sessions gracefully (#10732)
Re-enables TestSSH/RemoteForward_Unix_Signal and addresses the underlying race: we were not closing the remote forward on context expiry, only the session and connection.
However, there is still a more fundamental issue in that we don't have the ability to ensure that TCP sessions are properly terminated before tearing down the Tailnet conn. This is due to the assumption in the sockets API, that the underlying IP interface is long
lived compared with the TCP socket, and thus closing a socket returns immediately and does not wait for the TCP termination handshake --- that is handled async in the tcpip stack. However, this assumption does not hold for us and tailnet, since on shutdown,
we also tear down the tailnet connection, and this can race with the TCP termination.
Closing the remote forward explicitly should prevent forward state from accumulating, since the Close() function waits for a reply from the remote SSH server.
I've also attempted to workaround the TCP/tailnet issue for `--stdio` by using `CloseWrite()` instead of `Close()`. By closing the write side of the connection, half-close the TCP connection, and the server detects this and closes the other direction, which then
triggers our read loop to exit only after the server has had a chance to process the close.
TODO in a stacked PR is to implement this logic for `vscodessh` as well.
2023-11-17 08:43:20 +00:00
if err = stack . push ( "sshSession" , sshSession ) ; err != nil {
return err
}
2023-06-12 05:18:33 +00:00
wg . Add ( 1 )
go func ( ) {
defer wg . Done ( )
watchAndClose (
ctx ,
func ( ) error {
fix: close ssh sessions gracefully (#10732)
Re-enables TestSSH/RemoteForward_Unix_Signal and addresses the underlying race: we were not closing the remote forward on context expiry, only the session and connection.
However, there is still a more fundamental issue in that we don't have the ability to ensure that TCP sessions are properly terminated before tearing down the Tailnet conn. This is due to the assumption in the sockets API, that the underlying IP interface is long
lived compared with the TCP socket, and thus closing a socket returns immediately and does not wait for the TCP termination handshake --- that is handled async in the tcpip stack. However, this assumption does not hold for us and tailnet, since on shutdown,
we also tear down the tailnet connection, and this can race with the TCP termination.
Closing the remote forward explicitly should prevent forward state from accumulating, since the Close() function waits for a reply from the remote SSH server.
I've also attempted to workaround the TCP/tailnet issue for `--stdio` by using `CloseWrite()` instead of `Close()`. By closing the write side of the connection, half-close the TCP connection, and the server detects this and closes the other direction, which then
triggers our read loop to exit only after the server has had a chance to process the close.
TODO in a stacked PR is to implement this logic for `vscodessh` as well.
2023-11-17 08:43:20 +00:00
stack . close ( xerrors . New ( "watchAndClose" ) )
2023-06-12 05:18:33 +00:00
return nil
} ,
logger ,
client ,
workspace ,
)
} ( )
2022-03-29 00:19:28 +00:00
2022-06-02 08:13:38 +00:00
if identityAgent == "" {
identityAgent = os . Getenv ( "SSH_AUTH_SOCK" )
}
if forwardAgent && identityAgent != "" {
err = gosshagent . ForwardToRemote ( sshClient , identityAgent )
2022-05-25 18:28:10 +00:00
if err != nil {
2023-01-06 07:52:19 +00:00
return xerrors . Errorf ( "forward agent: %w" , err )
2022-05-25 18:28:10 +00:00
}
err = gosshagent . RequestAgentForwarding ( sshSession )
if err != nil {
return xerrors . Errorf ( "request agent forwarding failed: %w" , err )
}
}
2023-01-06 07:52:19 +00:00
if forwardGPG {
if workspaceAgent . OperatingSystem == "windows" {
return xerrors . New ( "GPG forwarding is not supported for Windows workspaces" )
}
err = uploadGPGKeys ( ctx , sshClient )
if err != nil {
return xerrors . Errorf ( "upload GPG public keys and ownertrust to workspace: %w" , err )
}
2023-03-23 22:42:20 +00:00
closer , err := forwardGPGAgent ( ctx , inv . Stderr , sshClient )
2023-01-06 07:52:19 +00:00
if err != nil {
return xerrors . Errorf ( "forward GPG socket: %w" , err )
}
fix: close ssh sessions gracefully (#10732)
Re-enables TestSSH/RemoteForward_Unix_Signal and addresses the underlying race: we were not closing the remote forward on context expiry, only the session and connection.
However, there is still a more fundamental issue in that we don't have the ability to ensure that TCP sessions are properly terminated before tearing down the Tailnet conn. This is due to the assumption in the sockets API, that the underlying IP interface is long
lived compared with the TCP socket, and thus closing a socket returns immediately and does not wait for the TCP termination handshake --- that is handled async in the tcpip stack. However, this assumption does not hold for us and tailnet, since on shutdown,
we also tear down the tailnet connection, and this can race with the TCP termination.
Closing the remote forward explicitly should prevent forward state from accumulating, since the Close() function waits for a reply from the remote SSH server.
I've also attempted to workaround the TCP/tailnet issue for `--stdio` by using `CloseWrite()` instead of `Close()`. By closing the write side of the connection, half-close the TCP connection, and the server detects this and closes the other direction, which then
triggers our read loop to exit only after the server has had a chance to process the close.
TODO in a stacked PR is to implement this logic for `vscodessh` as well.
2023-11-17 08:43:20 +00:00
if err = stack . push ( "forwardGPGAgent" , closer ) ; err != nil {
return err
}
2023-01-06 07:52:19 +00:00
}
2024-01-19 13:21:10 +00:00
if len ( remoteForwards ) > 0 {
for _ , remoteForward := range remoteForwards {
localAddr , remoteAddr , err := parseRemoteForward ( remoteForward )
if err != nil {
return err
}
2023-07-20 10:05:39 +00:00
2024-01-19 13:21:10 +00:00
closer , err := sshRemoteForward ( ctx , inv . Stderr , sshClient , localAddr , remoteAddr )
if err != nil {
return xerrors . Errorf ( "ssh remote forward: %w" , err )
}
if err = stack . push ( "sshRemoteForward" , closer ) ; err != nil {
return err
}
fix: close ssh sessions gracefully (#10732)
Re-enables TestSSH/RemoteForward_Unix_Signal and addresses the underlying race: we were not closing the remote forward on context expiry, only the session and connection.
However, there is still a more fundamental issue in that we don't have the ability to ensure that TCP sessions are properly terminated before tearing down the Tailnet conn. This is due to the assumption in the sockets API, that the underlying IP interface is long
lived compared with the TCP socket, and thus closing a socket returns immediately and does not wait for the TCP termination handshake --- that is handled async in the tcpip stack. However, this assumption does not hold for us and tailnet, since on shutdown,
we also tear down the tailnet connection, and this can race with the TCP termination.
Closing the remote forward explicitly should prevent forward state from accumulating, since the Close() function waits for a reply from the remote SSH server.
I've also attempted to workaround the TCP/tailnet issue for `--stdio` by using `CloseWrite()` instead of `Close()`. By closing the write side of the connection, half-close the TCP connection, and the server detects this and closes the other direction, which then
triggers our read loop to exit only after the server has had a chance to process the close.
TODO in a stacked PR is to implement this logic for `vscodessh` as well.
2023-11-17 08:43:20 +00:00
}
2023-07-20 10:05:39 +00:00
}
2023-03-23 22:42:20 +00:00
stdinFile , validIn := inv . Stdin . ( * os . File )
2024-04-17 18:01:20 +00:00
stdoutFile , validOut := inv . Stdout . ( * os . File )
if validIn && validOut && isatty . IsTerminal ( stdinFile . Fd ( ) ) && isatty . IsTerminal ( stdoutFile . Fd ( ) ) {
inState , err := pty . MakeInputRaw ( stdinFile . Fd ( ) )
if err != nil {
return err
}
defer func ( ) {
_ = pty . RestoreTerminal ( stdinFile . Fd ( ) , inState )
} ( )
outState , err := pty . MakeOutputRaw ( stdoutFile . Fd ( ) )
2022-03-30 22:59:54 +00:00
if err != nil {
return err
}
defer func ( ) {
2024-04-17 18:01:20 +00:00
_ = pty . RestoreTerminal ( stdoutFile . Fd ( ) , outState )
2022-03-30 22:59:54 +00:00
} ( )
2022-04-25 03:23:54 +00:00
2022-08-02 14:44:59 +00:00
windowChange := listenWindowSize ( ctx )
2022-04-25 03:23:54 +00:00
go func ( ) {
for {
select {
2022-08-02 14:44:59 +00:00
case <- ctx . Done ( ) :
2022-04-25 03:23:54 +00:00
return
case <- windowChange :
}
2022-08-02 14:44:59 +00:00
width , height , err := term . GetSize ( int ( stdoutFile . Fd ( ) ) )
if err != nil {
continue
}
2022-04-25 03:23:54 +00:00
_ = sshSession . WindowChange ( height , width )
}
} ( )
2022-03-30 22:59:54 +00:00
}
2024-04-22 10:13:48 +00:00
for _ , kv := range parsedEnv {
if err := sshSession . Setenv ( kv [ 0 ] , kv [ 1 ] ) ; err != nil {
return xerrors . Errorf ( "setenv: %w" , err )
}
}
2022-03-30 22:59:54 +00:00
err = sshSession . RequestPty ( "xterm-256color" , 128 , 128 , gossh . TerminalModes { } )
2022-03-25 19:48:08 +00:00
if err != nil {
2023-05-25 05:07:39 +00:00
return xerrors . Errorf ( "request pty: %w" , err )
2022-03-25 19:48:08 +00:00
}
2022-04-01 19:42:36 +00:00
2023-03-23 22:42:20 +00:00
sshSession . Stdin = inv . Stdin
sshSession . Stdout = inv . Stdout
sshSession . Stderr = inv . Stderr
2022-04-01 19:42:36 +00:00
2022-03-25 19:48:08 +00:00
err = sshSession . Shell ( )
if err != nil {
2023-05-25 05:07:39 +00:00
return xerrors . Errorf ( "start shell: %w" , err )
2022-03-25 19:48:08 +00:00
}
2022-04-01 19:42:36 +00:00
2022-08-02 14:44:59 +00:00
// Put cancel at the top of the defer stack to initiate
// shutdown of services.
defer cancel ( )
2022-09-12 16:27:51 +00:00
if validOut {
// Set initial window size.
width , height , err := term . GetSize ( int ( stdoutFile . Fd ( ) ) )
if err == nil {
_ = sshSession . WindowChange ( height , width )
}
}
2023-01-06 07:52:19 +00:00
2022-03-25 19:48:08 +00:00
err = sshSession . Wait ( )
if err != nil {
2023-11-24 12:35:56 +00:00
if exitErr := ( & gossh . ExitError { } ) ; errors . As ( err , & exitErr ) {
// Clear the error since it's not useful beyond
// reporting status.
return ExitError ( exitErr . ExitStatus ( ) , nil )
}
2023-01-06 07:52:19 +00:00
// If the connection drops unexpectedly, we get an
// ExitMissingError but no other error details, so try to at
// least give the user a better message
2022-07-25 15:25:34 +00:00
if errors . Is ( err , & gossh . ExitMissingError { } ) {
2023-11-24 12:35:56 +00:00
return ExitError ( 255 , xerrors . New ( "SSH connection ended unexpectedly" ) )
2022-07-25 15:25:34 +00:00
}
2023-05-25 05:07:39 +00:00
return xerrors . Errorf ( "session ended: %w" , err )
2022-03-22 19:17:50 +00:00
}
return nil
} ,
}
2024-03-15 16:24:38 +00:00
waitOption := serpent . Option {
2023-06-08 13:52:44 +00:00
Flag : "wait" ,
Env : "CODER_SSH_WAIT" ,
Description : "Specifies whether or not to wait for the startup script to finish executing. Auto means that the agent startup script behavior configured in the workspace template is used." ,
Default : "auto" ,
2024-03-15 16:24:38 +00:00
Value : serpent . EnumOf ( & waitEnum , "yes" , "no" , "auto" ) ,
2023-06-08 13:52:44 +00:00
}
2024-03-15 16:24:38 +00:00
cmd . Options = serpent . OptionSet {
2023-03-23 22:42:20 +00:00
{
Flag : "stdio" ,
Env : "CODER_SSH_STDIO" ,
Description : "Specifies whether to emit SSH output over stdin/stdout." ,
2024-03-15 16:24:38 +00:00
Value : serpent . BoolOf ( & stdio ) ,
2023-03-23 22:42:20 +00:00
} ,
{
Flag : "forward-agent" ,
FlagShorthand : "A" ,
Env : "CODER_SSH_FORWARD_AGENT" ,
Description : "Specifies whether to forward the SSH agent specified in $SSH_AUTH_SOCK." ,
2024-03-15 16:24:38 +00:00
Value : serpent . BoolOf ( & forwardAgent ) ,
2023-03-23 22:42:20 +00:00
} ,
{
Flag : "forward-gpg" ,
FlagShorthand : "G" ,
Env : "CODER_SSH_FORWARD_GPG" ,
Description : "Specifies whether to forward the GPG agent. Unsupported on Windows workspaces, but supports all clients. Requires gnupg (gpg, gpgconf) on both the client and workspace. The GPG agent must already be running locally and will not be started for you. If a GPG agent is already running in the workspace, it will be attempted to be killed." ,
2024-03-15 16:24:38 +00:00
Value : serpent . BoolOf ( & forwardGPG ) ,
2023-03-23 22:42:20 +00:00
} ,
{
Flag : "identity-agent" ,
Env : "CODER_SSH_IDENTITY_AGENT" ,
Description : "Specifies which identity agent to use (overrides $SSH_AUTH_SOCK), forward agent must also be enabled." ,
2024-03-15 16:24:38 +00:00
Value : serpent . StringOf ( & identityAgent ) ,
2023-03-23 22:42:20 +00:00
} ,
{
Flag : "workspace-poll-interval" ,
Env : "CODER_WORKSPACE_POLL_INTERVAL" ,
Description : "Specifies how often to poll for workspace automated shutdown." ,
Default : "1m" ,
2024-03-15 16:24:38 +00:00
Value : serpent . DurationOf ( & wsPollInterval ) ,
2023-03-23 22:42:20 +00:00
} ,
2023-06-08 13:52:44 +00:00
waitOption ,
2023-03-23 22:42:20 +00:00
{
Flag : "no-wait" ,
Env : "CODER_SSH_NO_WAIT" ,
2023-06-08 13:52:44 +00:00
Description : "Enter workspace immediately after the agent has connected. This is the default if the template has configured the agent startup script behavior as non-blocking." ,
2024-03-15 16:24:38 +00:00
Value : serpent . BoolOf ( & noWait ) ,
UseInstead : [ ] serpent . Option { waitOption } ,
2023-03-23 22:42:20 +00:00
} ,
2023-05-25 05:07:39 +00:00
{
2023-06-12 05:18:33 +00:00
Flag : "log-dir" ,
Description : "Specify the directory containing SSH diagnostic log files." ,
Env : "CODER_SSH_LOG_DIR" ,
2023-05-25 05:07:39 +00:00
FlagShorthand : "l" ,
2024-03-15 16:24:38 +00:00
Value : serpent . StringOf ( & logDirPath ) ,
2023-05-25 05:07:39 +00:00
} ,
2023-07-20 10:05:39 +00:00
{
Flag : "remote-forward" ,
Description : "Enable remote port forwarding (remote_port:local_address:local_port)." ,
Env : "CODER_SSH_REMOTE_FORWARD" ,
FlagShorthand : "R" ,
2024-03-15 16:24:38 +00:00
Value : serpent . StringArrayOf ( & remoteForwards ) ,
2023-07-20 10:05:39 +00:00
} ,
2024-04-22 10:13:48 +00:00
{
Flag : "env" ,
Description : "Set environment variable(s) for session (key1=value1,key2=value2,...)." ,
Env : "CODER_SSH_ENV" ,
FlagShorthand : "e" ,
Value : serpent . StringArrayOf ( & env ) ,
} ,
2024-03-15 16:24:38 +00:00
sshDisableAutostartOption ( serpent . BoolOf ( & disableAutostart ) ) ,
2023-03-23 22:42:20 +00:00
}
2022-03-22 19:17:50 +00:00
return cmd
}
2022-05-13 17:09:04 +00:00
2023-05-25 05:07:39 +00:00
// watchAndClose ensures closer is called if the context is canceled or
// the workspace reaches the stopped state.
//
// Watching the stopped state is a work-around for cases
// where the agent is not gracefully shut down and the
// connection is left open. If, for instance, the networking
// is stopped before the agent is shut down, the disconnect
// will usually not propagate.
//
// See: https://github.com/coder/coder/issues/6180
func watchAndClose ( ctx context . Context , closer func ( ) error , logger slog . Logger , client * codersdk . Client , workspace codersdk . Workspace ) {
// Ensure session is ended on both context cancellation
// and workspace stop.
defer func ( ) {
err := closer ( )
if err != nil {
logger . Error ( ctx , "error closing session" , slog . Error ( err ) )
}
} ( )
startWatchLoop :
for {
2023-06-20 10:30:45 +00:00
logger . Debug ( ctx , "connecting to the coder server to watch workspace events" )
2023-05-25 05:07:39 +00:00
var wsWatch <- chan codersdk . Workspace
var err error
for r := retry . New ( time . Second , 15 * time . Second ) ; r . Wait ( ctx ) ; {
wsWatch , err = client . WatchWorkspace ( ctx , workspace . ID )
if err == nil {
break
}
if ctx . Err ( ) != nil {
2023-06-20 10:30:45 +00:00
logger . Debug ( ctx , "context expired" , slog . Error ( ctx . Err ( ) ) )
2023-05-25 05:07:39 +00:00
return
}
}
for {
select {
case <- ctx . Done ( ) :
2023-06-20 10:30:45 +00:00
logger . Debug ( ctx , "context expired" , slog . Error ( ctx . Err ( ) ) )
2023-05-25 05:07:39 +00:00
return
case w , ok := <- wsWatch :
if ! ok {
continue startWatchLoop
}
// Transitioning to stop or delete could mean that
// the agent will still gracefully stop. If a new
// build is starting, there's no reason to wait for
// the agent, it should be long gone.
if workspace . LatestBuild . ID != w . LatestBuild . ID && w . LatestBuild . Transition == codersdk . WorkspaceTransitionStart {
logger . Info ( ctx , "new build started" )
return
}
// Note, we only react to the stopped state here because we
// want to give the agent a chance to gracefully shut down
// during "stopping".
if w . LatestBuild . Status == codersdk . WorkspaceStatusStopped {
logger . Info ( ctx , "workspace stopped" )
return
}
}
}
}
}
2022-05-18 14:10:40 +00:00
// getWorkspaceAgent returns the workspace and agent selected using either the
2023-12-08 16:01:13 +00:00
// `<workspace>[.<agent>]` syntax via `in`.
// If autoStart is true, the workspace will be started if it is not already running.
2024-04-13 18:39:57 +00:00
func getWorkspaceAndAgent ( ctx context . Context , inv * serpent . Invocation , client * codersdk . Client , autostart bool , input string ) ( codersdk . Workspace , codersdk . WorkspaceAgent , error ) { //nolint:revive
2022-05-18 14:10:40 +00:00
var (
2024-04-13 18:39:57 +00:00
workspace codersdk . Workspace
// The input will be `owner/name.agent`
// The agent is optional.
workspaceParts = strings . Split ( input , "." )
2022-05-18 14:10:40 +00:00
err error
)
2023-05-24 09:37:22 +00:00
workspace , err = namedWorkspace ( ctx , client , workspaceParts [ 0 ] )
2023-03-23 22:42:20 +00:00
if err != nil {
return codersdk . Workspace { } , codersdk . WorkspaceAgent { } , err
2022-05-18 14:10:40 +00:00
}
2022-05-19 18:04:44 +00:00
if workspace . LatestBuild . Transition != codersdk . WorkspaceTransitionStart {
2023-12-08 16:01:13 +00:00
if ! autostart {
2024-02-27 18:04:46 +00:00
return codersdk . Workspace { } , codersdk . WorkspaceAgent { } , xerrors . New ( "workspace must be started" )
2023-12-08 16:01:13 +00:00
}
// Autostart the workspace for the user.
// For some failure modes, return a better message.
if workspace . LatestBuild . Transition == codersdk . WorkspaceTransitionDelete {
// Any sort of deleting status, we should reject with a nicer error.
return codersdk . Workspace { } , codersdk . WorkspaceAgent { } , xerrors . Errorf ( "workspace %q is deleted" , workspace . Name )
}
if workspace . LatestBuild . Job . Status == codersdk . ProvisionerJobFailed {
return codersdk . Workspace { } , codersdk . WorkspaceAgent { } ,
xerrors . Errorf ( "workspace %q is in failed state, unable to autostart the workspace" , workspace . Name )
}
// The workspace needs to be stopped before we can start it.
// It cannot be in any pending or failed state.
if workspace . LatestBuild . Status != codersdk . WorkspaceStatusStopped {
return codersdk . Workspace { } , codersdk . WorkspaceAgent { } ,
2024-02-27 18:04:46 +00:00
xerrors . Errorf ( "workspace must be started; was unable to autostart as the last build job is %q, expected %q" ,
2023-12-08 16:01:13 +00:00
workspace . LatestBuild . Status ,
codersdk . WorkspaceStatusStopped ,
)
}
2024-01-23 17:01:44 +00:00
// Start workspace based on the last build parameters.
// It's possible for a workspace build to fail due to the template requiring starting
// workspaces with the active version.
2023-12-08 16:01:13 +00:00
_ , _ = fmt . Fprintf ( inv . Stderr , "Workspace was stopped, starting workspace to allow connecting to %q...\n" , workspace . Name )
2024-01-23 17:01:44 +00:00
_ , err = startWorkspace ( inv , client , workspace , workspaceParameterFlags { } , WorkspaceStart )
if cerr , ok := codersdk . AsError ( err ) ; ok && cerr . StatusCode ( ) == http . StatusForbidden {
_ , err = startWorkspace ( inv , client , workspace , workspaceParameterFlags { } , WorkspaceUpdate )
if err != nil {
return codersdk . Workspace { } , codersdk . WorkspaceAgent { } , xerrors . Errorf ( "start workspace with active template version: %w" , err )
}
2024-03-28 13:55:15 +00:00
_ , _ = fmt . Fprintln ( inv . Stdout , "Unable to start the workspace with template version from last build. Your workspace has been updated to the current active template version." )
2024-01-23 17:01:44 +00:00
} else if err != nil {
return codersdk . Workspace { } , codersdk . WorkspaceAgent { } , xerrors . Errorf ( "start workspace with current template version: %w" , err )
}
// Refresh workspace state so that `outdated`, `build`,`template_*` fields are up-to-date.
workspace , err = namedWorkspace ( ctx , client , workspaceParts [ 0 ] )
2023-12-08 16:01:13 +00:00
if err != nil {
2024-01-23 17:01:44 +00:00
return codersdk . Workspace { } , codersdk . WorkspaceAgent { } , err
2023-12-08 16:01:13 +00:00
}
2022-05-18 14:10:40 +00:00
}
if workspace . LatestBuild . Job . CompletedAt == nil {
2023-03-23 22:42:20 +00:00
err := cliui . WorkspaceBuild ( ctx , inv . Stderr , client , workspace . LatestBuild . ID )
2022-05-18 14:10:40 +00:00
if err != nil {
return codersdk . Workspace { } , codersdk . WorkspaceAgent { } , err
2023-05-24 09:37:22 +00:00
}
// Fetch up-to-date build information after completion.
workspace . LatestBuild , err = client . WorkspaceBuild ( ctx , workspace . LatestBuild . ID )
if err != nil {
return codersdk . Workspace { } , codersdk . WorkspaceAgent { } , err
2022-05-18 14:10:40 +00:00
}
}
2022-05-19 18:04:44 +00:00
if workspace . LatestBuild . Transition == codersdk . WorkspaceTransitionDelete {
2022-05-18 14:10:40 +00:00
return codersdk . Workspace { } , codersdk . WorkspaceAgent { } , xerrors . Errorf ( "workspace %q is being deleted" , workspace . Name )
}
2024-01-02 18:46:18 +00:00
var agentName string
if len ( workspaceParts ) >= 2 {
agentName = workspaceParts [ 1 ]
}
workspaceAgent , err := getWorkspaceAgent ( workspace , agentName )
if err != nil {
return codersdk . Workspace { } , codersdk . WorkspaceAgent { } , err
}
return workspace , workspaceAgent , nil
}
func getWorkspaceAgent ( workspace codersdk . Workspace , agentName string ) ( workspaceAgent codersdk . WorkspaceAgent , err error ) {
2022-10-03 21:01:13 +00:00
resources := workspace . LatestBuild . Resources
2022-05-18 14:10:40 +00:00
agents := make ( [ ] codersdk . WorkspaceAgent , 0 )
for _ , resource := range resources {
agents = append ( agents , resource . Agents ... )
}
if len ( agents ) == 0 {
2024-01-02 18:46:18 +00:00
return codersdk . WorkspaceAgent { } , xerrors . Errorf ( "workspace %q has no agents" , workspace . Name )
2022-05-18 14:10:40 +00:00
}
2024-01-02 18:46:18 +00:00
if agentName != "" {
2022-05-18 14:10:40 +00:00
for _ , otherAgent := range agents {
2024-01-02 18:46:18 +00:00
if otherAgent . Name != agentName {
2022-05-18 14:10:40 +00:00
continue
}
2022-09-01 01:09:44 +00:00
workspaceAgent = otherAgent
2022-05-18 14:10:40 +00:00
break
}
2022-09-01 01:09:44 +00:00
if workspaceAgent . ID == uuid . Nil {
2024-01-02 18:46:18 +00:00
return codersdk . WorkspaceAgent { } , xerrors . Errorf ( "agent not found by name %q" , agentName )
2022-05-18 14:10:40 +00:00
}
}
2022-09-01 01:09:44 +00:00
if workspaceAgent . ID == uuid . Nil {
2022-05-18 14:10:40 +00:00
if len ( agents ) > 1 {
2022-09-01 01:09:44 +00:00
workspaceAgent , err = cryptorand . Element ( agents )
2022-05-18 14:10:40 +00:00
if err != nil {
2024-01-02 18:46:18 +00:00
return codersdk . WorkspaceAgent { } , err
2022-05-18 14:10:40 +00:00
}
} else {
2022-09-01 01:09:44 +00:00
workspaceAgent = agents [ 0 ]
2022-05-18 14:10:40 +00:00
}
}
2024-01-02 18:46:18 +00:00
return workspaceAgent , nil
2022-05-18 14:10:40 +00:00
}
2022-05-13 17:09:04 +00:00
// Attempt to poll workspace autostop. We write a per-workspace lockfile to
// avoid spamming the user with notifications in case of multiple instances
// of the CLI running simultaneously.
func tryPollWorkspaceAutostop ( ctx context . Context , client * codersdk . Client , workspace codersdk . Workspace ) ( stop func ( ) ) {
lock := flock . New ( filepath . Join ( os . TempDir ( ) , "coder-autostop-notify-" + workspace . ID . String ( ) ) )
2023-07-20 10:05:39 +00:00
conditionCtx , cancelCondition := context . WithCancel ( ctx )
condition := notifyCondition ( conditionCtx , client , workspace . ID , lock )
stopFunc := notify . Notify ( condition , workspacePollInterval , autostopNotifyCountdown ... )
return func ( ) {
// With many "ssh" processes running, `lock.TryLockContext` can be hanging until the context canceled.
// Without this cancellation, a CLI process with failed remote-forward could be hanging indefinitely.
cancelCondition ( )
stopFunc ( )
}
2022-05-13 17:09:04 +00:00
}
// Notify the user if the workspace is due to shutdown.
func notifyCondition ( ctx context . Context , client * codersdk . Client , workspaceID uuid . UUID , lock * flock . Flock ) notify . Condition {
return func ( now time . Time ) ( deadline time . Time , callback func ( ) ) {
// Keep trying to regain the lock.
2022-05-20 10:57:02 +00:00
locked , err := lock . TryLockContext ( ctx , workspacePollInterval )
2022-05-13 17:09:04 +00:00
if err != nil || ! locked {
return time . Time { } , nil
}
ws , err := client . Workspace ( ctx , workspaceID )
if err != nil {
return time . Time { } , nil
}
2022-06-02 10:23:34 +00:00
if ptr . NilOrZero ( ws . TTLMillis ) {
2022-05-13 17:09:04 +00:00
return time . Time { } , nil
}
2022-08-25 16:10:42 +00:00
deadline = ws . LatestBuild . Deadline . Time
2022-05-13 17:09:04 +00:00
callback = func ( ) {
ttl := deadline . Sub ( now )
var title , body string
if ttl > time . Minute {
2022-05-27 19:04:33 +00:00
title = fmt . Sprintf ( ` Workspace %s stopping soon ` , ws . Name )
2022-05-13 17:09:04 +00:00
body = fmt . Sprintf (
2022-05-27 19:04:33 +00:00
` Your Coder workspace %s is scheduled to stop in %.0f mins ` , ws . Name , ttl . Minutes ( ) )
2022-05-13 17:09:04 +00:00
} else {
title = fmt . Sprintf ( "Workspace %s stopping!" , ws . Name )
body = fmt . Sprintf ( "Your Coder workspace %s is stopping any time now!" , ws . Name )
}
// notify user with a native system notification (best effort)
_ = beeep . Notify ( title , body , "" )
}
return deadline . Truncate ( time . Minute ) , callback
}
}
2022-11-07 18:12:39 +00:00
// Verify if the user workspace is outdated and prepare an actionable message for user.
func verifyWorkspaceOutdated ( client * codersdk . Client , workspace codersdk . Workspace ) ( string , bool ) {
if ! workspace . Outdated {
return "" , false // workspace is up-to-date
}
workspaceLink := buildWorkspaceLink ( client . URL , workspace )
return fmt . Sprintf ( "👋 Your workspace is outdated! Update it here: %s\n" , workspaceLink ) , true
}
// Build the user workspace link which navigates to the Coder web UI.
func buildWorkspaceLink ( serverURL * url . URL , workspace codersdk . Workspace ) * url . URL {
return serverURL . ResolveReference ( & url . URL { Path : fmt . Sprintf ( "@%s/%s" , workspace . OwnerName , workspace . Name ) } )
}
2023-01-06 07:52:19 +00:00
// runLocal runs a command on the local machine.
func runLocal ( ctx context . Context , stdin io . Reader , name string , args ... string ) ( [ ] byte , error ) {
cmd := exec . CommandContext ( ctx , name , args ... )
cmd . Stdin = stdin
out , err := cmd . Output ( )
if err != nil {
var stderr [ ] byte
if exitErr := new ( exec . ExitError ) ; errors . As ( err , & exitErr ) {
stderr = exitErr . Stderr
}
return out , xerrors . Errorf (
"`%s %s` failed: stderr: %s\n\nstdout: %s\n\n%w" ,
name ,
strings . Join ( args , " " ) ,
bytes . TrimSpace ( stderr ) ,
bytes . TrimSpace ( out ) ,
err ,
)
}
return out , nil
}
// runRemoteSSH runs a command on a remote machine/workspace via SSH.
func runRemoteSSH ( sshClient * gossh . Client , stdin io . Reader , cmd string ) ( [ ] byte , error ) {
sess , err := sshClient . NewSession ( )
if err != nil {
return nil , xerrors . Errorf ( "create SSH session" )
}
defer sess . Close ( )
stderr := bytes . NewBuffer ( nil )
sess . Stdin = stdin
2023-01-29 20:53:49 +00:00
// On fish, this was outputting to stderr instead of stdout.
// The tests pass differently on different Linux machines,
// so it's best we capture the output of both.
out , err := sess . CombinedOutput ( cmd )
2023-01-06 07:52:19 +00:00
if err != nil {
return out , xerrors . Errorf (
"`%s` failed: stderr: %s\n\nstdout: %s:\n\n%w" ,
cmd ,
bytes . TrimSpace ( stderr . Bytes ( ) ) ,
bytes . TrimSpace ( out ) ,
err ,
)
}
return out , nil
}
func uploadGPGKeys ( ctx context . Context , sshClient * gossh . Client ) error {
// Check if the agent is running in the workspace already.
//
// Note: we don't support windows in the workspace for GPG forwarding so
// using shell commands is fine.
//
// Note: we sleep after killing the agent because it doesn't always die
// immediately.
2023-02-03 18:25:11 +00:00
agentSocketBytes , err := runRemoteSSH ( sshClient , nil , ` sh - c '
2023-01-06 07:52:19 +00:00
set - eux
agent_socket = $ ( gpgconf -- list - dir agent - socket )
echo "$agent_socket"
if [ - S "$agent_socket" ] ; then
echo "agent socket exists, attempting to kill it" > & 2
gpgconf -- kill gpg - agent
rm - f "$agent_socket"
sleep 1
fi
test ! - S "$agent_socket"
2023-02-03 18:25:11 +00:00
' ` )
2023-01-06 07:52:19 +00:00
agentSocket := strings . TrimSpace ( string ( agentSocketBytes ) )
if err != nil {
return xerrors . Errorf ( "check if agent socket is running (check if %q exists): %w" , agentSocket , err )
}
if agentSocket == "" {
return xerrors . Errorf ( "agent socket path is empty, check the output of `gpgconf --list-dir agent-socket`" )
}
// Read the user's public keys and ownertrust from GPG.
pubKeyExport , err := runLocal ( ctx , nil , "gpg" , "--armor" , "--export" )
if err != nil {
return xerrors . Errorf ( "export local public keys from GPG: %w" , err )
}
ownerTrustExport , err := runLocal ( ctx , nil , "gpg" , "--export-ownertrust" )
if err != nil {
return xerrors . Errorf ( "export local ownertrust from GPG: %w" , err )
}
// Import the public keys and ownertrust into the workspace.
_ , err = runRemoteSSH ( sshClient , bytes . NewReader ( pubKeyExport ) , "gpg --import" )
if err != nil {
return xerrors . Errorf ( "import public keys into workspace: %w" , err )
}
_ , err = runRemoteSSH ( sshClient , bytes . NewReader ( ownerTrustExport ) , "gpg --import-ownertrust" )
if err != nil {
return xerrors . Errorf ( "import ownertrust into workspace: %w" , err )
}
// Kill the agent in the workspace if it was started by one of the above
// commands.
_ , err = runRemoteSSH ( sshClient , nil , fmt . Sprintf ( "gpgconf --kill gpg-agent && rm -f %q" , agentSocket ) )
if err != nil {
return xerrors . Errorf ( "kill existing agent in workspace: %w" , err )
}
return nil
}
func localGPGExtraSocket ( ctx context . Context ) ( string , error ) {
localSocket , err := runLocal ( ctx , nil , "gpgconf" , "--list-dir" , "agent-extra-socket" )
if err != nil {
return "" , xerrors . Errorf ( "get local GPG agent socket: %w" , err )
}
return string ( bytes . TrimSpace ( localSocket ) ) , nil
}
func remoteGPGAgentSocket ( sshClient * gossh . Client ) ( string , error ) {
remoteSocket , err := runRemoteSSH ( sshClient , nil , "gpgconf --list-dir agent-socket" )
if err != nil {
return "" , xerrors . Errorf ( "get remote GPG agent socket: %w" , err )
}
return string ( bytes . TrimSpace ( remoteSocket ) ) , nil
}
fix: close ssh sessions gracefully (#10732)
Re-enables TestSSH/RemoteForward_Unix_Signal and addresses the underlying race: we were not closing the remote forward on context expiry, only the session and connection.
However, there is still a more fundamental issue in that we don't have the ability to ensure that TCP sessions are properly terminated before tearing down the Tailnet conn. This is due to the assumption in the sockets API, that the underlying IP interface is long
lived compared with the TCP socket, and thus closing a socket returns immediately and does not wait for the TCP termination handshake --- that is handled async in the tcpip stack. However, this assumption does not hold for us and tailnet, since on shutdown,
we also tear down the tailnet connection, and this can race with the TCP termination.
Closing the remote forward explicitly should prevent forward state from accumulating, since the Close() function waits for a reply from the remote SSH server.
I've also attempted to workaround the TCP/tailnet issue for `--stdio` by using `CloseWrite()` instead of `Close()`. By closing the write side of the connection, half-close the TCP connection, and the server detects this and closes the other direction, which then
triggers our read loop to exit only after the server has had a chance to process the close.
TODO in a stacked PR is to implement this logic for `vscodessh` as well.
2023-11-17 08:43:20 +00:00
type closerWithName struct {
name string
closer io . Closer
}
type closerStack struct {
sync . Mutex
closers [ ] closerWithName
closed bool
logger slog . Logger
err error
2024-03-07 13:26:49 +00:00
wg sync . WaitGroup
fix: close ssh sessions gracefully (#10732)
Re-enables TestSSH/RemoteForward_Unix_Signal and addresses the underlying race: we were not closing the remote forward on context expiry, only the session and connection.
However, there is still a more fundamental issue in that we don't have the ability to ensure that TCP sessions are properly terminated before tearing down the Tailnet conn. This is due to the assumption in the sockets API, that the underlying IP interface is long
lived compared with the TCP socket, and thus closing a socket returns immediately and does not wait for the TCP termination handshake --- that is handled async in the tcpip stack. However, this assumption does not hold for us and tailnet, since on shutdown,
we also tear down the tailnet connection, and this can race with the TCP termination.
Closing the remote forward explicitly should prevent forward state from accumulating, since the Close() function waits for a reply from the remote SSH server.
I've also attempted to workaround the TCP/tailnet issue for `--stdio` by using `CloseWrite()` instead of `Close()`. By closing the write side of the connection, half-close the TCP connection, and the server detects this and closes the other direction, which then
triggers our read loop to exit only after the server has had a chance to process the close.
TODO in a stacked PR is to implement this logic for `vscodessh` as well.
2023-11-17 08:43:20 +00:00
}
func newCloserStack ( ctx context . Context , logger slog . Logger ) * closerStack {
cs := & closerStack { logger : logger }
go cs . closeAfterContext ( ctx )
return cs
}
func ( c * closerStack ) closeAfterContext ( ctx context . Context ) {
<- ctx . Done ( )
c . close ( ctx . Err ( ) )
}
func ( c * closerStack ) close ( err error ) {
c . Lock ( )
if c . closed {
c . Unlock ( )
2024-03-07 13:26:49 +00:00
c . wg . Wait ( )
fix: close ssh sessions gracefully (#10732)
Re-enables TestSSH/RemoteForward_Unix_Signal and addresses the underlying race: we were not closing the remote forward on context expiry, only the session and connection.
However, there is still a more fundamental issue in that we don't have the ability to ensure that TCP sessions are properly terminated before tearing down the Tailnet conn. This is due to the assumption in the sockets API, that the underlying IP interface is long
lived compared with the TCP socket, and thus closing a socket returns immediately and does not wait for the TCP termination handshake --- that is handled async in the tcpip stack. However, this assumption does not hold for us and tailnet, since on shutdown,
we also tear down the tailnet connection, and this can race with the TCP termination.
Closing the remote forward explicitly should prevent forward state from accumulating, since the Close() function waits for a reply from the remote SSH server.
I've also attempted to workaround the TCP/tailnet issue for `--stdio` by using `CloseWrite()` instead of `Close()`. By closing the write side of the connection, half-close the TCP connection, and the server detects this and closes the other direction, which then
triggers our read loop to exit only after the server has had a chance to process the close.
TODO in a stacked PR is to implement this logic for `vscodessh` as well.
2023-11-17 08:43:20 +00:00
return
}
c . closed = true
c . err = err
2024-03-07 13:26:49 +00:00
c . wg . Add ( 1 )
defer c . wg . Done ( )
fix: close ssh sessions gracefully (#10732)
Re-enables TestSSH/RemoteForward_Unix_Signal and addresses the underlying race: we were not closing the remote forward on context expiry, only the session and connection.
However, there is still a more fundamental issue in that we don't have the ability to ensure that TCP sessions are properly terminated before tearing down the Tailnet conn. This is due to the assumption in the sockets API, that the underlying IP interface is long
lived compared with the TCP socket, and thus closing a socket returns immediately and does not wait for the TCP termination handshake --- that is handled async in the tcpip stack. However, this assumption does not hold for us and tailnet, since on shutdown,
we also tear down the tailnet connection, and this can race with the TCP termination.
Closing the remote forward explicitly should prevent forward state from accumulating, since the Close() function waits for a reply from the remote SSH server.
I've also attempted to workaround the TCP/tailnet issue for `--stdio` by using `CloseWrite()` instead of `Close()`. By closing the write side of the connection, half-close the TCP connection, and the server detects this and closes the other direction, which then
triggers our read loop to exit only after the server has had a chance to process the close.
TODO in a stacked PR is to implement this logic for `vscodessh` as well.
2023-11-17 08:43:20 +00:00
c . Unlock ( )
for i := len ( c . closers ) - 1 ; i >= 0 ; i -- {
cwn := c . closers [ i ]
cErr := cwn . closer . Close ( )
c . logger . Debug ( context . Background ( ) ,
"closed item from stack" , slog . F ( "name" , cwn . name ) , slog . Error ( cErr ) )
}
}
func ( c * closerStack ) push ( name string , closer io . Closer ) error {
c . Lock ( )
if c . closed {
c . Unlock ( )
// since we're refusing to push it on the stack, close it now
err := closer . Close ( )
c . logger . Error ( context . Background ( ) ,
"closed item rejected push" , slog . F ( "name" , name ) , slog . Error ( err ) )
return xerrors . Errorf ( "already closed: %w" , c . err )
}
c . closers = append ( c . closers , closerWithName { name : name , closer : closer } )
c . Unlock ( )
return nil
}
// rawSSHCopier handles copying raw SSH data between the conn and the pair (r, w).
type rawSSHCopier struct {
conn * gonet . TCPConn
logger slog . Logger
r io . Reader
w io . Writer
2023-11-22 09:11:21 +00:00
done chan struct { }
}
func newRawSSHCopier ( logger slog . Logger , conn * gonet . TCPConn , r io . Reader , w io . Writer ) * rawSSHCopier {
return & rawSSHCopier { conn : conn , logger : logger , r : r , w : w , done : make ( chan struct { } ) }
fix: close ssh sessions gracefully (#10732)
Re-enables TestSSH/RemoteForward_Unix_Signal and addresses the underlying race: we were not closing the remote forward on context expiry, only the session and connection.
However, there is still a more fundamental issue in that we don't have the ability to ensure that TCP sessions are properly terminated before tearing down the Tailnet conn. This is due to the assumption in the sockets API, that the underlying IP interface is long
lived compared with the TCP socket, and thus closing a socket returns immediately and does not wait for the TCP termination handshake --- that is handled async in the tcpip stack. However, this assumption does not hold for us and tailnet, since on shutdown,
we also tear down the tailnet connection, and this can race with the TCP termination.
Closing the remote forward explicitly should prevent forward state from accumulating, since the Close() function waits for a reply from the remote SSH server.
I've also attempted to workaround the TCP/tailnet issue for `--stdio` by using `CloseWrite()` instead of `Close()`. By closing the write side of the connection, half-close the TCP connection, and the server detects this and closes the other direction, which then
triggers our read loop to exit only after the server has had a chance to process the close.
TODO in a stacked PR is to implement this logic for `vscodessh` as well.
2023-11-17 08:43:20 +00:00
}
func ( c * rawSSHCopier ) copy ( wg * sync . WaitGroup ) {
2023-11-22 09:11:21 +00:00
defer close ( c . done )
fix: close ssh sessions gracefully (#10732)
Re-enables TestSSH/RemoteForward_Unix_Signal and addresses the underlying race: we were not closing the remote forward on context expiry, only the session and connection.
However, there is still a more fundamental issue in that we don't have the ability to ensure that TCP sessions are properly terminated before tearing down the Tailnet conn. This is due to the assumption in the sockets API, that the underlying IP interface is long
lived compared with the TCP socket, and thus closing a socket returns immediately and does not wait for the TCP termination handshake --- that is handled async in the tcpip stack. However, this assumption does not hold for us and tailnet, since on shutdown,
we also tear down the tailnet connection, and this can race with the TCP termination.
Closing the remote forward explicitly should prevent forward state from accumulating, since the Close() function waits for a reply from the remote SSH server.
I've also attempted to workaround the TCP/tailnet issue for `--stdio` by using `CloseWrite()` instead of `Close()`. By closing the write side of the connection, half-close the TCP connection, and the server detects this and closes the other direction, which then
triggers our read loop to exit only after the server has had a chance to process the close.
TODO in a stacked PR is to implement this logic for `vscodessh` as well.
2023-11-17 08:43:20 +00:00
logCtx := context . Background ( )
wg . Add ( 1 )
go func ( ) {
defer wg . Done ( )
// We close connections using CloseWrite instead of Close, so that the SSH server sees the
// closed connection while reading, and shuts down cleanly. This will trigger the io.Copy
// in the server-to-client direction to also be closed and the copy() routine will exit.
// This ensures that we don't leave any state in the server, like forwarded ports if
// copy() were to return and the underlying tailnet connection torn down before the TCP
// session exits. This is a bit of a hack to block shut down at the application layer, since
// we can't serialize the TCP and tailnet layers shutting down.
//
// Of course, if the underlying transport is broken, io.Copy will still return.
defer func ( ) {
cwErr := c . conn . CloseWrite ( )
c . logger . Debug ( logCtx , "closed raw SSH connection for writing" , slog . Error ( cwErr ) )
} ( )
_ , err := io . Copy ( c . conn , c . r )
if err != nil {
c . logger . Error ( logCtx , "copy stdin error" , slog . Error ( err ) )
} else {
c . logger . Debug ( logCtx , "copy stdin complete" )
}
} ( )
_ , err := io . Copy ( c . w , c . conn )
if err != nil {
c . logger . Error ( logCtx , "copy stdout error" , slog . Error ( err ) )
} else {
c . logger . Debug ( logCtx , "copy stdout complete" )
}
}
func ( c * rawSSHCopier ) Close ( ) error {
2023-11-22 09:11:21 +00:00
err := c . conn . CloseWrite ( )
// give the copy() call a chance to return on a timeout, so that we don't
// continue tearing down and close the underlying netstack before the SSH
// session has a chance to gracefully shut down.
t := time . NewTimer ( 5 * time . Second )
defer t . Stop ( )
select {
case <- c . done :
case <- t . C :
}
return err
fix: close ssh sessions gracefully (#10732)
Re-enables TestSSH/RemoteForward_Unix_Signal and addresses the underlying race: we were not closing the remote forward on context expiry, only the session and connection.
However, there is still a more fundamental issue in that we don't have the ability to ensure that TCP sessions are properly terminated before tearing down the Tailnet conn. This is due to the assumption in the sockets API, that the underlying IP interface is long
lived compared with the TCP socket, and thus closing a socket returns immediately and does not wait for the TCP termination handshake --- that is handled async in the tcpip stack. However, this assumption does not hold for us and tailnet, since on shutdown,
we also tear down the tailnet connection, and this can race with the TCP termination.
Closing the remote forward explicitly should prevent forward state from accumulating, since the Close() function waits for a reply from the remote SSH server.
I've also attempted to workaround the TCP/tailnet issue for `--stdio` by using `CloseWrite()` instead of `Close()`. By closing the write side of the connection, half-close the TCP connection, and the server detects this and closes the other direction, which then
triggers our read loop to exit only after the server has had a chance to process the close.
TODO in a stacked PR is to implement this logic for `vscodessh` as well.
2023-11-17 08:43:20 +00:00
}
2023-12-08 16:01:13 +00:00
2024-03-15 16:24:38 +00:00
func sshDisableAutostartOption ( src * serpent . Bool ) serpent . Option {
return serpent . Option {
2023-12-08 16:01:13 +00:00
Flag : "disable-autostart" ,
Description : "Disable starting the workspace automatically when connecting via SSH." ,
Env : "CODER_SSH_DISABLE_AUTOSTART" ,
Value : src ,
Default : "false" ,
}
}
2024-02-08 11:09:42 +00:00
type stdioErrLogReader struct {
l slog . Logger
}
func ( r stdioErrLogReader ) Read ( _ [ ] byte ) ( int , error ) {
r . l . Error ( context . Background ( ) , "reading from stdin in stdio mode is not allowed" )
return 0 , io . EOF
}