feat: show tailnet peer diagnostics after coder ping (#12314)

Beginnings of a solution to #12297 

Doesn't cover disco or definitively display whether we successfully connected to DERP, but shows some checklist diagnostics for connecting to an agent.

For this first PR, I just added it to `coder ping` to see how we like it, but could be incorporated into `coder ssh` _et al._ after a timeout.

```
$ coder ping dogfood2
p2p connection established in 147ms
pong from dogfood2 p2p via  95.217.xxx.yyy:42631  in 147ms
pong from dogfood2 p2p via  95.217.xxx.yyy:42631  in 140ms
pong from dogfood2 p2p via  95.217.xxx.yyy:42631  in 140ms
✔ preferred DERP region 999 (Council Bluffs, Iowa)
✔ sent local data to Coder networking coodinator
✔ received remote agent data from Coder networking coordinator
    preferred DERP 10013 (Europe Fly.io (Paris))
    endpoints: 95.217.xxx.yyy:42631, 95.217.xxx.yyy:37576, 172.17.0.1:37576, 172.20.0.10:37576
✔ Wireguard handshake 11s ago
```
This commit is contained in:
Spike Curtis 2024-02-27 22:04:46 +04:00 committed by GitHub
parent 32691e67e6
commit 4e7beee102
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
11 changed files with 503 additions and 8 deletions

View File

@ -2,13 +2,17 @@ package cliui
import (
"context"
"fmt"
"io"
"strconv"
"strings"
"time"
"github.com/google/uuid"
"golang.org/x/xerrors"
"github.com/coder/coder/v2/codersdk"
"github.com/coder/coder/v2/tailnet"
)
var errAgentShuttingDown = xerrors.New("agent is shutting down")
@ -281,3 +285,55 @@ type closeFunc func() error
func (c closeFunc) Close() error {
return c()
}
func PeerDiagnostics(w io.Writer, d tailnet.PeerDiagnostics) {
if d.PreferredDERP > 0 {
rn, ok := d.DERPRegionNames[d.PreferredDERP]
if !ok {
rn = "unknown"
}
_, _ = fmt.Fprintf(w, "✔ preferred DERP region: %d (%s)\n", d.PreferredDERP, rn)
} else {
_, _ = fmt.Fprint(w, "✘ not connected to DERP\n")
}
if d.SentNode {
_, _ = fmt.Fprint(w, "✔ sent local data to Coder networking coodinator\n")
} else {
_, _ = fmt.Fprint(w, "✘ have not sent local data to Coder networking coordinator\n")
}
if d.ReceivedNode != nil {
dp := d.ReceivedNode.DERP
dn := ""
// should be 127.3.3.40:N where N is the DERP region
ap := strings.Split(dp, ":")
if len(ap) == 2 {
dp = ap[1]
di, err := strconv.Atoi(dp)
if err == nil {
var ok bool
dn, ok = d.DERPRegionNames[di]
if ok {
dn = fmt.Sprintf("(%s)", dn)
} else {
dn = "(unknown)"
}
}
}
_, _ = fmt.Fprintf(w,
"✔ received remote agent data from Coder networking coordinator\n preferred DERP region: %s %s\n endpoints: %s\n",
dp, dn, strings.Join(d.ReceivedNode.Endpoints, ", "))
} else {
_, _ = fmt.Fprint(w, "✘ have not received remote agent data from Coder networking coordinator\n")
}
if !d.LastWireguardHandshake.IsZero() {
ago := time.Since(d.LastWireguardHandshake)
symbol := "✔"
// wireguard is supposed to refresh handshake on 5 minute intervals
if ago > 5*time.Minute {
symbol = "⚠"
}
_, _ = fmt.Fprintf(w, "%s Wireguard handshake %s ago\n", symbol, ago.Round(time.Second))
} else {
_, _ = fmt.Fprint(w, "✘ Wireguard is not connected\n")
}
}

View File

@ -6,6 +6,7 @@ import (
"context"
"io"
"os"
"regexp"
"strings"
"sync/atomic"
"testing"
@ -15,12 +16,14 @@ import (
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
"golang.org/x/xerrors"
"tailscale.com/tailcfg"
"github.com/coder/coder/v2/cli/clibase"
"github.com/coder/coder/v2/cli/clitest"
"github.com/coder/coder/v2/cli/cliui"
"github.com/coder/coder/v2/coderd/util/ptr"
"github.com/coder/coder/v2/codersdk"
"github.com/coder/coder/v2/tailnet"
"github.com/coder/coder/v2/testutil"
)
@ -476,3 +479,191 @@ func TestAgent(t *testing.T) {
require.NoError(t, cmd.Invoke().Run())
})
}
func TestPeerDiagnostics(t *testing.T) {
t.Parallel()
testCases := []struct {
name string
diags tailnet.PeerDiagnostics
want []*regexp.Regexp // must be ordered, can omit lines
}{
{
name: "noPreferredDERP",
diags: tailnet.PeerDiagnostics{
PreferredDERP: 0,
DERPRegionNames: make(map[int]string),
SentNode: true,
ReceivedNode: &tailcfg.Node{DERP: "127.3.3.40:999"},
LastWireguardHandshake: time.Now(),
},
want: []*regexp.Regexp{
regexp.MustCompile("^✘ not connected to DERP$"),
},
},
{
name: "preferredDERP",
diags: tailnet.PeerDiagnostics{
PreferredDERP: 23,
DERPRegionNames: map[int]string{
23: "testo",
},
SentNode: true,
ReceivedNode: &tailcfg.Node{DERP: "127.3.3.40:999"},
LastWireguardHandshake: time.Now(),
},
want: []*regexp.Regexp{
regexp.MustCompile(`^✔ preferred DERP region: 23 \(testo\)$`),
},
},
{
name: "sentNode",
diags: tailnet.PeerDiagnostics{
PreferredDERP: 0,
DERPRegionNames: map[int]string{},
SentNode: true,
ReceivedNode: &tailcfg.Node{DERP: "127.3.3.40:999"},
LastWireguardHandshake: time.Time{},
},
want: []*regexp.Regexp{
regexp.MustCompile(`^✔ sent local data to Coder networking coodinator$`),
},
},
{
name: "didntSendNode",
diags: tailnet.PeerDiagnostics{
PreferredDERP: 0,
DERPRegionNames: map[int]string{},
SentNode: false,
ReceivedNode: &tailcfg.Node{DERP: "127.3.3.40:999"},
LastWireguardHandshake: time.Time{},
},
want: []*regexp.Regexp{
regexp.MustCompile(`^✘ have not sent local data to Coder networking coordinator$`),
},
},
{
name: "receivedNodeDERPOKNoEndpoints",
diags: tailnet.PeerDiagnostics{
PreferredDERP: 0,
DERPRegionNames: map[int]string{999: "Embedded"},
SentNode: true,
ReceivedNode: &tailcfg.Node{DERP: "127.3.3.40:999"},
LastWireguardHandshake: time.Time{},
},
want: []*regexp.Regexp{
regexp.MustCompile(`^✔ received remote agent data from Coder networking coordinator$`),
regexp.MustCompile(`preferred DERP region: 999 \(Embedded\)$`),
regexp.MustCompile(`endpoints: $`),
},
},
{
name: "receivedNodeDERPUnknownNoEndpoints",
diags: tailnet.PeerDiagnostics{
PreferredDERP: 0,
DERPRegionNames: map[int]string{},
SentNode: true,
ReceivedNode: &tailcfg.Node{DERP: "127.3.3.40:999"},
LastWireguardHandshake: time.Time{},
},
want: []*regexp.Regexp{
regexp.MustCompile(`^✔ received remote agent data from Coder networking coordinator$`),
regexp.MustCompile(`preferred DERP region: 999 \(unknown\)$`),
regexp.MustCompile(`endpoints: $`),
},
},
{
name: "receivedNodeEndpointsNoDERP",
diags: tailnet.PeerDiagnostics{
PreferredDERP: 0,
DERPRegionNames: map[int]string{999: "Embedded"},
SentNode: true,
ReceivedNode: &tailcfg.Node{Endpoints: []string{"99.88.77.66:4555", "33.22.11.0:3444"}},
LastWireguardHandshake: time.Time{},
},
want: []*regexp.Regexp{
regexp.MustCompile(`^✔ received remote agent data from Coder networking coordinator$`),
regexp.MustCompile(`preferred DERP region:\s*$`),
regexp.MustCompile(`endpoints: 99\.88\.77\.66:4555, 33\.22\.11\.0:3444$`),
},
},
{
name: "didntReceiveNode",
diags: tailnet.PeerDiagnostics{
PreferredDERP: 0,
DERPRegionNames: map[int]string{},
SentNode: false,
ReceivedNode: nil,
LastWireguardHandshake: time.Time{},
},
want: []*regexp.Regexp{
regexp.MustCompile(`^✘ have not received remote agent data from Coder networking coordinator$`),
},
},
{
name: "noWireguardHandshake",
diags: tailnet.PeerDiagnostics{
PreferredDERP: 0,
DERPRegionNames: map[int]string{},
SentNode: false,
ReceivedNode: nil,
LastWireguardHandshake: time.Time{},
},
want: []*regexp.Regexp{
regexp.MustCompile(`^✘ Wireguard is not connected$`),
},
},
{
name: "wireguardHandshakeRecent",
diags: tailnet.PeerDiagnostics{
PreferredDERP: 0,
DERPRegionNames: map[int]string{},
SentNode: false,
ReceivedNode: nil,
LastWireguardHandshake: time.Now().Add(-5 * time.Second),
},
want: []*regexp.Regexp{
regexp.MustCompile(`^✔ Wireguard handshake \d+s ago$`),
},
},
{
name: "wireguardHandshakeOld",
diags: tailnet.PeerDiagnostics{
PreferredDERP: 0,
DERPRegionNames: map[int]string{},
SentNode: false,
ReceivedNode: nil,
LastWireguardHandshake: time.Now().Add(-450 * time.Second), // 7m30s
},
want: []*regexp.Regexp{
regexp.MustCompile(`^⚠ Wireguard handshake 7m\d+s ago$`),
},
},
}
for _, tc := range testCases {
tc := tc
t.Run(tc.name, func(t *testing.T) {
t.Parallel()
r, w := io.Pipe()
go func() {
defer w.Close()
cliui.PeerDiagnostics(w, tc.diags)
}()
s := bufio.NewScanner(r)
i := 0
got := make([]string, 0)
for s.Scan() {
got = append(got, s.Text())
if i < len(tc.want) {
reg := tc.want[i]
if reg.Match(s.Bytes()) {
i++
}
}
}
if i < len(tc.want) {
t.Logf("failed to match regexp: %s\ngot:\n%s", tc.want[i].String(), strings.Join(got, "\n"))
t.FailNow()
}
})
}
}

View File

@ -135,6 +135,8 @@ func (r *RootCmd) ping() *clibase.Cmd {
)
if n == int(pingNum) {
diags := conn.GetPeerDiagnostics()
cliui.PeerDiagnostics(inv.Stdout, diags)
return nil
}
}

View File

@ -42,4 +42,32 @@ func TestPing(t *testing.T) {
cancel()
<-cmdDone
})
t.Run("1Ping", func(t *testing.T) {
t.Parallel()
client, workspace, agentToken := setupWorkspaceForAgent(t)
inv, root := clitest.New(t, "ping", "-n", "1", workspace.Name)
clitest.SetupConfig(t, client, root)
pty := ptytest.New(t)
inv.Stdin = pty.Input()
inv.Stderr = pty.Output()
inv.Stdout = pty.Output()
_ = agenttest.New(t, client.URL, agentToken)
_ = coderdtest.AwaitWorkspaceAgents(t, client, workspace.ID)
ctx, cancel := context.WithTimeout(context.Background(), testutil.WaitLong)
defer cancel()
cmdDone := tGo(t, func() {
err := inv.WithContext(ctx).Run()
assert.NoError(t, err)
})
pty.ExpectMatch("pong from " + workspace.Name)
pty.ExpectMatch("✔ received remote agent data from Coder networking coordinator")
cancel()
<-cmdDone
})
}

View File

@ -563,7 +563,7 @@ func getWorkspaceAndAgent(ctx context.Context, inv *clibase.Invocation, client *
if workspace.LatestBuild.Transition != codersdk.WorkspaceTransitionStart {
if !autostart {
return codersdk.Workspace{}, codersdk.WorkspaceAgent{}, xerrors.New("workspace must be in start transition to ssh")
return codersdk.Workspace{}, codersdk.WorkspaceAgent{}, xerrors.New("workspace must be started")
}
// Autostart the workspace for the user.
// For some failure modes, return a better message.
@ -579,7 +579,7 @@ func getWorkspaceAndAgent(ctx context.Context, inv *clibase.Invocation, client *
// It cannot be in any pending or failed state.
if workspace.LatestBuild.Status != codersdk.WorkspaceStatusStopped {
return codersdk.Workspace{}, codersdk.WorkspaceAgent{},
xerrors.Errorf("workspace must be in start transition to ssh, was unable to autostart as the last build job is %q, expected %q",
xerrors.Errorf("workspace must be started; was unable to autostart as the last build job is %q, expected %q",
workspace.LatestBuild.Status,
codersdk.WorkspaceStatusStopped,
)

View File

@ -414,3 +414,7 @@ func (c *WorkspaceAgentConn) apiClient() *http.Client {
},
}
}
func (c *WorkspaceAgentConn) GetPeerDiagnostics() tailnet.PeerDiagnostics {
return c.Conn.GetPeerDiagnostics(c.opts.AgentID)
}

View File

@ -521,6 +521,27 @@ func (c *configMaps) nodeAddresses(publicKey key.NodePublic) ([]netip.Prefix, bo
return nil, false
}
func (c *configMaps) fillPeerDiagnostics(d *PeerDiagnostics, peerID uuid.UUID) {
status := c.status()
c.L.Lock()
defer c.L.Unlock()
if c.derpMap != nil {
for j, r := range c.derpMap.Regions {
d.DERPRegionNames[j] = r.RegionName
}
}
lc, ok := c.peers[peerID]
if !ok {
return
}
d.ReceivedNode = lc.node
ps, ok := status.Peer[lc.node.Key]
if !ok {
return
}
d.LastWireguardHandshake = ps.LastHandshake
}
type peerLifecycle struct {
peerID uuid.UUID
node *tailcfg.Node

View File

@ -773,6 +773,78 @@ func TestConfigMaps_setDERPMap_same(t *testing.T) {
_ = testutil.RequireRecvCtx(ctx, t, done)
}
func TestConfigMaps_fillPeerDiagnostics(t *testing.T) {
t.Parallel()
ctx := testutil.Context(t, testutil.WaitShort)
logger := slogtest.Make(t, nil).Leveled(slog.LevelDebug)
fEng := newFakeEngineConfigurable()
nodePrivateKey := key.NewNode()
nodeID := tailcfg.NodeID(5)
discoKey := key.NewDisco()
uut := newConfigMaps(logger, fEng, nodeID, nodePrivateKey, discoKey.Public())
defer uut.close()
// Given: DERP Map and peer already set
derpMap := &tailcfg.DERPMap{
HomeParams: &tailcfg.DERPHomeParams{RegionScore: map[int]float64{
1: 0.025,
1001: 0.111,
}},
Regions: map[int]*tailcfg.DERPRegion{
1: {
RegionCode: "AUH",
RegionName: "AUH",
Nodes: []*tailcfg.DERPNode{
{Name: "AUH0"},
},
},
1001: {
RegionCode: "DXB",
RegionName: "DXB",
Nodes: []*tailcfg.DERPNode{
{Name: "DXB0"},
},
},
},
}
p1ID := uuid.UUID{1}
p1Node := newTestNode(1)
p1n, err := NodeToProto(p1Node)
require.NoError(t, err)
p1tcn, err := uut.protoNodeToTailcfg(p1n)
p1tcn.KeepAlive = true
require.NoError(t, err)
hst := time.Date(2024, 1, 7, 12, 0, 10, 0, time.UTC)
uut.L.Lock()
uut.derpMap = derpMap
uut.peers[p1ID] = &peerLifecycle{
peerID: p1ID,
node: p1tcn,
lastHandshake: hst,
}
uut.L.Unlock()
s0 := expectStatusWithHandshake(ctx, t, fEng, p1Node.Key, hst)
// When: call fillPeerDiagnostics
d := PeerDiagnostics{DERPRegionNames: make(map[int]string)}
uut.fillPeerDiagnostics(&d, p1ID)
testutil.RequireRecvCtx(ctx, t, s0)
// Then:
require.Equal(t, map[int]string{1: "AUH", 1001: "DXB"}, d.DERPRegionNames)
require.Equal(t, p1tcn, d.ReceivedNode)
require.Equal(t, hst, d.LastWireguardHandshake)
done := make(chan struct{})
go func() {
defer close(done)
uut.close()
}()
_ = testutil.RequireRecvCtx(ctx, t, done)
}
func expectStatusWithHandshake(
ctx context.Context, t testing.TB, fEng *fakeEngineConfigurable, k key.NodePublic, lastHandshake time.Time,
) <-chan struct{} {

View File

@ -645,6 +645,30 @@ func (c *Conn) MagicsockServeHTTPDebug(w http.ResponseWriter, r *http.Request) {
c.magicConn.ServeHTTPDebug(w, r)
}
// PeerDiagnostics is a checklist of human-readable conditions necessary to establish an encrypted
// tunnel to a peer via a Conn
type PeerDiagnostics struct {
// PreferredDERP is 0 if we are not connected to a DERP region. If non-zero, we are connected to
// the given region as our home or "preferred" DERP.
PreferredDERP int
DERPRegionNames map[int]string
// SentNode is true if we have successfully transmitted our local Node via the most recently set
// NodeCallback.
SentNode bool
// ReceivedNode is the last Node we received for the peer, or nil if we haven't received the node.
ReceivedNode *tailcfg.Node
// LastWireguardHandshake is the last time we completed a wireguard handshake
LastWireguardHandshake time.Time
// TODO: surface Discovery (disco) protocol problems
}
func (c *Conn) GetPeerDiagnostics(peerID uuid.UUID) PeerDiagnostics {
d := PeerDiagnostics{DERPRegionNames: make(map[int]string)}
c.nodeUpdater.fillPeerDiagnostics(&d)
c.configMaps.fillPeerDiagnostics(&d, peerID)
return d
}
type listenKey struct {
network string
host string

View File

@ -36,6 +36,7 @@ type nodeUpdater struct {
addresses []netip.Prefix
lastStatus time.Time
blockEndpoints bool
sentNode bool // for PeerDiagnostics
}
// updateLoop waits until the config is dirty and then calls the callback with the newest node.
@ -79,6 +80,7 @@ func (u *nodeUpdater) updateLoop() {
u.logger.Debug(context.Background(), "calling nodeUpdater callback", slog.F("node", node))
callback(node)
u.L.Lock()
u.sentNode = true
}
}
@ -212,6 +214,7 @@ func (u *nodeUpdater) setCallback(callback func(node *Node)) {
defer u.L.Unlock()
u.callback = callback
u.dirty = true
u.sentNode = false
u.Broadcast()
}
@ -228,3 +231,11 @@ func (u *nodeUpdater) setBlockEndpoints(blockEndpoints bool) {
u.blockEndpoints = blockEndpoints
u.Broadcast()
}
// fillPeerDiagnostics fills out the PeerDiagnostics with PreferredDERP and SentNode
func (u *nodeUpdater) fillPeerDiagnostics(d *PeerDiagnostics) {
u.L.Lock()
defer u.L.Unlock()
d.PreferredDERP = u.preferredDERP
d.SentNode = u.sentNode
}

View File

@ -5,16 +5,14 @@ import (
"testing"
"time"
"golang.org/x/xerrors"
"golang.org/x/exp/slices"
"tailscale.com/wgengine"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
"golang.org/x/exp/maps"
"golang.org/x/exp/slices"
"golang.org/x/xerrors"
"tailscale.com/tailcfg"
"tailscale.com/types/key"
"tailscale.com/wgengine"
"cdr.dev/slog"
"cdr.dev/slog/sloggers/slogtest"
@ -569,3 +567,91 @@ func TestNodeUpdater_setBlockEndpoints_same(t *testing.T) {
}()
_ = testutil.RequireRecvCtx(ctx, t, done)
}
func TestNodeUpdater_fillPeerDiagnostics(t *testing.T) {
t.Parallel()
ctx := testutil.Context(t, testutil.WaitShort)
logger := slogtest.Make(t, nil).Leveled(slog.LevelDebug)
id := tailcfg.NodeID(1)
nodeKey := key.NewNode().Public()
discoKey := key.NewDisco().Public()
uut := newNodeUpdater(
logger,
func(n *Node) {},
id, nodeKey, discoKey,
)
defer uut.close()
// at start of day, filling diagnostics will not have derp and sentNode is false
d := PeerDiagnostics{}
uut.fillPeerDiagnostics(&d)
require.Equal(t, 0, d.PreferredDERP)
require.False(t, d.SentNode)
dl := map[string]float64{"1": 0.025}
uut.setNetInfo(&tailcfg.NetInfo{
PreferredDERP: 1,
DERPLatency: dl,
})
// after node callback, we should get the derp and SentNode is true.
// Use eventually since, there is a race between the callback completing
// and the test checking
require.Eventually(t, func() bool {
d := PeerDiagnostics{}
uut.fillPeerDiagnostics(&d)
// preferred DERP should be set right away, even if the callback is not
// complete.
if !assert.Equal(t, 1, d.PreferredDERP) {
return false
}
return d.SentNode
}, testutil.WaitShort, testutil.IntervalFast)
done := make(chan struct{})
go func() {
defer close(done)
uut.close()
}()
_ = testutil.RequireRecvCtx(ctx, t, done)
}
func TestNodeUpdater_fillPeerDiagnostics_noCallback(t *testing.T) {
t.Parallel()
ctx := testutil.Context(t, testutil.WaitShort)
logger := slogtest.Make(t, nil).Leveled(slog.LevelDebug)
id := tailcfg.NodeID(1)
nodeKey := key.NewNode().Public()
discoKey := key.NewDisco().Public()
uut := newNodeUpdater(
logger,
nil,
id, nodeKey, discoKey,
)
defer uut.close()
// at start of day, filling diagnostics will not have derp and sentNode is false
d := PeerDiagnostics{}
uut.fillPeerDiagnostics(&d)
require.Equal(t, 0, d.PreferredDERP)
require.False(t, d.SentNode)
dl := map[string]float64{"1": 0.025}
uut.setNetInfo(&tailcfg.NetInfo{
PreferredDERP: 1,
DERPLatency: dl,
})
// since there is no callback, SentNode should not be true, but we get
// the preferred DERP
uut.fillPeerDiagnostics(&d)
require.Equal(t, 1, d.PreferredDERP)
require.False(t, d.SentNode)
done := make(chan struct{})
go func() {
defer close(done)
uut.close()
}()
_ = testutil.RequireRecvCtx(ctx, t, done)
}