From 520c3a8ff71649944d148d3af1c5c3308350ae8c Mon Sep 17 00:00:00 2001 From: Spike Curtis Date: Tue, 2 Jan 2024 15:53:52 +0400 Subject: [PATCH] fix: use TSMP for pings and checking reachability (#11306) We're seeing some flaky tests related to agent connectivity - https://github.com/coder/coder/actions/runs/7286675441/job/19856270998 I'm pretty sure what happened in this one is that the client opened a connection while the wgengine was in the process of reconfiguring the wireguard device, so the fact that the peer became "active" as a result of traffic being sent was not noticed. The test calls `AwaitReachable()` but this only tests the disco layer, so it doesn't wait for wireguard to come up. I think we should be using TSMP for pinging and reachability, since this operates at the IP layer, and therefore requires that wireguard comes up before being successful. This should also help with the problems we have seen where a TCP connection starts before wireguard is up and the initial round trip has to wait for the 5 second wireguard handshake retry. fixes: #11294 --- agent/agent_test.go | 25 +++++++++++++------------ tailnet/conn.go | 4 ++-- 2 files changed, 15 insertions(+), 14 deletions(-) diff --git a/agent/agent_test.go b/agent/agent_test.go index 9017240738..1e62ddef3e 100644 --- a/agent/agent_test.go +++ b/agent/agent_test.go @@ -174,10 +174,10 @@ func TestAgent_Stats_Magic(t *testing.T) { require.NoError(t, err) err = session.Shell() require.NoError(t, err) - var s *agentsdk.Stats require.Eventuallyf(t, func() bool { - var ok bool - s, ok = <-stats + s, ok := <-stats + t.Logf("got stats: ok=%t, ConnectionCount=%d, RxBytes=%d, TxBytes=%d, SessionCountVSCode=%d, ConnectionMedianLatencyMS=%f", + ok, s.ConnectionCount, s.RxBytes, s.TxBytes, s.SessionCountVSCode, s.ConnectionMedianLatencyMS) return ok && s.ConnectionCount > 0 && s.RxBytes > 0 && s.TxBytes > 0 && // Ensure that the connection didn't count as a "normal" SSH session. // This was a special one, so it should be labeled specially in the stats! @@ -186,7 +186,7 @@ func TestAgent_Stats_Magic(t *testing.T) { // If it isn't, it's set to -1. s.ConnectionMedianLatencyMS >= 0 }, testutil.WaitLong, testutil.IntervalFast, - "never saw stats: %+v", s, + "never saw stats", ) // The shell will automatically exit if there is no stdin! _ = stdin.Close() @@ -240,14 +240,14 @@ func TestAgent_Stats_Magic(t *testing.T) { _ = tunneledConn.Close() }) - var s *agentsdk.Stats require.Eventuallyf(t, func() bool { - var ok bool - s, ok = <-stats + s, ok := <-stats + t.Logf("got stats with conn open: ok=%t, ConnectionCount=%d, SessionCountJetBrains=%d", + ok, s.ConnectionCount, s.SessionCountJetBrains) return ok && s.ConnectionCount > 0 && s.SessionCountJetBrains == 1 }, testutil.WaitLong, testutil.IntervalFast, - "never saw stats with conn open: %+v", s, + "never saw stats with conn open", ) // Kill the server and connection after checking for the echo. @@ -256,12 +256,13 @@ func TestAgent_Stats_Magic(t *testing.T) { _ = tunneledConn.Close() require.Eventuallyf(t, func() bool { - var ok bool - s, ok = <-stats - return ok && s.ConnectionCount == 0 && + s, ok := <-stats + t.Logf("got stats after disconnect %t, %d", + ok, s.SessionCountJetBrains) + return ok && s.SessionCountJetBrains == 0 }, testutil.WaitLong, testutil.IntervalFast, - "never saw stats after conn closes: %+v", s, + "never saw stats after conn closes", ) }) } diff --git a/tailnet/conn.go b/tailnet/conn.go index c785e7fabb..3620cc5244 100644 --- a/tailnet/conn.go +++ b/tailnet/conn.go @@ -670,12 +670,12 @@ func (c *Conn) Status() *ipnstate.Status { return sb.Status() } -// Ping sends a Disco ping to the Wireguard engine. +// Ping sends a ping to the Wireguard engine. // The bool returned is true if the ping was performed P2P. func (c *Conn) Ping(ctx context.Context, ip netip.Addr) (time.Duration, bool, *ipnstate.PingResult, error) { errCh := make(chan error, 1) prChan := make(chan *ipnstate.PingResult, 1) - go c.wireguardEngine.Ping(ip, tailcfg.PingDisco, func(pr *ipnstate.PingResult) { + go c.wireguardEngine.Ping(ip, tailcfg.PingTSMP, func(pr *ipnstate.PingResult) { if pr.Err != "" { errCh <- xerrors.New(pr.Err) return