fix: use TSMP for pings and checking reachability (#11306)

We're seeing some flaky tests related to agent connectivity - https://github.com/coder/coder/actions/runs/7286675441/job/19856270998

I'm pretty sure what happened in this one is that the client opened a connection while the wgengine was in the process of reconfiguring the wireguard device, so the fact that the peer became "active" as a result of traffic being sent was not noticed.

The test calls `AwaitReachable()` but this only tests the disco layer, so it doesn't wait for wireguard to come up.

I think we should be using TSMP for pinging and reachability, since this operates at the IP layer, and therefore requires that wireguard comes up before being successful.

This should also help with the problems we have seen where a TCP connection starts before wireguard is up and the initial round trip has to wait for the 5 second wireguard handshake retry.

fixes: #11294
This commit is contained in:
Spike Curtis 2024-01-02 15:53:52 +04:00 committed by GitHub
parent 58e40f6cd6
commit 520c3a8ff7
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 15 additions and 14 deletions

View File

@ -174,10 +174,10 @@ func TestAgent_Stats_Magic(t *testing.T) {
require.NoError(t, err)
err = session.Shell()
require.NoError(t, err)
var s *agentsdk.Stats
require.Eventuallyf(t, func() bool {
var ok bool
s, ok = <-stats
s, ok := <-stats
t.Logf("got stats: ok=%t, ConnectionCount=%d, RxBytes=%d, TxBytes=%d, SessionCountVSCode=%d, ConnectionMedianLatencyMS=%f",
ok, s.ConnectionCount, s.RxBytes, s.TxBytes, s.SessionCountVSCode, s.ConnectionMedianLatencyMS)
return ok && s.ConnectionCount > 0 && s.RxBytes > 0 && s.TxBytes > 0 &&
// Ensure that the connection didn't count as a "normal" SSH session.
// This was a special one, so it should be labeled specially in the stats!
@ -186,7 +186,7 @@ func TestAgent_Stats_Magic(t *testing.T) {
// If it isn't, it's set to -1.
s.ConnectionMedianLatencyMS >= 0
}, testutil.WaitLong, testutil.IntervalFast,
"never saw stats: %+v", s,
"never saw stats",
)
// The shell will automatically exit if there is no stdin!
_ = stdin.Close()
@ -240,14 +240,14 @@ func TestAgent_Stats_Magic(t *testing.T) {
_ = tunneledConn.Close()
})
var s *agentsdk.Stats
require.Eventuallyf(t, func() bool {
var ok bool
s, ok = <-stats
s, ok := <-stats
t.Logf("got stats with conn open: ok=%t, ConnectionCount=%d, SessionCountJetBrains=%d",
ok, s.ConnectionCount, s.SessionCountJetBrains)
return ok && s.ConnectionCount > 0 &&
s.SessionCountJetBrains == 1
}, testutil.WaitLong, testutil.IntervalFast,
"never saw stats with conn open: %+v", s,
"never saw stats with conn open",
)
// Kill the server and connection after checking for the echo.
@ -256,12 +256,13 @@ func TestAgent_Stats_Magic(t *testing.T) {
_ = tunneledConn.Close()
require.Eventuallyf(t, func() bool {
var ok bool
s, ok = <-stats
return ok && s.ConnectionCount == 0 &&
s, ok := <-stats
t.Logf("got stats after disconnect %t, %d",
ok, s.SessionCountJetBrains)
return ok &&
s.SessionCountJetBrains == 0
}, testutil.WaitLong, testutil.IntervalFast,
"never saw stats after conn closes: %+v", s,
"never saw stats after conn closes",
)
})
}

View File

@ -670,12 +670,12 @@ func (c *Conn) Status() *ipnstate.Status {
return sb.Status()
}
// Ping sends a Disco ping to the Wireguard engine.
// Ping sends a ping to the Wireguard engine.
// The bool returned is true if the ping was performed P2P.
func (c *Conn) Ping(ctx context.Context, ip netip.Addr) (time.Duration, bool, *ipnstate.PingResult, error) {
errCh := make(chan error, 1)
prChan := make(chan *ipnstate.PingResult, 1)
go c.wireguardEngine.Ping(ip, tailcfg.PingDisco, func(pr *ipnstate.PingResult) {
go c.wireguardEngine.Ping(ip, tailcfg.PingTSMP, func(pr *ipnstate.PingResult) {
if pr.Err != "" {
errCh <- xerrors.New(pr.Err)
return