feat: automatically use websockets if DERP upgrade is unavailable (#6381)

* feat: automatically use websockets if DERP upgrade is unavailable

This might be our biggest hangup for deployments at the moment...

Load balancers by default do not support the DERP protocol, so many
of our prospects and customers run into failing workspace connections.
This automatically swaps to use WebSockets, and reports the reason to
coderd.

In a future contribution, a warning will appear by the agent if it was
forced to use WebSockets instead of DERP.

* Fix nil pointer type in Tailscale dep

* Fix requested changes
This commit is contained in:
Kyle Carberry 2023-03-01 16:18:14 -06:00 committed by GitHub
parent ce11400b56
commit 1724cbf872
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
9 changed files with 300 additions and 29 deletions

View File

@ -284,6 +284,9 @@ func New(options *Options) *API {
// replicas or instances of this middleware.
apiRateLimiter := httpmw.RateLimit(options.APIRateLimit, time.Minute)
derpHandler := derphttp.Handler(api.DERPServer)
derpHandler, api.derpCloseFunc = tailnet.WithWebsocketSupport(api.DERPServer, derpHandler)
r.Use(
httpmw.Recover(api.Logger),
tracing.StatusWriterMiddleware,
@ -363,7 +366,7 @@ func New(options *Options) *API {
r.Route("/%40{user}/{workspace_and_agent}/apps/{workspaceapp}", apps)
r.Route("/@{user}/{workspace_and_agent}/apps/{workspaceapp}", apps)
r.Route("/derp", func(r chi.Router) {
r.Get("/", derphttp.Handler(api.DERPServer).ServeHTTP)
r.Get("/", derpHandler.ServeHTTP)
// This is used when UDP is blocked, and latency must be checked via HTTP(s).
r.Get("/latency-check", func(w http.ResponseWriter, r *http.Request) {
w.WriteHeader(http.StatusOK)
@ -726,6 +729,7 @@ type API struct {
WebsocketWaitMutex sync.Mutex
WebsocketWaitGroup sync.WaitGroup
derpCloseFunc func()
metricsCache *metricscache.Cache
workspaceAgentCache *wsconncache.Cache
@ -739,6 +743,7 @@ type API struct {
// Close waits for all WebSocket connections to drain before returning.
func (api *API) Close() error {
api.cancel()
api.derpCloseFunc()
api.WebsocketWaitMutex.Lock()
api.WebsocketWaitGroup.Wait()

2
go.mod
View File

@ -40,7 +40,7 @@ replace github.com/tcnksm/go-httpstat => github.com/kylecarbs/go-httpstat v0.0.0
// There are a few minor changes we make to Tailscale that we're slowly upstreaming. Compare here:
// https://github.com/tailscale/tailscale/compare/main...coder:tailscale:main
replace tailscale.com => github.com/coder/tailscale v1.1.1-0.20230210022917-446fc10e755a
replace tailscale.com => github.com/coder/tailscale v1.1.1-0.20230301203426-fb16ae7c5bba
// Switch to our fork that imports fixes from http://github.com/tailscale/ssh.
// See: https://github.com/coder/coder/issues/3371

28
go.sum
View File

@ -379,6 +379,34 @@ github.com/coder/ssh v0.0.0-20220811105153-fcea99919338 h1:tN5GKFT68YLVzJoA8AHui
github.com/coder/ssh v0.0.0-20220811105153-fcea99919338/go.mod h1:ZSS+CUoKHDrqVakTfTWUlKSr9MtMFkC4UvtQKD7O914=
github.com/coder/tailscale v1.1.1-0.20230210022917-446fc10e755a h1:fTXoK+Yikz8Rl0v0nHxO+lTV0y8Q7wdmGFq1CqLgznE=
github.com/coder/tailscale v1.1.1-0.20230210022917-446fc10e755a/go.mod h1:jpg+77g19FpXL43U1VoIqoSg1K/Vh5CVxycGldQ8KhA=
github.com/coder/tailscale v1.1.1-0.20230228183907-71829ca8456b h1:i8R1RwDqLihavxsaTovKCax45AJLy2dtXI0PcLm+BnM=
github.com/coder/tailscale v1.1.1-0.20230228183907-71829ca8456b/go.mod h1:jpg+77g19FpXL43U1VoIqoSg1K/Vh5CVxycGldQ8KhA=
github.com/coder/tailscale v1.1.1-0.20230228184228-8e7815556b85 h1:tg89NZBDe91wWTeII4jFThHdB6USE3mBNXzK0/CWODM=
github.com/coder/tailscale v1.1.1-0.20230228184228-8e7815556b85/go.mod h1:jpg+77g19FpXL43U1VoIqoSg1K/Vh5CVxycGldQ8KhA=
github.com/coder/tailscale v1.1.1-0.20230228184330-c2cd7d4f5a70 h1:gGevbQLqz4fWk5HytFodNe/Bl8rqgHb2+YsRoaZPf1g=
github.com/coder/tailscale v1.1.1-0.20230228184330-c2cd7d4f5a70/go.mod h1:jpg+77g19FpXL43U1VoIqoSg1K/Vh5CVxycGldQ8KhA=
github.com/coder/tailscale v1.1.1-0.20230228184508-acb075f559ce h1:oAYRHRM8/rPrQhcPiXBkLw/YU/NyPhhN+HypRJWSq2k=
github.com/coder/tailscale v1.1.1-0.20230228184508-acb075f559ce/go.mod h1:jpg+77g19FpXL43U1VoIqoSg1K/Vh5CVxycGldQ8KhA=
github.com/coder/tailscale v1.1.1-0.20230228184618-0e3e4965c416 h1:25uoFv5YNvve9r0BEGJAJHVWulyPmPPvXp6S3b0085o=
github.com/coder/tailscale v1.1.1-0.20230228184618-0e3e4965c416/go.mod h1:jpg+77g19FpXL43U1VoIqoSg1K/Vh5CVxycGldQ8KhA=
github.com/coder/tailscale v1.1.1-0.20230228185337-873afd546fb2 h1:yrKQPYAadmtrj5J33Fs/Ldx3FgXCfd0gUbcFqjpsq+I=
github.com/coder/tailscale v1.1.1-0.20230228185337-873afd546fb2/go.mod h1:jpg+77g19FpXL43U1VoIqoSg1K/Vh5CVxycGldQ8KhA=
github.com/coder/tailscale v1.1.1-0.20230228185607-b6042b55609a h1:aqp8CUtVBcCLDI369ORzbVt/pfOt0mTZBxIAejgZkgY=
github.com/coder/tailscale v1.1.1-0.20230228185607-b6042b55609a/go.mod h1:jpg+77g19FpXL43U1VoIqoSg1K/Vh5CVxycGldQ8KhA=
github.com/coder/tailscale v1.1.1-0.20230228194335-31045cf53c85 h1:MHJDOehTwolaeT8FQrBQ5pNEdWXO+8pWTy5ODIYOZUU=
github.com/coder/tailscale v1.1.1-0.20230228194335-31045cf53c85/go.mod h1:jpg+77g19FpXL43U1VoIqoSg1K/Vh5CVxycGldQ8KhA=
github.com/coder/tailscale v1.1.1-0.20230228194714-66eac1a43e5c h1:WylIWZ+8UpiW2NPTu6z2sgFHwqo9/XmT/QTNWT47qic=
github.com/coder/tailscale v1.1.1-0.20230228194714-66eac1a43e5c/go.mod h1:jpg+77g19FpXL43U1VoIqoSg1K/Vh5CVxycGldQ8KhA=
github.com/coder/tailscale v1.1.1-0.20230228200853-ddedcafad3a1 h1:XgOpzrEiDR4GzoKE4dhqopZAg9MrG1cXPMc9/PXA6kk=
github.com/coder/tailscale v1.1.1-0.20230228200853-ddedcafad3a1/go.mod h1:jpg+77g19FpXL43U1VoIqoSg1K/Vh5CVxycGldQ8KhA=
github.com/coder/tailscale v1.1.1-0.20230228204054-70dbfd7be018 h1:pv80tzqexcjrP/4CxFpalrk6bS2XJ2xOGA00u1WUy+g=
github.com/coder/tailscale v1.1.1-0.20230228204054-70dbfd7be018/go.mod h1:jpg+77g19FpXL43U1VoIqoSg1K/Vh5CVxycGldQ8KhA=
github.com/coder/tailscale v1.1.1-0.20230228215233-ba13d5ceee2a h1:eXE/5hMkcHfa3dcJJ4WkseSbJNU4thMfC8cUMIAgi2s=
github.com/coder/tailscale v1.1.1-0.20230228215233-ba13d5ceee2a/go.mod h1:jpg+77g19FpXL43U1VoIqoSg1K/Vh5CVxycGldQ8KhA=
github.com/coder/tailscale v1.1.1-0.20230228220447-d27e5d3056e9 h1:bFcFXLUUi+cwgOqrjbXN+XmI7QvB1up/UZoNF1+9nuM=
github.com/coder/tailscale v1.1.1-0.20230228220447-d27e5d3056e9/go.mod h1:jpg+77g19FpXL43U1VoIqoSg1K/Vh5CVxycGldQ8KhA=
github.com/coder/tailscale v1.1.1-0.20230301203426-fb16ae7c5bba h1:JOD5pqNtiz9lkSX74PY2BJOyNqsBmvGUjFGIuECtG+o=
github.com/coder/tailscale v1.1.1-0.20230301203426-fb16ae7c5bba/go.mod h1:jpg+77g19FpXL43U1VoIqoSg1K/Vh5CVxycGldQ8KhA=
github.com/coder/terraform-provider-coder v0.6.14 h1:NsJ1mo0MN1x/VyNLYmsgPUYn2JgzdVNZBqnj9OSIDgY=
github.com/coder/terraform-provider-coder v0.6.14/go.mod h1:UIfU3bYNeSzJJvHyJ30tEKjD6Z9utloI+HUM/7n94CY=
github.com/containerd/aufs v0.0.0-20200908144142-dab0cbea06f4/go.mod h1:nukgQABAEopAHvB6j7cnP5zJ+/3aVcE7hCYqvIwAHyE=

View File

@ -192,19 +192,20 @@ func NewConn(options *Options) (conn *Conn, err error) {
wireguardEngine.SetFilter(filter.New(netMap.PacketFilter, localIPs, logIPs, nil, Logger(options.Logger.Named("packet-filter"))))
dialContext, dialCancel := context.WithCancel(context.Background())
server := &Conn{
blockEndpoints: options.BlockEndpoints,
dialContext: dialContext,
dialCancel: dialCancel,
closed: make(chan struct{}),
logger: options.Logger,
magicConn: magicConn,
dialer: dialer,
listeners: map[listenKey]*listener{},
peerMap: map[tailcfg.NodeID]*tailcfg.Node{},
tunDevice: tunDevice,
netMap: netMap,
netStack: netStack,
wireguardMonitor: wireguardMonitor,
blockEndpoints: options.BlockEndpoints,
dialContext: dialContext,
dialCancel: dialCancel,
closed: make(chan struct{}),
logger: options.Logger,
magicConn: magicConn,
dialer: dialer,
listeners: map[listenKey]*listener{},
peerMap: map[tailcfg.NodeID]*tailcfg.Node{},
lastDERPForcedWebsockets: map[int]string{},
tunDevice: tunDevice,
netMap: netMap,
netStack: netStack,
wireguardMonitor: wireguardMonitor,
wireguardRouter: &router.Config{
LocalAddrs: netMap.Addresses,
},
@ -247,6 +248,17 @@ func NewConn(options *Options) (conn *Conn, err error) {
server.lastMutex.Unlock()
server.sendNode()
})
magicConn.SetDERPForcedWebsocketCallback(func(region int, reason string) {
server.logger.Debug(context.Background(), "derp forced websocket", slog.F("region", region), slog.F("reason", reason))
server.lastMutex.Lock()
if server.lastDERPForcedWebsockets[region] == reason {
server.lastMutex.Unlock()
return
}
server.lastDERPForcedWebsockets[region] = reason
server.lastMutex.Unlock()
server.sendNode()
})
netStack.ForwardTCPIn = server.forwardTCP
err = netStack.Start(nil)
@ -299,10 +311,11 @@ type Conn struct {
nodeChanged bool
// It's only possible to store these values via status functions,
// so the values must be stored for retrieval later on.
lastStatus time.Time
lastEndpoints []tailcfg.Endpoint
lastNetInfo *tailcfg.NetInfo
nodeCallback func(node *Node)
lastStatus time.Time
lastEndpoints []tailcfg.Endpoint
lastDERPForcedWebsockets map[int]string
lastNetInfo *tailcfg.NetInfo
nodeCallback func(node *Node)
trafficStats *connstats.Statistics
}
@ -632,21 +645,24 @@ func (c *Conn) selfNode() *Node {
}
var preferredDERP int
var derpLatency map[string]float64
var derpForcedWebsocket map[int]string
if c.lastNetInfo != nil {
preferredDERP = c.lastNetInfo.PreferredDERP
derpLatency = c.lastNetInfo.DERPLatency
derpForcedWebsocket = c.lastDERPForcedWebsockets
}
node := &Node{
ID: c.netMap.SelfNode.ID,
AsOf: database.Now(),
Key: c.netMap.SelfNode.Key,
Addresses: c.netMap.SelfNode.Addresses,
AllowedIPs: c.netMap.SelfNode.AllowedIPs,
DiscoKey: c.magicConn.DiscoPublicKey(),
Endpoints: endpoints,
PreferredDERP: preferredDERP,
DERPLatency: derpLatency,
ID: c.netMap.SelfNode.ID,
AsOf: database.Now(),
Key: c.netMap.SelfNode.Key,
Addresses: c.netMap.SelfNode.Addresses,
AllowedIPs: c.netMap.SelfNode.AllowedIPs,
DiscoKey: c.magicConn.DiscoPublicKey(),
Endpoints: endpoints,
PreferredDERP: preferredDERP,
DERPLatency: derpLatency,
DERPForcedWebsocket: derpForcedWebsocket,
}
if c.blockEndpoints {
node.Endpoints = nil

View File

@ -13,6 +13,7 @@ import (
"cdr.dev/slog/sloggers/slogtest"
"github.com/coder/coder/tailnet"
"github.com/coder/coder/tailnet/tailnettest"
"github.com/coder/coder/testutil"
)
func TestMain(m *testing.M) {
@ -63,7 +64,7 @@ func TestTailnet(t *testing.T) {
assert.NoError(t, err)
})
require.True(t, w2.AwaitReachable(context.Background(), w1IP))
conn := make(chan struct{})
conn := make(chan struct{}, 1)
go func() {
listener, err := w1.Listen("tcp", ":35565")
assert.NoError(t, err)
@ -81,6 +82,86 @@ func TestTailnet(t *testing.T) {
_ = nc.Close()
<-conn
nodes := make(chan *tailnet.Node, 1)
w2.SetNodeCallback(func(node *tailnet.Node) {
select {
case nodes <- node:
default:
}
})
node := <-nodes
// Ensure this connected over DERP!
require.Len(t, node.DERPForcedWebsocket, 0)
w1.Close()
w2.Close()
})
t.Run("ForcesWebSockets", func(t *testing.T) {
t.Parallel()
ctx, cancelFunc := testutil.Context(t)
defer cancelFunc()
w1IP := tailnet.IP()
derpMap := tailnettest.RunDERPOnlyWebSockets(t)
w1, err := tailnet.NewConn(&tailnet.Options{
Addresses: []netip.Prefix{netip.PrefixFrom(w1IP, 128)},
Logger: logger.Named("w1"),
DERPMap: derpMap,
BlockEndpoints: true,
})
require.NoError(t, err)
w2, err := tailnet.NewConn(&tailnet.Options{
Addresses: []netip.Prefix{netip.PrefixFrom(tailnet.IP(), 128)},
Logger: logger.Named("w2"),
DERPMap: derpMap,
BlockEndpoints: true,
})
require.NoError(t, err)
t.Cleanup(func() {
_ = w1.Close()
_ = w2.Close()
})
w1.SetNodeCallback(func(node *tailnet.Node) {
err := w2.UpdateNodes([]*tailnet.Node{node}, false)
assert.NoError(t, err)
})
w2.SetNodeCallback(func(node *tailnet.Node) {
err := w1.UpdateNodes([]*tailnet.Node{node}, false)
assert.NoError(t, err)
})
require.True(t, w2.AwaitReachable(ctx, w1IP))
conn := make(chan struct{}, 1)
go func() {
listener, err := w1.Listen("tcp", ":35565")
assert.NoError(t, err)
defer listener.Close()
nc, err := listener.Accept()
if !assert.NoError(t, err) {
return
}
_ = nc.Close()
conn <- struct{}{}
}()
nc, err := w2.DialContextTCP(ctx, netip.AddrPortFrom(w1IP, 35565))
require.NoError(t, err)
_ = nc.Close()
<-conn
nodes := make(chan *tailnet.Node, 1)
w2.SetNodeCallback(func(node *tailnet.Node) {
select {
case nodes <- node:
default:
}
})
node := <-nodes
require.Len(t, node.DERPForcedWebsocket, 1)
// Ensure the reason is valid!
require.Equal(t, "GET failed with status code 400: Invalid \"Upgrade\" header: DERP", node.DERPForcedWebsocket[derpMap.RegionIDs()[0]])
w1.Close()
w2.Close()
})

View File

@ -58,6 +58,11 @@ type Node struct {
PreferredDERP int `json:"preferred_derp"`
// DERPLatency is the latency in seconds to each DERP server.
DERPLatency map[string]float64 `json:"derp_latency"`
// DERPForcedWebsocket contains a mapping of DERP regions to
// error messages that caused the connection to be forced to
// use WebSockets. We don't use WebSockets by default because
// they are less performant.
DERPForcedWebsocket map[int]string `json:"derp_forced_websockets"`
// Addresses are the IP address ranges this connection exposes.
Addresses []netip.Prefix `json:"addresses"`
// AllowedIPs specify what addresses can dial the connection.

72
tailnet/derp.go Normal file
View File

@ -0,0 +1,72 @@
package tailnet
import (
"bufio"
"context"
"log"
"net/http"
"strings"
"sync"
"nhooyr.io/websocket"
"tailscale.com/derp"
"tailscale.com/net/wsconn"
)
// WithWebsocketSupport returns an http.Handler that upgrades
// connections to the "derp" subprotocol to WebSockets and
// passes them to the DERP server.
// Taken from: https://github.com/tailscale/tailscale/blob/e3211ff88ba85435f70984cf67d9b353f3d650d8/cmd/derper/websocket.go#L21
func WithWebsocketSupport(s *derp.Server, base http.Handler) (http.Handler, func()) {
var mu sync.Mutex
var waitGroup sync.WaitGroup
ctx, cancelFunc := context.WithCancel(context.Background())
return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
up := strings.ToLower(r.Header.Get("Upgrade"))
// Very early versions of Tailscale set "Upgrade: WebSocket" but didn't actually
// speak WebSockets (they still assumed DERP's binary framing). So to distinguish
// clients that actually want WebSockets, look for an explicit "derp" subprotocol.
if up != "websocket" || !strings.Contains(r.Header.Get("Sec-Websocket-Protocol"), "derp") {
base.ServeHTTP(w, r)
return
}
mu.Lock()
if ctx.Err() != nil {
mu.Unlock()
return
}
waitGroup.Add(1)
mu.Unlock()
defer waitGroup.Done()
c, err := websocket.Accept(w, r, &websocket.AcceptOptions{
Subprotocols: []string{"derp"},
OriginPatterns: []string{"*"},
// Disable compression because we transmit WireGuard messages that
// are not compressible.
// Additionally, Safari has a broken implementation of compression
// (see https://github.com/nhooyr/websocket/issues/218) that makes
// enabling it actively harmful.
CompressionMode: websocket.CompressionDisabled,
})
if err != nil {
log.Printf("websocket.Accept: %v", err)
return
}
defer c.Close(websocket.StatusInternalError, "closing")
if c.Subprotocol() != "derp" {
c.Close(websocket.StatusPolicyViolation, "client must speak the derp subprotocol")
return
}
wc := wsconn.NetConn(ctx, c, websocket.MessageBinary)
brw := bufio.NewReadWriter(bufio.NewReader(wc), bufio.NewWriter(wc))
s.Accept(ctx, wc, brw, r.RemoteAddr)
}), func() {
cancelFunc()
mu.Lock()
waitGroup.Wait()
mu.Unlock()
}
}

View File

@ -2,6 +2,8 @@ package tailnettest
import (
"crypto/tls"
"fmt"
"html"
"net"
"net/http"
"net/http/httptest"
@ -61,3 +63,60 @@ func RunDERPAndSTUN(t *testing.T) *tailcfg.DERPMap {
},
}
}
// RunDERPOnlyWebSockets creates a DERP mapping for tests that
// only allows WebSockets through it. Many proxies do not support
// upgrading DERP, so this is a good fallback.
func RunDERPOnlyWebSockets(t *testing.T) *tailcfg.DERPMap {
logf := tailnet.Logger(slogtest.Make(t, nil))
d := derp.NewServer(key.NewNode(), logf)
handler := derphttp.Handler(d)
var closeFunc func()
handler, closeFunc = tailnet.WithWebsocketSupport(d, handler)
server := httptest.NewUnstartedServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
if r.URL.Path != "/derp" {
handler.ServeHTTP(w, r)
return
}
if r.Header.Get("Upgrade") != "websocket" {
w.WriteHeader(http.StatusBadRequest)
_, _ = w.Write([]byte(fmt.Sprintf(`Invalid "Upgrade" header: %s`, html.EscapeString(r.Header.Get("Upgrade")))))
return
}
handler.ServeHTTP(w, r)
}))
server.Config.ErrorLog = tslogger.StdLogger(logf)
server.Config.TLSNextProto = make(map[string]func(*http.Server, *tls.Conn, http.Handler))
server.StartTLS()
t.Cleanup(func() {
server.CloseClientConnections()
server.Close()
closeFunc()
d.Close()
})
tcpAddr, ok := server.Listener.Addr().(*net.TCPAddr)
if !ok {
t.FailNow()
}
return &tailcfg.DERPMap{
Regions: map[int]*tailcfg.DERPRegion{
1: {
RegionID: 1,
RegionCode: "test",
RegionName: "Test",
Nodes: []*tailcfg.DERPNode{
{
Name: "t1",
RegionID: 1,
IPv4: "127.0.0.1",
IPv6: "none",
DERPPort: tcpAddr.Port,
InsecureForTests: true,
},
},
},
},
}
}

View File

@ -16,3 +16,8 @@ func TestRunDERPAndSTUN(t *testing.T) {
t.Parallel()
_ = tailnettest.RunDERPAndSTUN(t)
}
func TestRunDERPOnlyWebSockets(t *testing.T) {
t.Parallel()
_ = tailnettest.RunDERPOnlyWebSockets(t)
}