feat: add debug server for tailnet coordinators (#5861)

Implements a Tailscale-like debug server for our in-memory coordinator. This should provide some visibility into why connections could be failing.
Resolves: https://github.com/coder/coder/issues/5845

![image](https://user-images.githubusercontent.com/6332295/214680832-2724d633-2d54-44d6-a7ce-5841e5824ee5.png)
This commit is contained in:
Colin Adler 2023-01-25 15:27:36 -06:00 committed by GitHub
parent 8830ddfd56
commit 1cd5f38cb0
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
16 changed files with 261 additions and 34 deletions

View File

@ -1193,7 +1193,7 @@ func (c *client) ListenWorkspaceAgent(_ context.Context) (net.Conn, error) {
}
c.t.Cleanup(c.lastWorkspaceAgent)
go func() {
_ = c.coordinator.ServeAgent(serverConn, c.agentID)
_ = c.coordinator.ServeAgent(serverConn, c.agentID, "")
close(closed)
}()
return clientConn, nil

22
coderd/apidoc/docs.go generated
View File

@ -362,6 +362,28 @@ const docTemplate = `{
}
}
},
"/debug/coordinator": {
"get": {
"security": [
{
"CoderSessionToken": []
}
],
"produces": [
"text/html"
],
"tags": [
"Debug"
],
"summary": "Debug Info Wireguard Coordinator",
"operationId": "debug-info-wireguard-coordinator",
"responses": {
"200": {
"description": "OK"
}
}
}
},
"/entitlements": {
"get": {
"security": [

View File

@ -308,6 +308,24 @@
}
}
},
"/debug/coordinator": {
"get": {
"security": [
{
"CoderSessionToken": []
}
],
"produces": ["text/html"],
"tags": ["Debug"],
"summary": "Debug Info Wireguard Coordinator",
"operationId": "debug-info-wireguard-coordinator",
"responses": {
"200": {
"description": "OK"
}
}
}
},
"/entitlements": {
"get": {
"security": [

View File

@ -613,6 +613,25 @@ func New(options *Options) *API {
r.Get("/", api.workspaceApplicationAuth)
})
})
r.Route("/debug", func(r chi.Router) {
r.Use(
apiKeyMiddleware,
// Ensure only owners can access debug endpoints.
func(next http.Handler) http.Handler {
return http.HandlerFunc(func(rw http.ResponseWriter, r *http.Request) {
if !api.Authorize(r, rbac.ActionRead, rbac.ResourceDebugInfo) {
httpapi.ResourceNotFound(rw)
return
}
next.ServeHTTP(rw, r)
})
},
)
r.Get("/coordinator", api.debugCoordinator)
})
})
if options.SwaggerEndpoint {

View File

@ -272,6 +272,11 @@ func AGPLRoutes(a *AuthTester) (map[string]string, map[string]RouteCheck) {
AssertAction: rbac.ActionRead,
AssertObject: rbac.ResourceTemplate,
},
"GET:/api/v2/debug/coordinator": {
AssertAction: rbac.ActionRead,
AssertObject: rbac.ResourceDebugInfo,
},
}
// Routes like proxy routes support all HTTP methods. A helper func to expand

View File

@ -327,7 +327,7 @@ func assertAccept(t *testing.T, comment SwaggerComment) {
}
}
var allowedProduceTypes = []string{"json", "text/event-stream"}
var allowedProduceTypes = []string{"json", "text/event-stream", "text/html"}
func assertProduce(t *testing.T, comment SwaggerComment) {
var hasResponseModel bool
@ -344,7 +344,8 @@ func assertProduce(t *testing.T, comment SwaggerComment) {
} else {
if (comment.router == "/workspaceagents/me/app-health" && comment.method == "post") ||
(comment.router == "/workspaceagents/me/version" && comment.method == "post") ||
(comment.router == "/licenses/{id}" && comment.method == "delete") {
(comment.router == "/licenses/{id}" && comment.method == "delete") ||
(comment.router == "/debug/coordinator" && comment.method == "get") {
return // Exception: HTTP 200 is returned without response entity
}

14
coderd/debug.go Normal file
View File

@ -0,0 +1,14 @@
package coderd
import "net/http"
// @Summary Debug Info Wireguard Coordinator
// @ID debug-info-wireguard-coordinator
// @Security CoderSessionToken
// @Produce text/html
// @Tags Debug
// @Success 200
// @Router /debug/coordinator [get]
func (api *API) debugCoordinator(rw http.ResponseWriter, r *http.Request) {
(*api.TailnetCoordinator.Load()).ServeHTTPDebug(rw, r)
}

View File

@ -150,6 +150,11 @@ var (
ResourceReplicas = Object{
Type: "replicas",
}
// ResourceDebugInfo controls access to the debug routes `/api/v2/debug/*`.
ResourceDebugInfo = Object{
Type: "debug_info",
}
)
// Object is used to create objects for authz checks when you have none in

View File

@ -521,6 +521,16 @@ func (api *API) workspaceAgentCoordinate(rw http.ResponseWriter, r *http.Request
})
return
}
workspace, err := api.Database.GetWorkspaceByID(ctx, build.WorkspaceID)
if err != nil {
httpapi.Write(ctx, rw, http.StatusBadRequest, codersdk.Response{
Message: "Internal error fetching workspace.",
Detail: err.Error(),
})
return
}
// Ensure the resource is still valid!
// We only accept agents for resources on the latest build.
ensureLatestBuild := func() error {
@ -618,7 +628,7 @@ func (api *API) workspaceAgentCoordinate(rw http.ResponseWriter, r *http.Request
closeChan := make(chan struct{})
go func() {
defer close(closeChan)
err := (*api.TailnetCoordinator.Load()).ServeAgent(wsNetConn, workspaceAgent.ID)
err := (*api.TailnetCoordinator.Load()).ServeAgent(wsNetConn, workspaceAgent.ID, fmt.Sprintf("%s-%s", workspace.Name, workspaceAgent.Name))
if err != nil {
api.Logger.Warn(ctx, "tailnet coordinator agent error", slog.Error(err))
_ = conn.Close(websocket.StatusInternalError, err.Error())

View File

@ -207,7 +207,7 @@ func (c *client) ListenWorkspaceAgent(_ context.Context) (net.Conn, error) {
<-closed
})
go func() {
_ = c.coordinator.ServeAgent(serverConn, c.agentID)
_ = c.coordinator.ServeAgent(serverConn, c.agentID, "")
close(closed)
}()
return clientConn, nil

21
docs/api/debug.md Normal file
View File

@ -0,0 +1,21 @@
# Debug
## Debug Info Wireguard Coordinator
### Code samples
```shell
# Example request using curl
curl -X GET http://coder-server:8080/api/v2/debug/coordinator \
-H 'Coder-Session-Token: API_KEY'
```
`GET /debug/coordinator`
### Responses
| Status | Meaning | Description | Schema |
| ------ | ------------------------------------------------------- | ----------- | ------ |
| 200 | [OK](https://tools.ietf.org/html/rfc7231#section-6.3.1) | OK | |
To perform this operation, you must be authenticated. [Learn more](authentication.md).

View File

@ -364,6 +364,10 @@
"title": "Builds",
"path": "./api/builds.md"
},
{
"title": "Debug",
"path": "./api/debug.md"
},
{
"title": "Enterprise",
"path": "./api/enterprise.md"

View File

@ -5,8 +5,10 @@ import (
"context"
"encoding/json"
"errors"
"fmt"
"io"
"net"
"net/http"
"sync"
"time"
@ -174,7 +176,7 @@ func (c *haCoordinator) handleNextClientMessage(id, agent uuid.UUID, decoder *js
// ServeAgent accepts a WebSocket connection to an agent that listens to
// incoming connections and publishes node updates.
func (c *haCoordinator) ServeAgent(conn net.Conn, id uuid.UUID) error {
func (c *haCoordinator) ServeAgent(conn net.Conn, id uuid.UUID, _ string) error {
// Tell clients on other instances to send a callmemaybe to us.
err := c.publishAgentHello(id)
if err != nil {
@ -573,3 +575,9 @@ func (c *haCoordinator) formatAgentUpdate(id uuid.UUID, node *agpl.Node) ([]byte
return buf.Bytes(), nil
}
func (*haCoordinator) ServeHTTPDebug(w http.ResponseWriter, _ *http.Request) {
w.Header().Set("Content-Type", "text/html; charset=utf-8")
fmt.Fprintf(w, "<h1>coordinator</h1>")
fmt.Fprintf(w, "<h2>ha debug coming soon</h2>")
}

View File

@ -60,7 +60,7 @@ func TestCoordinatorSingle(t *testing.T) {
id := uuid.New()
closeChan := make(chan struct{})
go func() {
err := coordinator.ServeAgent(server, id)
err := coordinator.ServeAgent(server, id, "")
assert.NoError(t, err)
close(closeChan)
}()
@ -91,7 +91,7 @@ func TestCoordinatorSingle(t *testing.T) {
agentID := uuid.New()
closeAgentChan := make(chan struct{})
go func() {
err := coordinator.ServeAgent(agentServerWS, agentID)
err := coordinator.ServeAgent(agentServerWS, agentID, "")
assert.NoError(t, err)
close(closeAgentChan)
}()
@ -142,7 +142,7 @@ func TestCoordinatorSingle(t *testing.T) {
})
closeAgentChan = make(chan struct{})
go func() {
err := coordinator.ServeAgent(agentServerWS, agentID)
err := coordinator.ServeAgent(agentServerWS, agentID, "")
assert.NoError(t, err)
close(closeAgentChan)
}()
@ -184,7 +184,7 @@ func TestCoordinatorHA(t *testing.T) {
agentID := uuid.New()
closeAgentChan := make(chan struct{})
go func() {
err := coordinator1.ServeAgent(agentServerWS, agentID)
err := coordinator1.ServeAgent(agentServerWS, agentID, "")
assert.NoError(t, err)
close(closeAgentChan)
}()
@ -240,7 +240,7 @@ func TestCoordinatorHA(t *testing.T) {
})
closeAgentChan = make(chan struct{})
go func() {
err := coordinator1.ServeAgent(agentServerWS, agentID)
err := coordinator1.ServeAgent(agentServerWS, agentID, "")
assert.NoError(t, err)
close(closeAgentChan)
}()

View File

@ -4,10 +4,13 @@ import (
"context"
"encoding/json"
"errors"
"fmt"
"io"
"net"
"net/http"
"net/netip"
"sync"
"sync/atomic"
"time"
"github.com/google/uuid"
@ -22,6 +25,9 @@ import (
// └──────────────────┘ └────────────────────┘ └───────────────────┘ └──────────────────┘
// Coordinators have different guarantees for HA support.
type Coordinator interface {
// ServeHTTPDebug serves a debug webpage that shows the internal state of
// the coordinator.
ServeHTTPDebug(w http.ResponseWriter, r *http.Request)
// Node returns an in-memory node by ID.
Node(id uuid.UUID) *Node
// ServeClient accepts a WebSocket connection that wants to connect to an agent
@ -29,7 +35,8 @@ type Coordinator interface {
ServeClient(conn net.Conn, id uuid.UUID, agent uuid.UUID) error
// ServeAgent accepts a WebSocket connection to an agent that listens to
// incoming connections and publishes node updates.
ServeAgent(conn net.Conn, id uuid.UUID) error
// Name is just used for debug information. It can be left blank.
ServeAgent(conn net.Conn, id uuid.UUID, name string) error
// Close closes the coordinator.
Close() error
}
@ -104,8 +111,8 @@ func NewCoordinator() Coordinator {
return &coordinator{
closed: false,
nodes: map[uuid.UUID]*Node{},
agentSockets: map[uuid.UUID]idConn{},
agentToConnectionSockets: map[uuid.UUID]map[uuid.UUID]net.Conn{},
agentSockets: map[uuid.UUID]*trackedConn{},
agentToConnectionSockets: map[uuid.UUID]map[uuid.UUID]*trackedConn{},
}
}
@ -117,23 +124,34 @@ func NewCoordinator() Coordinator {
// This coordinator is incompatible with multiple Coder
// replicas as all node data is in-memory.
type coordinator struct {
mutex sync.Mutex
mutex sync.RWMutex
closed bool
// nodes maps agent and connection IDs their respective node.
nodes map[uuid.UUID]*Node
// agentSockets maps agent IDs to their open websocket.
agentSockets map[uuid.UUID]idConn
agentSockets map[uuid.UUID]*trackedConn
// agentToConnectionSockets maps agent IDs to connection IDs of conns that
// are subscribed to updates for that agent.
agentToConnectionSockets map[uuid.UUID]map[uuid.UUID]net.Conn
agentToConnectionSockets map[uuid.UUID]map[uuid.UUID]*trackedConn
}
type idConn struct {
type trackedConn struct {
net.Conn
// id is an ephemeral UUID used to uniquely identify the owner of the
// connection.
id uuid.UUID
conn net.Conn
id uuid.UUID
name string
start int64
lastWrite int64
overwrites int64
}
func (t *trackedConn) Write(b []byte) (n int, err error) {
atomic.StoreInt64(&t.lastWrite, time.Now().Unix())
return t.Conn.Write(b)
}
// Node returns an in-memory node by ID.
@ -182,12 +200,18 @@ func (c *coordinator) ServeClient(conn net.Conn, id uuid.UUID, agent uuid.UUID)
c.mutex.Lock()
connectionSockets, ok := c.agentToConnectionSockets[agent]
if !ok {
connectionSockets = map[uuid.UUID]net.Conn{}
connectionSockets = map[uuid.UUID]*trackedConn{}
c.agentToConnectionSockets[agent] = connectionSockets
}
now := time.Now().Unix()
// Insert this connection into a map so the agent
// can publish node updates.
connectionSockets[id] = conn
connectionSockets[id] = &trackedConn{
Conn: conn,
start: now,
lastWrite: now,
}
c.mutex.Unlock()
defer func() {
c.mutex.Lock()
@ -243,7 +267,7 @@ func (c *coordinator) handleNextClientMessage(id, agent uuid.UUID, decoder *json
return xerrors.Errorf("marshal nodes: %w", err)
}
_, err = agentSocket.conn.Write(data)
_, err = agentSocket.Write(data)
if err != nil {
if errors.Is(err, io.EOF) || errors.Is(err, io.ErrClosedPipe) || errors.Is(err, context.Canceled) {
return nil
@ -256,7 +280,7 @@ func (c *coordinator) handleNextClientMessage(id, agent uuid.UUID, decoder *json
// ServeAgent accepts a WebSocket connection to an agent that
// listens to incoming connections and publishes node updates.
func (c *coordinator) ServeAgent(conn net.Conn, id uuid.UUID) error {
func (c *coordinator) ServeAgent(conn net.Conn, id uuid.UUID, name string) error {
c.mutex.Lock()
if c.closed {
c.mutex.Unlock()
@ -289,6 +313,8 @@ func (c *coordinator) ServeAgent(conn net.Conn, id uuid.UUID) error {
// This uniquely identifies a connection that belongs to this goroutine.
unique := uuid.New()
now := time.Now().Unix()
overwrites := int64(0)
// If an old agent socket is connected, we close it to avoid any leaks. This
// shouldn't ever occur because we expect one agent to be running, but it's
@ -297,11 +323,17 @@ func (c *coordinator) ServeAgent(conn net.Conn, id uuid.UUID) error {
// dead.
oldAgentSocket, ok := c.agentSockets[id]
if ok {
_ = oldAgentSocket.conn.Close()
overwrites = oldAgentSocket.overwrites + 1
_ = oldAgentSocket.Close()
}
c.agentSockets[id] = idConn{
c.agentSockets[id] = &trackedConn{
id: unique,
conn: conn,
Conn: conn,
name: name,
start: now,
lastWrite: now,
overwrites: overwrites,
}
c.mutex.Unlock()
@ -311,7 +343,7 @@ func (c *coordinator) ServeAgent(conn net.Conn, id uuid.UUID) error {
// Only delete the connection if it's ours. It could have been
// overwritten.
if idConn := c.agentSockets[id]; idConn.id == unique {
if idConn, ok := c.agentSockets[id]; ok && idConn.id == unique {
delete(c.agentSockets, id)
delete(c.nodes, id)
}
@ -382,7 +414,7 @@ func (c *coordinator) Close() error {
for _, socket := range c.agentSockets {
socket := socket
go func() {
_ = socket.conn.Close()
_ = socket.Close()
wg.Done()
}()
}
@ -403,3 +435,71 @@ func (c *coordinator) Close() error {
wg.Wait()
return nil
}
func (c *coordinator) ServeHTTPDebug(w http.ResponseWriter, _ *http.Request) {
w.Header().Set("Content-Type", "text/html; charset=utf-8")
now := time.Now()
c.mutex.RLock()
defer c.mutex.RUnlock()
fmt.Fprintln(w, "<h1>in-memory wireguard coordinator debug</h1>")
fmt.Fprintf(w, "<h2 id=agents><a href=#agents>#</a> agents: total %d</h2>\n", len(c.agentSockets))
fmt.Fprintln(w, "<ul>")
for id, conn := range c.agentSockets {
fmt.Fprintf(w, "<li><b>%s</b> (%s): created %v ago, write %v ago, overwrites %d </li>\n",
conn.name,
id.String(),
now.Sub(time.Unix(conn.start, 0)).Round(time.Second),
now.Sub(time.Unix(conn.lastWrite, 0)).Round(time.Second),
conn.overwrites,
)
if connCount := len(c.agentToConnectionSockets[id]); connCount > 0 {
fmt.Fprintf(w, "<h3>connections: total %d</h3>\n", connCount)
fmt.Fprintln(w, "<ul>")
for id, conn := range c.agentToConnectionSockets[id] {
fmt.Fprintf(w, "<li><b>%s</b> (%s): created %v ago, write %v ago </li>\n",
conn.name,
id.String(),
now.Sub(time.Unix(conn.start, 0)).Round(time.Second),
now.Sub(time.Unix(conn.lastWrite, 0)).Round(time.Second),
)
}
fmt.Fprintln(w, "</ul>")
}
}
fmt.Fprintln(w, "</ul>")
missingAgents := map[uuid.UUID]map[uuid.UUID]*trackedConn{}
for agentID, conns := range c.agentToConnectionSockets {
if len(conns) == 0 {
continue
}
if _, ok := c.agentSockets[agentID]; !ok {
missingAgents[agentID] = conns
}
}
fmt.Fprintf(w, "<h2 id=missing-agents><a href=#missing-agents>#</a> missing agents: total %d</h2>\n", len(missingAgents))
fmt.Fprintln(w, "<ul>")
for agentID, conns := range missingAgents {
fmt.Fprintf(w, "<li><b>unknown</b> (%s): created ? ago, write ? ago, overwrites ? </li>\n",
agentID.String(),
)
fmt.Fprintf(w, "<h3>connections: total %d</h3>\n", len(conns))
fmt.Fprintln(w, "<ul>")
for id, conn := range conns {
fmt.Fprintf(w, "<li><b>%s</b> (%s): created %v ago, write %v ago </li>\n",
conn.name,
id.String(),
now.Sub(time.Unix(conn.start, 0)).Round(time.Second),
now.Sub(time.Unix(conn.lastWrite, 0)).Round(time.Second),
)
}
fmt.Fprintln(w, "</ul>")
}
fmt.Fprintln(w, "</ul>")
}

View File

@ -48,7 +48,7 @@ func TestCoordinator(t *testing.T) {
id := uuid.New()
closeChan := make(chan struct{})
go func() {
err := coordinator.ServeAgent(server, id)
err := coordinator.ServeAgent(server, id, "")
assert.NoError(t, err)
close(closeChan)
}()
@ -76,7 +76,7 @@ func TestCoordinator(t *testing.T) {
agentID := uuid.New()
closeAgentChan := make(chan struct{})
go func() {
err := coordinator.ServeAgent(agentServerWS, agentID)
err := coordinator.ServeAgent(agentServerWS, agentID, "")
assert.NoError(t, err)
close(closeAgentChan)
}()
@ -127,7 +127,7 @@ func TestCoordinator(t *testing.T) {
})
closeAgentChan = make(chan struct{})
go func() {
err := coordinator.ServeAgent(agentServerWS, agentID)
err := coordinator.ServeAgent(agentServerWS, agentID, "")
assert.NoError(t, err)
close(closeAgentChan)
}()
@ -160,7 +160,7 @@ func TestCoordinator(t *testing.T) {
agentID := uuid.New()
closeAgentChan1 := make(chan struct{})
go func() {
err := coordinator.ServeAgent(agentServerWS1, agentID)
err := coordinator.ServeAgent(agentServerWS1, agentID, "")
assert.NoError(t, err)
close(closeAgentChan1)
}()
@ -205,7 +205,7 @@ func TestCoordinator(t *testing.T) {
})
closeAgentChan2 := make(chan struct{})
go func() {
err := coordinator.ServeAgent(agentServerWS2, agentID)
err := coordinator.ServeAgent(agentServerWS2, agentID, "")
assert.NoError(t, err)
close(closeAgentChan2)
}()