feat(support): add client magicsock and agent prometheus metrics to support bundle (#12604)

* feat(codersdk): add ability to fetch prometheus metrics directly from agent
* feat(support): add client magicsock and agent prometheus metrics to support bundle
* refactor(support): simplify AgentInfo control flow

Co-authored-by: Mathias Fredriksson <mafredri@gmail.com>
This commit is contained in:
Cian Johnston 2024-03-15 15:33:49 +00:00 committed by GitHub
parent 4d9e6c0134
commit b0c4e7504c
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
8 changed files with 187 additions and 108 deletions

View File

@ -25,6 +25,7 @@ import (
"github.com/go-chi/chi/v5"
"github.com/google/uuid"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/common/expfmt"
"github.com/spf13/afero"
"go.uber.org/atomic"
"golang.org/x/exp/slices"
@ -34,6 +35,7 @@ import (
"tailscale.com/net/speedtest"
"tailscale.com/tailcfg"
"tailscale.com/types/netlogtype"
"tailscale.com/util/clientmetric"
"cdr.dev/slog"
"github.com/coder/retry"
@ -1980,3 +1982,26 @@ func (a *apiConnRoutineManager) start(name string, b gracefulShutdownBehavior, f
func (a *apiConnRoutineManager) wait() error {
return a.eg.Wait()
}
func PrometheusMetricsHandler(prometheusRegistry *prometheus.Registry, logger slog.Logger) http.Handler {
return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
w.Header().Set("Content-Type", "text/plain")
// Based on: https://github.com/tailscale/tailscale/blob/280255acae604796a1113861f5a84e6fa2dc6121/ipn/localapi/localapi.go#L489
clientmetric.WritePrometheusExpositionFormat(w)
metricFamilies, err := prometheusRegistry.Gather()
if err != nil {
logger.Error(context.Background(), "prometheus handler failed to gather metric families", slog.Error(err))
return
}
for _, metricFamily := range metricFamilies {
_, err = expfmt.MetricFamilyToText(w, metricFamily)
if err != nil {
logger.Error(context.Background(), "expfmt.MetricFamilyToText failed", slog.Error(err))
return
}
}
})
}

View File

@ -35,11 +35,13 @@ func (a *agent) apiHandler() http.Handler {
ignorePorts: cpy,
cacheDuration: cacheDuration,
}
promHandler := PrometheusMetricsHandler(a.prometheusRegistry, a.logger)
r.Get("/api/v0/listening-ports", lp.handler)
r.Get("/debug/logs", a.HandleHTTPDebugLogs)
r.Get("/debug/magicsock", a.HandleHTTPDebugMagicsock)
r.Get("/debug/magicsock/debug-logging/{state}", a.HandleHTTPMagicsockDebugLoggingState)
r.Get("/debug/manifest", a.HandleHTTPDebugManifest)
r.Get("/debug/prometheus", promHandler.ServeHTTP)
return r
}

View File

@ -18,10 +18,8 @@ import (
"cloud.google.com/go/compute/metadata"
"golang.org/x/xerrors"
"gopkg.in/natefinch/lumberjack.v2"
"tailscale.com/util/clientmetric"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/common/expfmt"
"cdr.dev/slog"
"cdr.dev/slog/sloggers/sloghuman"
@ -315,7 +313,8 @@ func (r *RootCmd) workspaceAgent() *clibase.Cmd {
ModifiedProcesses: nil,
})
prometheusSrvClose := ServeHandler(ctx, logger, prometheusMetricsHandler(prometheusRegistry, logger), prometheusAddress, "prometheus")
promHandler := agent.PrometheusMetricsHandler(prometheusRegistry, logger)
prometheusSrvClose := ServeHandler(ctx, logger, promHandler, prometheusAddress, "prometheus")
defer prometheusSrvClose()
debugSrvClose := ServeHandler(ctx, logger, agnt.HTTPDebug(), debugAddress, "debug")
@ -501,26 +500,3 @@ func urlPort(u string) (int, error) {
}
return -1, xerrors.Errorf("invalid port: %s", u)
}
func prometheusMetricsHandler(prometheusRegistry *prometheus.Registry, logger slog.Logger) http.Handler {
return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
w.Header().Set("Content-Type", "text/plain")
// Based on: https://github.com/tailscale/tailscale/blob/280255acae604796a1113861f5a84e6fa2dc6121/ipn/localapi/localapi.go#L489
clientmetric.WritePrometheusExpositionFormat(w)
metricFamilies, err := prometheusRegistry.Gather()
if err != nil {
logger.Error(context.Background(), "Prometheus handler can't gather metric families", slog.Error(err))
return
}
for _, metricFamily := range metricFamilies {
_, err = expfmt.MetricFamilyToText(w, metricFamily)
if err != nil {
logger.Error(context.Background(), "expfmt.MetricFamilyToText failed", slog.Error(err))
return
}
}
})
}

View File

@ -176,8 +176,10 @@ func writeBundle(src *support.Bundle, dest *zip.Writer) error {
"network/tailnet_debug.html": src.Network.TailnetDebug,
"workspace/build_logs.txt": humanizeBuildLogs(src.Workspace.BuildLogs),
"agent/logs.txt": string(src.Agent.Logs),
"agent/magicsock.html": string(src.Agent.MagicsockHTML),
"agent/agent_magicsock.html": string(src.Agent.AgentMagicsockHTML),
"agent/client_magicsock.html": string(src.Agent.ClientMagicsockHTML),
"agent/startup_logs.txt": humanizeAgentLogs(src.Agent.StartupLogs),
"agent/prometheus.txt": string(src.Agent.Prometheus),
"workspace/template_file.zip": string(templateVersionBytes),
"logs.txt": strings.Join(src.Logs, "\n"),
} {

View File

@ -177,9 +177,12 @@ func assertBundleContents(t *testing.T, path string) {
case "agent/logs.txt":
bs := readBytesFromZip(t, f)
require.NotEmpty(t, bs, "logs should not be empty")
case "agent/magicsock.html":
case "agent/agent_magicsock.html":
bs := readBytesFromZip(t, f)
require.NotEmpty(t, bs, "agent magicsock should not be empty")
case "agent/client_magicsock.html":
bs := readBytesFromZip(t, f)
require.NotEmpty(t, bs, "client magicsock should not be empty")
case "agent/manifest.json":
var v agentsdk.Manifest
decodeJSONFromZip(t, f, &v)
@ -192,6 +195,9 @@ func assertBundleContents(t *testing.T, path string) {
var v *ipnstate.PingResult
decodeJSONFromZip(t, f, &v)
require.NotEmpty(t, v, "ping result should not be empty")
case "agent/prometheus.txt":
bs := readBytesFromZip(t, f)
require.NotEmpty(t, bs, "agent prometheus metrics should not be empty")
case "agent/startup_logs.txt":
bs := readBytesFromZip(t, f)
require.Contains(t, string(bs), "started up")

View File

@ -364,6 +364,9 @@ func (c *WorkspaceAgentConn) DebugMagicsock(ctx context.Context) ([]byte, error)
if err != nil {
return nil, xerrors.Errorf("do request: %w", err)
}
if res.StatusCode != http.StatusOK {
return nil, ReadBodyAsError(res)
}
defer res.Body.Close()
bs, err := io.ReadAll(res.Body)
if err != nil {
@ -382,6 +385,9 @@ func (c *WorkspaceAgentConn) DebugManifest(ctx context.Context) ([]byte, error)
return nil, xerrors.Errorf("do request: %w", err)
}
defer res.Body.Close()
if res.StatusCode != http.StatusOK {
return nil, ReadBodyAsError(res)
}
bs, err := io.ReadAll(res.Body)
if err != nil {
return nil, xerrors.Errorf("read response body: %w", err)
@ -398,6 +404,28 @@ func (c *WorkspaceAgentConn) DebugLogs(ctx context.Context) ([]byte, error) {
return nil, xerrors.Errorf("do request: %w", err)
}
defer res.Body.Close()
if res.StatusCode != http.StatusOK {
return nil, ReadBodyAsError(res)
}
bs, err := io.ReadAll(res.Body)
if err != nil {
return nil, xerrors.Errorf("read response body: %w", err)
}
return bs, nil
}
// PrometheusMetrics returns a response from the agent's prometheus metrics endpoint
func (c *WorkspaceAgentConn) PrometheusMetrics(ctx context.Context) ([]byte, error) {
ctx, span := tracing.StartSpan(ctx)
defer span.End()
res, err := c.apiRequest(ctx, http.MethodGet, "/debug/prometheus", nil)
if err != nil {
return nil, xerrors.Errorf("do request: %w", err)
}
defer res.Body.Close()
if res.StatusCode != http.StatusOK {
return nil, ReadBodyAsError(res)
}
bs, err := io.ReadAll(res.Body)
if err != nil {
return nil, xerrors.Errorf("read response body: %w", err)

View File

@ -7,6 +7,7 @@ import (
"encoding/json"
"io"
"net/http"
"net/http/httptest"
"strings"
"golang.org/x/sync/errgroup"
@ -57,14 +58,16 @@ type Workspace struct {
}
type Agent struct {
Agent *codersdk.WorkspaceAgent `json:"agent"`
ListeningPorts *codersdk.WorkspaceAgentListeningPortsResponse `json:"listening_ports"`
Logs []byte `json:"logs"`
MagicsockHTML []byte `json:"magicsock_html"`
Manifest *agentsdk.Manifest `json:"manifest"`
PeerDiagnostics *tailnet.PeerDiagnostics `json:"peer_diagnostics"`
PingResult *ipnstate.PingResult `json:"ping_result"`
StartupLogs []codersdk.WorkspaceAgentLog `json:"startup_logs"`
Agent *codersdk.WorkspaceAgent `json:"agent"`
ListeningPorts *codersdk.WorkspaceAgentListeningPortsResponse `json:"listening_ports"`
Logs []byte `json:"logs"`
ClientMagicsockHTML []byte `json:"client_magicsock_html"`
AgentMagicsockHTML []byte `json:"agent_magicsock_html"`
Manifest *agentsdk.Manifest `json:"manifest"`
PeerDiagnostics *tailnet.PeerDiagnostics `json:"peer_diagnostics"`
PingResult *ipnstate.PingResult `json:"ping_result"`
Prometheus []byte `json:"prometheus"`
StartupLogs []codersdk.WorkspaceAgentLog `json:"startup_logs"`
}
// Deps is a set of dependencies for discovering information
@ -313,77 +316,10 @@ func AgentInfo(ctx context.Context, client *codersdk.Client, log slog.Logger, ag
return nil
})
conn, err := client.DialWorkspaceAgent(ctx, agentID, &codersdk.DialWorkspaceAgentOptions{
Logger: log.Named("dial-agent"),
BlockEndpoints: false,
})
if err != nil {
log.Error(ctx, "dial agent", slog.Error(err))
} else {
defer func() {
if err := conn.Close(); err != nil {
log.Error(ctx, "failed to close agent connection", slog.Error(err))
}
<-conn.Closed()
}()
if !conn.AwaitReachable(ctx) {
log.Error(ctx, "timed out waiting for agent")
} else {
eg.Go(func() error {
_, _, pingRes, err := conn.Ping(ctx)
if err != nil {
return xerrors.Errorf("ping agent: %w", err)
}
a.PingResult = pingRes
return nil
})
eg.Go(func() error {
pds := conn.GetPeerDiagnostics()
a.PeerDiagnostics = &pds
return nil
})
eg.Go(func() error {
msBytes, err := conn.DebugMagicsock(ctx)
if err != nil {
return xerrors.Errorf("get agent magicsock page: %w", err)
}
a.MagicsockHTML = msBytes
return nil
})
eg.Go(func() error {
manifestRes, err := conn.DebugManifest(ctx)
if err != nil {
return xerrors.Errorf("fetch manifest: %w", err)
}
if err := json.NewDecoder(bytes.NewReader(manifestRes)).Decode(&a.Manifest); err != nil {
return xerrors.Errorf("decode agent manifest: %w", err)
}
return nil
})
eg.Go(func() error {
logBytes, err := conn.DebugLogs(ctx)
if err != nil {
return xerrors.Errorf("fetch coder agent logs: %w", err)
}
a.Logs = logBytes
return nil
})
eg.Go(func() error {
lps, err := conn.ListeningPorts(ctx)
if err != nil {
return xerrors.Errorf("get listening ports: %w", err)
}
a.ListeningPorts = &lps
return nil
})
}
}
// to simplify control flow, fetching information directly from
// the agent is handled in a separate function
closer := connectedAgentInfo(ctx, client, log, agentID, &eg, &a)
defer closer()
if err := eg.Wait(); err != nil {
log.Error(ctx, "fetch agent information", slog.Error(err))
@ -392,6 +328,108 @@ func AgentInfo(ctx context.Context, client *codersdk.Client, log slog.Logger, ag
return a
}
func connectedAgentInfo(ctx context.Context, client *codersdk.Client, log slog.Logger, agentID uuid.UUID, eg *errgroup.Group, a *Agent) (closer func()) {
conn, err := client.DialWorkspaceAgent(ctx, agentID, &codersdk.DialWorkspaceAgentOptions{
Logger: log.Named("dial-agent"),
BlockEndpoints: false,
})
closer = func() {}
if err != nil {
log.Error(ctx, "dial agent", slog.Error(err))
return closer
}
if !conn.AwaitReachable(ctx) {
log.Error(ctx, "timed out waiting for agent")
return closer
}
closer = func() {
if err := conn.Close(); err != nil {
log.Error(ctx, "failed to close agent connection", slog.Error(err))
}
<-conn.Closed()
}
eg.Go(func() error {
req, err := http.NewRequestWithContext(ctx, http.MethodGet, "http://localhost/", nil)
if err != nil {
return xerrors.Errorf("create request: %w", err)
}
rr := httptest.NewRecorder()
conn.MagicsockServeHTTPDebug(rr, req)
a.ClientMagicsockHTML = rr.Body.Bytes()
return nil
})
eg.Go(func() error {
promRes, err := conn.PrometheusMetrics(ctx)
if err != nil {
return xerrors.Errorf("fetch agent prometheus metrics: %w", err)
}
a.Prometheus = promRes
return nil
})
eg.Go(func() error {
_, _, pingRes, err := conn.Ping(ctx)
if err != nil {
return xerrors.Errorf("ping agent: %w", err)
}
a.PingResult = pingRes
return nil
})
eg.Go(func() error {
pds := conn.GetPeerDiagnostics()
a.PeerDiagnostics = &pds
return nil
})
eg.Go(func() error {
msBytes, err := conn.DebugMagicsock(ctx)
if err != nil {
return xerrors.Errorf("get agent magicsock page: %w", err)
}
a.AgentMagicsockHTML = msBytes
return nil
})
eg.Go(func() error {
manifestRes, err := conn.DebugManifest(ctx)
if err != nil {
return xerrors.Errorf("fetch manifest: %w", err)
}
if err := json.NewDecoder(bytes.NewReader(manifestRes)).Decode(&a.Manifest); err != nil {
return xerrors.Errorf("decode agent manifest: %w", err)
}
return nil
})
eg.Go(func() error {
logBytes, err := conn.DebugLogs(ctx)
if err != nil {
return xerrors.Errorf("fetch coder agent logs: %w", err)
}
a.Logs = logBytes
return nil
})
eg.Go(func() error {
lps, err := conn.ListeningPorts(ctx)
if err != nil {
return xerrors.Errorf("get listening ports: %w", err)
}
a.ListeningPorts = &lps
return nil
})
return closer
}
// Run generates a support bundle with the given dependencies.
func Run(ctx context.Context, d *Deps) (*Bundle, error) {
var b Bundle

View File

@ -75,9 +75,11 @@ func TestRun(t *testing.T) {
assertNotNilNotEmpty(t, bun.Agent.Agent, "agent should be present")
assertNotNilNotEmpty(t, bun.Agent.ListeningPorts, "agent listening ports should be present")
assertNotNilNotEmpty(t, bun.Agent.Logs, "agent logs should be present")
assertNotNilNotEmpty(t, bun.Agent.MagicsockHTML, "agent magicsock should be present")
assertNotNilNotEmpty(t, bun.Agent.AgentMagicsockHTML, "agent magicsock should be present")
assertNotNilNotEmpty(t, bun.Agent.ClientMagicsockHTML, "client magicsock should be present")
assertNotNilNotEmpty(t, bun.Agent.PeerDiagnostics, "agent peer diagnostics should be present")
assertNotNilNotEmpty(t, bun.Agent.PingResult, "agent ping result should be present")
assertNotNilNotEmpty(t, bun.Agent.Prometheus, "agent prometheus metrics should be present")
assertNotNilNotEmpty(t, bun.Agent.StartupLogs, "agent startup logs should be present")
assertNotNilNotEmpty(t, bun.Logs, "bundle logs should be present")
})