From b0c4e7504c5edaeecd42319fc1fe92f70cd7187f Mon Sep 17 00:00:00 2001 From: Cian Johnston Date: Fri, 15 Mar 2024 15:33:49 +0000 Subject: [PATCH] feat(support): add client magicsock and agent prometheus metrics to support bundle (#12604) * feat(codersdk): add ability to fetch prometheus metrics directly from agent * feat(support): add client magicsock and agent prometheus metrics to support bundle * refactor(support): simplify AgentInfo control flow Co-authored-by: Mathias Fredriksson --- agent/agent.go | 25 +++++ agent/api.go | 2 + cli/agent.go | 28 +---- cli/support.go | 4 +- cli/support_test.go | 8 +- codersdk/workspaceagentconn.go | 28 +++++ support/support.go | 196 ++++++++++++++++++++------------- support/support_test.go | 4 +- 8 files changed, 187 insertions(+), 108 deletions(-) diff --git a/agent/agent.go b/agent/agent.go index 120ef7cdf7..6f601cbcd7 100644 --- a/agent/agent.go +++ b/agent/agent.go @@ -25,6 +25,7 @@ import ( "github.com/go-chi/chi/v5" "github.com/google/uuid" "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/common/expfmt" "github.com/spf13/afero" "go.uber.org/atomic" "golang.org/x/exp/slices" @@ -34,6 +35,7 @@ import ( "tailscale.com/net/speedtest" "tailscale.com/tailcfg" "tailscale.com/types/netlogtype" + "tailscale.com/util/clientmetric" "cdr.dev/slog" "github.com/coder/retry" @@ -1980,3 +1982,26 @@ func (a *apiConnRoutineManager) start(name string, b gracefulShutdownBehavior, f func (a *apiConnRoutineManager) wait() error { return a.eg.Wait() } + +func PrometheusMetricsHandler(prometheusRegistry *prometheus.Registry, logger slog.Logger) http.Handler { + return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "text/plain") + + // Based on: https://github.com/tailscale/tailscale/blob/280255acae604796a1113861f5a84e6fa2dc6121/ipn/localapi/localapi.go#L489 + clientmetric.WritePrometheusExpositionFormat(w) + + metricFamilies, err := prometheusRegistry.Gather() + if err != nil { + logger.Error(context.Background(), "prometheus handler failed to gather metric families", slog.Error(err)) + return + } + + for _, metricFamily := range metricFamilies { + _, err = expfmt.MetricFamilyToText(w, metricFamily) + if err != nil { + logger.Error(context.Background(), "expfmt.MetricFamilyToText failed", slog.Error(err)) + return + } + } + }) +} diff --git a/agent/api.go b/agent/api.go index 7ca63443a2..e7ed1ca6a1 100644 --- a/agent/api.go +++ b/agent/api.go @@ -35,11 +35,13 @@ func (a *agent) apiHandler() http.Handler { ignorePorts: cpy, cacheDuration: cacheDuration, } + promHandler := PrometheusMetricsHandler(a.prometheusRegistry, a.logger) r.Get("/api/v0/listening-ports", lp.handler) r.Get("/debug/logs", a.HandleHTTPDebugLogs) r.Get("/debug/magicsock", a.HandleHTTPDebugMagicsock) r.Get("/debug/magicsock/debug-logging/{state}", a.HandleHTTPMagicsockDebugLoggingState) r.Get("/debug/manifest", a.HandleHTTPDebugManifest) + r.Get("/debug/prometheus", promHandler.ServeHTTP) return r } diff --git a/cli/agent.go b/cli/agent.go index b900db48d0..8891cf78b6 100644 --- a/cli/agent.go +++ b/cli/agent.go @@ -18,10 +18,8 @@ import ( "cloud.google.com/go/compute/metadata" "golang.org/x/xerrors" "gopkg.in/natefinch/lumberjack.v2" - "tailscale.com/util/clientmetric" "github.com/prometheus/client_golang/prometheus" - "github.com/prometheus/common/expfmt" "cdr.dev/slog" "cdr.dev/slog/sloggers/sloghuman" @@ -315,7 +313,8 @@ func (r *RootCmd) workspaceAgent() *clibase.Cmd { ModifiedProcesses: nil, }) - prometheusSrvClose := ServeHandler(ctx, logger, prometheusMetricsHandler(prometheusRegistry, logger), prometheusAddress, "prometheus") + promHandler := agent.PrometheusMetricsHandler(prometheusRegistry, logger) + prometheusSrvClose := ServeHandler(ctx, logger, promHandler, prometheusAddress, "prometheus") defer prometheusSrvClose() debugSrvClose := ServeHandler(ctx, logger, agnt.HTTPDebug(), debugAddress, "debug") @@ -501,26 +500,3 @@ func urlPort(u string) (int, error) { } return -1, xerrors.Errorf("invalid port: %s", u) } - -func prometheusMetricsHandler(prometheusRegistry *prometheus.Registry, logger slog.Logger) http.Handler { - return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { - w.Header().Set("Content-Type", "text/plain") - - // Based on: https://github.com/tailscale/tailscale/blob/280255acae604796a1113861f5a84e6fa2dc6121/ipn/localapi/localapi.go#L489 - clientmetric.WritePrometheusExpositionFormat(w) - - metricFamilies, err := prometheusRegistry.Gather() - if err != nil { - logger.Error(context.Background(), "Prometheus handler can't gather metric families", slog.Error(err)) - return - } - - for _, metricFamily := range metricFamilies { - _, err = expfmt.MetricFamilyToText(w, metricFamily) - if err != nil { - logger.Error(context.Background(), "expfmt.MetricFamilyToText failed", slog.Error(err)) - return - } - } - }) -} diff --git a/cli/support.go b/cli/support.go index ab6687e442..ba74980c2e 100644 --- a/cli/support.go +++ b/cli/support.go @@ -176,8 +176,10 @@ func writeBundle(src *support.Bundle, dest *zip.Writer) error { "network/tailnet_debug.html": src.Network.TailnetDebug, "workspace/build_logs.txt": humanizeBuildLogs(src.Workspace.BuildLogs), "agent/logs.txt": string(src.Agent.Logs), - "agent/magicsock.html": string(src.Agent.MagicsockHTML), + "agent/agent_magicsock.html": string(src.Agent.AgentMagicsockHTML), + "agent/client_magicsock.html": string(src.Agent.ClientMagicsockHTML), "agent/startup_logs.txt": humanizeAgentLogs(src.Agent.StartupLogs), + "agent/prometheus.txt": string(src.Agent.Prometheus), "workspace/template_file.zip": string(templateVersionBytes), "logs.txt": strings.Join(src.Logs, "\n"), } { diff --git a/cli/support_test.go b/cli/support_test.go index b19cbaebc5..5954e975bd 100644 --- a/cli/support_test.go +++ b/cli/support_test.go @@ -177,9 +177,12 @@ func assertBundleContents(t *testing.T, path string) { case "agent/logs.txt": bs := readBytesFromZip(t, f) require.NotEmpty(t, bs, "logs should not be empty") - case "agent/magicsock.html": + case "agent/agent_magicsock.html": bs := readBytesFromZip(t, f) require.NotEmpty(t, bs, "agent magicsock should not be empty") + case "agent/client_magicsock.html": + bs := readBytesFromZip(t, f) + require.NotEmpty(t, bs, "client magicsock should not be empty") case "agent/manifest.json": var v agentsdk.Manifest decodeJSONFromZip(t, f, &v) @@ -192,6 +195,9 @@ func assertBundleContents(t *testing.T, path string) { var v *ipnstate.PingResult decodeJSONFromZip(t, f, &v) require.NotEmpty(t, v, "ping result should not be empty") + case "agent/prometheus.txt": + bs := readBytesFromZip(t, f) + require.NotEmpty(t, bs, "agent prometheus metrics should not be empty") case "agent/startup_logs.txt": bs := readBytesFromZip(t, f) require.Contains(t, string(bs), "started up") diff --git a/codersdk/workspaceagentconn.go b/codersdk/workspaceagentconn.go index 09bed58fe6..cf85ee07de 100644 --- a/codersdk/workspaceagentconn.go +++ b/codersdk/workspaceagentconn.go @@ -364,6 +364,9 @@ func (c *WorkspaceAgentConn) DebugMagicsock(ctx context.Context) ([]byte, error) if err != nil { return nil, xerrors.Errorf("do request: %w", err) } + if res.StatusCode != http.StatusOK { + return nil, ReadBodyAsError(res) + } defer res.Body.Close() bs, err := io.ReadAll(res.Body) if err != nil { @@ -382,6 +385,9 @@ func (c *WorkspaceAgentConn) DebugManifest(ctx context.Context) ([]byte, error) return nil, xerrors.Errorf("do request: %w", err) } defer res.Body.Close() + if res.StatusCode != http.StatusOK { + return nil, ReadBodyAsError(res) + } bs, err := io.ReadAll(res.Body) if err != nil { return nil, xerrors.Errorf("read response body: %w", err) @@ -398,6 +404,28 @@ func (c *WorkspaceAgentConn) DebugLogs(ctx context.Context) ([]byte, error) { return nil, xerrors.Errorf("do request: %w", err) } defer res.Body.Close() + if res.StatusCode != http.StatusOK { + return nil, ReadBodyAsError(res) + } + bs, err := io.ReadAll(res.Body) + if err != nil { + return nil, xerrors.Errorf("read response body: %w", err) + } + return bs, nil +} + +// PrometheusMetrics returns a response from the agent's prometheus metrics endpoint +func (c *WorkspaceAgentConn) PrometheusMetrics(ctx context.Context) ([]byte, error) { + ctx, span := tracing.StartSpan(ctx) + defer span.End() + res, err := c.apiRequest(ctx, http.MethodGet, "/debug/prometheus", nil) + if err != nil { + return nil, xerrors.Errorf("do request: %w", err) + } + defer res.Body.Close() + if res.StatusCode != http.StatusOK { + return nil, ReadBodyAsError(res) + } bs, err := io.ReadAll(res.Body) if err != nil { return nil, xerrors.Errorf("read response body: %w", err) diff --git a/support/support.go b/support/support.go index 2a2c3c55f8..48e570768f 100644 --- a/support/support.go +++ b/support/support.go @@ -7,6 +7,7 @@ import ( "encoding/json" "io" "net/http" + "net/http/httptest" "strings" "golang.org/x/sync/errgroup" @@ -57,14 +58,16 @@ type Workspace struct { } type Agent struct { - Agent *codersdk.WorkspaceAgent `json:"agent"` - ListeningPorts *codersdk.WorkspaceAgentListeningPortsResponse `json:"listening_ports"` - Logs []byte `json:"logs"` - MagicsockHTML []byte `json:"magicsock_html"` - Manifest *agentsdk.Manifest `json:"manifest"` - PeerDiagnostics *tailnet.PeerDiagnostics `json:"peer_diagnostics"` - PingResult *ipnstate.PingResult `json:"ping_result"` - StartupLogs []codersdk.WorkspaceAgentLog `json:"startup_logs"` + Agent *codersdk.WorkspaceAgent `json:"agent"` + ListeningPorts *codersdk.WorkspaceAgentListeningPortsResponse `json:"listening_ports"` + Logs []byte `json:"logs"` + ClientMagicsockHTML []byte `json:"client_magicsock_html"` + AgentMagicsockHTML []byte `json:"agent_magicsock_html"` + Manifest *agentsdk.Manifest `json:"manifest"` + PeerDiagnostics *tailnet.PeerDiagnostics `json:"peer_diagnostics"` + PingResult *ipnstate.PingResult `json:"ping_result"` + Prometheus []byte `json:"prometheus"` + StartupLogs []codersdk.WorkspaceAgentLog `json:"startup_logs"` } // Deps is a set of dependencies for discovering information @@ -313,77 +316,10 @@ func AgentInfo(ctx context.Context, client *codersdk.Client, log slog.Logger, ag return nil }) - conn, err := client.DialWorkspaceAgent(ctx, agentID, &codersdk.DialWorkspaceAgentOptions{ - Logger: log.Named("dial-agent"), - BlockEndpoints: false, - }) - if err != nil { - log.Error(ctx, "dial agent", slog.Error(err)) - } else { - defer func() { - if err := conn.Close(); err != nil { - log.Error(ctx, "failed to close agent connection", slog.Error(err)) - } - <-conn.Closed() - }() - if !conn.AwaitReachable(ctx) { - log.Error(ctx, "timed out waiting for agent") - } else { - eg.Go(func() error { - _, _, pingRes, err := conn.Ping(ctx) - if err != nil { - return xerrors.Errorf("ping agent: %w", err) - } - a.PingResult = pingRes - return nil - }) - - eg.Go(func() error { - pds := conn.GetPeerDiagnostics() - a.PeerDiagnostics = &pds - return nil - }) - - eg.Go(func() error { - msBytes, err := conn.DebugMagicsock(ctx) - if err != nil { - return xerrors.Errorf("get agent magicsock page: %w", err) - } - a.MagicsockHTML = msBytes - return nil - }) - - eg.Go(func() error { - manifestRes, err := conn.DebugManifest(ctx) - if err != nil { - return xerrors.Errorf("fetch manifest: %w", err) - } - if err := json.NewDecoder(bytes.NewReader(manifestRes)).Decode(&a.Manifest); err != nil { - return xerrors.Errorf("decode agent manifest: %w", err) - } - - return nil - }) - - eg.Go(func() error { - logBytes, err := conn.DebugLogs(ctx) - if err != nil { - return xerrors.Errorf("fetch coder agent logs: %w", err) - } - a.Logs = logBytes - return nil - }) - - eg.Go(func() error { - lps, err := conn.ListeningPorts(ctx) - if err != nil { - return xerrors.Errorf("get listening ports: %w", err) - } - a.ListeningPorts = &lps - return nil - }) - } - } + // to simplify control flow, fetching information directly from + // the agent is handled in a separate function + closer := connectedAgentInfo(ctx, client, log, agentID, &eg, &a) + defer closer() if err := eg.Wait(); err != nil { log.Error(ctx, "fetch agent information", slog.Error(err)) @@ -392,6 +328,108 @@ func AgentInfo(ctx context.Context, client *codersdk.Client, log slog.Logger, ag return a } +func connectedAgentInfo(ctx context.Context, client *codersdk.Client, log slog.Logger, agentID uuid.UUID, eg *errgroup.Group, a *Agent) (closer func()) { + conn, err := client.DialWorkspaceAgent(ctx, agentID, &codersdk.DialWorkspaceAgentOptions{ + Logger: log.Named("dial-agent"), + BlockEndpoints: false, + }) + + closer = func() {} + + if err != nil { + log.Error(ctx, "dial agent", slog.Error(err)) + return closer + } + + if !conn.AwaitReachable(ctx) { + log.Error(ctx, "timed out waiting for agent") + return closer + } + + closer = func() { + if err := conn.Close(); err != nil { + log.Error(ctx, "failed to close agent connection", slog.Error(err)) + } + <-conn.Closed() + } + + eg.Go(func() error { + req, err := http.NewRequestWithContext(ctx, http.MethodGet, "http://localhost/", nil) + if err != nil { + return xerrors.Errorf("create request: %w", err) + } + rr := httptest.NewRecorder() + conn.MagicsockServeHTTPDebug(rr, req) + a.ClientMagicsockHTML = rr.Body.Bytes() + return nil + }) + + eg.Go(func() error { + promRes, err := conn.PrometheusMetrics(ctx) + if err != nil { + return xerrors.Errorf("fetch agent prometheus metrics: %w", err) + } + a.Prometheus = promRes + return nil + }) + + eg.Go(func() error { + _, _, pingRes, err := conn.Ping(ctx) + if err != nil { + return xerrors.Errorf("ping agent: %w", err) + } + a.PingResult = pingRes + return nil + }) + + eg.Go(func() error { + pds := conn.GetPeerDiagnostics() + a.PeerDiagnostics = &pds + return nil + }) + + eg.Go(func() error { + msBytes, err := conn.DebugMagicsock(ctx) + if err != nil { + return xerrors.Errorf("get agent magicsock page: %w", err) + } + a.AgentMagicsockHTML = msBytes + return nil + }) + + eg.Go(func() error { + manifestRes, err := conn.DebugManifest(ctx) + if err != nil { + return xerrors.Errorf("fetch manifest: %w", err) + } + if err := json.NewDecoder(bytes.NewReader(manifestRes)).Decode(&a.Manifest); err != nil { + return xerrors.Errorf("decode agent manifest: %w", err) + } + + return nil + }) + + eg.Go(func() error { + logBytes, err := conn.DebugLogs(ctx) + if err != nil { + return xerrors.Errorf("fetch coder agent logs: %w", err) + } + a.Logs = logBytes + return nil + }) + + eg.Go(func() error { + lps, err := conn.ListeningPorts(ctx) + if err != nil { + return xerrors.Errorf("get listening ports: %w", err) + } + a.ListeningPorts = &lps + return nil + }) + + return closer +} + // Run generates a support bundle with the given dependencies. func Run(ctx context.Context, d *Deps) (*Bundle, error) { var b Bundle diff --git a/support/support_test.go b/support/support_test.go index 6876bafbc5..6b790627ca 100644 --- a/support/support_test.go +++ b/support/support_test.go @@ -75,9 +75,11 @@ func TestRun(t *testing.T) { assertNotNilNotEmpty(t, bun.Agent.Agent, "agent should be present") assertNotNilNotEmpty(t, bun.Agent.ListeningPorts, "agent listening ports should be present") assertNotNilNotEmpty(t, bun.Agent.Logs, "agent logs should be present") - assertNotNilNotEmpty(t, bun.Agent.MagicsockHTML, "agent magicsock should be present") + assertNotNilNotEmpty(t, bun.Agent.AgentMagicsockHTML, "agent magicsock should be present") + assertNotNilNotEmpty(t, bun.Agent.ClientMagicsockHTML, "client magicsock should be present") assertNotNilNotEmpty(t, bun.Agent.PeerDiagnostics, "agent peer diagnostics should be present") assertNotNilNotEmpty(t, bun.Agent.PingResult, "agent ping result should be present") + assertNotNilNotEmpty(t, bun.Agent.Prometheus, "agent prometheus metrics should be present") assertNotNilNotEmpty(t, bun.Agent.StartupLogs, "agent startup logs should be present") assertNotNilNotEmpty(t, bun.Logs, "bundle logs should be present") })