mirror of https://github.com/coder/coder.git
507 lines
11 KiB
Go
507 lines
11 KiB
Go
package derphealth
|
|
|
|
import (
|
|
"context"
|
|
"fmt"
|
|
"net"
|
|
"net/netip"
|
|
"net/url"
|
|
"strings"
|
|
"sync"
|
|
"sync/atomic"
|
|
"time"
|
|
|
|
"golang.org/x/exp/slices"
|
|
"golang.org/x/xerrors"
|
|
"tailscale.com/derp"
|
|
"tailscale.com/derp/derphttp"
|
|
"tailscale.com/net/netcheck"
|
|
"tailscale.com/net/portmapper"
|
|
"tailscale.com/prober"
|
|
"tailscale.com/tailcfg"
|
|
"tailscale.com/types/key"
|
|
tslogger "tailscale.com/types/logger"
|
|
|
|
"github.com/coder/coder/v2/coderd/healthcheck/health"
|
|
"github.com/coder/coder/v2/coderd/util/ptr"
|
|
"github.com/coder/coder/v2/coderd/util/slice"
|
|
"github.com/coder/coder/v2/codersdk/healthsdk"
|
|
)
|
|
|
|
const (
|
|
warningNodeUsesWebsocket = `Node uses WebSockets because the "Upgrade: DERP" header may be blocked on the load balancer.`
|
|
oneNodeUnhealthy = "Region is operational, but performance might be degraded as one node is unhealthy."
|
|
missingNodeReport = "Missing node health report, probably a developer error."
|
|
)
|
|
|
|
type ReportOptions struct {
|
|
Dismissed bool
|
|
|
|
DERPMap *tailcfg.DERPMap
|
|
}
|
|
|
|
type Report healthsdk.DERPHealthReport
|
|
|
|
type RegionReport struct {
|
|
healthsdk.DERPRegionReport
|
|
mu sync.Mutex
|
|
}
|
|
|
|
type NodeReport struct {
|
|
healthsdk.DERPNodeReport
|
|
mu sync.Mutex
|
|
clientCounter int
|
|
}
|
|
|
|
func (r *Report) Run(ctx context.Context, opts *ReportOptions) {
|
|
r.Healthy = true
|
|
r.Severity = health.SeverityOK
|
|
r.Warnings = []health.Message{}
|
|
r.Dismissed = opts.Dismissed
|
|
|
|
r.Regions = map[int]*healthsdk.DERPRegionReport{}
|
|
|
|
wg := &sync.WaitGroup{}
|
|
mu := sync.Mutex{}
|
|
|
|
wg.Add(len(opts.DERPMap.Regions))
|
|
for _, region := range opts.DERPMap.Regions {
|
|
var (
|
|
region = region
|
|
regionReport = RegionReport{
|
|
DERPRegionReport: healthsdk.DERPRegionReport{
|
|
Region: region,
|
|
},
|
|
}
|
|
)
|
|
go func() {
|
|
defer wg.Done()
|
|
defer func() {
|
|
if err := recover(); err != nil {
|
|
regionReport.Error = ptr.Ref(fmt.Sprint(err))
|
|
}
|
|
}()
|
|
|
|
regionReport.Run(ctx)
|
|
|
|
mu.Lock()
|
|
r.Regions[region.RegionID] = ®ionReport.DERPRegionReport
|
|
if !regionReport.Healthy {
|
|
r.Healthy = false
|
|
}
|
|
|
|
r.Warnings = append(r.Warnings, regionReport.Warnings...)
|
|
mu.Unlock()
|
|
}()
|
|
}
|
|
|
|
ncLogf := func(format string, args ...interface{}) {
|
|
mu.Lock()
|
|
r.NetcheckLogs = append(r.NetcheckLogs, fmt.Sprintf(format, args...))
|
|
mu.Unlock()
|
|
}
|
|
nc := &netcheck.Client{
|
|
PortMapper: portmapper.NewClient(tslogger.WithPrefix(ncLogf, "portmap: "), nil, nil, nil),
|
|
Logf: tslogger.WithPrefix(ncLogf, "netcheck: "),
|
|
}
|
|
ncReport, netcheckErr := nc.GetReport(ctx, opts.DERPMap)
|
|
r.Netcheck = ncReport
|
|
r.NetcheckErr = convertError(netcheckErr)
|
|
|
|
wg.Wait()
|
|
|
|
// Review region reports and select the highest severity.
|
|
for _, regionReport := range r.Regions {
|
|
if regionReport.Severity.Value() > r.Severity.Value() {
|
|
r.Severity = regionReport.Severity
|
|
}
|
|
}
|
|
}
|
|
|
|
func (r *RegionReport) Run(ctx context.Context) {
|
|
r.Healthy = true
|
|
r.Severity = health.SeverityOK
|
|
r.NodeReports = []*healthsdk.DERPNodeReport{}
|
|
r.Warnings = []health.Message{}
|
|
|
|
wg := &sync.WaitGroup{}
|
|
var unhealthyNodes int // atomic.Int64 is not mandatory as we depend on RegionReport mutex.
|
|
|
|
wg.Add(len(r.Region.Nodes))
|
|
for _, node := range r.Region.Nodes {
|
|
var (
|
|
node = node
|
|
nodeReport = NodeReport{
|
|
DERPNodeReport: healthsdk.DERPNodeReport{
|
|
Node: node,
|
|
Healthy: true,
|
|
},
|
|
}
|
|
)
|
|
|
|
go func() {
|
|
defer wg.Done()
|
|
defer func() {
|
|
if err := recover(); err != nil {
|
|
nodeReport.Error = ptr.Ref(fmt.Sprint(err))
|
|
nodeReport.Severity = health.SeverityError
|
|
}
|
|
}()
|
|
|
|
nodeReport.Run(ctx)
|
|
|
|
r.mu.Lock()
|
|
r.NodeReports = append(r.NodeReports, &nodeReport.DERPNodeReport)
|
|
if nodeReport.Severity != health.SeverityOK {
|
|
unhealthyNodes++
|
|
}
|
|
|
|
r.Warnings = append(r.Warnings, nodeReport.Warnings...)
|
|
r.mu.Unlock()
|
|
}()
|
|
}
|
|
wg.Wait()
|
|
|
|
r.mu.Lock()
|
|
defer r.mu.Unlock()
|
|
|
|
sortNodeReports(r.NodeReports)
|
|
|
|
if len(r.Region.Nodes) != len(r.NodeReports) {
|
|
r.Healthy = false
|
|
r.Severity = health.SeverityError
|
|
r.Error = ptr.Ref(missingNodeReport)
|
|
return
|
|
}
|
|
|
|
if len(r.Region.Nodes) == 1 {
|
|
r.Healthy = r.NodeReports[0].Severity != health.SeverityError
|
|
r.Severity = r.NodeReports[0].Severity
|
|
} else if unhealthyNodes == 1 {
|
|
// r.Healthy = true (by default)
|
|
r.Severity = health.SeverityWarning
|
|
r.Warnings = append(r.Warnings, health.Messagef(health.CodeDERPOneNodeUnhealthy, oneNodeUnhealthy))
|
|
} else if unhealthyNodes > 1 {
|
|
r.Healthy = false
|
|
|
|
// Review node reports and select the highest severity.
|
|
for _, nodeReport := range r.NodeReports {
|
|
if nodeReport.Severity.Value() > r.Severity.Value() {
|
|
r.Severity = nodeReport.Severity
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
func (r *NodeReport) derpURL() *url.URL {
|
|
derpURL := &url.URL{
|
|
Scheme: "https",
|
|
Host: r.Node.HostName,
|
|
Path: "/derp",
|
|
}
|
|
if r.Node.ForceHTTP {
|
|
derpURL.Scheme = "http"
|
|
}
|
|
if r.Node.HostName == "" {
|
|
derpURL.Host = r.Node.IPv4
|
|
}
|
|
if r.Node.DERPPort != 0 && !(r.Node.DERPPort == 443 && derpURL.Scheme == "https") && !(r.Node.DERPPort == 80 && derpURL.Scheme == "http") {
|
|
derpURL.Host = fmt.Sprintf("%s:%d", derpURL.Host, r.Node.DERPPort)
|
|
}
|
|
|
|
return derpURL
|
|
}
|
|
|
|
func (r *NodeReport) Run(ctx context.Context) {
|
|
ctx, cancel := context.WithTimeout(ctx, 10*time.Second)
|
|
defer cancel()
|
|
|
|
r.Severity = health.SeverityOK
|
|
r.ClientLogs = [][]string{}
|
|
r.ClientErrs = [][]string{}
|
|
r.Warnings = []health.Message{}
|
|
|
|
wg := &sync.WaitGroup{}
|
|
|
|
wg.Add(2)
|
|
go func() {
|
|
defer wg.Done()
|
|
r.doExchangeMessage(ctx)
|
|
}()
|
|
go func() {
|
|
defer wg.Done()
|
|
r.doSTUNTest(ctx)
|
|
}()
|
|
|
|
wg.Wait()
|
|
|
|
// We can't exchange messages with the node,
|
|
if (!r.CanExchangeMessages && !r.Node.STUNOnly) ||
|
|
// The node was marked as STUN compatible but the STUN test failed.
|
|
r.STUN.Error != nil {
|
|
r.Healthy = false
|
|
r.Severity = health.SeverityError
|
|
}
|
|
|
|
if r.UsesWebsocket {
|
|
r.Warnings = append(r.Warnings, health.Messagef(health.CodeDERPNodeUsesWebsocket, warningNodeUsesWebsocket))
|
|
r.Severity = health.SeverityWarning
|
|
}
|
|
}
|
|
|
|
func (r *NodeReport) doExchangeMessage(ctx context.Context) {
|
|
if r.Node.STUNOnly {
|
|
return
|
|
}
|
|
|
|
var (
|
|
peerKey atomic.Pointer[key.NodePublic]
|
|
lastSent atomic.Pointer[time.Time]
|
|
)
|
|
ctx, cancel := context.WithCancel(ctx)
|
|
defer cancel()
|
|
wg := &sync.WaitGroup{}
|
|
|
|
receive, receiveID, err := r.derpClient(ctx, r.derpURL())
|
|
if err != nil {
|
|
return
|
|
}
|
|
defer receive.Close()
|
|
|
|
wg.Add(2)
|
|
go func() {
|
|
defer wg.Done()
|
|
defer receive.Close()
|
|
|
|
pkt, err := r.recvData(receive)
|
|
if err != nil {
|
|
r.writeClientErr(receiveID, xerrors.Errorf("recv derp message: %w", err))
|
|
return
|
|
}
|
|
|
|
if *peerKey.Load() != pkt.Source {
|
|
r.writeClientErr(receiveID, xerrors.Errorf("received pkt from unknown peer: %s", pkt.Source.ShortString()))
|
|
return
|
|
}
|
|
|
|
t := lastSent.Load()
|
|
|
|
r.mu.Lock()
|
|
r.CanExchangeMessages = true
|
|
rtt := time.Since(*t)
|
|
r.RoundTripPing = rtt.String()
|
|
r.RoundTripPingMs = int(rtt.Milliseconds())
|
|
r.mu.Unlock()
|
|
|
|
cancel()
|
|
}()
|
|
go func() {
|
|
defer wg.Done()
|
|
send, sendID, err := r.derpClient(ctx, r.derpURL())
|
|
if err != nil {
|
|
return
|
|
}
|
|
defer send.Close()
|
|
|
|
pk := send.SelfPublicKey()
|
|
peerKey.Store(&pk)
|
|
|
|
ticker := time.NewTicker(time.Second)
|
|
defer ticker.Stop()
|
|
|
|
var iter uint8
|
|
for {
|
|
lastSent.Store(ptr.Ref(time.Now()))
|
|
err = send.Send(receive.SelfPublicKey(), []byte{iter})
|
|
if err != nil {
|
|
r.writeClientErr(sendID, xerrors.Errorf("send derp message: %w", err))
|
|
return
|
|
}
|
|
iter++
|
|
|
|
select {
|
|
case <-ctx.Done():
|
|
return
|
|
case <-ticker.C:
|
|
}
|
|
}
|
|
}()
|
|
|
|
wg.Wait()
|
|
}
|
|
|
|
func (r *NodeReport) doSTUNTest(ctx context.Context) {
|
|
if r.Node.STUNPort == -1 {
|
|
return
|
|
}
|
|
r.mu.Lock()
|
|
r.STUN.Enabled = true
|
|
r.mu.Unlock()
|
|
|
|
addr, port, err := r.stunAddr(ctx)
|
|
if err != nil {
|
|
r.STUN.Error = convertError(xerrors.Errorf("get stun addr: %w", err))
|
|
return
|
|
}
|
|
|
|
// We only create a prober to call ProbeUDP manually.
|
|
p, err := prober.DERP(prober.New(), "", time.Second, time.Second, time.Second)
|
|
if err != nil {
|
|
r.STUN.Error = convertError(xerrors.Errorf("create prober: %w", err))
|
|
return
|
|
}
|
|
|
|
err = p.ProbeUDP(addr, port)(ctx)
|
|
if err != nil {
|
|
r.STUN.Error = convertError(xerrors.Errorf("probe stun: %w", err))
|
|
return
|
|
}
|
|
|
|
r.mu.Lock()
|
|
r.STUN.CanSTUN = true
|
|
r.mu.Unlock()
|
|
}
|
|
|
|
func (r *NodeReport) stunAddr(ctx context.Context) (string, int, error) {
|
|
port := r.Node.STUNPort
|
|
if port == 0 {
|
|
port = 3478
|
|
}
|
|
if port < 0 || port > 1<<16-1 {
|
|
return "", 0, xerrors.Errorf("invalid stun port %d", port)
|
|
}
|
|
|
|
if r.Node.STUNTestIP != "" {
|
|
ip, err := netip.ParseAddr(r.Node.STUNTestIP)
|
|
if err != nil {
|
|
return "", 0, xerrors.Errorf("invalid stun test ip %q: %w", r.Node.STUNTestIP, err)
|
|
}
|
|
|
|
return ip.String(), port, nil
|
|
}
|
|
|
|
if r.Node.HostName != "" {
|
|
addrs, err := net.DefaultResolver.LookupIPAddr(ctx, r.Node.HostName)
|
|
if err != nil {
|
|
return "", 0, xerrors.Errorf("lookup ip addr: %w", err)
|
|
}
|
|
for _, a := range addrs {
|
|
return a.String(), port, nil
|
|
}
|
|
}
|
|
|
|
if r.Node.IPv4 != "" {
|
|
ip, err := netip.ParseAddr(r.Node.IPv4)
|
|
if err != nil {
|
|
return "", 0, xerrors.Errorf("invalid ipv4 %q: %w", r.Node.IPv4, err)
|
|
}
|
|
|
|
if !ip.Is4() {
|
|
return "", 0, xerrors.Errorf("provided node ipv4 is not v4 %q: %w", r.Node.IPv4, err)
|
|
}
|
|
|
|
return ip.String(), port, nil
|
|
}
|
|
if r.Node.IPv6 != "" {
|
|
ip, err := netip.ParseAddr(r.Node.IPv6)
|
|
if err != nil {
|
|
return "", 0, xerrors.Errorf("invalid ipv6 %q: %w", r.Node.IPv6, err)
|
|
}
|
|
|
|
if !ip.Is6() {
|
|
return "", 0, xerrors.Errorf("provided node ipv6 is not v6 %q: %w", r.Node.IPv6, err)
|
|
}
|
|
|
|
return ip.String(), port, nil
|
|
}
|
|
|
|
return "", 0, xerrors.New("no stun ips provided")
|
|
}
|
|
|
|
func (r *NodeReport) writeClientErr(clientID int, err error) {
|
|
r.mu.Lock()
|
|
r.ClientErrs[clientID] = append(r.ClientErrs[clientID], err.Error())
|
|
r.mu.Unlock()
|
|
}
|
|
|
|
func (r *NodeReport) derpClient(ctx context.Context, derpURL *url.URL) (*derphttp.Client, int, error) {
|
|
r.mu.Lock()
|
|
id := r.clientCounter
|
|
r.clientCounter++
|
|
r.ClientLogs = append(r.ClientLogs, []string{})
|
|
r.ClientErrs = append(r.ClientErrs, []string{})
|
|
r.mu.Unlock()
|
|
|
|
client, err := derphttp.NewClient(key.NewNode(), derpURL.String(), func(format string, args ...any) {
|
|
r.mu.Lock()
|
|
defer r.mu.Unlock()
|
|
|
|
msg := fmt.Sprintf(format, args...)
|
|
if strings.Contains(msg, "We'll use WebSockets on the next connection attempt") {
|
|
r.UsesWebsocket = true
|
|
}
|
|
r.ClientLogs[id] = append(r.ClientLogs[id], msg)
|
|
})
|
|
if err != nil {
|
|
err := xerrors.Errorf("create derp client: %w", err)
|
|
r.writeClientErr(id, err)
|
|
return nil, id, err
|
|
}
|
|
|
|
go func() {
|
|
<-ctx.Done()
|
|
_ = client.Close()
|
|
}()
|
|
|
|
i := 0
|
|
for ; i < 5; i++ {
|
|
err = client.Connect(ctx)
|
|
if err != nil {
|
|
r.writeClientErr(id, xerrors.Errorf("connect to derp: %w", err))
|
|
continue
|
|
}
|
|
break
|
|
}
|
|
if i == 5 {
|
|
err := xerrors.Errorf("couldn't connect after 5 tries, last error: %w", err)
|
|
r.writeClientErr(id, xerrors.Errorf("couldn't connect after 5 tries, last error: %w", err))
|
|
return nil, id, err
|
|
}
|
|
|
|
return client, id, nil
|
|
}
|
|
|
|
func (r *NodeReport) recvData(client *derphttp.Client) (derp.ReceivedPacket, error) {
|
|
for {
|
|
msg, err := client.Recv()
|
|
if err != nil {
|
|
return derp.ReceivedPacket{}, err
|
|
}
|
|
|
|
switch msg := msg.(type) {
|
|
case derp.ReceivedPacket:
|
|
return msg, nil
|
|
case derp.ServerInfoMessage:
|
|
r.mu.Lock()
|
|
r.ServerInfo = msg
|
|
r.mu.Unlock()
|
|
default:
|
|
// Drop all others!
|
|
}
|
|
}
|
|
}
|
|
|
|
func convertError(err error) *string {
|
|
if err != nil {
|
|
return ptr.Ref(err.Error())
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
func sortNodeReports(reports []*healthsdk.DERPNodeReport) {
|
|
slices.SortFunc(reports, func(a, b *healthsdk.DERPNodeReport) int {
|
|
return slice.Ascending(a.Node.Name, b.Node.Name)
|
|
})
|
|
}
|