mirror of https://github.com/coder/coder.git
fix: prevent single replica proxies from staying unhealthy (#12641)
In the peer healthcheck code, when an error pinging peers is detected we write a "replicaErr" string with the error reason. However, if there are no peer replicas to ping we returned early without setting the string to empty. This would cause replicas that had peers (which were failing) and then the peers left to permanently show an error until a new peer appeared. Also demotes DERP replica checking to a "warning" rather than an "error" which should prevent the primary from removing the proxy from the region map if DERP meshing is non-functional. This can happen without causing problems if the peer is shutting down so we don't want to disrupt everything if there isn't an issue.
This commit is contained in:
parent
8a2f38a746
commit
cf50461ab4
|
@ -449,8 +449,21 @@ func (s *Server) handleRegister(res wsproxysdk.RegisterWorkspaceProxyResponse) e
|
|||
}
|
||||
|
||||
func (s *Server) pingSiblingReplicas(replicas []codersdk.Replica) {
|
||||
ctx, cancel := context.WithTimeout(s.ctx, 10*time.Second)
|
||||
defer cancel()
|
||||
|
||||
errStr := pingSiblingReplicas(ctx, s.Logger, &s.replicaPingSingleflight, s.derpMeshTLSConfig, replicas)
|
||||
s.replicaErrMut.Lock()
|
||||
s.replicaErr = errStr
|
||||
s.replicaErrMut.Unlock()
|
||||
if s.Options.ReplicaErrCallback != nil {
|
||||
s.Options.ReplicaErrCallback(replicas, s.replicaErr)
|
||||
}
|
||||
}
|
||||
|
||||
func pingSiblingReplicas(ctx context.Context, logger slog.Logger, sf *singleflight.Group, derpMeshTLSConfig *tls.Config, replicas []codersdk.Replica) string {
|
||||
if len(replicas) == 0 {
|
||||
return
|
||||
return ""
|
||||
}
|
||||
|
||||
// Avoid pinging multiple times at once if the list hasn't changed.
|
||||
|
@ -462,18 +475,11 @@ func (s *Server) pingSiblingReplicas(replicas []codersdk.Replica) {
|
|||
singleflightStr := strings.Join(relayURLs, " ") // URLs can't contain spaces.
|
||||
|
||||
//nolint:dogsled
|
||||
_, _, _ = s.replicaPingSingleflight.Do(singleflightStr, func() (any, error) {
|
||||
const (
|
||||
perReplicaTimeout = 3 * time.Second
|
||||
fullTimeout = 10 * time.Second
|
||||
)
|
||||
ctx, cancel := context.WithTimeout(s.ctx, fullTimeout)
|
||||
defer cancel()
|
||||
|
||||
errStrInterface, _, _ := sf.Do(singleflightStr, func() (any, error) {
|
||||
client := http.Client{
|
||||
Timeout: perReplicaTimeout,
|
||||
Timeout: 3 * time.Second,
|
||||
Transport: &http.Transport{
|
||||
TLSClientConfig: s.derpMeshTLSConfig,
|
||||
TLSClientConfig: derpMeshTLSConfig,
|
||||
DisableKeepAlives: true,
|
||||
},
|
||||
}
|
||||
|
@ -485,7 +491,7 @@ func (s *Server) pingSiblingReplicas(replicas []codersdk.Replica) {
|
|||
err := replicasync.PingPeerReplica(ctx, client, peer.RelayAddress)
|
||||
if err != nil {
|
||||
errs <- xerrors.Errorf("ping sibling replica %s (%s): %w", peer.Hostname, peer.RelayAddress, err)
|
||||
s.Logger.Warn(ctx, "failed to ping sibling replica, this could happen if the replica has shutdown",
|
||||
logger.Warn(ctx, "failed to ping sibling replica, this could happen if the replica has shutdown",
|
||||
slog.F("replica_hostname", peer.Hostname),
|
||||
slog.F("replica_relay_address", peer.RelayAddress),
|
||||
slog.Error(err),
|
||||
|
@ -504,20 +510,14 @@ func (s *Server) pingSiblingReplicas(replicas []codersdk.Replica) {
|
|||
}
|
||||
}
|
||||
|
||||
s.replicaErrMut.Lock()
|
||||
defer s.replicaErrMut.Unlock()
|
||||
s.replicaErr = ""
|
||||
if len(replicaErrs) > 0 {
|
||||
s.replicaErr = fmt.Sprintf("Failed to dial peers: %s", strings.Join(replicaErrs, ", "))
|
||||
if len(replicaErrs) == 0 {
|
||||
return "", nil
|
||||
}
|
||||
if s.Options.ReplicaErrCallback != nil {
|
||||
s.Options.ReplicaErrCallback(replicas, s.replicaErr)
|
||||
}
|
||||
|
||||
//nolint:nilnil // we don't actually use the return value of the
|
||||
// singleflight here
|
||||
return nil, nil
|
||||
return fmt.Sprintf("Failed to dial peers: %s", strings.Join(replicaErrs, ", ")), nil
|
||||
})
|
||||
|
||||
//nolint:forcetypeassert
|
||||
return errStrInterface.(string)
|
||||
}
|
||||
|
||||
func (s *Server) handleRegisterFailure(err error) {
|
||||
|
@ -590,7 +590,8 @@ func (s *Server) healthReport(rw http.ResponseWriter, r *http.Request) {
|
|||
|
||||
s.replicaErrMut.Lock()
|
||||
if s.replicaErr != "" {
|
||||
report.Errors = append(report.Errors, "High availability networking: it appears you are running more than one replica of the proxy, but the replicas are unable to establish a mesh for networking: "+s.replicaErr)
|
||||
report.Warnings = append(report.Warnings,
|
||||
"High availability networking: it appears you are running more than one replica of the proxy, but the replicas are unable to establish a mesh for networking: "+s.replicaErr)
|
||||
}
|
||||
s.replicaErrMut.Unlock()
|
||||
|
||||
|
|
|
@ -563,7 +563,7 @@ func TestWorkspaceProxyDERPMeshProbe(t *testing.T) {
|
|||
return proxyRes
|
||||
}
|
||||
|
||||
registerBrokenProxy := func(ctx context.Context, t *testing.T, primaryAccessURL *url.URL, accessURL, token string) {
|
||||
registerBrokenProxy := func(ctx context.Context, t *testing.T, primaryAccessURL *url.URL, accessURL, token string) uuid.UUID {
|
||||
t.Helper()
|
||||
// Create a HTTP server that always replies with 500.
|
||||
srv := httptest.NewServer(http.HandlerFunc(func(rw http.ResponseWriter, r *http.Request) {
|
||||
|
@ -574,21 +574,23 @@ func TestWorkspaceProxyDERPMeshProbe(t *testing.T) {
|
|||
// Register a proxy.
|
||||
wsproxyClient := wsproxysdk.New(primaryAccessURL)
|
||||
wsproxyClient.SetSessionToken(token)
|
||||
|
||||
hostname, err := cryptorand.String(6)
|
||||
require.NoError(t, err)
|
||||
replicaID := uuid.New()
|
||||
_, err = wsproxyClient.RegisterWorkspaceProxy(ctx, wsproxysdk.RegisterWorkspaceProxyRequest{
|
||||
AccessURL: accessURL,
|
||||
WildcardHostname: "",
|
||||
DerpEnabled: true,
|
||||
DerpOnly: false,
|
||||
ReplicaID: uuid.New(),
|
||||
ReplicaID: replicaID,
|
||||
ReplicaHostname: hostname,
|
||||
ReplicaError: "",
|
||||
ReplicaRelayAddress: srv.URL,
|
||||
Version: buildinfo.Version(),
|
||||
})
|
||||
require.NoError(t, err)
|
||||
|
||||
return replicaID
|
||||
}
|
||||
|
||||
t.Run("ProbeOK", func(t *testing.T) {
|
||||
|
@ -781,8 +783,120 @@ func TestWorkspaceProxyDERPMeshProbe(t *testing.T) {
|
|||
resp.Body.Close()
|
||||
require.NoError(t, err)
|
||||
|
||||
require.Len(t, respJSON.Errors, 1, "proxy is healthy")
|
||||
require.Contains(t, respJSON.Errors[0], "High availability networking")
|
||||
require.Len(t, respJSON.Warnings, 1, "proxy is healthy")
|
||||
require.Contains(t, respJSON.Warnings[0], "High availability networking")
|
||||
})
|
||||
|
||||
// This test catches a regression we detected on dogfood which caused
|
||||
// proxies to remain unhealthy after a mesh failure if they dropped to zero
|
||||
// siblings after the failure.
|
||||
t.Run("HealthyZero", func(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
deploymentValues := coderdtest.DeploymentValues(t)
|
||||
deploymentValues.Experiments = []string{
|
||||
"*",
|
||||
}
|
||||
|
||||
client, closer, api, _ := coderdenttest.NewWithAPI(t, &coderdenttest.Options{
|
||||
Options: &coderdtest.Options{
|
||||
DeploymentValues: deploymentValues,
|
||||
AppHostname: "*.primary.test.coder.com",
|
||||
IncludeProvisionerDaemon: true,
|
||||
RealIPConfig: &httpmw.RealIPConfig{
|
||||
TrustedOrigins: []*net.IPNet{{
|
||||
IP: net.ParseIP("127.0.0.1"),
|
||||
Mask: net.CIDRMask(8, 32),
|
||||
}},
|
||||
TrustedHeaders: []string{
|
||||
"CF-Connecting-IP",
|
||||
},
|
||||
},
|
||||
},
|
||||
LicenseOptions: &coderdenttest.LicenseOptions{
|
||||
Features: license.Features{
|
||||
codersdk.FeatureWorkspaceProxy: 1,
|
||||
},
|
||||
},
|
||||
})
|
||||
t.Cleanup(func() {
|
||||
_ = closer.Close()
|
||||
})
|
||||
|
||||
proxyURL, err := url.Parse("https://proxy2.test.coder.com")
|
||||
require.NoError(t, err)
|
||||
|
||||
// Create 1 real proxy replica.
|
||||
replicaPingErr := make(chan string, 4)
|
||||
proxy := coderdenttest.NewWorkspaceProxyReplica(t, api, client, &coderdenttest.ProxyOptions{
|
||||
Name: "proxy-2",
|
||||
ProxyURL: proxyURL,
|
||||
ReplicaPingCallback: func(replicas []codersdk.Replica, err string) {
|
||||
replicaPingErr <- err
|
||||
},
|
||||
})
|
||||
|
||||
ctx := testutil.Context(t, testutil.WaitLong)
|
||||
otherReplicaID := registerBrokenProxy(ctx, t, api.AccessURL, proxyURL.String(), proxy.Options.ProxySessionToken)
|
||||
|
||||
// Force the proxy to re-register immediately.
|
||||
err = proxy.RegisterNow()
|
||||
require.NoError(t, err, "failed to force proxy to re-register")
|
||||
|
||||
// Wait for the ping to fail.
|
||||
for {
|
||||
replicaErr := testutil.RequireRecvCtx(ctx, t, replicaPingErr)
|
||||
t.Log("replica ping error:", replicaErr)
|
||||
if replicaErr != "" {
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
// GET /healthz-report
|
||||
u := proxy.ServerURL.ResolveReference(&url.URL{Path: "/healthz-report"})
|
||||
req, err := http.NewRequestWithContext(ctx, http.MethodGet, u.String(), nil)
|
||||
require.NoError(t, err)
|
||||
resp, err := http.DefaultClient.Do(req)
|
||||
require.NoError(t, err)
|
||||
var respJSON codersdk.ProxyHealthReport
|
||||
err = json.NewDecoder(resp.Body).Decode(&respJSON)
|
||||
resp.Body.Close()
|
||||
require.NoError(t, err)
|
||||
require.Len(t, respJSON.Warnings, 1, "proxy is healthy")
|
||||
require.Contains(t, respJSON.Warnings[0], "High availability networking")
|
||||
|
||||
// Deregister the other replica.
|
||||
wsproxyClient := wsproxysdk.New(api.AccessURL)
|
||||
wsproxyClient.SetSessionToken(proxy.Options.ProxySessionToken)
|
||||
err = wsproxyClient.DeregisterWorkspaceProxy(ctx, wsproxysdk.DeregisterWorkspaceProxyRequest{
|
||||
ReplicaID: otherReplicaID,
|
||||
})
|
||||
require.NoError(t, err)
|
||||
|
||||
// Force the proxy to re-register immediately.
|
||||
err = proxy.RegisterNow()
|
||||
require.NoError(t, err, "failed to force proxy to re-register")
|
||||
|
||||
// Wait for the ping to be skipped.
|
||||
for {
|
||||
replicaErr := testutil.RequireRecvCtx(ctx, t, replicaPingErr)
|
||||
t.Log("replica ping error:", replicaErr)
|
||||
// Should be empty because there are no more peers. This was where
|
||||
// the regression was.
|
||||
if replicaErr == "" {
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
// GET /healthz-report
|
||||
req, err = http.NewRequestWithContext(ctx, http.MethodGet, u.String(), nil)
|
||||
require.NoError(t, err)
|
||||
resp, err = http.DefaultClient.Do(req)
|
||||
require.NoError(t, err)
|
||||
err = json.NewDecoder(resp.Body).Decode(&respJSON)
|
||||
resp.Body.Close()
|
||||
require.NoError(t, err)
|
||||
require.Len(t, respJSON.Warnings, 0, "proxy is unhealthy")
|
||||
})
|
||||
}
|
||||
|
||||
|
|
Loading…
Reference in New Issue