coder/enterprise/tailnet/pgcoord.go

package tailnet

import (
	"context"
	"database/sql"
	"encoding/json"
	"fmt"
	"net"
	"net/http"
	"net/netip"
	"strings"
	"sync"
	"time"

	"github.com/cenkalti/backoff/v4"
	"github.com/google/uuid"
	"golang.org/x/exp/slices"
	"golang.org/x/xerrors"

	"cdr.dev/slog"
	"github.com/coder/coder/v2/coderd/database"
	"github.com/coder/coder/v2/coderd/database/dbauthz"
	"github.com/coder/coder/v2/coderd/database/pubsub"
	"github.com/coder/coder/v2/coderd/rbac"
	"github.com/coder/coder/v2/coderd/util/slice"
	agpl "github.com/coder/coder/v2/tailnet"
)

const (
	EventHeartbeats      = "tailnet_coordinator_heartbeat"
	eventClientUpdate    = "tailnet_client_update"
	eventAgentUpdate     = "tailnet_agent_update"
	HeartbeatPeriod      = time.Second * 2
	MissedHeartbeats     = 3
	numQuerierWorkers    = 10
	numBinderWorkers     = 10
	numSubscriberWorkers = 10
	dbMaxBackoff         = 10 * time.Second
	cleanupPeriod        = time.Hour
)

// TODO: add subscriber to this graphic
// pgCoord is a postgres-backed coordinator
//
//	┌────────┐       ┌────────┐        ┌───────┐
//	│ connIO ├───────► binder ├────────► store │
//	└───▲────┘       │        │        │       │
//	    │            └────────┘ ┌──────┤       │
//	    │                       │      └───────┘
//	    │                       │
//	    │            ┌──────────▼┐     ┌────────┐
//	    │            │           │     │        │
//	    └────────────┤ querier   ◄─────┤ pubsub │
//	                 │           │     │        │
//	                 └───────────┘     └────────┘
//
// each incoming connection (websocket) from a client or agent is wrapped in a connIO which handles reading & writing
// from it.  Node updates from a connIO are sent to the binder, which writes them to the database.Store.  The querier
// is responsible for querying the store for the nodes the connection needs (e.g. for a client, the corresponding
// agent).  The querier receives pubsub notifications about changes, which trigger queries for the latest state.
//
// The querier also sends the coordinator's heartbeat, and monitors the heartbeats of other coordinators.  When
// heartbeats cease for a coordinator, it stops using any nodes discovered from that coordinator and pushes an update
// to affected connIOs.
//
// This package uses the term "binding" to mean the act of registering an association between some connection (client
// or agent) and an agpl.Node.  It uses the term "mapping" to mean the act of determining the nodes that the connection
// needs to receive (e.g. for a client, the node bound to the corresponding agent, or for an agent, the nodes bound to
// all clients of the agent).
type pgCoord struct {
	ctx    context.Context
	logger slog.Logger
	pubsub pubsub.Pubsub
	store  database.Store

	bindings         chan binding
	newConnections   chan agpl.Queue
	closeConnections chan agpl.Queue
	subscriberCh     chan subscribe
	querierSubCh     chan subscribe
	id               uuid.UUID

	cancel    context.CancelFunc
	closeOnce sync.Once
	closed    chan struct{}

	binder     *binder
	subscriber *subscriber
	querier    *querier
}

var pgCoordSubject = rbac.Subject{
	ID: uuid.Nil.String(),
	Roles: rbac.Roles([]rbac.Role{
		{
			Name:        "tailnetcoordinator",
			DisplayName: "Tailnet Coordinator",
			Site: rbac.Permissions(map[string][]rbac.Action{
				rbac.ResourceTailnetCoordinator.Type: {rbac.WildcardSymbol},
			}),
			Org:  map[string][]rbac.Permission{},
			User: []rbac.Permission{},
		},
	}),
	Scope: rbac.ScopeAll,
}.WithCachedASTValue()

// NewPGCoord creates a high-availability coordinator that stores state in the PostgreSQL database and
// receives notifications of updates via the pubsub.
func NewPGCoord(ctx context.Context, logger slog.Logger, ps pubsub.Pubsub, store database.Store) (agpl.Coordinator, error) {
	ctx, cancel := context.WithCancel(dbauthz.As(ctx, pgCoordSubject))
	id := uuid.New()
	logger = logger.Named("pgcoord").With(slog.F("coordinator_id", id))
	bCh := make(chan binding)
	// used for opening connections
	cCh := make(chan agpl.Queue)
	// used for closing connections
	ccCh := make(chan agpl.Queue)
	// for communicating subscriptions with the subscriber
	sCh := make(chan subscribe)
	// for communicating subscriptions with the querier
	qsCh := make(chan subscribe)
	// signals when first heartbeat has been sent, so it's safe to start binding.
	fHB := make(chan struct{})

	c := &pgCoord{
		ctx:              ctx,
		cancel:           cancel,
		logger:           logger,
		pubsub:           ps,
		store:            store,
		binder:           newBinder(ctx, logger, id, store, bCh, fHB),
		bindings:         bCh,
		newConnections:   cCh,
		closeConnections: ccCh,
		subscriber:       newSubscriber(ctx, logger, id, store, sCh, fHB),
		subscriberCh:     sCh,
		querierSubCh:     qsCh,
		id:               id,
		querier:          newQuerier(ctx, logger, id, ps, store, id, cCh, ccCh, qsCh, numQuerierWorkers, fHB),
		closed:           make(chan struct{}),
	}
	logger.Info(ctx, "starting coordinator")
	return c, nil
}

// This is copied from codersdk because importing it here would cause an import
// cycle. This is just temporary until wsconncache is phased out.
var legacyAgentIP = netip.MustParseAddr("fd7a:115c:a1e0:49d6:b259:b7ac:b1b2:48f4")

func (c *pgCoord) ServeMultiAgent(id uuid.UUID) agpl.MultiAgentConn {
	ma := (&agpl.MultiAgent{
		ID: id,
		AgentIsLegacyFunc: func(agentID uuid.UUID) bool {
			if n := c.Node(agentID); n == nil {
				// If we don't have the node at all assume it's legacy for
				// safety.
				return true
			} else if len(n.Addresses) > 0 && n.Addresses[0].Addr() == legacyAgentIP {
				// An agent is determined to be "legacy" if it's first IP is the
				// legacy IP. Agents with only the legacy IP aren't compatible
				// with single_tailnet and must be routed through wsconncache.
				return true
			} else {
				return false
			}
		},
		OnSubscribe: func(enq agpl.Queue, agent uuid.UUID) (*agpl.Node, error) {
			err := c.addSubscription(enq, agent)
			return c.Node(agent), err
		},
		OnUnsubscribe: c.removeSubscription,
		OnNodeUpdate: func(id uuid.UUID, node *agpl.Node) error {
			return sendCtx(c.ctx, c.bindings, binding{
				bKey: bKey{id, agpl.QueueKindClient},
				node: node,
			})
		},
		OnRemove: func(enq agpl.Queue) {
			_ = sendCtx(c.ctx, c.bindings, binding{
				bKey: bKey{
					id:   enq.UniqueID(),
					kind: enq.Kind(),
				},
			})
			_ = sendCtx(c.ctx, c.subscriberCh, subscribe{
				sKey:   sKey{clientID: id},
				q:      enq,
				active: false,
			})
			_ = sendCtx(c.ctx, c.closeConnections, enq)
		},
	}).Init()

	if err := sendCtx(c.ctx, c.newConnections, agpl.Queue(ma)); err != nil {
		// If we can't successfully send the multiagent, that means the
		// coordinator is shutting down. In this case, just return a closed
		// multiagent.
		ma.CoordinatorClose()
	}

	return ma
}

func (c *pgCoord) addSubscription(q agpl.Queue, agentID uuid.UUID) error {
	sub := subscribe{
		sKey: sKey{
			clientID: q.UniqueID(),
			agentID:  agentID,
		},
		q:      q,
		active: true,
	}
	if err := sendCtx(c.ctx, c.subscriberCh, sub); err != nil {
		return err
	}
	if err := sendCtx(c.ctx, c.querierSubCh, sub); err != nil {
		// There's no need to clean up the sub sent to the subscriber if this
		// fails, since it means the entire coordinator is being torn down.
		return err
	}

	return nil
}

func (c *pgCoord) removeSubscription(q agpl.Queue, agentID uuid.UUID) error {
	sub := subscribe{
		sKey: sKey{
			clientID: q.UniqueID(),
			agentID:  agentID,
		},
		q:      q,
		active: false,
	}
	if err := sendCtx(c.ctx, c.subscriberCh, sub); err != nil {
		return err
	}
	if err := sendCtx(c.ctx, c.querierSubCh, sub); err != nil {
		// There's no need to clean up the sub sent to the subscriber if this
		// fails, since it means the entire coordinator is being torn down.
		return err
	}

	return nil
}

func (c *pgCoord) Node(id uuid.UUID) *agpl.Node {
	// In production, we only ever get this request for an agent.
	// We're going to directly query the database, since we would only have the agent mapping stored locally if we had
	// a client of that agent connected, which isn't always the case.
	mappings, err := c.querier.queryAgent(id)
	if err != nil {
		c.logger.Error(c.ctx, "failed to query agents", slog.Error(err))
	}
	mappings = c.querier.heartbeats.filter(mappings)
	var bestT time.Time
	var bestN *agpl.Node
	for _, m := range mappings {
		if m.updatedAt.After(bestT) {
			bestN = m.node
			bestT = m.updatedAt
		}
	}
	return bestN
}

func (c *pgCoord) ServeClient(conn net.Conn, id uuid.UUID, agent uuid.UUID) error {
	defer func() {
		err := conn.Close()
		if err != nil {
			c.logger.Debug(c.ctx, "closing client connection",
				slog.F("client_id", id),
				slog.F("agent_id", agent),
				slog.Error(err))
		}
	}()

	cIO := newConnIO(c.ctx, c.logger, c.bindings, conn, id, id.String(), agpl.QueueKindClient)
	if err := sendCtx(c.ctx, c.newConnections, agpl.Queue(cIO)); err != nil {
		// can only be a context error, no need to log here.
		return err
	}
	defer func() { _ = sendCtx(c.ctx, c.closeConnections, agpl.Queue(cIO)) }()

	if err := c.addSubscription(cIO, agent); err != nil {
		return err
	}
	defer func() { _ = c.removeSubscription(cIO, agent) }()

	<-cIO.ctx.Done()
	return nil
}

func (c *pgCoord) ServeAgent(conn net.Conn, id uuid.UUID, name string) error {
	defer func() {
		err := conn.Close()
		if err != nil {
			c.logger.Debug(c.ctx, "closing agent connection",
				slog.F("agent_id", id),
				slog.Error(err))
		}
	}()
	logger := c.logger.With(slog.F("name", name))
	cIO := newConnIO(c.ctx, logger, c.bindings, conn, id, name, agpl.QueueKindAgent)
	if err := sendCtx(c.ctx, c.newConnections, agpl.Queue(cIO)); err != nil {
		// can only be a context error, no need to log here.
		return err
	}
	defer func() { _ = sendCtx(c.ctx, c.closeConnections, agpl.Queue(cIO)) }()

	<-cIO.ctx.Done()
	return nil
}

func (c *pgCoord) Close() error {
	c.logger.Info(c.ctx, "closing coordinator")
	c.cancel()
	c.closeOnce.Do(func() { close(c.closed) })
	return nil
}

func sendCtx[A any](ctx context.Context, c chan<- A, a A) (err error) {
	select {
	case <-ctx.Done():
		return ctx.Err()
	case c <- a:
		return nil
	}
}

type sKey struct {
	clientID uuid.UUID
	agentID  uuid.UUID
}

type subscribe struct {
	sKey

	q agpl.Queue
	// whether the subscription should be active. if true, the subscription is
	// added. if false, the subscription is removed.
	active bool
}

type subscriber struct {
	ctx           context.Context
	logger        slog.Logger
	coordinatorID uuid.UUID
	store         database.Store
	subscriptions <-chan subscribe

	mu sync.Mutex
	// map[clientID]map[agentID]subscribe
	latest map[uuid.UUID]map[uuid.UUID]subscribe
	workQ  *workQ[sKey]
}

func newSubscriber(ctx context.Context,
	logger slog.Logger,
	id uuid.UUID,
	store database.Store,
	subscriptions <-chan subscribe,
	startWorkers <-chan struct{},
) *subscriber {
	s := &subscriber{
		ctx:           ctx,
		logger:        logger,
		coordinatorID: id,
		store:         store,
		subscriptions: subscriptions,
		latest:        make(map[uuid.UUID]map[uuid.UUID]subscribe),
		workQ:         newWorkQ[sKey](ctx),
	}
	go s.handleSubscriptions()
	go func() {
		<-startWorkers
		for i := 0; i < numSubscriberWorkers; i++ {
			go s.worker()
		}
	}()
	return s
}

func (s *subscriber) handleSubscriptions() {
	for {
		select {
		case <-s.ctx.Done():
			s.logger.Debug(s.ctx, "subscriber exiting", slog.Error(s.ctx.Err()))
			return
		case sub := <-s.subscriptions:
			s.storeSubscription(sub)
			s.workQ.enqueue(sub.sKey)
		}
	}
}

func (s *subscriber) worker() {
	eb := backoff.NewExponentialBackOff()
	eb.MaxElapsedTime = 0 // retry indefinitely
	eb.MaxInterval = dbMaxBackoff
	bkoff := backoff.WithContext(eb, s.ctx)
	for {
		bk, err := s.workQ.acquire()
		if err != nil {
			// context expired
			return
		}
		err = backoff.Retry(func() error {
			bnd := s.retrieveSubscription(bk)
			return s.writeOne(bnd)
		}, bkoff)
		if err != nil {
			bkoff.Reset()
		}
		s.workQ.done(bk)
	}
}

func (s *subscriber) storeSubscription(sub subscribe) {
	s.mu.Lock()
	defer s.mu.Unlock()
	if sub.active {
		if _, ok := s.latest[sub.clientID]; !ok {
			s.latest[sub.clientID] = map[uuid.UUID]subscribe{}
		}
		s.latest[sub.clientID][sub.agentID] = sub
	} else {
		// If the agentID is nil, clean up all of the clients subscriptions.
		if sub.agentID == uuid.Nil {
			delete(s.latest, sub.clientID)
		} else {
			delete(s.latest[sub.clientID], sub.agentID)
			// clean up the subscription map if all the subscriptions are gone.
			if len(s.latest[sub.clientID]) == 0 {
				delete(s.latest, sub.clientID)
			}
		}
	}
}

// retrieveBinding gets the latest binding for a key.
func (s *subscriber) retrieveSubscription(sk sKey) subscribe {
	s.mu.Lock()
	defer s.mu.Unlock()
	agents, ok := s.latest[sk.clientID]
	if !ok {
		return subscribe{
			sKey:   sk,
			active: false,
		}
	}

	sub, ok := agents[sk.agentID]
	if !ok {
		return subscribe{
			sKey:   sk,
			active: false,
		}
	}

	return sub
}

func (s *subscriber) writeOne(sub subscribe) error {
	var err error
	switch {
	case sub.agentID == uuid.Nil:
		err = s.store.DeleteAllTailnetClientSubscriptions(s.ctx, database.DeleteAllTailnetClientSubscriptionsParams{
			ClientID:      sub.clientID,
			CoordinatorID: s.coordinatorID,
		})
		s.logger.Debug(s.ctx, "deleted all client subscriptions",
			slog.F("client_id", sub.clientID),
			slog.Error(err),
		)
	case sub.active:
		err = s.store.UpsertTailnetClientSubscription(s.ctx, database.UpsertTailnetClientSubscriptionParams{
			ClientID:      sub.clientID,
			CoordinatorID: s.coordinatorID,
			AgentID:       sub.agentID,
		})
		s.logger.Debug(s.ctx, "upserted client subscription",
			slog.F("client_id", sub.clientID),
			slog.F("agent_id", sub.agentID),
			slog.Error(err),
		)
	case !sub.active:
		err = s.store.DeleteTailnetClientSubscription(s.ctx, database.DeleteTailnetClientSubscriptionParams{
			ClientID:      sub.clientID,
			CoordinatorID: s.coordinatorID,
			AgentID:       sub.agentID,
		})
		s.logger.Debug(s.ctx, "deleted client subscription",
			slog.F("client_id", sub.clientID),
			slog.F("agent_id", sub.agentID),
			slog.Error(err),
		)
	default:
		panic("unreachable")
	}
	if err != nil && !database.IsQueryCanceledError(err) {
		s.logger.Error(s.ctx, "write subscription to database",
			slog.F("client_id", sub.clientID),
			slog.F("agent_id", sub.agentID),
			slog.F("active", sub.active),
			slog.Error(err))
	}
	return err
}

// bKey, or "binding key" identifies a client or agent in a binding. Agents and
// clients are differentiated by the kind field.
type bKey struct {
	id   uuid.UUID
	kind agpl.QueueKind
}

// binding represents an association between a client or agent and a Node.
type binding struct {
	bKey
	node *agpl.Node
}

func (b *binding) isAgent() bool  { return b.kind == agpl.QueueKindAgent }
func (b *binding) isClient() bool { return b.kind == agpl.QueueKindClient }

// binder reads node bindings from the channel and writes them to the database.  It handles retries with a backoff.
type binder struct {
	ctx           context.Context
	logger        slog.Logger
	coordinatorID uuid.UUID
	store         database.Store
	bindings      <-chan binding

	mu     sync.Mutex
	latest map[bKey]binding
	workQ  *workQ[bKey]
}

func newBinder(ctx context.Context,
	logger slog.Logger,
	id uuid.UUID,
	store database.Store,
	bindings <-chan binding,
	startWorkers <-chan struct{},
) *binder {
	b := &binder{
		ctx:           ctx,
		logger:        logger,
		coordinatorID: id,
		store:         store,
		bindings:      bindings,
		latest:        make(map[bKey]binding),
		workQ:         newWorkQ[bKey](ctx),
	}
	go b.handleBindings()
	go func() {
		<-startWorkers
		for i := 0; i < numBinderWorkers; i++ {
			go b.worker()
		}
	}()
	return b
}

func (b *binder) handleBindings() {
	for {
		select {
		case <-b.ctx.Done():
			b.logger.Debug(b.ctx, "binder exiting", slog.Error(b.ctx.Err()))
			return
		case bnd := <-b.bindings:
			b.storeBinding(bnd)
			b.workQ.enqueue(bnd.bKey)
		}
	}
}

func (b *binder) worker() {
	eb := backoff.NewExponentialBackOff()
	eb.MaxElapsedTime = 0 // retry indefinitely
	eb.MaxInterval = dbMaxBackoff
	bkoff := backoff.WithContext(eb, b.ctx)
	for {
		bk, err := b.workQ.acquire()
		if err != nil {
			// context expired
			return
		}
		err = backoff.Retry(func() error {
			bnd := b.retrieveBinding(bk)
			return b.writeOne(bnd)
		}, bkoff)
		if err != nil {
			bkoff.Reset()
		}
		b.workQ.done(bk)
	}
}

func (b *binder) writeOne(bnd binding) error {
	var nodeRaw json.RawMessage
	var err error
	if bnd.node != nil {
		nodeRaw, err = json.Marshal(*bnd.node)
		if err != nil {
			// this is very bad news, but it should never happen because the node was Unmarshalled by this process
			// earlier.
			b.logger.Error(b.ctx, "failed to marshal node", slog.Error(err))
			return err
		}
	}

	switch {
	case bnd.isAgent() && len(nodeRaw) > 0:
		_, err = b.store.UpsertTailnetAgent(b.ctx, database.UpsertTailnetAgentParams{
			ID:            bnd.id,
			CoordinatorID: b.coordinatorID,
			Node:          nodeRaw,
		})
		b.logger.Debug(b.ctx, "upserted agent binding",
			slog.F("agent_id", bnd.id), slog.F("node", nodeRaw), slog.Error(err))
	case bnd.isAgent() && len(nodeRaw) == 0:
		_, err = b.store.DeleteTailnetAgent(b.ctx, database.DeleteTailnetAgentParams{
			ID:            bnd.id,
			CoordinatorID: b.coordinatorID,
		})
		b.logger.Debug(b.ctx, "deleted agent binding",
			slog.F("agent_id", bnd.id), slog.Error(err))
		if xerrors.Is(err, sql.ErrNoRows) {
			// treat deletes as idempotent
			err = nil
		}
	case bnd.isClient() && len(nodeRaw) > 0:
		_, err = b.store.UpsertTailnetClient(b.ctx, database.UpsertTailnetClientParams{
			ID:            bnd.id,
			CoordinatorID: b.coordinatorID,
			Node:          nodeRaw,
		})
		b.logger.Debug(b.ctx, "upserted client binding",
			slog.F("client_id", bnd.id),
			slog.F("node", nodeRaw), slog.Error(err))
	case bnd.isClient() && len(nodeRaw) == 0:
		_, err = b.store.DeleteTailnetClient(b.ctx, database.DeleteTailnetClientParams{
			ID:            bnd.id,
			CoordinatorID: b.coordinatorID,
		})
		b.logger.Debug(b.ctx, "deleted client binding",
			slog.F("client_id", bnd.id))
		if xerrors.Is(err, sql.ErrNoRows) {
			// treat deletes as idempotent
			err = nil
		}
	default:
		panic("unhittable")
	}
	if err != nil && !database.IsQueryCanceledError(err) {
		b.logger.Error(b.ctx, "failed to write binding to database",
			slog.F("binding_id", bnd.id),
			slog.F("kind", bnd.kind),
			slog.F("node", string(nodeRaw)),
			slog.Error(err))
	}
	return err
}

// storeBinding stores the latest binding, where we interpret node == nil as removing the binding. This keeps the map
// from growing without bound.
func (b *binder) storeBinding(bnd binding) {
	b.mu.Lock()
	defer b.mu.Unlock()
	if bnd.node != nil {
		b.latest[bnd.bKey] = bnd
	} else {
		// nil node is interpreted as removing binding
		delete(b.latest, bnd.bKey)
	}
}

// retrieveBinding gets the latest binding for a key.
func (b *binder) retrieveBinding(bk bKey) binding {
	b.mu.Lock()
	defer b.mu.Unlock()
	bnd, ok := b.latest[bk]
	if !ok {
		bnd = binding{
			bKey: bk,
			node: nil,
		}
	}
	return bnd
}

// mapper tracks a single client or agent ID, and fans out updates to that ID->node mapping to every local connection
// that needs it.
type mapper struct {
	ctx    context.Context
	logger slog.Logger

	add chan agpl.Queue
	del chan agpl.Queue

	// reads from this channel trigger sending latest nodes to
	// all connections.  It is used when coordinators are added
	// or removed
	update chan struct{}

	mappings chan []mapping

	conns  map[bKey]agpl.Queue
	latest []mapping

	heartbeats *heartbeats
}

func newMapper(ctx context.Context, logger slog.Logger, mk mKey, h *heartbeats) *mapper {
	logger = logger.With(
		slog.F("agent_id", mk.agent),
		slog.F("kind", mk.kind),
	)
	m := &mapper{
		ctx:        ctx,
		logger:     logger,
		add:        make(chan agpl.Queue),
		del:        make(chan agpl.Queue),
		update:     make(chan struct{}),
		conns:      make(map[bKey]agpl.Queue),
		mappings:   make(chan []mapping),
		heartbeats: h,
	}
	go m.run()
	return m
}

func (m *mapper) run() {
	for {
		select {
		case <-m.ctx.Done():
			return
		case c := <-m.add:
			m.conns[bKey{id: c.UniqueID(), kind: c.Kind()}] = c
			nodes := m.mappingsToNodes(m.latest)
			if len(nodes) == 0 {
				m.logger.Debug(m.ctx, "skipping 0 length node update")
				continue
			}
			if err := c.Enqueue(nodes); err != nil {
				m.logger.Error(m.ctx, "failed to enqueue node update", slog.Error(err))
			}
		case c := <-m.del:
			delete(m.conns, bKey{id: c.UniqueID(), kind: c.Kind()})
		case mappings := <-m.mappings:
			m.latest = mappings
			nodes := m.mappingsToNodes(mappings)
			if len(nodes) == 0 {
				m.logger.Debug(m.ctx, "skipping 0 length node update")
				continue
			}
			for _, conn := range m.conns {
				if err := conn.Enqueue(nodes); err != nil {
					m.logger.Error(m.ctx, "failed to enqueue node update", slog.Error(err))
				}
			}
		case <-m.update:
			nodes := m.mappingsToNodes(m.latest)
			if len(nodes) == 0 {
				m.logger.Debug(m.ctx, "skipping 0 length node update")
				continue
			}
			for _, conn := range m.conns {
				if err := conn.Enqueue(nodes); err != nil {
					m.logger.Error(m.ctx, "failed to enqueue triggered node update", slog.Error(err))
				}
			}
		}
	}
}

// mappingsToNodes takes a set of mappings and resolves the best set of nodes.  We may get several mappings for a
// particular connection, from different coordinators in the distributed system.  Furthermore, some coordinators
// might be considered invalid on account of missing heartbeats.  We take the most recent mapping from a valid
// coordinator as the "best" mapping.
func (m *mapper) mappingsToNodes(mappings []mapping) []*agpl.Node {
	mappings = m.heartbeats.filter(mappings)
	best := make(map[bKey]mapping, len(mappings))
	for _, m := range mappings {
		var bk bKey
		if m.client == uuid.Nil {
			bk = bKey{id: m.agent, kind: agpl.QueueKindAgent}
		} else {
			bk = bKey{id: m.client, kind: agpl.QueueKindClient}
		}

		bestM, ok := best[bk]
		if !ok || m.updatedAt.After(bestM.updatedAt) {
			best[bk] = m
		}
	}
	nodes := make([]*agpl.Node, 0, len(best))
	for _, m := range best {
		nodes = append(nodes, m.node)
	}
	return nodes
}

// querier is responsible for monitoring pubsub notifications and querying the database for the mappings that all
// connected clients and agents need.  It also checks heartbeats and withdraws mappings from coordinators that have
// failed heartbeats.
type querier struct {
	ctx           context.Context
	logger        slog.Logger
	coordinatorID uuid.UUID
	pubsub        pubsub.Pubsub
	store         database.Store

	newConnections   chan agpl.Queue
	closeConnections chan agpl.Queue
	subscriptions    chan subscribe

	workQ *workQ[mKey]

	heartbeats *heartbeats
	updates    <-chan hbUpdate

	mu      sync.Mutex
	mappers map[mKey]*countedMapper
	conns   map[uuid.UUID]agpl.Queue
	// clientSubscriptions maps client ids to the agent ids they're subscribed to.
	// map[client_id]map[agent_id]
	clientSubscriptions map[uuid.UUID]map[uuid.UUID]struct{}
	healthy             bool
}

type countedMapper struct {
	*mapper
	count  int
	cancel context.CancelFunc
}

func newQuerier(ctx context.Context,
	logger slog.Logger,
	coordinatorID uuid.UUID,
	ps pubsub.Pubsub,
	store database.Store,
	self uuid.UUID,
	newConnections chan agpl.Queue,
	closeConnections chan agpl.Queue,
	subscriptions chan subscribe,
	numWorkers int,
	firstHeartbeat chan struct{},
) *querier {
	updates := make(chan hbUpdate)
	q := &querier{
		ctx:                 ctx,
		logger:              logger.Named("querier"),
		coordinatorID:       coordinatorID,
		pubsub:              ps,
		store:               store,
		newConnections:      newConnections,
		closeConnections:    closeConnections,
		subscriptions:       subscriptions,
		workQ:               newWorkQ[mKey](ctx),
		heartbeats:          newHeartbeats(ctx, logger, ps, store, self, updates, firstHeartbeat),
		mappers:             make(map[mKey]*countedMapper),
		conns:               make(map[uuid.UUID]agpl.Queue),
		updates:             updates,
		clientSubscriptions: make(map[uuid.UUID]map[uuid.UUID]struct{}),
		healthy:             true, // assume we start healthy
	}
	q.subscribe()

	go func() {
		<-firstHeartbeat
		go q.handleIncoming()
		for i := 0; i < numWorkers; i++ {
			go q.worker()
		}
		go q.handleUpdates()
	}()
	return q
}

func (q *querier) handleIncoming() {
	for {
		select {
		case <-q.ctx.Done():
			return

		case c := <-q.newConnections:
			switch c.Kind() {
			case agpl.QueueKindAgent:
				q.newAgentConn(c)
			case agpl.QueueKindClient:
				q.newClientConn(c)
			default:
				panic(fmt.Sprint("unreachable: invalid queue kind ", c.Kind()))
			}

		case c := <-q.closeConnections:
			q.cleanupConn(c)

		case sub := <-q.subscriptions:
			if sub.active {
				q.newClientSubscription(sub.q, sub.agentID)
			} else {
				q.removeClientSubscription(sub.q, sub.agentID)
			}
		}
	}
}

func (q *querier) newAgentConn(c agpl.Queue) {
	q.mu.Lock()
	defer q.mu.Unlock()
	if !q.healthy {
		err := c.Close()
		q.logger.Info(q.ctx, "closed incoming connection while unhealthy",
			slog.Error(err),
			slog.F("agent_id", c.UniqueID()),
		)
		return
	}
	mk := mKey{
		agent: c.UniqueID(),
		kind:  c.Kind(),
	}
	cm, ok := q.mappers[mk]
	if !ok {
		ctx, cancel := context.WithCancel(q.ctx)
		mpr := newMapper(ctx, q.logger, mk, q.heartbeats)
		cm = &countedMapper{
			mapper: mpr,
			count:  0,
			cancel: cancel,
		}
		q.mappers[mk] = cm
		// we don't have any mapping state for this key yet
		q.workQ.enqueue(mk)
	}
	if err := sendCtx(cm.ctx, cm.add, c); err != nil {
		return
	}
	cm.count++
	q.conns[c.UniqueID()] = c
}

func (q *querier) newClientSubscription(c agpl.Queue, agentID uuid.UUID) {
	q.mu.Lock()
	defer q.mu.Unlock()

	if _, ok := q.clientSubscriptions[c.UniqueID()]; !ok {
		q.clientSubscriptions[c.UniqueID()] = map[uuid.UUID]struct{}{}
	}

	mk := mKey{
		agent: agentID,
		kind:  agpl.QueueKindClient,
	}
	cm, ok := q.mappers[mk]
	if !ok {
		ctx, cancel := context.WithCancel(q.ctx)
		mpr := newMapper(ctx, q.logger, mk, q.heartbeats)
		cm = &countedMapper{
			mapper: mpr,
			count:  0,
			cancel: cancel,
		}
		q.mappers[mk] = cm
		// we don't have any mapping state for this key yet
		q.workQ.enqueue(mk)
	}
	if err := sendCtx(cm.ctx, cm.add, c); err != nil {
		return
	}
	q.clientSubscriptions[c.UniqueID()][agentID] = struct{}{}
	cm.count++
}

func (q *querier) removeClientSubscription(c agpl.Queue, agentID uuid.UUID) {
	q.mu.Lock()
	defer q.mu.Unlock()

	// Allow duplicate unsubscribes. It's possible for cleanupConn to race with
	// an external call to removeClientSubscription, so we just ensure the
	// client subscription exists before attempting to remove it.
	if _, ok := q.clientSubscriptions[c.UniqueID()][agentID]; !ok {
		return
	}

	mk := mKey{
		agent: agentID,
		kind:  agpl.QueueKindClient,
	}
	cm := q.mappers[mk]
	if err := sendCtx(cm.ctx, cm.del, c); err != nil {
		return
	}
	delete(q.clientSubscriptions[c.UniqueID()], agentID)
	cm.count--
	if cm.count == 0 {
		cm.cancel()
		delete(q.mappers, mk)
	}
	if len(q.clientSubscriptions[c.UniqueID()]) == 0 {
		delete(q.clientSubscriptions, c.UniqueID())
	}
}

func (q *querier) newClientConn(c agpl.Queue) {
	q.mu.Lock()
	defer q.mu.Unlock()
	if !q.healthy {
		err := c.Close()
		q.logger.Info(q.ctx, "closed incoming connection while unhealthy",
			slog.Error(err),
			slog.F("client_id", c.UniqueID()),
		)
		return
	}

	q.conns[c.UniqueID()] = c
}

func (q *querier) cleanupConn(c agpl.Queue) {
	q.mu.Lock()
	defer q.mu.Unlock()
	delete(q.conns, c.UniqueID())

	// Iterate over all subscriptions and remove them from the mappers.
	for agentID := range q.clientSubscriptions[c.UniqueID()] {
		mk := mKey{
			agent: agentID,
			kind:  c.Kind(),
		}
		cm := q.mappers[mk]
		if err := sendCtx(cm.ctx, cm.del, c); err != nil {
			continue
		}
		cm.count--
		if cm.count == 0 {
			cm.cancel()
			delete(q.mappers, mk)
		}
	}
	delete(q.clientSubscriptions, c.UniqueID())

	mk := mKey{
		agent: c.UniqueID(),
		kind:  c.Kind(),
	}
	cm, ok := q.mappers[mk]
	if !ok {
		return
	}

	if err := sendCtx(cm.ctx, cm.del, c); err != nil {
		return
	}
	cm.count--
	if cm.count == 0 {
		cm.cancel()
		delete(q.mappers, mk)
	}
}

func (q *querier) worker() {
	eb := backoff.NewExponentialBackOff()
	eb.MaxElapsedTime = 0 // retry indefinitely
	eb.MaxInterval = dbMaxBackoff
	bkoff := backoff.WithContext(eb, q.ctx)
	for {
		mk, err := q.workQ.acquire()
		if err != nil {
			// context expired
			return
		}
		err = backoff.Retry(func() error {
			return q.query(mk)
		}, bkoff)
		if err != nil {
			bkoff.Reset()
		}
		q.workQ.done(mk)
	}
}

func (q *querier) query(mk mKey) error {
	var mappings []mapping
	var err error
	// If the mapping is an agent, query all of its clients.
	if mk.kind == agpl.QueueKindAgent {
		mappings, err = q.queryClientsOfAgent(mk.agent)
		if err != nil {
			return err
		}
	} else {
		// The mapping is for clients subscribed to the agent. Query the agent
		// itself.
		mappings, err = q.queryAgent(mk.agent)
		if err != nil {
			return err
		}
	}
	q.mu.Lock()
	mpr, ok := q.mappers[mk]
	q.mu.Unlock()
	if !ok {
		q.logger.Debug(q.ctx, "query for missing mapper",
			slog.F("agent_id", mk.agent), slog.F("kind", mk.kind))
		return nil
	}
	q.logger.Debug(q.ctx, "sending mappings", slog.F("mapping_len", len(mappings)))
	mpr.mappings <- mappings
	return nil
}

func (q *querier) queryClientsOfAgent(agent uuid.UUID) ([]mapping, error) {
	clients, err := q.store.GetTailnetClientsForAgent(q.ctx, agent)
	q.logger.Debug(q.ctx, "queried clients of agent",
		slog.F("agent_id", agent), slog.F("num_clients", len(clients)), slog.Error(err))
	if err != nil {
		return nil, err
	}
	mappings := make([]mapping, 0, len(clients))
	for _, client := range clients {
		node := new(agpl.Node)
		err := json.Unmarshal(client.Node, node)
		if err != nil {
			q.logger.Error(q.ctx, "failed to unmarshal node", slog.Error(err))
			return nil, backoff.Permanent(err)
		}
		mappings = append(mappings, mapping{
			client:      client.ID,
			agent:       agent,
			coordinator: client.CoordinatorID,
			updatedAt:   client.UpdatedAt,
			node:        node,
		})
	}
	return mappings, nil
}

func (q *querier) queryAgent(agentID uuid.UUID) ([]mapping, error) {
	agents, err := q.store.GetTailnetAgents(q.ctx, agentID)
	q.logger.Debug(q.ctx, "queried agents",
		slog.F("agent_id", agentID), slog.F("num_agents", len(agents)), slog.Error(err))
	if err != nil {
		return nil, err
	}
	return q.agentsToMappings(agents)
}

func (q *querier) agentsToMappings(agents []database.TailnetAgent) ([]mapping, error) {
	slog.Helper()
	mappings := make([]mapping, 0, len(agents))
	for _, agent := range agents {
		node := new(agpl.Node)
		err := json.Unmarshal(agent.Node, node)
		if err != nil {
			q.logger.Error(q.ctx, "failed to unmarshal node", slog.Error(err))
			return nil, backoff.Permanent(err)
		}
		mappings = append(mappings, mapping{
			agent:       agent.ID,
			coordinator: agent.CoordinatorID,
			updatedAt:   agent.UpdatedAt,
			node:        node,
		})
	}
	return mappings, nil
}

// subscribe starts our subscriptions to client and agent updates in a new goroutine, and returns once we are subscribed
// or the querier context is canceled.
func (q *querier) subscribe() {
	subscribed := make(chan struct{})
	go func() {
		defer close(subscribed)
		eb := backoff.NewExponentialBackOff()
		eb.MaxElapsedTime = 0 // retry indefinitely
		eb.MaxInterval = dbMaxBackoff
		bkoff := backoff.WithContext(eb, q.ctx)
		var cancelClient context.CancelFunc
		err := backoff.Retry(func() error {
			cancelFn, err := q.pubsub.SubscribeWithErr(eventClientUpdate, q.listenClient)
			if err != nil {
				q.logger.Warn(q.ctx, "failed to subscribe to client updates", slog.Error(err))
				return err
			}
			cancelClient = cancelFn
			return nil
		}, bkoff)
		if err != nil {
			if q.ctx.Err() == nil {
				q.logger.Error(q.ctx, "code bug: retry failed before context canceled", slog.Error(err))
			}
			return
		}
		defer cancelClient()
		bkoff.Reset()
		q.logger.Debug(q.ctx, "subscribed to client updates")

		var cancelAgent context.CancelFunc
		err = backoff.Retry(func() error {
			cancelFn, err := q.pubsub.SubscribeWithErr(eventAgentUpdate, q.listenAgent)
			if err != nil {
				q.logger.Warn(q.ctx, "failed to subscribe to agent updates", slog.Error(err))
				return err
			}
			cancelAgent = cancelFn
			return nil
		}, bkoff)
		if err != nil {
			if q.ctx.Err() == nil {
				q.logger.Error(q.ctx, "code bug: retry failed before context canceled", slog.Error(err))
			}
			return
		}
		defer cancelAgent()
		q.logger.Debug(q.ctx, "subscribed to agent updates")

		// unblock the outer function from returning
		subscribed <- struct{}{}

		// hold subscriptions open until context is canceled
		<-q.ctx.Done()
	}()
	<-subscribed
}

func (q *querier) listenClient(_ context.Context, msg []byte, err error) {
	if xerrors.Is(err, pubsub.ErrDroppedMessages) {
		q.logger.Warn(q.ctx, "pubsub may have dropped client updates")
		// we need to schedule a full resync of client mappings
		q.resyncClientMappings()
		return
	}
	if err != nil {
		q.logger.Warn(q.ctx, "unhandled pubsub error", slog.Error(err))
		return
	}
	client, agent, err := parseClientUpdate(string(msg))
	if err != nil {
		q.logger.Error(q.ctx, "failed to parse client update", slog.F("msg", string(msg)), slog.Error(err))
		return
	}
	logger := q.logger.With(slog.F("client_id", client), slog.F("agent_id", agent))
	logger.Debug(q.ctx, "got client update")

	mk := mKey{
		agent: agent,
		kind:  agpl.QueueKindAgent,
	}
	q.mu.Lock()
	_, ok := q.mappers[mk]
	q.mu.Unlock()
	if !ok {
		logger.Debug(q.ctx, "ignoring update because we have no mapper")
		return
	}
	q.workQ.enqueue(mk)
}

func (q *querier) listenAgent(_ context.Context, msg []byte, err error) {
	if xerrors.Is(err, pubsub.ErrDroppedMessages) {
		q.logger.Warn(q.ctx, "pubsub may have dropped agent updates")
		// we need to schedule a full resync of agent mappings
		q.resyncAgentMappings()
		return
	}
	if err != nil {
		q.logger.Warn(q.ctx, "unhandled pubsub error", slog.Error(err))
	}
	agent, err := parseUpdateMessage(string(msg))
	if err != nil {
		q.logger.Error(q.ctx, "failed to parse agent update", slog.F("msg", string(msg)), slog.Error(err))
		return
	}
	logger := q.logger.With(slog.F("agent_id", agent))
	logger.Debug(q.ctx, "got agent update")
	mk := mKey{
		agent: agent,
		kind:  agpl.QueueKindClient,
	}
	q.mu.Lock()
	_, ok := q.mappers[mk]
	q.mu.Unlock()
	if !ok {
		logger.Debug(q.ctx, "ignoring update because we have no mapper")
		return
	}
	q.workQ.enqueue(mk)
}

func (q *querier) resyncClientMappings() {
	q.mu.Lock()
	defer q.mu.Unlock()
	for mk := range q.mappers {
		if mk.kind == agpl.QueueKindClient {
			q.workQ.enqueue(mk)
		}
	}
}

func (q *querier) resyncAgentMappings() {
	q.mu.Lock()
	defer q.mu.Unlock()
	for mk := range q.mappers {
		if mk.kind == agpl.QueueKindAgent {
			q.workQ.enqueue(mk)
		}
	}
}

func (q *querier) handleUpdates() {
	for {
		select {
		case <-q.ctx.Done():
			return
		case u := <-q.updates:
			if u.filter == filterUpdateUpdated {
				q.updateAll()
			}
			if u.health == healthUpdateUnhealthy {
				q.unhealthyCloseAll()
				continue
			}
			if u.health == healthUpdateHealthy {
				q.setHealthy()
				continue
			}
		}
	}
}

func (q *querier) updateAll() {
	q.mu.Lock()
	defer q.mu.Unlock()

	for _, cm := range q.mappers {
		// send on goroutine to avoid holding the q.mu.  Heartbeat failures come asynchronously with respect to
		// other kinds of work, so it's fine to deliver the command to refresh async.
		go func(m *mapper) {
			// make sure we send on the _mapper_ context, not our own in case the mapper is
			// shutting down or shut down.
			_ = sendCtx(m.ctx, m.update, struct{}{})
		}(cm.mapper)
	}
}

// unhealthyCloseAll marks the coordinator unhealthy and closes all connections.  We do this so that clients and agents
// are forced to reconnect to the coordinator, and will hopefully land on a healthy coordinator.
func (q *querier) unhealthyCloseAll() {
	q.mu.Lock()
	defer q.mu.Unlock()
	q.healthy = false
	for _, c := range q.conns {
		// close connections async so that we don't block the querier routine that responds to updates
		go func(c agpl.Queue) {
			err := c.Close()
			if err != nil {
				q.logger.Debug(q.ctx, "error closing conn while unhealthy", slog.Error(err))
			}
		}(c)
		// NOTE: we don't need to remove the connection from the map, as that will happen async in q.cleanupConn()
	}
}

func (q *querier) setHealthy() {
	q.mu.Lock()
	defer q.mu.Unlock()
	q.healthy = true
}

func (q *querier) getAll(ctx context.Context) (map[uuid.UUID]database.TailnetAgent, map[uuid.UUID][]database.TailnetClient, error) {
	agents, err := q.store.GetAllTailnetAgents(ctx)
	if err != nil {
		return nil, nil, xerrors.Errorf("get all tailnet agents: %w", err)
	}
	agentsMap := map[uuid.UUID]database.TailnetAgent{}
	for _, agent := range agents {
		agentsMap[agent.ID] = agent
	}
	clients, err := q.store.GetAllTailnetClients(ctx)
	if err != nil {
		return nil, nil, xerrors.Errorf("get all tailnet clients: %w", err)
	}
	clientsMap := map[uuid.UUID][]database.TailnetClient{}
	for _, client := range clients {
		for _, agentID := range client.AgentIds {
			clientsMap[agentID] = append(clientsMap[agentID], client.TailnetClient)
		}
	}

	return agentsMap, clientsMap, nil
}

func parseClientUpdate(msg string) (client, agent uuid.UUID, err error) {
	parts := strings.Split(msg, ",")
	if len(parts) != 2 {
		return uuid.Nil, uuid.Nil, xerrors.Errorf("expected 2 parts separated by comma")
	}
	client, err = uuid.Parse(parts[0])
	if err != nil {
		return uuid.Nil, uuid.Nil, xerrors.Errorf("failed to parse client UUID: %w", err)
	}

	agent, err = uuid.Parse(parts[1])
	if err != nil {
		return uuid.Nil, uuid.Nil, xerrors.Errorf("failed to parse agent UUID: %w", err)
	}

	return client, agent, nil
}

func parseUpdateMessage(msg string) (agent uuid.UUID, err error) {
	agent, err = uuid.Parse(msg)
	if err != nil {
		return uuid.Nil, xerrors.Errorf("failed to parse update message UUID: %w", err)
	}
	return agent, nil
}

// mKey identifies a set of node mappings we want to query.
type mKey struct {
	agent uuid.UUID
	// we always query based on the agent ID, but if we have client connection(s), we query the agent itself.  If we
	// have an agent connection, we need the node mappings for all clients of the agent.
	kind agpl.QueueKind
}

// mapping associates a particular client or agent, and its respective coordinator with a node.  It is generalized to
// include clients or agents: agent mappings will have client set to uuid.Nil.
type mapping struct {
	client      uuid.UUID
	agent       uuid.UUID
	coordinator uuid.UUID
	updatedAt   time.Time
	node        *agpl.Node
}

type queueKey interface {
	mKey | bKey | sKey
}

// workQ allows scheduling work based on a key.  Multiple enqueue requests for the same key are coalesced, and
// only one in-progress job per key is scheduled.
type workQ[K queueKey] struct {
	ctx context.Context

	cond       *sync.Cond
	pending    []K
	inProgress map[K]bool
}

func newWorkQ[K queueKey](ctx context.Context) *workQ[K] {
	q := &workQ[K]{
		ctx:        ctx,
		cond:       sync.NewCond(&sync.Mutex{}),
		inProgress: make(map[K]bool),
	}
	// wake up all waiting workers when context is done
	go func() {
		<-ctx.Done()
		q.cond.L.Lock()
		defer q.cond.L.Unlock()
		q.cond.Broadcast()
	}()
	return q
}

// enqueue adds the key to the workQ if it is not already pending.
func (q *workQ[K]) enqueue(key K) {
	q.cond.L.Lock()
	defer q.cond.L.Unlock()
	for _, mk := range q.pending {
		if mk == key {
			// already pending, no-op
			return
		}
	}
	q.pending = append(q.pending, key)
	q.cond.Signal()
}

// acquire gets a new key to begin working on.  This call blocks until work is available.  After acquiring a key, the
// worker MUST call done() with the same key to mark it complete and allow new pending work to be acquired for the key.
// An error is returned if the workQ context is canceled to unblock waiting workers.
func (q *workQ[K]) acquire() (key K, err error) {
	q.cond.L.Lock()
	defer q.cond.L.Unlock()
	for !q.workAvailable() && q.ctx.Err() == nil {
		q.cond.Wait()
	}
	if q.ctx.Err() != nil {
		return key, q.ctx.Err()
	}
	for i, mk := range q.pending {
		_, ok := q.inProgress[mk]
		if !ok {
			q.pending = append(q.pending[:i], q.pending[i+1:]...)
			q.inProgress[mk] = true
			return mk, nil
		}
	}
	// this should not be possible because we are holding the lock when we exit the loop that waits
	panic("woke with no work available")
}

// workAvailable returns true if there is work we can do.  Must be called while holding q.cond.L
func (q workQ[K]) workAvailable() bool {
	for _, mk := range q.pending {
		_, ok := q.inProgress[mk]
		if !ok {
			return true
		}
	}
	return false
}

// done marks the key completed; MUST be called after acquire() for each key.
func (q *workQ[K]) done(key K) {
	q.cond.L.Lock()
	defer q.cond.L.Unlock()
	delete(q.inProgress, key)
	q.cond.Signal()
}

type filterUpdate int

const (
	filterUpdateNone filterUpdate = iota
	filterUpdateUpdated
)

type healthUpdate int

const (
	healthUpdateNone healthUpdate = iota
	healthUpdateHealthy
	healthUpdateUnhealthy
)

// hbUpdate is an update sent from the heartbeats to the querier.  Zero values of the fields mean no update of that
// kind.
type hbUpdate struct {
	filter filterUpdate
	health healthUpdate
}

// heartbeats sends heartbeats for this coordinator on a timer, and monitors heartbeats from other coordinators.  If a
// coordinator misses their heartbeat, we remove it from our map of "valid" coordinators, such that we will filter out
// any mappings for it when filter() is called, and we send a signal on the update channel, which triggers all mappers
// to recompute their mappings and push them out to their connections.
type heartbeats struct {
	ctx    context.Context
	logger slog.Logger
	pubsub pubsub.Pubsub
	store  database.Store
	self   uuid.UUID

	update           chan<- hbUpdate
	firstHeartbeat   chan<- struct{}
	failedHeartbeats int

	lock         sync.RWMutex
	coordinators map[uuid.UUID]time.Time
	timer        *time.Timer

	// overwritten in tests, but otherwise constant
	cleanupPeriod time.Duration
}

func newHeartbeats(
	ctx context.Context, logger slog.Logger,
	ps pubsub.Pubsub, store database.Store,
	self uuid.UUID, update chan<- hbUpdate,
	firstHeartbeat chan<- struct{},
) *heartbeats {
	h := &heartbeats{
		ctx:            ctx,
		logger:         logger,
		pubsub:         ps,
		store:          store,
		self:           self,
		update:         update,
		firstHeartbeat: firstHeartbeat,
		coordinators:   make(map[uuid.UUID]time.Time),
		cleanupPeriod:  cleanupPeriod,
	}
	go h.subscribe()
	go h.sendBeats()
	go h.cleanupLoop()
	return h
}

func (h *heartbeats) filter(mappings []mapping) []mapping {
	out := make([]mapping, 0, len(mappings))
	h.lock.RLock()
	defer h.lock.RUnlock()
	for _, m := range mappings {
		ok := m.coordinator == h.self
		if !ok {
			_, ok = h.coordinators[m.coordinator]
		}
		if ok {
			out = append(out, m)
		}
	}
	return out
}

func (h *heartbeats) subscribe() {
	eb := backoff.NewExponentialBackOff()
	eb.MaxElapsedTime = 0 // retry indefinitely
	eb.MaxInterval = dbMaxBackoff
	bkoff := backoff.WithContext(eb, h.ctx)
	var cancel context.CancelFunc
	bErr := backoff.Retry(func() error {
		cancelFn, err := h.pubsub.SubscribeWithErr(EventHeartbeats, h.listen)
		if err != nil {
			h.logger.Warn(h.ctx, "failed to subscribe to heartbeats", slog.Error(err))
			return err
		}
		cancel = cancelFn
		return nil
	}, bkoff)
	if bErr != nil {
		if h.ctx.Err() == nil {
			h.logger.Error(h.ctx, "code bug: retry failed before context canceled", slog.Error(bErr))
		}
		return
	}
	// cancel subscription when context finishes
	defer cancel()
	<-h.ctx.Done()
}

func (h *heartbeats) listen(_ context.Context, msg []byte, err error) {
	if err != nil {
		// in the context of heartbeats, if we miss some messages it will be OK as long
		// as we aren't disconnected for multiple beats.  Still, even if we are disconnected
		// for longer, there isn't much to do except log.  Once we reconnect we will reinstate
		// any expired coordinators that are still alive and continue on.
		h.logger.Warn(h.ctx, "heartbeat notification error", slog.Error(err))
		return
	}
	id, err := uuid.Parse(string(msg))
	if err != nil {
		h.logger.Error(h.ctx, "unable to parse heartbeat", slog.F("msg", string(msg)), slog.Error(err))
		return
	}
	if id == h.self {
		h.logger.Debug(h.ctx, "ignoring our own heartbeat")
		return
	}
	h.recvBeat(id)
}

func (h *heartbeats) recvBeat(id uuid.UUID) {
	h.logger.Debug(h.ctx, "got heartbeat", slog.F("other_coordinator_id", id))
	h.lock.Lock()
	defer h.lock.Unlock()
	if _, ok := h.coordinators[id]; !ok {
		h.logger.Info(h.ctx, "heartbeats (re)started", slog.F("other_coordinator_id", id))
		// send on a separate goroutine to avoid holding lock.  Triggering update can be async
		go func() {
			_ = sendCtx(h.ctx, h.update, hbUpdate{filter: filterUpdateUpdated})
		}()
	}
	h.coordinators[id] = time.Now()

	if h.timer == nil {
		// this can only happen for the very first beat
		h.timer = time.AfterFunc(MissedHeartbeats*HeartbeatPeriod, h.checkExpiry)
		h.logger.Debug(h.ctx, "set initial heartbeat timeout")
		return
	}
	h.resetExpiryTimerWithLock()
}

func (h *heartbeats) resetExpiryTimerWithLock() {
	var oldestTime time.Time
	for _, t := range h.coordinators {
		if oldestTime.IsZero() || t.Before(oldestTime) {
			oldestTime = t
		}
	}
	d := time.Until(oldestTime.Add(MissedHeartbeats * HeartbeatPeriod))
	h.logger.Debug(h.ctx, "computed oldest heartbeat", slog.F("oldest", oldestTime), slog.F("time_to_expiry", d))
	// only reschedule if it's in the future.
	if d > 0 {
		h.timer.Reset(d)
	}
}

func (h *heartbeats) checkExpiry() {
	h.logger.Debug(h.ctx, "checking heartbeat expiry")
	h.lock.Lock()
	defer h.lock.Unlock()
	now := time.Now()
	expired := false
	for id, t := range h.coordinators {
		lastHB := now.Sub(t)
		h.logger.Debug(h.ctx, "last heartbeat from coordinator", slog.F("other_coordinator_id", id), slog.F("last_heartbeat", lastHB))
		if lastHB > MissedHeartbeats*HeartbeatPeriod {
			expired = true
			delete(h.coordinators, id)
			h.logger.Info(h.ctx, "coordinator failed heartbeat check", slog.F("other_coordinator_id", id), slog.F("last_heartbeat", lastHB))
		}
	}
	if expired {
		// send on a separate goroutine to avoid holding lock.  Triggering update can be async
		go func() {
			_ = sendCtx(h.ctx, h.update, hbUpdate{filter: filterUpdateUpdated})
		}()
	}
	// we need to reset the timer for when the next oldest coordinator will expire, if any.
	h.resetExpiryTimerWithLock()
}

func (h *heartbeats) sendBeats() {
	// send an initial heartbeat so that other coordinators can start using our bindings right away.
	h.sendBeat()
	close(h.firstHeartbeat) // signal binder it can start writing
	defer h.sendDelete()
	tkr := time.NewTicker(HeartbeatPeriod)
	defer tkr.Stop()
	for {
		select {
		case <-h.ctx.Done():
			h.logger.Debug(h.ctx, "ending heartbeats", slog.Error(h.ctx.Err()))
			return
		case <-tkr.C:
			h.sendBeat()
		}
	}
}

func (h *heartbeats) sendBeat() {
	_, err := h.store.UpsertTailnetCoordinator(h.ctx, h.self)
	if xerrors.Is(err, context.Canceled) {
		return
	}
	if err != nil {
		h.logger.Error(h.ctx, "failed to send heartbeat", slog.Error(err))
		h.failedHeartbeats++
		if h.failedHeartbeats == 3 {
			h.logger.Error(h.ctx, "coordinator failed 3 heartbeats and is unhealthy")
			_ = sendCtx(h.ctx, h.update, hbUpdate{health: healthUpdateUnhealthy})
		}
		return
	}
	h.logger.Debug(h.ctx, "sent heartbeat")
	if h.failedHeartbeats >= 3 {
		h.logger.Info(h.ctx, "coordinator sent heartbeat and is healthy")
		_ = sendCtx(h.ctx, h.update, hbUpdate{health: healthUpdateHealthy})
	}
	h.failedHeartbeats = 0
}

func (h *heartbeats) sendDelete() {
	// here we don't want to use the main context, since it will have been canceled
	ctx := dbauthz.As(context.Background(), pgCoordSubject)
	err := h.store.DeleteCoordinator(ctx, h.self)
	if err != nil {
		h.logger.Error(h.ctx, "failed to send coordinator delete", slog.Error(err))
		return
	}
	h.logger.Debug(h.ctx, "deleted coordinator")
}

func (h *heartbeats) cleanupLoop() {
	h.cleanup()
	tkr := time.NewTicker(h.cleanupPeriod)
	defer tkr.Stop()
	for {
		select {
		case <-h.ctx.Done():
			h.logger.Debug(h.ctx, "ending cleanupLoop", slog.Error(h.ctx.Err()))
			return
		case <-tkr.C:
			h.cleanup()
		}
	}
}

// cleanup issues a DB command to clean out any old expired coordinators state.  The cleanup is idempotent, so no need
// to synchronize with other coordinators.
func (h *heartbeats) cleanup() {
	err := h.store.CleanTailnetCoordinators(h.ctx)
	if err != nil {
		// the records we are attempting to clean up do no serious harm other than
		// accumulating in the tables, so we don't bother retrying if it fails.
		h.logger.Error(h.ctx, "failed to cleanup old coordinators", slog.Error(err))
		return
	}
	h.logger.Debug(h.ctx, "cleaned up old coordinators")
}

func (c *pgCoord) ServeHTTPDebug(w http.ResponseWriter, r *http.Request) {
	ctx := r.Context()
	debug, err := c.htmlDebug(ctx)
	if err != nil {
		w.WriteHeader(http.StatusInternalServerError)
		_, _ = w.Write([]byte(err.Error()))
		return
	}

	agpl.CoordinatorHTTPDebug(debug)(w, r)
}

func (c *pgCoord) htmlDebug(ctx context.Context) (agpl.HTMLDebug, error) {
	now := time.Now()
	data := agpl.HTMLDebug{}
	agents, clients, err := c.querier.getAll(ctx)
	if err != nil {
		return data, xerrors.Errorf("get all agents and clients: %w", err)
	}

	for _, agent := range agents {
		htmlAgent := &agpl.HTMLAgent{
			ID: agent.ID,
			// Name: ??, TODO: get agent names
			LastWriteAge: now.Sub(agent.UpdatedAt).Round(time.Second),
		}
		for _, conn := range clients[agent.ID] {
			htmlAgent.Connections = append(htmlAgent.Connections, &agpl.HTMLClient{
				ID:           conn.ID,
				Name:         conn.ID.String(),
				LastWriteAge: now.Sub(conn.UpdatedAt).Round(time.Second),
			})
			data.Nodes = append(data.Nodes, &agpl.HTMLNode{
				ID:   conn.ID,
				Node: conn.Node,
			})
		}
		slices.SortFunc(htmlAgent.Connections, func(a, b *agpl.HTMLClient) int {
			return slice.Ascending(a.Name, b.Name)
		})

		data.Agents = append(data.Agents, htmlAgent)
		data.Nodes = append(data.Nodes, &agpl.HTMLNode{
			ID: agent.ID,
			// Name: ??, TODO: get agent names
			Node: agent.Node,
		})
	}
	slices.SortFunc(data.Agents, func(a, b *agpl.HTMLAgent) int {
		return slice.Ascending(a.Name, b.Name)
	})

	for agentID, conns := range clients {
		if len(conns) == 0 {
			continue
		}

		if _, ok := agents[agentID]; ok {
			continue
		}
		agent := &agpl.HTMLAgent{
			Name: "unknown",
			ID:   agentID,
		}
		for _, conn := range conns {
			agent.Connections = append(agent.Connections, &agpl.HTMLClient{
				Name:         conn.ID.String(),
				ID:           conn.ID,
				LastWriteAge: now.Sub(conn.UpdatedAt).Round(time.Second),
			})
			data.Nodes = append(data.Nodes, &agpl.HTMLNode{
				ID:   conn.ID,
				Node: conn.Node,
			})
		}
		slices.SortFunc(agent.Connections, func(a, b *agpl.HTMLClient) int {
			return slice.Ascending(a.Name, b.Name)
		})

		data.MissingAgents = append(data.MissingAgents, agent)
	}
	slices.SortFunc(data.MissingAgents, func(a, b *agpl.HTMLAgent) int {
		return slice.Ascending(a.Name, b.Name)
	})

	return data, nil
}