
573 lines
16 KiB

package tailnet
import (
const lostTimeout = 15 * time.Minute
// engineConfigurable is the subset of wgengine.Engine that we use for configuration.
// This allows us to test configuration code without faking the whole interface.
type engineConfigurable interface {
Reconfig(*wgcfg.Config, *router.Config, *dns.Config, *tailcfg.Debug) error
type phase int
const (
idle phase = iota
type phased struct {
phase phase
type configMaps struct {
netmapDirty bool
derpMapDirty bool
filterDirty bool
closing bool
engine engineConfigurable
static netmap.NetworkMap
peers map[uuid.UUID]*peerLifecycle
addresses []netip.Prefix
derpMap *proto.DERPMap
logger slog.Logger
blockEndpoints bool
// for testing
clock clock.Clock
func newConfigMaps(logger slog.Logger, engine engineConfigurable, nodeID tailcfg.NodeID, nodeKey key.NodePrivate, discoKey key.DiscoPublic) *configMaps {
pubKey := nodeKey.Public()
c := &configMaps{
phased: phased{Cond: *(sync.NewCond(&sync.Mutex{}))},
logger: logger,
engine: engine,
static: netmap.NetworkMap{
SelfNode: &tailcfg.Node{
ID: nodeID,
Key: pubKey,
DiscoKey: discoKey,
NodeKey: pubKey,
PrivateKey: nodeKey,
PacketFilter: []filter.Match{{
// Allow any protocol!
IPProto: []ipproto.Proto{ipproto.TCP, ipproto.UDP, ipproto.ICMPv4, ipproto.ICMPv6, ipproto.SCTP},
// Allow traffic sourced from anywhere.
Srcs: []netip.Prefix{
netip.PrefixFrom(netip.AddrFrom4([4]byte{}), 0),
netip.PrefixFrom(netip.AddrFrom16([16]byte{}), 0),
// Allow traffic to route anywhere.
Dsts: []filter.NetPortRange{
Net: netip.PrefixFrom(netip.AddrFrom4([4]byte{}), 0),
Ports: filter.PortRange{
First: 0,
Last: 65535,
Net: netip.PrefixFrom(netip.AddrFrom16([16]byte{}), 0),
Ports: filter.PortRange{
First: 0,
Last: 65535,
Caps: []filter.CapMatch{},
peers: make(map[uuid.UUID]*peerLifecycle),
clock: clock.New(),
go c.configLoop()
return c
// configLoop waits for the config to be dirty, then reconfigures the engine.
// It is internal to configMaps
func (c *configMaps) configLoop() {
defer c.L.Unlock()
defer func() {
c.phase = closed
for {
for !(c.closing || c.netmapDirty || c.filterDirty || c.derpMapDirty) {
c.phase = idle
if c.closing {
c.logger.Debug(context.Background(), "closing configMaps configLoop")
// queue up the reconfiguration actions we will take while we have
// the configMaps locked. We will execute them while unlocked to avoid
// blocking during reconfig.
actions := make([]func(), 0, 3)
if c.derpMapDirty {
derpMap := c.derpMapLocked()
actions = append(actions, func() {
c.logger.Debug(context.Background(), "updating engine DERP map", slog.F("derp_map", derpMap))
if c.netmapDirty {
nm := c.netMapLocked()
actions = append(actions, func() {
c.logger.Debug(context.Background(), "updating engine network map", slog.F("network_map", nm))
if c.filterDirty {
f := c.filterLocked()
actions = append(actions, func() {
c.logger.Debug(context.Background(), "updating engine filter", slog.F("filter", f))
c.netmapDirty = false
c.filterDirty = false
c.derpMapDirty = false
c.phase = configuring
for _, a := range actions {
// close closes the configMaps and stops it configuring the engine
func (c *configMaps) close() {
defer c.L.Unlock()
for _, lc := range c.peers {
c.closing = true
for c.phase != closed {
// netMapLocked returns the current NetworkMap as determined by the config we
// have. c.L must be held.
func (c *configMaps) netMapLocked() *netmap.NetworkMap {
nm := new(netmap.NetworkMap)
*nm = c.static
nm.Addresses = make([]netip.Prefix, len(c.addresses))
copy(nm.Addresses, c.addresses)
nm.DERPMap = DERPMapFromProto(c.derpMap)
nm.Peers = c.peerConfigLocked()
nm.SelfNode.Addresses = nm.Addresses
nm.SelfNode.AllowedIPs = nm.Addresses
return nm
// peerConfigLocked returns the set of peer nodes we have. c.L must be held.
func (c *configMaps) peerConfigLocked() []*tailcfg.Node {
out := make([]*tailcfg.Node, 0, len(c.peers))
for _, p := range c.peers {
n := p.node.Clone()
if c.blockEndpoints {
n.Endpoints = nil
out = append(out, n)
return out
// setAddresses sets the addresses belonging to this node to the given slice. It
// triggers configuration of the engine if the addresses have changed.
// c.L MUST NOT be held.
func (c *configMaps) setAddresses(ips []netip.Prefix) {
defer c.L.Unlock()
if d := prefixesDifferent(c.addresses, ips); !d {
c.addresses = make([]netip.Prefix, len(ips))
copy(c.addresses, ips)
c.netmapDirty = true
c.filterDirty = true
// setBlockEndpoints sets whether we should block configuring endpoints we learn
// from peers. It triggers a configuration of the engine if the value changes.
// nolint: revive
func (c *configMaps) setBlockEndpoints(blockEndpoints bool) {
defer c.L.Unlock()
if c.blockEndpoints != blockEndpoints {
c.netmapDirty = true
c.blockEndpoints = blockEndpoints
// setDERPMap sets the DERP map, triggering a configuration of the engine if it has changed.
// c.L MUST NOT be held.
func (c *configMaps) setDERPMap(derpMap *proto.DERPMap) {
defer c.L.Unlock()
eq, err := c.derpMap.Equal(derpMap)
if err != nil {
c.logger.Critical(context.Background(), "failed to compare DERP maps", slog.Error(err))
if eq {
c.derpMap = derpMap
c.derpMapDirty = true
// derMapLocked returns the current DERPMap. c.L must be held
func (c *configMaps) derpMapLocked() *tailcfg.DERPMap {
m := DERPMapFromProto(c.derpMap)
return m
// reconfig computes the correct wireguard config and calls the engine.Reconfig
// with the config we have. It is not intended for this to be called outside of
// the updateLoop()
func (c *configMaps) reconfig(nm *netmap.NetworkMap) {
cfg, err := nmcfg.WGCfg(nm, Logger(c.logger.Named("net.wgconfig")), netmap.AllowSingleHosts, "")
if err != nil {
// WGCfg never returns an error at the time this code was written. If it starts, returning
// errors if/when we upgrade tailscale, we'll need to deal.
c.logger.Critical(context.Background(), "update wireguard config failed", slog.Error(err))
rc := &router.Config{LocalAddrs: nm.Addresses}
err = c.engine.Reconfig(cfg, rc, &dns.Config{}, &tailcfg.Debug{})
if err != nil {
if errors.Is(err, wgengine.ErrNoChanges) {
c.logger.Error(context.Background(), "failed to reconfigure wireguard engine", slog.Error(err))
// filterLocked returns the current filter, based on our local addresses. c.L
// must be held.
func (c *configMaps) filterLocked() *filter.Filter {
localIPSet := netipx.IPSetBuilder{}
for _, addr := range c.addresses {
localIPs, _ := localIPSet.IPSet()
logIPSet := netipx.IPSetBuilder{}
logIPs, _ := logIPSet.IPSet()
return filter.New(
// updatePeers handles protocol updates about peers from the coordinator. c.L MUST NOT be held.
func (c *configMaps) updatePeers(updates []*proto.CoordinateResponse_PeerUpdate) {
status := c.status()
defer c.L.Unlock()
// Update all the lastHandshake values here. That way we don't have to
// worry about them being up-to-date when handling updates below, and it covers
// all peers, not just the ones we got updates about.
for _, lc := range c.peers {
if peerStatus, ok := status.Peer[lc.node.Key]; ok {
lc.lastHandshake = peerStatus.LastHandshake
for _, update := range updates {
if dirty := c.updatePeerLocked(update, status); dirty {
c.netmapDirty = true
if c.netmapDirty {
// status requests a status update from the engine.
func (c *configMaps) status() *ipnstate.Status {
sb := &ipnstate.StatusBuilder{WantPeers: true}
return sb.Status()
// updatePeerLocked processes a single update for a single peer. It is intended
// as internal function since it returns whether or not the config is dirtied by
// the update (instead of handling it directly like updatePeers). c.L must be held.
func (c *configMaps) updatePeerLocked(update *proto.CoordinateResponse_PeerUpdate, status *ipnstate.Status) (dirty bool) {
id, err := uuid.FromBytes(update.Id)
if err != nil {
c.logger.Critical(context.Background(), "received update with bad id", slog.F("id", update.Id))
return false
logger := c.logger.With(slog.F("peer_id", id))
lc, ok := c.peers[id]
var node *tailcfg.Node
if update.Kind == proto.CoordinateResponse_PeerUpdate_NODE {
// If no preferred DERP is provided, we can't reach the node.
if update.Node.PreferredDerp == 0 {
logger.Warn(context.Background(), "no preferred DERP, peer update", slog.F("node_proto", update.Node))
return false
node, err = c.protoNodeToTailcfg(update.Node)
if err != nil {
logger.Critical(context.Background(), "failed to convert proto node to tailcfg", slog.F("node_proto", update.Node))
return false
logger = logger.With(slog.F("key_id", node.Key.ShortString()), slog.F("node", node))
peerStatus, ok := status.Peer[node.Key]
// Starting KeepAlive messages at the initialization of a connection
// causes a race condition. If we send the handshake before the peer has
// our node, we'll have to wait for 5 seconds before trying again.
// Ideally, the first handshake starts when the user first initiates a
// connection to the peer. After a successful connection we enable
// keep alives to persist the connection and keep it from becoming idle.
// SSH connections don't send packets while idle, so we use keep alives
// to avoid random hangs while we set up the connection again after
// inactivity.
node.KeepAlive = ok && peerStatus.Active
switch {
case !ok && update.Kind == proto.CoordinateResponse_PeerUpdate_NODE:
// new!
var lastHandshake time.Time
if ps, ok := status.Peer[node.Key]; ok {
lastHandshake = ps.LastHandshake
c.peers[id] = &peerLifecycle{
peerID: id,
node: node,
lastHandshake: lastHandshake,
lost: false,
logger.Debug(context.Background(), "adding new peer")
return true
case ok && update.Kind == proto.CoordinateResponse_PeerUpdate_NODE:
// update
node.Created = lc.node.Created
dirty = !lc.node.Equal(node)
lc.node = node
lc.lost = false
logger.Debug(context.Background(), "node update to existing peer", slog.F("dirty", dirty))
return dirty
case !ok:
// disconnected or lost, but we don't have the node. No op
logger.Debug(context.Background(), "skipping update for peer we don't recognize")
return false
case update.Kind == proto.CoordinateResponse_PeerUpdate_DISCONNECTED:
delete(c.peers, id)
logger.Debug(context.Background(), "disconnected peer")
return true
case update.Kind == proto.CoordinateResponse_PeerUpdate_LOST:
lc.lost = true
logger.Debug(context.Background(), "marked peer lost")
// marking a node lost doesn't change anything right now, so dirty=false
return false
logger.Warn(context.Background(), "unknown peer update", slog.F("kind", update.Kind))
return false
// setAllPeersLost marks all peers as lost. Typically, this is called when we lose connection to
// the Coordinator. (When we reconnect, we will get NODE updates for all peers that are still connected
// and mark them as not lost.)
func (c *configMaps) setAllPeersLost() {
defer c.L.Unlock()
for _, lc := range c.peers {
if lc.lost {
// skip processing already lost nodes, as this just results in timer churn
lc.lost = true
// it's important to drop a log here so that we see it get marked lost if grepping thru
// the logs for a specific peer
"setAllPeersLost marked peer lost",
slog.F("peer_id", lc.peerID),
slog.F("key_id", lc.node.Key.ShortString()),
// peerLostTimeout is the callback that peerLifecycle uses when a peer is lost the timeout to
// receive a handshake fires.
func (c *configMaps) peerLostTimeout(id uuid.UUID) {
logger := c.logger.With(slog.F("peer_id", id))
"peer lost timeout")
// First do a status update to see if the peer did a handshake while we were
// waiting
status := c.status()
defer c.L.Unlock()
lc, ok := c.peers[id]
if !ok {
"timeout triggered for peer that is removed from the map")
if peerStatus, ok := status.Peer[lc.node.Key]; ok {
lc.lastHandshake = peerStatus.LastHandshake
logger = logger.With(slog.F("key_id", lc.node.Key.ShortString()))
if !lc.lost {
"timeout triggered for peer that is no longer lost")
since := c.clock.Since(lc.lastHandshake)
if since >= lostTimeout {
context.Background(), "removing lost peer")
delete(c.peers, id)
c.netmapDirty = true
"timeout triggered for peer but it had handshake in meantime")
func (c *configMaps) protoNodeToTailcfg(p *proto.Node) (*tailcfg.Node, error) {
node, err := ProtoToNode(p)
if err != nil {
return nil, err
return &tailcfg.Node{
ID: tailcfg.NodeID(p.GetId()),
Created: c.clock.Now(),
Key: node.Key,
DiscoKey: node.DiscoKey,
Addresses: node.Addresses,
AllowedIPs: node.AllowedIPs,
Endpoints: node.Endpoints,
DERP: fmt.Sprintf("%s:%d", tailcfg.DerpMagicIP, node.PreferredDERP),
Hostinfo: (&tailcfg.Hostinfo{}).View(),
}, nil
// nodeAddresses returns the addresses for the peer with the given publicKey, if known.
func (c *configMaps) nodeAddresses(publicKey key.NodePublic) ([]netip.Prefix, bool) {
defer c.L.Unlock()
for _, lc := range c.peers {
if lc.node.Key == publicKey {
return lc.node.Addresses, true
return nil, false
type peerLifecycle struct {
peerID uuid.UUID
node *tailcfg.Node
lost bool
lastHandshake time.Time
timer *clock.Timer
func (l *peerLifecycle) resetTimer() {
if l.timer != nil {
l.timer = nil
func (l *peerLifecycle) setLostTimer(c *configMaps) {
if l.timer != nil {
ttl := lostTimeout - c.clock.Since(l.lastHandshake)
if ttl <= 0 {
ttl = time.Nanosecond
l.timer = c.clock.AfterFunc(ttl, func() {
// prefixesDifferent returns true if the two slices contain different prefixes
// where order doesn't matter.
func prefixesDifferent(a, b []netip.Prefix) bool {
if len(a) != len(b) {
return true
as := make(map[string]bool)
for _, p := range a {
as[p.String()] = true
for _, p := range b {
if !as[p.String()] {
return true
return false