netbirdio / netbird

Connect your devices into a secure WireGuard®-based overlay network with SSO, MFA and granular access controls.
https://netbird.io
BSD 3-Clause "New" or "Revised" License
10.74k stars 484 forks source link

Network Monitor causes race conditions when multiple interfaces are connected #2130

Closed hurricanehrndz closed 3 months ago

hurricanehrndz commented 3 months ago

Describe the problem

When more than one network interface is connected to the same network, the network monitor will try and restart the engine multiple times in succession

This error will occur

2024-05-29T17:14:13-04:00 INFO client/internal/connect.go:339: using 43196 as wireguard port: 43195 is in use
2024-05-29T17:14:13-04:00 INFO client/internal/routemanager/manager.go:93: Routing setup complete
2024-05-29T17:14:13-04:00 ERRO client/internal/engine.go:313: failed creating tunnel interface utun4319: [resource busy]
2024-05-29T17:14:13-04:00 INFO client/internal/routemanager/manager.go:117: Routing cleanup complete
2024-05-29T17:14:13-04:00 ERRO client/internal/connect.go:261: error while starting Netbird Connection Engine: create wg interface: resource busy

To Reproduce

Steps to reproduce the behavior: Using macOS device, connected via ethernet and wifi, wait, disconnect, watch netbird fail

Expected behavior

For a resource conflict and race condition not to occur

Are you using NetBird Cloud?

No

NetBird version

0.28/master

Possible Solution

Add a de-bounce mechanism to avoid multiple engine restarts within 3s of a restart

From a1f8b0ed4c5d49bb0c00243f5b5a11becae3cf1e Mon Sep 17 00:00:00 2001
From: Carlos Hernandez <carlos@hrndz.ca>
Date: Fri, 31 May 2024 08:16:30 -0600
Subject: [PATCH] Ignore route changes if last was less than 2s ago

---
 client/internal/engine.go | 71 ++++++++++++++++++++++++++-------------
 1 file changed, 48 insertions(+), 23 deletions(-)

diff --git a/client/internal/engine.go b/client/internal/engine.go
index 2cbde13a..915e3ea0 100644
--- a/client/internal/engine.go
+++ b/client/internal/engine.go
@@ -152,6 +152,8 @@ type Engine struct {
    wgProbe     *Probe

    wgConnWorker sync.WaitGroup
+
+   lastNetworkChangeTimestamp *NetworkChangeTimestamp
 }

 // Peer is an instance of the Connection Peer
@@ -160,6 +162,17 @@ type Peer struct {
    WgAllowedIps string
 }

+// NetworkChangeTimestamp is a thread-safe structure that holds the timestamp of the last network change
+type NetworkChangeTimestamp struct {
+   mu    sync.Mutex
+   value int64
+}
+
+// NewNetworkChangeTimestamp creates a new NetworkChangeTimestamp instance with the current time as the initial value
+func NewNetworkChangeTimestamp() *NetworkChangeTimestamp {
+   return &NetworkChangeTimestamp{value: time.Now().Unix()}
+}
+
 // NewEngine creates a new Connection Engine
 func NewEngine(
    clientCtx context.Context,
@@ -199,25 +212,25 @@ func NewEngineWithProbes(
    relayProbe *Probe,
    wgProbe *Probe,
 ) *Engine {
-
    return &Engine{
-       clientCtx:      clientCtx,
-       clientCancel:   clientCancel,
-       signal:         signalClient,
-       mgmClient:      mgmClient,
-       peerConns:      make(map[string]*peer.Conn),
-       syncMsgMux:     &sync.Mutex{},
-       config:         config,
-       mobileDep:      mobileDep,
-       STUNs:          []*stun.URI{},
-       TURNs:          []*stun.URI{},
-       networkSerial:  0,
-       sshServerFunc:  nbssh.DefaultSSHServer,
-       statusRecorder: statusRecorder,
-       mgmProbe:       mgmProbe,
-       signalProbe:    signalProbe,
-       relayProbe:     relayProbe,
-       wgProbe:        wgProbe,
+       clientCtx:                  clientCtx,
+       clientCancel:               clientCancel,
+       signal:                     signalClient,
+       mgmClient:                  mgmClient,
+       peerConns:                  make(map[string]*peer.Conn),
+       syncMsgMux:                 &sync.Mutex{},
+       config:                     config,
+       mobileDep:                  mobileDep,
+       STUNs:                      []*stun.URI{},
+       TURNs:                      []*stun.URI{},
+       networkSerial:              0,
+       sshServerFunc:              nbssh.DefaultSSHServer,
+       statusRecorder:             statusRecorder,
+       mgmProbe:                   mgmProbe,
+       signalProbe:                signalProbe,
+       relayProbe:                 relayProbe,
+       wgProbe:                    wgProbe,
+       lastNetworkChangeTimestamp: &NetworkChangeTimestamp{},
    }
 }

@@ -228,6 +241,7 @@ func (e *Engine) Stop() error {
    if e.cancel != nil {
        e.cancel()
    }
+   e.cancel = nil

    // stopping network monitor first to avoid starting the engine again
    if e.networkMonitor != nil {
@@ -358,7 +372,6 @@ func (e *Engine) Start() error {
 // modifyPeers updates peers that have been modified (e.g. IP address has been changed).
 // It closes the existing connection, removes it from the peerConns map, and creates a new one.
 func (e *Engine) modifyPeers(peersUpdate []*mgmProto.RemotePeerConfig) error {
-
    // first, check if peers have been modified
    var modified []*mgmProto.RemotePeerConfig
    for _, p := range peersUpdate {
@@ -481,7 +494,8 @@ func sendSignal(message *sProto.Message, s signal.Client) error {

 // SignalOfferAnswer signals either an offer or an answer to remote peer
 func SignalOfferAnswer(offerAnswer peer.OfferAnswer, myKey wgtypes.Key, remoteKey wgtypes.Key, s signal.Client,
-   isAnswer bool) error {
+   isAnswer bool,
+) error {
    var t sProto.Body_Type
    if isAnswer {
        t = sProto.Body_ANSWER
@@ -539,7 +553,6 @@ func isNil(server nbssh.Server) bool {
 }

 func (e *Engine) updateSSH(sshConf *mgmProto.SSHConfig) error {
-
    if !e.config.ServerSSHAllowed {
        log.Warnf("running SSH server is not permitted")
        return nil
@@ -672,7 +685,6 @@ func (e *Engine) updateTURNs(turns []*mgmProto.ProtectedHostConfig) error {
 }

 func (e *Engine) updateNetworkMap(networkMap *mgmProto.NetworkMap) error {
-
    // intentionally leave it before checking serial because for now it can happen that peer IP changed but serial didn't
    if networkMap.GetPeerConfig() != nil {
        err := e.updateConfig(networkMap.GetPeerConfig())
@@ -1110,7 +1122,7 @@ func (e *Engine) receiveSignalEvents() {

 func (e *Engine) parseNATExternalIPMappings() []string {
    var mappedIPs []string
-   var ignoredIFaces = make(map[string]interface{})
+   ignoredIFaces := make(map[string]interface{})
    for _, iFace := range e.config.IFaceBlackList {
        ignoredIFaces[iFace] = nil
    }
@@ -1411,7 +1423,20 @@ func (e *Engine) startNetworkMonitor() {
    e.networkMonitor = networkmonitor.New()
    go func() {
        err := e.networkMonitor.Start(e.ctx, func() {
+           log.Infof("Network monitor detected network change")
+           e.lastNetworkChangeTimestamp.mu.Lock()
+           defer e.lastNetworkChangeTimestamp.mu.Unlock()
+           now := time.Now().Unix()
+           if e.lastNetworkChangeTimestamp.value + 3 > now  || e.cancel == nil {
+               log.Infof("Network monitor, skipping engine restart")
+               return
+           }
            log.Infof("Network monitor detected network change, restarting engine")
+           e.lastNetworkChangeTimestamp.value = now
+           if e.cancel == nil {
+               log.Info("restart already in progress, skipping")
+               return
+           }
            if err := e.Stop(); err != nil {
                log.Errorf("Failed to stop engine: %v", err)
            }
hurricanehrndz commented 3 months ago

@bcmmbaga you may want to look at this because of #2126

hurricanehrndz commented 3 months ago

closed by #2225