Open n-holmstedt opened 2 years ago
This issue happened to me as well using a regular salt-minion on Windows while performing an highstate.
File "c:\salt\bin\lib\site-packages\salt-3003-py3.7.egg\salt\minion.py", line 1905, in _thread_return
function_name, function_args, executors, opts, data
File "c:\salt\bin\lib\site-packages\salt-3003-py3.7.egg\salt\minion.py", line 1861, in _execute_job_function
return_data = self.executors[fname](opts, data, func, args, kwargs)
File "c:\salt\bin\lib\site-packages\salt-3003-py3.7.egg\salt\loader.py", line 1235, in __call__
return self.loader.run(run_func, *args, **kwargs)
File "c:\salt\bin\lib\site-packages\salt-3003-py3.7.egg\salt\loader.py", line 2268, in run
return self._last_context.run(self._run_as, _func_or_method, *args, **kwargs)
File "c:\salt\bin\lib\site-packages\salt-3003-py3.7.egg\salt\loader.py", line 2283, in _run_as
return _func_or_method(*args, **kwargs)
File "c:\salt\bin\lib\site-packages\salt-3003-py3.7.egg\salt\executors\direct_call.py", line 12, in execute
return func(*args, **kwargs)
File "c:\salt\bin\lib\site-packages\salt-3003-py3.7.egg\salt\loader.py", line 1235, in __call__
return self.loader.run(run_func, *args, **kwargs)
File "c:\salt\bin\lib\site-packages\salt-3003-py3.7.egg\salt\loader.py", line 2268, in run
return self._last_context.run(self._run_as, _func_or_method, *args, **kwargs)
File "c:\salt\bin\lib\site-packages\salt-3003-py3.7.egg\salt\loader.py", line 2283, in _run_as
return _func_or_method(*args, **kwargs)
File "c:\salt\bin\lib\site-packages\salt-3003-py3.7.egg\salt\modules\state.py", line 1121, in highstate
orchestration_jid=orchestration_jid,
File "c:\salt\bin\lib\site-packages\salt-3003-py3.7.egg\salt\state.py", line 4513, in call_highstate
self.state.functions["cmd.run"](
File "c:\salt\bin\lib\site-packages\salt-3003-py3.7.egg\salt\loader.py", line 1403, in __getitem__
func = super().__getitem__(item)
File "c:\salt\bin\lib\site-packages\salt-3003-py3.7.egg\salt\utils\lazy.py", line 101, in __getitem__
if self._load(key):
File "c:\salt\bin\lib\site-packages\salt-3003-py3.7.egg\salt\loader.py", line 2100, in _load
ret = _inner_load(mod_name)
File "c:\salt\bin\lib\site-packages\salt-3003-py3.7.egg\salt\loader.py", line 2089, in _inner_load
if self._load_module(name) and key in self._dict:
File "c:\salt\bin\lib\site-packages\salt-3003-py3.7.egg\salt\loader.py", line 1884, in _load_module
self.__clean_sys_path()
File "c:\salt\bin\lib\site-packages\salt-3003-py3.7.egg\salt\loader.py", line 1705, in __clean_sys_path
importlib.invalidate_caches()
File "c:\salt\bin\lib\importlib\__init__.py", line 71, in invalidate_caches
finder.invalidate_caches()
File "<frozen importlib._bootstrap_external>", line 1184, in invalidate_caches
RuntimeError: dictionary changed size during iteration
Created a very basic MR which hopefully handles this by adding a Lock around the operations with sys.path and the local list holding the loaded modules. This issue hits us once a week and is especially nasty because the minion is barely usable after that and requires a restart, which may not be available via salt as the service module may no more be present.
Sadly seeing this in lightrush/framework-laptop-formula#33 on Ubuntu 22.04.
Sadly purging salt-common
, salt-minion
or installing salt-master
fixed the problem.
As this is a constant issue with each release, the patch for 3006.8
diff --git a/salt/loader/lazy.py b/salt/loader/lazy.py
index a3b795a375..576cec65cb 100644
--- a/salt/loader/lazy.py
+++ b/salt/loader/lazy.py
@@ -227,6 +227,8 @@ class LazyLoader(salt.utils.lazy.LazyDict):
"""
mod_dict_class = dict
+ _lock = threading.RLock()
+ _global_lock = threading.Lock()
def __init__(
self,
@@ -322,8 +324,7 @@ class LazyLoader(salt.utils.lazy.LazyDict):
self.suffix_map[suffix] = (suffix, mode, kind)
self.suffix_order.append(suffix)
- self._lock = threading.RLock()
- with self._lock:
+ with LazyLoader._lock:
self._refresh_file_mapping()
super().__init__() # late init the lazy loader
@@ -565,7 +566,7 @@ class LazyLoader(salt.utils.lazy.LazyDict):
"""
Clear the dict
"""
- with self._lock:
+ with LazyLoader._lock:
super().clear() # clear the lazy loader
self.loaded_files = set()
self.missing_modules = {}
@@ -642,34 +643,36 @@ class LazyLoader(salt.utils.lazy.LazyDict):
self._reload_submodules(submodule)
def __populate_sys_path(self):
- for directory in self.extra_module_dirs:
- if directory not in sys.path:
- sys.path.append(directory)
- self._clean_module_dirs.append(directory)
+ with LazyLoader._global_lock:
+ for directory in self.extra_module_dirs:
+ if directory not in sys.path:
+ sys.path.append(directory)
+ self._clean_module_dirs.append(directory)
def __clean_sys_path(self):
- invalidate_path_importer_cache = False
- for directory in self._clean_module_dirs:
- if directory in sys.path:
- sys.path.remove(directory)
- invalidate_path_importer_cache = True
- self._clean_module_dirs = []
-
- # Be sure that sys.path_importer_cache do not contains any
- # invalid FileFinder references
- importlib.invalidate_caches()
-
- # Because we are mangling with importlib, we can find from
- # time to time an invalidation issue with
- # sys.path_importer_cache, that requires the removal of
- # FileFinder that remain None for the extra_module_dirs
- if invalidate_path_importer_cache:
- for directory in self.extra_module_dirs:
- if (
- directory in sys.path_importer_cache
- and sys.path_importer_cache[directory] is None
- ):
- del sys.path_importer_cache[directory]
+ with LazyLoader._global_lock:
+ invalidate_path_importer_cache = False
+ for directory in self._clean_module_dirs:
+ if directory in sys.path:
+ sys.path.remove(directory)
+ invalidate_path_importer_cache = True
+ self._clean_module_dirs = []
+
+ # Be sure that sys.path_importer_cache do not contains any
+ # invalid FileFinder references
+ importlib.invalidate_caches()
+
+ # Because we are mangling with importlib, we can find from
+ # time to time an invalidation issue with
+ # sys.path_importer_cache, that requires the removal of
+ # FileFinder that remain None for the extra_module_dirs
+ if invalidate_path_importer_cache:
+ for directory in self.extra_module_dirs:
+ if (
+ directory in sys.path_importer_cache
+ and sys.path_importer_cache[directory] is None
+ ):
+ del sys.path_importer_cache[directory]
def _load_module(self, name):
mod = None
@@ -700,7 +703,8 @@ class LazyLoader(salt.utils.lazy.LazyDict):
fpath_dirname = os.path.dirname(fpath)
try:
self.__populate_sys_path()
- sys.path.append(fpath_dirname)
+ with LazyLoader._global_lock:
+ sys.path.append(fpath_dirname)
if suffix == ".pyx":
mod = pyximport.load_module(name, fpath, tempfile.gettempdir())
elif suffix == ".o":
@@ -837,7 +841,8 @@ class LazyLoader(salt.utils.lazy.LazyDict):
self.missing_modules[name] = error
return False
finally:
- sys.path.remove(fpath_dirname)
+ with LazyLoader._global_lock:
+ sys.path.remove(fpath_dirname)
self.__clean_sys_path()
loader_context = salt.loader.context.LoaderContext()
@@ -1042,7 +1047,7 @@ class LazyLoader(salt.utils.lazy.LazyDict):
if "." not in key:
raise KeyError(f"The key '{key}' should contain a '.'")
mod_name, _ = key.split(".", 1)
- with self._lock:
+ with LazyLoader._lock:
# It is possible that the key is in the dictionary after
# acquiring the lock due to another thread loading it.
if mod_name in self.missing_modules or key in self._dict:
@@ -1092,7 +1097,7 @@ class LazyLoader(salt.utils.lazy.LazyDict):
"""
Load all of them
"""
- with self._lock:
+ with LazyLoader._lock:
for name in self.file_mapping:
if name in self.loaded_files or name in self.missing_modules:
continue
@@ -1101,7 +1106,7 @@ class LazyLoader(salt.utils.lazy.LazyDict):
self.loaded = True
def reload_modules(self):
- with self._lock:
+ with LazyLoader._lock:
self.loaded_files = set()
self._load_all()
@garethgreenaway, maybe you can consider it or provide feedback of what is missing.
@dwoz ^^^
The loader was never intended to be thread safe. So, I'd like to understand the issue and where we are using the loader with threads more.
The issue regularly happens in our case on plain-regular Windows minions (nothing special with deltaproxy) when initially provisioning a host via multiple state and highstate calls. It is so bad that we where required to implement a self-patching mechanism including a minion start for addressing that while provisioning the minion/server.
The problem is: when this happens, there is no way to mitigate the situation without restarting the salt-minion. The minion is burned and useless, once this happened.
To my understanding the issue is not the loader per se, but that python does not provide a thread-safe mechanism to update the sys.path. It just seems that coincidentally multiple instances of the loader do not consider that. The proposed patch is also not bullet proof because in theory something else could manipulate the sys.path as well, breaking the loader again. It just seems that in this specific situation the loader alone modifies the sys.path.
Description Deltaproxy causing importlibs invalidate_caches to throw a KeyError for caches that has already been deleted. This is causing the module function that is being executed to fail.
Setup (Please provide relevant configs and/or SLS files (be sure to remove sensitive info. There is no general set-up of Salt.)
Please be as specific as possible and give set-up details.
Running 210 IOX-XR napalm-minions controlled by a total of 30 deltaproxies (max 8 per delta) connected one master, all of them deployed in Kubernetes using the official image.
Steps to Reproduce the behavior (Include debug logs if possible and relevant) This bug is a bit hard to reproduce, and only seem to occur when scheduling custom module-commands. The example here is a small abstraction of the napalm 'cli' function run every night as:
Looking at the importlib invalidate_caches
It seems like the deltaproxy-hosted minions share this cache, and under certain circumstances a minion that has evaluated 'finder' as None wont be able to delete it from its sys.path since another minion already deleted it. Might be that this issue scales somewhat linearly with the amount of hosted minions on a delta-proxy?
We've solved this internally catching the error in the lazy loader
Expected behavior The function being scheduled for all minions under the deltaproxy should be executed.
Screenshots If applicable, add screenshots to help explain your problem.
Versions Report
salt --versions-report
(Provided by running salt --versions-report. Please also mention any differences in master/minion versions.) ```yaml Salt Version: Salt: 3004 Dependency Versions: cffi: 1.14.6 cherrypy: unknown dateutil: 2.8.1 docker-py: Not Installed gitdb: Not Installed gitpython: Not Installed Jinja2: 2.11.3 libgit2: 1.1.0 M2Crypto: Not Installed Mako: Not Installed msgpack: 1.0.2 msgpack-pure: Not Installed mysql-python: Not Installed pycparser: 2.17 pycrypto: Not Installed pycryptodome: 3.9.8 pygit2: 1.6.1 Python: 3.7.12 (default, Sep 8 2021, 01:55:52) python-gnupg: 0.4.4 PyYAML: 5.4.1 PyZMQ: 18.0.1 smmap: Not Installed timelib: 0.2.4 Tornado: 4.5.3 ZMQ: 4.3.1 System Versions: dist: alpine 3.14.2 locale: UTF-8 machine: x86_64 release: 5.4.72-microsoft-standard-WSL2 system: Linux version: Alpine Linux 3.14.2 ```Additional context N/A