reframe-hpc / reframe

A powerful Python framework for writing and running portable regression tests and benchmarks for HPC systems.
https://reframe-hpc.readthedocs.org
BSD 3-Clause "New" or "Revised" License
220 stars 103 forks source link

auto-detection, use existing module if told to #2292

Open akesandgren opened 2 years ago

akesandgren commented 2 years ago

The auto-detection could perhaps be made to use an existing module on the remote system instead of bootstrapping itself. I would make the process a lot faster. A config option per site and/or partition would suffice

vkarak commented 1 year ago

Related to #2690.

akesandgren commented 1 year ago

Something like this works (tested):

diff --git a/reframe/frontend/autodetect.py b/reframe/frontend/autodetect.py
index d8c07c36..5d7be1c3 100644
--- a/reframe/frontend/autodetect.py
+++ b/reframe/frontend/autodetect.py
@@ -38,6 +38,22 @@ def _log_contents(filename):
                       f'--- {filename} ---')

+class _ake_reframe:
+    def __init__(self, prefix):
+        self._prefix = prefix
+        self._workdir = None
+
+    def __enter__(self):
+        self._workdir = os.path.abspath(
+            tempfile.mkdtemp(prefix='rfm.', dir=self._prefix)
+        )
+
+        return self._workdir
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        osext.rmtree(self._workdir)
+
+
 class _copy_reframe:
     def __init__(self, prefix):
         self._prefix = prefix
@@ -132,11 +148,46 @@ def _remote_detect(part):
         ]
         job.prepare(commands, env, trap_errors=True)

+    def _emit_module_script(job, env):
+        launcher_cmd = job.launcher.run_command(job)
+        commands = [
+            f'module load {module}',
+            f'{launcher_cmd} reframe --detect-host-topology=topo.json'
+        ]
+        job.prepare(commands, env, trap_errors=True)
+
     getlogger().info(
         f'Detecting topology of remote partition {part.fullname!r}: '
         f'this may take some time...'
     )
     topo_info = {}
+
+    module = runtime.runtime().get_option('general/0/reframe_module')
+    if module:
+        try:
+            prefix = runtime.runtime().get_option('general/0/remote_workdir')
+            with _ake_reframe(prefix) as dirname:
+                with osext.change_dir(dirname):
+                    job = Job.create(part.scheduler,
+                                     part.launcher_type(),
+                                     name='rfm-detect-job',
+                                     sched_access=part.access)
+                    _emit_module_script(job, [part.local_env])
+                    getlogger().debug('submitting detection script')
+                    _log_contents(job.script_filename)
+                    job.submit()
+                    job.wait()
+                    getlogger().debug('job finished')
+                    _log_contents(job.stdout)
+                    _log_contents(job.stderr)
+                    topo_info = json.loads(_contents('topo.json'))
+        except Exception as e:
+            getlogger().warning(f'failed to retrieve remote processor info using ReFrame module: {e}')
+            getlogger().debug(traceback.format_exc())
+    else:
+        getlogger().warning(f'reframe module not set')
+
+
     try:
         prefix = runtime.runtime().get_option('general/0/remote_workdir')
         with _copy_reframe(prefix) as dirname:
diff --git a/reframe/schemas/config.json b/reframe/schemas/config.json
index d9108b36..b735b448 100644
--- a/reframe/schemas/config.json
+++ b/reframe/schemas/config.json
@@ -486,6 +486,7 @@
                     "perf_info_level": {"$ref": "#/defs/loglevel"},
                     "pipeline_timeout": {"type": ["number", "null"]},
                     "purge_environment": {"type": "boolean"},
+                    "reframe_module": {"type": "string"},
                     "remote_detect": {"type": "boolean"},
                     "remote_workdir": {"type": "string"},
                     "report_file": {"type": "string"},

But should probably take a list of strings for the reframe_module. One could also add a "reframe_already_in_path" boolean which would just run reframe assuming that it is the correct version it will pick up. responsibility for that would of course be on the user side :-)