open-mpi / ompi

Open MPI main development repository
https://www.open-mpi.org
Other
2.15k stars 859 forks source link

orcm OPAL_PREFIX and LD_LIBRARY_PATH conflicts with open-mpi #1146

Closed noahv closed 8 years ago

noahv commented 8 years ago

Description of problem

ORCM OPAL_PREFIX and LD_LIBRARY_PATH conflicts with openmpi.

If we install orcm to the default location then none of the environment settings are needed.

If we install somewhere else then we will need to set up the environment so that orcm can find the needed libraries.

OPAL_PREFIX in particular breaks openmpi runs.

Version-Release number of selected component (if applicable)

openrcm-0.13.0-1.src.rpm

Steps to Reproduce

wget https://www.open-mpi.org/software/orcm/v0/downloads/openrcm-0.13.0-1.src.rpm 
rpmbuild --rebuild openrcm-0.13.0-1.src.rpm
# To install rpm and to relocate the binaries to a different folder add the --relocate switch.
rpm -ivh openrcm-0.13.0-1.x86_64.rpm --relocate /opt/openrcm=<new location>

The following environment variables will need to be set to the new location:

export OPAL_PREFIX=<new_location>
export OPAL_LIBDIR=<new_location>/lib64
export OPAL_DATADIR=<new_location>/share
export LD_LIBRARY_PATH=<new_location>/lib64:$LD_LIBRARY_PATH
export PATH=<new_location>/bin:$PATH:
rhc54 commented 8 years ago

@noahv In production, how are you setting the OPAL_foo values? Are you using lmod or some other method? Or is the user expected to set those for themselves?

noahv commented 8 years ago

Hi Ralph,

We don't set the OPAL_foo values at all. The elf RUNPATH header will be set at compile time to point to the library install path:

master3:/opt/openrcm/bin # readelf -d orcmd 

Dynamic section at offset 0x5d88 contains 34 entries:
  Tag        Type                         Name/Value
 0x0000000000000001 (NEEDED)             Shared library: [liborcm.so.0]
 0x0000000000000001 (NEEDED)             Shared library: [liborcmopen-rte.so.0]
 0x0000000000000001 (NEEDED)             Shared library: [liborcmopen-pal.so.0]
 0x0000000000000001 (NEEDED)             Shared library: [libdl.so.2]
 0x0000000000000001 (NEEDED)             Shared library: [librt.so.1]
 0x0000000000000001 (NEEDED)             Shared library: [libm.so.6]
 0x0000000000000001 (NEEDED)             Shared library: [libutil.so.1]
 0x0000000000000001 (NEEDED)             Shared library: [libpthread.so.0]
 0x0000000000000001 (NEEDED)             Shared library: [libc.so.6]
 0x000000000000001d (RUNPATH)            Library runpath: [/opt/openrcm/lib64]

If relocation of the orcm install is needed, you can re-configure with a different --prefix= option.

rhc54 commented 8 years ago

I believe I have this fixed - what do you think of the following change:

diff --git a/orcm/runtime/orcm_init.c b/orcm/runtime/orcm_init.c
index 7491633..c3fbfdf 100644
--- a/orcm/runtime/orcm_init.c
+++ b/orcm/runtime/orcm_init.c
@@ -1,10 +1,10 @@
 /*
- * Copyright (c) 2009-2011 Cisco Systems, Inc.  All rights reserved. 
- * Copyright (c) 2013-2014 Intel, Inc.  All rights reserved. 
+ * Copyright (c) 2009-2011 Cisco Systems, Inc.  All rights reserved.
+ * Copyright (c) 2013-2015 Intel, Inc.  All rights reserved.
  * $COPYRIGHT$
- * 
+ *
  * Additional copyrights may follow
- * 
+ *
  * $HEADER$
  */

@@ -17,6 +17,7 @@
 #endif

 #include "opal/util/error.h"
+#include "opal/util/opal_environ.h"
 #include "opal/util/output.h"
 #include "opal/util/show_help.h"
 #include "opal/runtime/opal.h"
@@ -55,7 +56,7 @@ const char orcm_version_string[] = ORCM_IDENT_STRING;
 int orcm_init(orcm_proc_type_t flags)
 {
     int ret;
-    char *error;
+    char *error, *envar;
     int spin;
     opal_output_stream_t lds;

@@ -76,13 +77,35 @@ int orcm_init(orcm_proc_type_t flags)
             };
         }
     }
-    
+
+    /* prior to initializing the OPAL layer, check to see
+     * if the OPAL (and friends) install location has been
+     * moved. In order to avoid conflicts with any other
+     * OPAL-using software, the relocation point will have
+     * been expressed as a set of "ORCM_foo" envars. We
+     * therefore check for the ORCM_foo values, and name-shift
+     * any we find to OPAL_foo so that OPAL will find them.
+     * Since all ORCM tools will have already copied their
+     * local environment, these name-shifted vars will not
+     * appear in the environment of any launched processes */
+    if (NULL != (envar = getenv("ORCM_PREFIX"))) {
+        opal_unsetenv("ORCM_PREFIX", &environ);
+        opal_setenv("OPAL_PREFIX", envar, true, &environ);
+    }
+    if (NULL != (envar = getenv("ORCM_LIBDIR"))) {
+        opal_unsetenv("ORCM_LIBDIR", &environ);
+        opal_setenv("OPAL_LIBDIR", envar, true, &environ);
+    }
+    if (NULL != (envar = getenv("ORCM_DATADIR"))) {
+        opal_unsetenv("ORCM_DATADIR", &environ);
+        opal_setenv("OPAL_DATADIR", envar, true, &environ);
+    }
     /* initialize the opal layer */
     if (ORTE_SUCCESS != (ret = opal_init(NULL, NULL))) {
         error = "opal_init";
         goto error;
     }
-    
+
     orcm_debug_verbosity = -1;
     (void) mca_base_var_register ("orcm", "orcm", NULL, "debug_verbose",
                                   "Verbosity level for ORCM debug messages (default: 1)",
@@ -107,7 +130,7 @@ int orcm_init(orcm_proc_type_t flags)
         error = "orte_locks_init";
         goto error;
     }
-    
+
     /* register handler for errnum -> string conversion */
     opal_error_register("ORTE", ORTE_ERR_BASE, ORTE_ERR_MAX, orte_err2str);

@@ -126,7 +149,7 @@ int orcm_init(orcm_proc_type_t flags)
         error = "register attr print";
         goto error;
     }
-    
+
     /* we don't need a progress thread as all our tools loop inside themselves,
      * so define orte_event_base to be the base opal_event_base
      */
@@ -179,7 +202,7 @@ int orcm_init(orcm_proc_type_t flags)
         error = "orte_init";
         goto error;
     }
-    
+
     /* setup the orte_show_help system - don't do this until the
      * end as otherwise show_help messages won't appear
      */
@@ -193,7 +216,7 @@ int orcm_init(orcm_proc_type_t flags)
         error = "orcm_dt_init";
         goto error;
     }
-    
+
     /* flag that orte is initialized so things can work */
     orte_initialized = true;
     orte_help_want_aggregate = false;
@@ -206,6 +229,6 @@ int orcm_init(orcm_proc_type_t flags)
                        "orcm_init:startup:internal-failure",
                        true, error, ORTE_ERROR_NAME(ret), ret);
     }
-    
+
     return ret;
 }
noahv commented 8 years ago

Looks great. Thanks! -- Noah

rhc54 commented 8 years ago

Kewl - committed it to the new ORCM repo at https://github.com/orcmteam/orcm