radical-cybertools / radical.pilot

RADICAL-Pilot
http://radical-cybertools.github.io/radical-pilot/index.html
Other
54 stars 23 forks source link

Flux environment setup failure #3183

Closed mtitov closed 4 months ago

mtitov commented 6 months ago

Error

$ cat task.000000.err 
/lustre/orion/scratch/matitov/chm155/flux/radical.pilot.sandbox/rp.session.frontier07830.matitov.019859.0001//pilot.0000//gtod: error while loading shared libraries: libfabric.so.1: cannot open shared object file: No such file or directory
/lustre/orion/scratch/matitov/chm155/flux/radical.pilot.sandbox/rp.session.frontier07830.matitov.019859.0001//pilot.0000//gtod: error while loading shared libraries: libfabric.so.1: cannot open shared object file: No such file or directory
/lustre/orion/scratch/matitov/chm155/flux/radical.pilot.sandbox/rp.session.frontier07830.matitov.019859.0001/pilot.0000/task.000000//task.000000.exec.sh: line 46: module: command not found
pre_exec failed

Task exec script

#!/bin/sh

# ------------------------------------------------------------------------------

export RP_TASK_ID="task.000000"
export RP_TASK_NAME="task.000000"
export RP_PILOT_ID="pilot.0000"
export RP_SESSION_ID="rp.session.frontier07830.matitov.019859.0001"
export RP_RESOURCE="ornl.frontier"
export RP_RESOURCE_SANDBOX="/lustre/orion/scratch/matitov/chm155/flux/radical.pilot.sandbox"
export RP_SESSION_SANDBOX="$RP_RESOURCE_SANDBOX/$RP_SESSION_ID/"
export RP_PILOT_SANDBOX="$RP_SESSION_SANDBOX/pilot.0000/"
export RP_TASK_SANDBOX="$RP_PILOT_SANDBOX/task.000000"
export RP_REGISTRY_ADDRESS="tcp://10.128.122.108:10002"
export RP_CORES_PER_RANK=1
export RP_GPUS_PER_RANK=1
export RP_GTOD="$RP_PILOT_SANDBOX/gtod"
export RP_PROF="$RP_PILOT_SANDBOX/prof"
export RP_PROF_TGT="$RP_PILOT_SANDBOX/task.000000/task.000000.prof"

rp_error() {
    echo "$1 failed" 1>&2
    exit 1
}

# ------------------------------------------------------------------------------
# rank ID
export RP_RANKS=1
export RP_RANK=$FLUX_TASK_RANK

rp_sync_ranks() {
    sig=$1
    echo $RP_RANK >> $sig.sig
    while test $(cat $sig.sig | wc -l) -lt $RP_RANKS; do
        sleep 1
    done
}

# ------------------------------------------------------------------------------
$RP_PROF exec_start ""

# ------------------------------------------------------------------------------
# pre-exec commands
$RP_PROF exec_pre ""
. /sw/frontier/init/profile || rp_error pre_exec
module reset || rp_error pre_exec
module load PrgEnv-gnu || rp_error pre_exec
module load rocm/6.0.0 || rp_error pre_exec
export TF_FORCE_GPU_ALLOW_GROWTH=true || rp_error pre_exec
export MIOPEN_USER_DB_PATH=$RP_PILOT_SANDBOX/miopen-cache || rp_error pre_exec
export MIOPEN_CUSTOM_CACHE_DIR=${MIOPEN_USER_DB_PATH} || rp_error pre_exec
mkdir -p ${MIOPEN_USER_DB_PATH} || rp_error pre_exec
source /ccs/proj/chm155/IMPECCABLE/miniconda/bin/activate st_mpi_base || rp_error pre_exec
cd /lustre/orion/scratch/matitov/chm155/flux/ST || rp_error pre_exec

# ------------------------------------------------------------------------------
# execute rank
$RP_PROF rank_start ""
python3 "smiles_regress_transformer_run_large.py" &

RP_EXEC_PID=$$
RP_RANK_PID=$!

wait $RP_RANK_PID
RP_RET=$?
$RP_PROF rank_stop "RP_EXEC_PID=$RP_EXEC_PID:RP_RANK_PID=$RP_RANK_PID"

# ------------------------------------------------------------------------------
# post-exec commands
$RP_PROF exec_post ""

# ------------------------------------------------------------------------------
$RP_PROF exec_stop ""
exit $RP_RET

# ------------------------------------------------------------------------------