Open robinzyb opened 6 months ago
contents of slurm file
#!/bin/bash
#!/bin/bash -l
#SBATCH --job-name="dpmd"
#SBATCH --account="s1123"
#SBATCH --mail-type=ALL
#SBATCH --mail-user=yongbin.zhuang@epfl.ch
#SBATCH --time=24:00:00
#SBATCH --nodes=1
#SBATCH --ntasks-per-node=12
#SBATCH --cpus-per-task=1
#SBATCH --partition=normal
#SBATCH --constraint=gpu
set -e
module load daint-gpu
export OMP_NUM_THREADS=$SLURM_CPUS_PER_TASK
export CRAY_CUDA_MPS=1
export TF_INTRA_OP_PARALLELISM_THREADS=1
export TF_INTER_OP_PARALLELISM_THREADS=1
export CUDA_VISIBLE_DEVICES=0
ulimit -s unlimited
source ~/.bashrc
conda activate deepmd
set +e
pushd /scratch/snx3000/zyongbin/05.CLL_v5/bivo4-metad/iters-001/train-deepmd/tasks/001 || exit 1
if [ -f dp-train.checkpoint ]; then echo 'hit dp-train.checkpoint, skip'; else
################################################################################
if [ ! -f model.ckpt.index ]; then dp train input.json; else dp train input.json --restart model.ckpt; fi
################################################################################
__EXITCODE__=$?; if [ $__EXITCODE__ -ne 0 ]; then exit $__EXITCODE__; fi
touch dp-train.checkpoint; fi # create checkpoint on success
popd
pushd /scratch/snx3000/zyongbin/05.CLL_v5/bivo4-metad/iters-001/train-deepmd/tasks/001 || exit 1
################################################################################
dp freeze -o original_model.pb && dp compress -i original_model.pb -o frozen_model.pb
################################################################################
__EXITCODE__=$?; if [ $__EXITCODE__ -ne 0 ]; then exit $__EXITCODE__; fi
popd
echo $SLURM_JOB_ID > job-8kYKJRN1URkE4OikNSu9Y20zWx.sbatch.success
echo $SLURM_JOB_ID > job-8kYKJRN1URkE4OikNSu9Y20zWx.sbatch.success
I see some similar issues:
Summary
The unexpcted error: CUDA_ERROR_INVALID_DEVICE: invalid device ordinal from
dp compress
command.DeePMD-kit Version
2.2.9
Backend and its version
Tensorflow v2.9.0
Python Version, CUDA Version, GCC Version, LAMMPS Version, etc
GPU P100
Details
The error is shown after the command
dp compress
. The content of stdout is pasted below. Note that In the processes ofTraning
andfrozen
, the gpu device can be found without errors.