Traveller2001 commented 1 year ago

Why the script path of the other machine is the script path of the local machine when opening deepspeed's multi-machine multi-card mode, how to open multi-machine multi-card normally, here is my running script and hostfile configuration

script.sh

#! /bin/bash
set -e
export NCCL_IB_DISABLE=1
export NCCL_SOCKET_IFNAME=eth1
# Change for multinode config
logname=$(date +'%Y-%m-%d_%H:%M:%S')
if [ -n "$1" ]; then
  logname=$1
fi

MP_SIZE=4
NUM_WORKERS=2
NUM_GPUS_PER_WORKER=4
HIDDEN_SIZE=3072
NUM_ATTN_HEADS=24
NUM_LAYERS=40
BATCHSIZE=8
DATA_PATH=$(cat data_augment.txt)
VOCAB_PATH=vocab.txt
MERGE_PATH=none.txt
CHECKPOINT_PATH=./checkpoints/${logname}

script_path=$(realpath $0)
script_dir=$(dirname $script_path)
config_json="$script_dir/zqy.json"
#config_json="wyp.json"
# offloads to NVMe
#config_json="$script_dir/ds_zero_stage_infinity_config.json"

#ZeRO Configs
stage=3
reduce_scatter=true
contigious_gradients=true
rbs=50000000
agbs=5000000000

#Activation Checkpointing and Contigious Memory
chkp_layers=1
PA=true
PA_CPU=true
CC=true
SYNCHRONIZE=true
PROFILE=false

# TiledLinear splits, 0 is disable
TILED_LINEAR="true"
TILE_DIM=1

# Megatron Model Parallelism
LOGDIR="tensorboard/failed/${logname}"

#--cpu-optimizer
#--save $CHECKPOINT_PATH \
#        --load $CHECKPOINT_PATH \
gpt_options=" \
        --save $CHECKPOINT_PATH \
        --load $CHECKPOINT_PATH \
        --model-parallel-size ${MP_SIZE} \
        --num-layers $NUM_LAYERS \
        --hidden-size $HIDDEN_SIZE \
        --num-attention-heads ${NUM_ATTN_HEADS} \
        --seq-length 2048 \
        --max-position-embeddings 2048 \
        --batch-size $BATCHSIZE \
          --train-iters 12000
        --train-tokens 1000000000 \
        --data-path $DATA_PATH \
        --vocab-file $VOCAB_PATH \
        --merge-file $MERGE_PATH \
        --data-impl mmap \
        --split 98,0,2 \
          --tokenizer-type EncDecTokenizer \
        --distributed-backend nccl \
        --lr 1.5e-4 \
        --lr-decay-style cosine \
        --min-lr 1.0e-5 \
        --weight-decay 1e-2 \
        --clip-grad 1.0 \
        --warmup 0.01 \
        --checkpoint-activations \
        --log-interval 1 \
        --save-interval 10000 \
        --eval-interval 2000 \
        --eval-iters 10 \
        --fp16 \
        --scattered-embeddings \
        --split-transformers \
        --tensorboard-dir ${LOGDIR}
"
deepspeed_options=" \
                --deepspeed \
                --deepspeed_config ${config_json} \
                --zero-stage ${stage} \
                --zero-reduce-bucket-size ${rbs} \
                --zero-allgather-bucket-size ${agbs} \
        --remote-device cpu 
            "

if [ "${contigious_gradients}" = "true" ]; then
  deepspeed_options="${deepspeed_options} \
                --zero-contigious-gradients"
fi

if [ "${reduce_scatter}" = "true" ]; then
  deepspeed_options="${deepspeed_options} \
                --zero-reduce-scatter"
fi

chkp_opt=" \
--deepspeed-activation-checkpointing \
--checkpoint-num-layers ${chkp_layers}"

if [ "${PA}" = "true" ]; then
  chkp_opt="${chkp_opt} --partition-activations"
fi

if [ "${PA_CPU}" = "true" ]; then
  chkp_opt="${chkp_opt} \
        --checkpoint-in-cpu"
fi

if [ "${SYNCHRONIZE}" = "true" ]; then
  chkp_opt="${chkp_opt} \
        --synchronize-each-layer"
fi

if [ "${CC}" = "true" ]; then
  chkp_opt="${chkp_opt} \
        --contigious-checkpointing"
fi

if [ "${PROFILE}" = "true" ]; then
  chkp_opt="${chkp_opt} \
        --profile-backward"
fi

if [ "${TILED_LINEAR}" = "true" ]; then
  tile_opt="${tile_opt} \
        --memory-centric-tiled-linear \
        --tile-factor=${TILE_DIM}"
fi

full_options="${gpt_options} ${deepspeed_options} ${chkp_opt} ${tile_opt}"

run_cmd="deepspeed --hostfile=hostfile.txt --num_nodes ${NUM_WORKERS} --num_gpus ${NUM_GPUS_PER_WORKER} --master_addr lxw@172.16.220.21 pretrain_gpt2.py  ${full_options}  |tee log/failed${logname}.log"

#if [ $(cat wlog/$(log_date).log | tail -1) == "end" ]; then
#  rm wlog/$(log_date).log
#fi

echo ${run_cmd}
eval ${run_cmd}
mv log/failed${logname}.log log/${logname}.log
mv ${LOGDIR} tensorboard/${logname}
set +x

hostfile.txt

lxw@172.16.220.21 slots=4 lxw@172.16.220.18 slots=4

KiaDavari commented 1 year ago

@Traveller2001, I have the same issue. Did you solve this? I’ve been working on this for days but have not found any workaround. Appreciate any help

wangjiahe0915 commented 8 months ago

Hi,have you solved the probelm?I meet the same probelm.Tank you!

microsoft / Megatron-DeepSpeed

（Multi node train)172.16.220.18: bash: line 0: cd: /data/asc23/src_zero3: No such file or directory 172.16.220.18: bash: /home/lxw/anaconda3/envs/sys_py37/bin/python: No such file or directory pdsh@worker-10: 172.16.220.18: ssh exited with exit code 127 #126

script.sh

hostfile.txt