Closed magicwang1111 closed 1 year ago
AI分析结论
机器V100 (visualGLM) root@iZbp1ewp3ew1qt4u8bdh0iZ:~/ai/VisualGLM-6B# nvidia-smi Tue May 23 17:26:24 2023 +-----------------------------------------------------------------------------+ | NVIDIA-SMI 515.65.01 Driver Version: 515.65.01 CUDA Version: 11.7 | |-------------------------------+----------------------+----------------------+ | GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC | | Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. | | | | MIG M. | |===============================+======================+======================| | 0 Tesla V100-SXM2... On | 00000000:00:07.0 Off | 0 | | N/A 37C P0 41W / 300W | 0MiB / 32768MiB | 0% Default | | | | N/A | +-------------------------------+----------------------+----------------------+
+-----------------------------------------------------------------------------+ | Processes: | | GPU GI CI PID Type Process name GPU Memory | | ID ID Usage | |=============================================================================| | No running processes found | +-----------------------------------------------------------------------------+
AI修改后的脚本,依然无法运行 bash
NUM_WORKERS=1
NUM_GPUS_PER_WORKER=1
MP_SIZE=1
script_path=$(realpath $0)
script_dir=$(dirname $script_path)
main_dir=$(dirname $script_dir)
MODEL_TYPE="visualglm-6b"
MODEL_ARGS="--max_source_length 64 \
--max_target_length 256 \
--lora_rank 10\
--pre_seq_len 4"
OPTIONS_SAT="SAT_HOME=$1"
OPTIONS_NCCL="NCCL_DEBUG=info NCCL_IB_DISABLE=0 NCCL_NET_GDR_LEVEL=2"
HOST_FILE_PATH="hostfile
HOST_FILE_PATH="hostfile_single"
train_data="./fewshot-data/dataset.json"
eval_data="./fewshot-data/dataset.json"
gpt_options="
--experiment-name finetune-$MODEL_TYPE\n
--model-parallel-size ${MP_SIZE}\n
--mode finetune\n
--train-iters 300\n
--resume-dataloader\n
$MODEL_ARGS\n
--train-data ${train_data}\n
--valid-data ${eval_data}\n
--distributed-backend nccl\n
--lr-decay-style cosine\n
--warmup .02\n
--checkpoint-activations\n
--save-interval 300\n
--eval-interval 10000\n
--save "./checkpoints"\n
--split 1\n
--eval-iters 10\n
--eval-batch-size 8\n
--zero-stage 1\n
--lr 0.0001\n
--batch-size 20\n
--skip-init\n
--fp16\n
--use_lora
"
run_cmd="${OPTIONS_NCCL} ${OPTIONS_SAT} deepspeed --master_port 16666 --hostfile ${HOST_FILE_PATH} finetune_visualglm.py ${gpt_options}"
echo ${run_cmd}
eval ${run_cmd}
set +x
你是在windows运行的吗?为什么会有\r呢
你是在windows运行的吗?为什么会有\r呢
V100 linux上运行
(visualGLM) root@iZbp1ewp3ew1qt4u8bdh0iZ:~/ai/VisualGLM-6B# ssh -V OpenSSH_8.9p1 Ubuntu-3ubuntu0.1, OpenSSL 3.0.2 15 Mar 2022
你是在windows运行的吗?为什么会有\r呢
原脚本VisualGLM-6B/finetune/finetune_visualglm.sh
NUM_WORKERS=1 NUM_GPUS_PER_WORKER=1 MP_SIZE=1
script_path=$(realpath $0) script_dir=$(dirname $script_path) main_dir=$(dirname $script_dir) MODEL_TYPE="visualglm-6b" MODEL_ARGS="--max_source_length 64 \ --max_target_length 256 \ --lora_rank 10\ --pre_seq_len 4"
OPTIONS_NCCL="NCCL_DEBUG=info NCCL_IB_DISABLE=0 NCCL_NET_GDR_LEVEL=2" HOST_FILE_PATH="hostfile" HOST_FILE_PATH="hostfile_single"
train_data="./fewshot-data/dataset.json" eval_data="./fewshot-data/dataset.json"
gpt_options=" \ --experiment-name finetune-$MODEL_TYPE \ --model-parallel-size ${MP_SIZE} \ --mode finetune \ --train-iters 300 \ --resume-dataloader \ $MODEL_ARGS \ --train-data ${train_data} \ --valid-data ${eval_data} \ --distributed-backend nccl \ --lr-decay-style cosine \ --warmup .02 \ --checkpoint-activations \ --save-interval 300 \ --eval-interval 10000 \ --save "./checkpoints" \ --split 1 \ --eval-iters 10 \ --eval-batch-size 8 \ --zero-stage 1 \ --lr 0.0001 \ --batch-size 20 \ --skip-init \ --fp16 \ --use_lora "
run_cmd="${OPTIONS_NCCL} ${OPTIONS_SAT} deepspeed --master_port 16666 --hostfile ${HOST_FILE_PATH} finetune_visualglm.py ${gpt_options}" echo ${run_cmd} eval ${run_cmd}
set +x
你是在windows运行的吗?为什么会有\r呢
原脚本VisualGLM-6B/finetune/finetune_visualglm.sh
! /bin/bash NUM_WORKERS=1 NUM_GPUS_PER_WORKER=1 MP_SIZE=1
script_path=$(realpath $0) script_dir=$(dirname $script_path) main_dir=$(dirname $script_dir) MODEL_TYPE="visualglm-6b" MODEL_ARGS="--max_source_length 64 --max_target_length 256 --lora_rank 10 --pre_seq_len 4"
OPTIONS_SAT="SAT_HOME=$1" #"SAT_HOME=/raid/dm/sat_models"
OPTIONS_NCCL="NCCL_DEBUG=info NCCL_IB_DISABLE=0 NCCL_NET_GDR_LEVEL=2" HOST_FILE_PATH="hostfile" HOST_FILE_PATH="hostfile_single"
train_data="./fewshot-data/dataset.json" eval_data="./fewshot-data/dataset.json"
gpt_options=" --experiment-name finetune-$MODEL_TYPE --model-parallel-size ${MP_SIZE} --mode finetune --train-iters 300 --resume-dataloader $MODEL_ARGS --train-data ${train_data} --valid-data ${eval_data} --distributed-backend nccl --lr-decay-style cosine --warmup .02 --checkpoint-activations --save-interval 300 --eval-interval 10000 --save "./checkpoints" --split 1 --eval-iters 10 --eval-batch-size 8 --zero-stage 1 --lr 0.0001 --batch-size 20 --skip-init --fp16 --use_lora "
run_cmd="${OPTIONS_NCCL} ${OPTIONS_SAT} deepspeed --master_port 16666 --hostfile ${HOST_FILE_PATH} finetune_visualglm.py ${gpt_options}" echo ${run_cmd} eval ${run_cmd}
set +x
@magicwang1111 在linux中使用dos2unix命令应该可以修复脚本
你是在windows运行的吗?为什么会有\r呢
原脚本VisualGLM-6B/finetune/finetune_visualglm.sh
! /bin/bash NUM_WORKERS=1 NUM_GPUS_PER_WORKER=1 MP_SIZE=1
script_path=$(realpath 0)scriptdir=(dirname scriptpath)maindir=(dirname $script_dir) MODEL_TYPE="visualglm-6b" MODEL_ARGS="--max_source_length 64 --max_target_length 256 --lora_rank 10 --pre_seq_len 4"
OPTIONS_SAT="SAT_HOME=$1" #"SAT_HOME=/raid/dm/sat_models"
OPTIONS_NCCL="NCCL_DEBUG=info NCCL_IB_DISABLE=0 NCCL_NET_GDR_LEVEL=2" HOST_FILE_PATH="hostfile" HOST_FILE_PATH="hostfile_single" train_data="./fewshot-data/dataset.json" eval_data="./fewshot-data/dataset.json" gpt_options=" --experiment-name finetune-$MODEL_TYPE --model-parallel-size ${MP_SIZE} --mode finetune --train-iters 300 --resume-dataloader $MODEL_ARGS --train-data ${train_data} --valid-data ${eval_data} --distributed-backend nccl --lr-decay-style cosine --warmup .02 --checkpoint-activations --save-interval 300 --eval-interval 10000 --save "./checkpoints" --split 1 --eval-iters 10 --eval-batch-size 8 --zero-stage 1 --lr 0.0001 --batch-size 20 --skip-init --fp16 --use_lora " run_cmd="${OPTIONS_NCCL} ${OPTIONS_SAT} deepspeed --master_port 16666 --hostfile ${HOST_FILE_PATH} finetune_visualglm.py ${gpt_options}" echo ${run_cmd} eval ${run_cmd} set +x
@magicwang1111 在linux中使用dos2unix命令应该可以修复脚本
谢谢已经通过该命令修复了
但是出现了新的问题
Traceback (most recent call last):
File "/root/ai/VisualGLM-6B/finetune_visualglm.py", line 188, in
看起来新的问题是显存不够导致的,可以尝试调小batch size
(visualGLM) root@iZbp1ewp3ew1qt4u8bdh0iZ:~/ai/VisualGLM-6B# bash finetune/finetune_visualglm.sh finetune/finetune_visualglm.sh: line 5: $'\r': command not found finetune/finetune_visualglm.sh: line 14: $'\r': command not found finetune/finetune_visualglm.sh: line 19: $'\r': command not found finetune/finetune_visualglm.sh: line 22: $'\r': command not found finetune/finetune_visualglm.sh: line 23: $'\r': command not found finetune/finetune_visualglm.sh: line 50: $'\r': command not found finetune/finetune_visualglm.sh: line 51: $'\r': command not found finetune/finetune_visualglm.sh: line 52: $'\r': command not found --use_lorat \20 \ 8 \\s \ \dataset.json hostfile_single [2023-05-23 17:22:18,395] [WARNING] [runner.py:191:fetch_hostfile] Unable to find hostfile, will proceed with training with local resources only. [2023-05-23 17:22:18,412] [INFO] [runner.py:541:main] cmd = /usr/bin/python3 -u -m deepspeed.launcher.launch --world_info=eyJsb2NhbGhvc3QiOiBbMF19 --m --use_lorat 20 e 8 ns l /dataset.json--enable_each_rank_log=None finetune_visualglm.py [2023-05-23 17:22:21,237] [INFO] [launch.py:222:main] 0 NCCL_IB_DISABLE=0 [2023-05-23 17:22:21,237] [INFO] [launch.py:222:main] 0 NCCL_DEBUG=info [2023-05-23 17:22:21,237] [INFO] [launch.py:222:main] 0 NCCL_NET_GDR_LEVEL=2 [2023-05-23 17:22:21,237] [INFO] [launch.py:229:main] WORLD INFO DICT: {'localhost': [0]} [2023-05-23 17:22:21,237] [INFO] [launch.py:235:main] nnodes=1, num_local_procs=1, node_rank=0 [2023-05-23 17:22:21,237] [INFO] [launch.py:246:main] global_rank_mapping=defaultdict(<class 'list'>, {'localhost': [0]}) [2023-05-23 17:22:21,237] [INFO] [launch.py:247:main] dist_world_size=1 [2023-05-23 17:22:21,237] [INFO] [launch.py:249:main] Setting CUDA_VISIBLE_DEVICES=0 usage: finetune_visualglm.py [-h] [--num-layers NUM_LAYERS] [--hidden-size HIDDEN_SIZE] [--num-attention-heads NUM_ATTENTION_HEADS] [--vocab-size VOCAB_SIZE] [--max-sequence-length MAX_SEQUENCE_LENGTH] [--layernorm-order {post,pre,sandwich}] [--inner-hidden-size INNER_HIDDEN_SIZE] [--hidden-size-per-attention-head HIDDEN_SIZE_PER_ATTENTION_HEAD] [--model-parallel-size MODEL_PARALLEL_SIZE] [--skip-init] [--use-gpu-initialization] [--layernorm-epsilon LAYERNORM_EPSILON] [--hidden-dropout HIDDEN_DROPOUT] [--attention-dropout ATTENTION_DROPOUT] [--make-vocab-size-divisible-by MAKE_VOCAB_SIZE_DIVISIBLE_BY] [--experiment-name EXPERIMENT_NAME] [--train-iters TRAIN_ITERS] [--batch-size BATCH_SIZE] [--lr LR] [--mode {pretrain,finetune,inference}] [--seed SEED] [--zero-stage {0,1,2}] [--checkpoint-activations] [--checkpoint-num-layers CHECKPOINT_NUM_LAYERS] [--fp16] [--bf16] [--gradient-accumulation-steps GRADIENT_ACCUMULATION_STEPS] [--epochs EPOCHS] [--log-interval LOG_INTERVAL] [--summary-dir SUMMARY_DIR] [--save-args] [--lr-decay-iters LR_DECAY_ITERS] [--lr-decay-style {constant,linear,cosine,exponential}] [--lr-decay-ratio LR_DECAY_RATIO] [--warmup WARMUP] [--weight-decay WEIGHT_DECAY] [--save SAVE] [--load LOAD] [--save-interval SAVE_INTERVAL] [--no-save-rng] [--no-load-rng] [--resume-dataloader] [--distributed-backend DISTRIBUTED_BACKEND] [--local_rank LOCAL_RANK] [--exit-interval EXIT_INTERVAL] [--eval-batch-size EVAL_BATCH_SIZE] [--eval-iters EVAL_ITERS] [--eval-interval EVAL_INTERVAL] [--strict-eval] [--train-data TRAIN_DATA [TRAIN_DATA ...]] [--train-data-weights TRAIN_DATA_WEIGHTS [TRAIN_DATA_WEIGHTS ...]] [--iterable-dataset] [--valid-data [VALID_DATA ...]] [--test-data [TEST_DATA ...]] [--split SPLIT] [--num-workers NUM_WORKERS] [--block-size BLOCK_SIZE] [--tokenizer-type TOKENIZER_TYPE] [--temperature TEMPERATURE] [--top_p TOP_P] [--top_k TOP_K] [--num-beams NUM_BEAMS] [--length-penalty LENGTH_PENALTY] [--no-repeat-ngram-size NO_REPEAT_NGRAM_SIZE] [--min-tgt-length MIN_TGT_LENGTH] [--out-seq-length OUT_SEQ_LENGTH] [--input-source INPUT_SOURCE] [--output-path OUTPUT_PATH] [--with-id] [--max-inference-batch-size MAX_INFERENCE_BATCH_SIZE] [--device DEVICE] [--deepspeed] [--deepspeed_config DEEPSPEED_CONFIG] [--deepscale] [--deepscale_config DEEPSCALE_CONFIG] [--deepspeed_mpi] --use_lorasualglm.py: error: unrecognized arguments: [2023-05-23 17:22:26,242] [INFO] [launch.py:428:sigkill_handler] Killing subprocess 128438 [2023-05-23 17:22:26,243] [ERROR] [launch.py:434:sigkill_handler] ['/usr/bin/python3', '-u', 'finetune_visualglm.py', '--local_rank=0', '\r', '--experiment-name', 'finetune-visualglm-6b\r', '\r', '--model-parallel-size', '1\r', '\r', '--mode', 'finetune', '\r', '--train-iters', '300', '\r', '--resume-dataloader', '\r', '--max_source_length', '64', '\r', '--max_target_length', '256', '\r', '--lora_rank', '10\r', '--pre_seq_len', '4\r', '\r', '--train-data', './fewshot-data/dataset.json\r', '\r', '--valid-data', './fewshot-data/dataset.json\r', '\r', '--distributed-backend', 'nccl', '\r', '--lr-decay-style', 'cosine', '\r', '--warmup', '.02', '\r', '--checkpoint-activations', '\r', '--save-interval', '300', '\r', '--eval-interval', '10000', '\r', '--save', './checkpoints', '\r', '--split', '1', '\r', '--eval-iters', '10', '\r', '--eval-batch-size', '8', '\r', '--zero-stage', '1', '\r', '--lr', '0.0001', '\r', '--batch-size', '20', '\r', '--skip-init', '\r', '--fp16', '\r', '--use_lora\r', '\r\r\r'] exits with return code = 2 finetune/finetune_visualglm.sh: line 56: $'\r': command not found : invalid optione_visualglm.sh: line 57: set: + set: usage: set [-abefhkmnptuvxBCHP] [-o option-name] [--] [arg ...]