Open iWangTing opened 3 months ago
脚本代码如下
#!/usr/bin/env
# Guide:
# This script supports distributed training on multi-gpu workers (as well as single-worker training).
# Please set the options below according to the comments.
# For multi-gpu workers training, these options should be manually set for each worker.
# After setting the options, please run the script on each worker.
# Command: bash run_scripts/muge_finetune_vit-b-16_rbt-base.sh ${DATAPATH}
GPUS_PER_NODE=1
WORKER_CNT=1
export MASTER_ADDR="localhost"
export MASTER_PORT=8514
export RANK=0
export PYTHONPATH=${PYTHONPATH}:`pwd`/cn_clip/
DATAPATH="/home/amax/sdb1/lxl2/B-data"
# 指定LMDB格式的训练集和验证集路径(存放了LMDB格式的图片和图文对数据)
train_data=${DATAPATH}/datasets/Bdata/lmdb/train
val_data=${DATAPATH}/datasets/Bdata/lmdb/valid # if val_data is not specif ied, the validation will be automatically disabled
# restore options
resume=${DATAPATH}/pretrained_weights/clip_cn_vit-b-16.pt # or specify your customed ckpt path to resume
reset_data_offset="--reset-data-offset"
reset_optimizer="--reset-optimizer"
# reset_optimizer=""
# 指定输出相关配置
output_base_dir=${DATAPATH}/experiments/
name="B_finetune_vit-b-16_roberta-base" # finetune超参、日志、ckpt将保存在../datapath/experiments/muge_finetune_vit-b-16_roberta-base_bs48_1gpu/
save_step_frequency=999999 # disable it
save_epoch_frequency=1 # 每轮保存一个finetune ckpt
log_interval=1 # 日志打印间隔步数
report_training_batch_acc="--report-training-batch-acc" # 训练中,报告训练batch的in-batch准确率
# 指定训练超参数
context_length=52 # 序列长度,这里指定为Chinese-CLIP默认的52
warmup=100 # warmup步数
batch_size=32 # 训练单卡batch size
valid_batch_size=32 # 验证单卡batch size
lr=5e-5 # 学习率,因为这里我们使用的对比学习batch size很小,所以对应的学习率也调低一些
accum_freq=1
wd=0.001
max_epochs=100
valid_step_interval=999999
valid_epoch_interval=1
vision_model="ViT-B-16"
text_model="RoBERTa-wwm-ext-base-chinese"
use_augment="--use-augment"
torchrun --nproc_per_node=${GPUS_PER_NODE} --nnodes=${WORKER_CNT} sdb1/lxl2/Chinese-CLIP-master/cn_clip/training/main.py \
--train-data=${train_data} \
--val-data=${val_data} \
--resume=${resume} \
${reset_data_offset} \
${reset_optimizer} \
--logs=${output_base_dir} \
--name=${name} \
--save-step-frequency=${save_step_frequency} \
--save-epoch-frequency=${save_epoch_frequency} \
--log-interval=${log_interval} \
${report_training_batch_acc} \
--context-length=${context_length} \
--warmup=${warmup} \
--batch-size=${batch_size} \
--valid-batch-size=${valid_batch_size} \
--valid-step-interval=${valid_step_interval} \
--valid-epoch-interval=${valid_epoch_interval} \
--lr=${lr} \
--accum_freq=${accum_freq} \
--wd=${wd} \
--max-epochs=${max_epochs} \
--vision-model=${vision_model} \
${use_augment} \
--text-model=${text_model} \
--grad-checkpointing
``