#!/usr/bin/env
# Guide:
# This script supports distributed training on multi-gpu workers (as well as single-worker training).
# Please set the options below according to the comments.
# For multi-gpu workers training, these options should be manually set for each worker.
# After setting the options, please run the script on each worker.
# Command: bash run_scripts/muge_finetune_vit-b-16_rbt-base.sh ${DATAPATH}
# Number of GPUs per GPU worker
GPUS_PER_NODE=1
# Number of GPU workers, for single-worker training, please set to 1
WORKER_CNT=1
# The ip address of the rank-0 worker, for single-worker training, please set to localhost
export MASTER_ADDR=localhost
# The port for communication
export MASTER_PORT=8514
# The rank of this worker, should be in {0, ..., WORKER_CNT-1}, for single-worker training, please set to 0
export RANK=0
export PYTHONPATH=${PYTHONPATH}:`pwd`/cn_clip/
DATAPATH=${1}
# data options
train_data=${DATAPATH}/datasets/**/lmdb/train
val_data=${DATAPATH}/datasets/**/lmdb/valid # if val_data is not specified, the validation will be automatically disabled
# restore options
resume=${DATAPATH}/pretrained_weights/clip_cn_vit-b-16.pt # or specify your customed ckpt path to resume
reset_data_offset="--reset-data-offset"
reset_optimizer="--reset-optimizer"
# reset_optimizer=""
# output options
output_base_dir=${DATAPATH}/experiments/
name=muge_finetune_vit-b-16_roberta-base_bs128_1gpu_22
save_step_frequency=999999 # disable it
save_epoch_frequency=100
log_interval=1
report_training_batch_acc="--report-training-batch-acc"
# report_training_batch_acc=""
# training hyper-params
context_length=52
warmup=100
batch_size=150
valid_batch_size=150
accum_freq=1
lr=15e-5
wd=0.001
max_epochs=800 # or you can alternatively specify --max-steps
valid_step_interval=999999
valid_epoch_interval=999999
vision_model=ViT-B-16
text_model=RoBERTa-wwm-ext-base-chinese
use_augment="--use-augment"
# use_augment=""
python3 -m torch.distributed.launch --use_env --nproc_per_node=${GPUS_PER_NODE} --nnodes=${WORKER_CNT} --node_rank=${RANK} \
--master_addr=${MASTER_ADDR} --master_port=${MASTER_PORT} cn_clip/training/main.py \
--train-data=${train_data} \
--val-data=${val_data} \
--resume=${resume} \
${reset_data_offset} \
${reset_optimizer} \
--logs=${output_base_dir} \
--name=${name} \
--save-step-frequency=${save_step_frequency} \
--save-epoch-frequency=${save_epoch_frequency} \
--log-interval=${log_interval} \
${report_training_batch_acc} \
--context-length=${context_length} \
--warmup=${warmup} \
--batch-size=${batch_size} \
--valid-batch-size=${valid_batch_size} \
--valid-step-interval=${valid_step_interval} \
--valid-epoch-interval=${valid_epoch_interval} \
--accum-freq=${accum_freq} \
--lr=${lr} \
--wd=${wd} \
--max-epochs=${max_epochs} \
--vision-model=${vision_model} \
${use_augment} \
--text-model=${text_model} \
--grad-checkpointing
此领域小白,目前在跟着教程进行 finetune 时对结果上有一些疑问。希望大佬可以指导一下。
背景:
40-50 个左右的玩具角色分类
打算的实现方式:
通过50个左右的文本标签和图片归一化求max,成功识别出时哪一款玩具
训练的数据集:
每个标签30-50张 * 48 ≈ 1700(图片背景基本一样)
配置和参数:
单卡 显存 32G
训练日志: 看起来 模型收敛在这个范围
由于我没有开validation 所以没有 valid 的loss情况
现状
最后我手动测试了几个训练集的数据,基本上文本和图片的归一化的最大值在 99 以上,但是测试其他的照片有些可以识别有些无法识别有些甚至识别错误( 不正确的文本和图片的归一化的最大值达到了 99)泛化能力比较差
如何优化?
这是我遇到的一些问题,希望大佬们可以帮忙给出一些指导性的建议,十分感谢。