Closed KimWu1994 closed 7 months ago
It's a little weird... I didn't meet nan loss during object alignment stage before. Can I see the full output of this run? (I need to check the config and see the loss in first several steps)
this is my config
# ========================= data ==========================
anno_root = "data/annotations" # annotation dir
pc_encoder = "uni3d"
feat_file = f"{anno_root}/scannet_{pc_encoder}_feats.pt"
# attribute_file = f"{anno_root}/scannet_attributes.json"
# train_file_s1 = [
# [
# feat_file,
# f"{anno_root}/scannet_train_attributes.pt",
# f"{anno_root}/scanrefer_train_stage1.json",
# ],
# # [
# # feat_file,
# # attribute_file,
# # f"{anno_root}/nr3d_train_stage1.json",
# # ],
# [
# feat_file,
# f"{anno_root}/scannet_train_attributes.pt",
# f"{anno_root}/scannet_train_stage1.json",
# ],
# # [
# # f"{anno_root}/objaverse_{pc_encoder}_feats.pt",
# # f"{anno_root}/objaverse_attributes.json",
# # f"{anno_root}/objaverse_stage1.json"
# # ]
# ]
train_file_s1=[
[
"data/annotations/scannet_uni3d_feats.pt",
"data/annotations/scannet_train_attributes.pt",
"data/annotations/scanrefer_train_stage1.json"
],
[
"data/annotations/scannet_uni3d_feats.pt",
"data/annotations/scannet_train_attributes.pt",
"data/annotations/scannet_train_stage1.json"
]
]
val_file_s1=[
[
"data/annotations/scannet_uni3d_feats.pt",
"data/annotations/scannet_val_attributes.pt",
"data/annotations/scannet_val_stage1.json"
]
]
train_file_s2=[
[
"data/annotations/scannet_uni3d_feats.pt",
"data/annotations/scannet_train_attributes.pt",
"data/annotations/obj_align_train.json"
]
]
val_file_s2=[
[
"data/annotations/scannet_uni3d_feats.pt",
"data/annotations/scannet_val_attributes.pt",
"data/annotations/obj_align_val.json"
]
]
# train_file_s2 = [
# [
# feat_file,
# f"{anno_root}/scannet_train_attributes.pt",
# f"{anno_root}/scanrefer_train_stage2_objxx.json",
# ],
# # [
# # f"{anno_root}/scannet_pointgroup_{pc_encoder}_feats.pt",
# # f"{anno_root}/scannet_pointgroup_train_attributes.pt",
# # f"{anno_root}/scanrefer_pointgroup_train_stage2_caption_iou50.json"
# # ],
# [
# feat_file,
# f"{anno_root}/scannet_train_attributes.pt",
# f"{anno_root}/nr3d_train_stage2_objxx.json"
# ],
# # [
# # feat_file,
# # f"{anno_root}/scannet_train_attributes.pt",
# # f"{anno_root}/sr3d_train_stage2_objxx.json"
# # ],
# [
# feat_file,
# f"{anno_root}/scannet_train_attributes.pt",
# f"{anno_root}/scene_align_train.json",
# ],
# # [
# # feat_file,
# # f"{anno_root}/scannet_train_attributes.pt",
# # f"{anno_root}/obj_align_train.json",
# # ],
# # [
# # feat_file,
# # f"{anno_root}/scannet_train_attributes.pt",
# # f"{anno_root}/scanqa_train_stage2_objxx.json"
# # ],
# # [
# # feat_file,
# # f"{anno_root}/scannet_train_attributes.pt",
# # f"{anno_root}/scanqa_train_stage2_new.json"
# # ],
# # [
# # feat_file,
# # f"{anno_root}/scannet_train_attributes.pt",
# # f"{anno_root}/nr3d_train_stage2_grounding_new.json"
# # ],
# # [
# # feat_file,
# # f"{anno_root}/scannet_train_attributes.pt",
# # f"{anno_root}/scanrefer_train_stage2_grounding_new.json"
# # ],
# # [
# # feat_file,
# # f"{anno_root}/scannet_train_attributes.pt",
# # f"{anno_root}/sr3d_train_stage2_grounding_new.json"
# # ],
# # [
# # f"{anno_root}/scannet_pointgroup_{pc_encoder}_feats.pt",
# # f"{anno_root}/scannet_pointgroup_train_attributes.pt",
# # f"{anno_root}/scanrefer_pointgroup_train_stage2_grounding_new.json"
# # ],
# # [
# # feat_file,
# # f"{anno_root}/scannet_val_attributes.pt",
# # f"{anno_root}/nr3d_train_stage2_multichoice0.01.json"
# # ],
# # [
# # feat_file,
# # f"{anno_root}/scannet_train_attributes.pt",
# # f"{anno_root}/scene_dataset_train_stage2.json"
# # ]
# ]
# val_file_s2 = [
# # [
# # feat_file,
# # f"{anno_root}/scannet_val_attributes.pt",
# # f"{anno_root}/scanrefer_val_stage2_objxx.json"
# # ],
# # [
# # f"{anno_root}/scannet_pointgroup_{pc_encoder}_feats.pt",
# # f"{anno_root}/scannet_pointgroup_val_attributes.pt",
# # f"{anno_root}/scanrefer_pointgroup_val_stage2_caption_iou25.json"
# # ],
# [
# feat_file,
# f"{anno_root}/scannet_val_attributes.pt",
# f"{anno_root}/stage2_val400.json"
# ],
# # [
# # feat_file,
# # f"{anno_root}/scannet_val_attributes.pt",
# # f"{anno_root}/nr3d_val_stage2_objxx.json"
# # ],
# # [
# # feat_file,
# # f"{anno_root}/scannet_val_attributes.pt",
# # f"{anno_root}/scene_align_val.json",
# # ],
# # [
# # feat_file,
# # f"{anno_root}/scannet_val_attributes.pt",
# # f"{anno_root}/obj_align_val.json"
# # ],
# # [
# # feat_file,
# # f"{anno_root}/scannet_val_attributes.pt",
# # f"{anno_root}/scanqa_val_stage2_objxx.json"
# # ],
# # [
# # feat_file,
# # f"{anno_root}/scannet_val_attributes.pt",
# # f"{anno_root}/scanqa_val_stage2_new.json"
# # ],
# # [
# # feat_file,
# # f"{anno_root}/scannet_val_attributes.pt",
# # f"{anno_root}/sr3d_val_stage2_grounding_new.json"
# # ],
# # [
# # f"{anno_root}/scannet_pointgroup_{pc_encoder}_feats.pt",
# # f"{anno_root}/scannet_pointgroup_val_attributes.pt",
# # f"{anno_root}/scanrefer_pointgroup_val_stage2_grounding_new.json"
# # ],
# # [
# # feat_file,
# # f"{anno_root}/scannet_val_attributes.pt",
# # f"{anno_root}/nr3d_val_stage2_multichoice0.01.json"
# # ],
# # [
# # feat_file,
# # f"{anno_root}/scannet_val_attributes.pt",
# # f"{anno_root}/scene_dataset_val_stage2.json"
# # ],
# ]
# train_file_s3 = [
# [
# feat_file,
# f"{anno_root}/scannet_train_attributes.pt",
# f"{anno_root}/scanqa_train_stage3.json",
# 1
# ],
# # [
# # feat_file,
# # attribute_file,
# # f"{anno_root}/scanrefer_train_conversation.json",
# # 3
# # ],
# # [
# # feat_file,
# # attribute_file,
# # f"{anno_root}/scanrefer_train_detail.json",
# # 1
# # ],
# # [
# # feat_file,
# # attribute_file,
# # f"{anno_root}/nr3d_train_tf.json",
# # 1
# # ]
# ]
# # val_file_s1 = [
# # # [
# # # feat_file,
# # # f"{anno_root}/scannet_val_attributes.pt",
# # # f"{anno_root}/scanrefer_val_stage1.json",
# # # ],
# # [
# # feat_file,
# # f"{anno_root}/scannet_val_attributes.pt",
# # f"{anno_root}/scannet_val_stage1.json",
# # ]
# # ]
# val_file_s3 = [
# [
# feat_file,
# f"{anno_root}/scannet_val_attributes.pt",
# f"{anno_root}/scanqa_val_predobj.json"
# ],
# # [
# # feat_file,
# # attribute_file,
# # f"{anno_root}/scanrefer_val_conversation100.json"
# # ],
# ]
train_file_s2=[
[
"data/annotations/scannet_uni3d_feats.pt",
"data/annotations/scannet_train_attributes.pt",
"data/annotations/scanrefer_train_stage2_objxx.json"
],
[
"data/annotations/scannet_uni3d_feats.pt",
"data/annotations/scannet_train_attributes.pt",
"data/annotations/nr3d_train_stage2_objxx.json"
],
[
"data/annotations/scannet_uni3d_feats.pt",
"data/annotations/scannet_train_attributes.pt",
"data/annotations/scene_align_train.json"
]
]
val_file_s2=[
[
"data/annotations/scannet_uni3d_feats.pt",
"data/annotations/scannet_val_attributes.pt",
"data/annotations/stage2_val400.json"
]
]
test_types = []
num_workers = 32
# ========================= input ==========================
s1_batch_size = 64
s2_batch_size = 1
s3_batch_size = 1
# max_txt_l = 32
pre_text = False
# ========================= model ==========================
model = dict(
llama_model_path="ckpts/vicuna-7b-v0",
input_dim=1024 if pc_encoder == "uni3d" else 512,
attr_dim=512,
encoder_num_layers=1,
mlp_dropout=0.1,
low_resource=False,
system_path="prompts/system.txt",
prompt_template="\n### Human: {}\n### Assistant: ",
max_txt_len=512,
end_sym="\n###",
stage=1,
add_scene_token=True,
debug=False,
obj_norm_scale=200,
scene_norm_scale=50,
grad_scale=1,
)
optimizer = dict(
opt="adamW",
lr=5e-3,
opt_betas=[0.9, 0.999], # default
weight_decay=0.02,
max_grad_norm=-1, # requires a positive float, use -1 to disable
# use a different lr for some modules, e.g., larger lr for new modules
different_lr=dict(
enable=True,
module_names=["module.llama_model", "module.relation_module"],
lr=[1e-5, 1e-5],
wd=[0.02, 0.02]
),
)
scheduler = dict(sched="cosine", epochs=3, min_lr_multi=0.01, warmup_epochs=0.2)
evaluate = False
deep_fusion = False
fp16 = True
gradient_checkpointing = True
# ========================= wandb ==========================
wandb = dict(
enable=False,
entity="huanghaifeng", # username or team name to store the runs, see https://docs.wandb.ai/ref/python/init
project="Scene-LLM",
)
dist_url = "env://"
device = "cuda"
# ========================= others ==========================
output_dir = "outputs/tmp" # output dir
resume = False # if True, load optimizer and scheduler states as well
debug = False
log_freq = 100
# eval_freq = 500
seed = 42
save_latest = False
do_save = True
auto_resume = True
pretrained_path = ""
this is my train bash
export MASTER_PORT=6007
export OMP_NUM_THREADS=1
echo "PYTHONPATH: ${PYTHONPATH}"
which_python=$(which python)
echo "which python: ${which_python}"
export PYTHONPATH=${PYTHONPATH}:${which_python}
export PYTHONPATH=${PYTHONPATH}:.
echo "PYTHONPATH: ${PYTHONPATH}"
export CUDA_VISIBLE_DEVICES=6
NNODE=1
NUM_GPUS=1
MASTER_NODE='localhost'
# stage=1
# epoch=6
# add_scene_token=False
# evaluate=False
# pretrained_path=""
# stage=2
# epoch=3
# add_scene_token=False
# evaluate=False
# pretrained_path="outputs/2024-04-10-111416_dp_lr2e-4_sta1_ep6/ckpt_05.pth"
stage=2
epoch=3
max_txt_len=32
lr=1e-4
add_scene_token=False
evaluate=False
pretrained_path="outputs2/2024-04-10-161007_dp_lr1e-4_sta1_ep6/ckpt_05.pth"
OUTPUT_DIR=outputs/"$(date +"%Y-%m-%d-%T" | tr -d ':')"_dp"$dp"_lr"$lr"_sta"$stage"_ep"$epoch"
torchrun --nnodes=${NNODE} --nproc_per_node=${NUM_GPUS} \
--rdzv_endpoint=${MASTER_NODE}:${MASTER_PORT} \
--rdzv_backend=c10d \
tasks/train.py \
$(dirname $0)/config.py \
output_dir ${OUTPUT_DIR} \
model.stage "$stage" \
scheduler.epochs "$epoch" \
optimizer.lr "$lr" \
model.max_txt_len "$max_txt_len" \
model.add_scene_token "$add_scene_token" \
pretrained_path "$pretrained_path" \
evaluate "$evaluate"
I think you forget to comment out these lines. The object alignment stage only use obj_align_train.json
.
Sorry, I'm still confused. I followed the stage 2 training instructions in the readme and commented out the following
# train_file_s2 = [
# [
# feat_file,
# f"{anno_root}/scannet_train_attributes.pt",
# f"{anno_root}/scanrefer_train_stage2_objxx.json",
# ],
# # [
# # f"{anno_root}/scannet_pointgroup_{pc_encoder}_feats.pt",
# # f"{anno_root}/scannet_pointgroup_train_attributes.pt",
# # f"{anno_root}/scanrefer_pointgroup_train_stage2_caption_iou50.json"
# # ],
# [
# feat_file,
# f"{anno_root}/scannet_train_attributes.pt",
# f"{anno_root}/nr3d_train_stage2_objxx.json"
# ],
# # [
# # feat_file,
# # f"{anno_root}/scannet_train_attributes.pt",
# # f"{anno_root}/sr3d_train_stage2_objxx.json"
# # ],
# [
# feat_file,
# f"{anno_root}/scannet_train_attributes.pt",
# f"{anno_root}/scene_align_train.json",
# ],
# # [
# # feat_file,
# # f"{anno_root}/scannet_train_attributes.pt",
# # f"{anno_root}/obj_align_train.json",
# # ],
# # [
# # feat_file,
# # f"{anno_root}/scannet_train_attributes.pt",
# # f"{anno_root}/scanqa_train_stage2_objxx.json"
# # ],
# # [
# # feat_file,
# # f"{anno_root}/scannet_train_attributes.pt",
# # f"{anno_root}/scanqa_train_stage2_new.json"
# # ],
# # [
# # feat_file,
# # f"{anno_root}/scannet_train_attributes.pt",
# # f"{anno_root}/nr3d_train_stage2_grounding_new.json"
# # ],
# # [
# # feat_file,
# # f"{anno_root}/scannet_train_attributes.pt",
# # f"{anno_root}/scanrefer_train_stage2_grounding_new.json"
# # ],
# # [
# # feat_file,
# # f"{anno_root}/scannet_train_attributes.pt",
# # f"{anno_root}/sr3d_train_stage2_grounding_new.json"
# # ],
# # [
# # f"{anno_root}/scannet_pointgroup_{pc_encoder}_feats.pt",
# # f"{anno_root}/scannet_pointgroup_train_attributes.pt",
# # f"{anno_root}/scanrefer_pointgroup_train_stage2_grounding_new.json"
# # ],
# # [
# # feat_file,
# # f"{anno_root}/scannet_val_attributes.pt",
# # f"{anno_root}/nr3d_train_stage2_multichoice0.01.json"
# # ],
# # [
# # feat_file,
# # f"{anno_root}/scannet_train_attributes.pt",
# # f"{anno_root}/scene_dataset_train_stage2.json"
# # ]
# ]
I added configuration in readme as follow:
train_file_s2=[
[
"data/annotations/scannet_uni3d_feats.pt",
"data/annotations/scannet_train_attributes.pt",
"data/annotations/obj_align_train.json"
]
]
The loss is nan. I may misunderstand your readme.
Shouldn't I comment out train_file_s2
in the original config.py?
What is the correct config for stage 2? Is it like the follow:
train_file_s2 = [
[
feat_file,
f"{anno_root}/scannet_train_attributes.pt",
f"{anno_root}/scanrefer_train_stage2_objxx.json",
],
# [
# f"{anno_root}/scannet_pointgroup_{pc_encoder}_feats.pt",
# f"{anno_root}/scannet_pointgroup_train_attributes.pt",
# f"{anno_root}/scanrefer_pointgroup_train_stage2_caption_iou50.json"
# ],
[
feat_file,
f"{anno_root}/scannet_train_attributes.pt",
f"{anno_root}/nr3d_train_stage2_objxx.json"
],
# [
# feat_file,
# f"{anno_root}/scannet_train_attributes.pt",
# f"{anno_root}/sr3d_train_stage2_objxx.json"
# ],
[
feat_file,
f"{anno_root}/scannet_train_attributes.pt",
f"{anno_root}/scene_align_train.json",
],
[
feat_file,
f"{anno_root}/scannet_train_attributes.pt",
f"{anno_root}/obj_align_train.json",
],
# [
# feat_file,
# f"{anno_root}/scannet_train_attributes.pt",
# f"{anno_root}/scanqa_train_stage2_objxx.json"
# ],
# [
# feat_file,
# f"{anno_root}/scannet_train_attributes.pt",
# f"{anno_root}/scanqa_train_stage2_new.json"
# ],
# [
# feat_file,
# f"{anno_root}/scannet_train_attributes.pt",
# f"{anno_root}/nr3d_train_stage2_grounding_new.json"
# ],
# [
# feat_file,
# f"{anno_root}/scannet_train_attributes.pt",
# f"{anno_root}/scanrefer_train_stage2_grounding_new.json"
# ],
# [
# feat_file,
# f"{anno_root}/scannet_train_attributes.pt",
# f"{anno_root}/sr3d_train_stage2_grounding_new.json"
# ],
# [
# f"{anno_root}/scannet_pointgroup_{pc_encoder}_feats.pt",
# f"{anno_root}/scannet_pointgroup_train_attributes.pt",
# f"{anno_root}/scanrefer_pointgroup_train_stage2_grounding_new.json"
# ],
# [
# feat_file,
# f"{anno_root}/scannet_val_attributes.pt",
# f"{anno_root}/nr3d_train_stage2_multichoice0.01.json"
# ],
# [
# feat_file,
# f"{anno_root}/scannet_train_attributes.pt",
# f"{anno_root}/scene_dataset_train_stage2.json"
# ]
]
this is my config
# ========================= data ========================== anno_root = "data/annotations" # annotation dir pc_encoder = "uni3d" feat_file = f"{anno_root}/scannet_{pc_encoder}_feats.pt" # attribute_file = f"{anno_root}/scannet_attributes.json" # train_file_s1 = [ # [ # feat_file, # f"{anno_root}/scannet_train_attributes.pt", # f"{anno_root}/scanrefer_train_stage1.json", # ], # # [ # # feat_file, # # attribute_file, # # f"{anno_root}/nr3d_train_stage1.json", # # ], # [ # feat_file, # f"{anno_root}/scannet_train_attributes.pt", # f"{anno_root}/scannet_train_stage1.json", # ], # # [ # # f"{anno_root}/objaverse_{pc_encoder}_feats.pt", # # f"{anno_root}/objaverse_attributes.json", # # f"{anno_root}/objaverse_stage1.json" # # ] # ] train_file_s1=[ [ "data/annotations/scannet_uni3d_feats.pt", "data/annotations/scannet_train_attributes.pt", "data/annotations/scanrefer_train_stage1.json" ], [ "data/annotations/scannet_uni3d_feats.pt", "data/annotations/scannet_train_attributes.pt", "data/annotations/scannet_train_stage1.json" ] ] val_file_s1=[ [ "data/annotations/scannet_uni3d_feats.pt", "data/annotations/scannet_val_attributes.pt", "data/annotations/scannet_val_stage1.json" ] ] train_file_s2=[ [ "data/annotations/scannet_uni3d_feats.pt", "data/annotations/scannet_train_attributes.pt", "data/annotations/obj_align_train.json" ] ] val_file_s2=[ [ "data/annotations/scannet_uni3d_feats.pt", "data/annotations/scannet_val_attributes.pt", "data/annotations/obj_align_val.json" ] ] # train_file_s2 = [ # [ # feat_file, # f"{anno_root}/scannet_train_attributes.pt", # f"{anno_root}/scanrefer_train_stage2_objxx.json", # ], # # [ # # f"{anno_root}/scannet_pointgroup_{pc_encoder}_feats.pt", # # f"{anno_root}/scannet_pointgroup_train_attributes.pt", # # f"{anno_root}/scanrefer_pointgroup_train_stage2_caption_iou50.json" # # ], # [ # feat_file, # f"{anno_root}/scannet_train_attributes.pt", # f"{anno_root}/nr3d_train_stage2_objxx.json" # ], # # [ # # feat_file, # # f"{anno_root}/scannet_train_attributes.pt", # # f"{anno_root}/sr3d_train_stage2_objxx.json" # # ], # [ # feat_file, # f"{anno_root}/scannet_train_attributes.pt", # f"{anno_root}/scene_align_train.json", # ], # # [ # # feat_file, # # f"{anno_root}/scannet_train_attributes.pt", # # f"{anno_root}/obj_align_train.json", # # ], # # [ # # feat_file, # # f"{anno_root}/scannet_train_attributes.pt", # # f"{anno_root}/scanqa_train_stage2_objxx.json" # # ], # # [ # # feat_file, # # f"{anno_root}/scannet_train_attributes.pt", # # f"{anno_root}/scanqa_train_stage2_new.json" # # ], # # [ # # feat_file, # # f"{anno_root}/scannet_train_attributes.pt", # # f"{anno_root}/nr3d_train_stage2_grounding_new.json" # # ], # # [ # # feat_file, # # f"{anno_root}/scannet_train_attributes.pt", # # f"{anno_root}/scanrefer_train_stage2_grounding_new.json" # # ], # # [ # # feat_file, # # f"{anno_root}/scannet_train_attributes.pt", # # f"{anno_root}/sr3d_train_stage2_grounding_new.json" # # ], # # [ # # f"{anno_root}/scannet_pointgroup_{pc_encoder}_feats.pt", # # f"{anno_root}/scannet_pointgroup_train_attributes.pt", # # f"{anno_root}/scanrefer_pointgroup_train_stage2_grounding_new.json" # # ], # # [ # # feat_file, # # f"{anno_root}/scannet_val_attributes.pt", # # f"{anno_root}/nr3d_train_stage2_multichoice0.01.json" # # ], # # [ # # feat_file, # # f"{anno_root}/scannet_train_attributes.pt", # # f"{anno_root}/scene_dataset_train_stage2.json" # # ] # ] # val_file_s2 = [ # # [ # # feat_file, # # f"{anno_root}/scannet_val_attributes.pt", # # f"{anno_root}/scanrefer_val_stage2_objxx.json" # # ], # # [ # # f"{anno_root}/scannet_pointgroup_{pc_encoder}_feats.pt", # # f"{anno_root}/scannet_pointgroup_val_attributes.pt", # # f"{anno_root}/scanrefer_pointgroup_val_stage2_caption_iou25.json" # # ], # [ # feat_file, # f"{anno_root}/scannet_val_attributes.pt", # f"{anno_root}/stage2_val400.json" # ], # # [ # # feat_file, # # f"{anno_root}/scannet_val_attributes.pt", # # f"{anno_root}/nr3d_val_stage2_objxx.json" # # ], # # [ # # feat_file, # # f"{anno_root}/scannet_val_attributes.pt", # # f"{anno_root}/scene_align_val.json", # # ], # # [ # # feat_file, # # f"{anno_root}/scannet_val_attributes.pt", # # f"{anno_root}/obj_align_val.json" # # ], # # [ # # feat_file, # # f"{anno_root}/scannet_val_attributes.pt", # # f"{anno_root}/scanqa_val_stage2_objxx.json" # # ], # # [ # # feat_file, # # f"{anno_root}/scannet_val_attributes.pt", # # f"{anno_root}/scanqa_val_stage2_new.json" # # ], # # [ # # feat_file, # # f"{anno_root}/scannet_val_attributes.pt", # # f"{anno_root}/sr3d_val_stage2_grounding_new.json" # # ], # # [ # # f"{anno_root}/scannet_pointgroup_{pc_encoder}_feats.pt", # # f"{anno_root}/scannet_pointgroup_val_attributes.pt", # # f"{anno_root}/scanrefer_pointgroup_val_stage2_grounding_new.json" # # ], # # [ # # feat_file, # # f"{anno_root}/scannet_val_attributes.pt", # # f"{anno_root}/nr3d_val_stage2_multichoice0.01.json" # # ], # # [ # # feat_file, # # f"{anno_root}/scannet_val_attributes.pt", # # f"{anno_root}/scene_dataset_val_stage2.json" # # ], # ] # train_file_s3 = [ # [ # feat_file, # f"{anno_root}/scannet_train_attributes.pt", # f"{anno_root}/scanqa_train_stage3.json", # 1 # ], # # [ # # feat_file, # # attribute_file, # # f"{anno_root}/scanrefer_train_conversation.json", # # 3 # # ], # # [ # # feat_file, # # attribute_file, # # f"{anno_root}/scanrefer_train_detail.json", # # 1 # # ], # # [ # # feat_file, # # attribute_file, # # f"{anno_root}/nr3d_train_tf.json", # # 1 # # ] # ] # # val_file_s1 = [ # # # [ # # # feat_file, # # # f"{anno_root}/scannet_val_attributes.pt", # # # f"{anno_root}/scanrefer_val_stage1.json", # # # ], # # [ # # feat_file, # # f"{anno_root}/scannet_val_attributes.pt", # # f"{anno_root}/scannet_val_stage1.json", # # ] # # ] # val_file_s3 = [ # [ # feat_file, # f"{anno_root}/scannet_val_attributes.pt", # f"{anno_root}/scanqa_val_predobj.json" # ], # # [ # # feat_file, # # attribute_file, # # f"{anno_root}/scanrefer_val_conversation100.json" # # ], # ] train_file_s2=[ [ "data/annotations/scannet_uni3d_feats.pt", "data/annotations/scannet_train_attributes.pt", "data/annotations/scanrefer_train_stage2_objxx.json" ], [ "data/annotations/scannet_uni3d_feats.pt", "data/annotations/scannet_train_attributes.pt", "data/annotations/nr3d_train_stage2_objxx.json" ], [ "data/annotations/scannet_uni3d_feats.pt", "data/annotations/scannet_train_attributes.pt", "data/annotations/scene_align_train.json" ] ] val_file_s2=[ [ "data/annotations/scannet_uni3d_feats.pt", "data/annotations/scannet_val_attributes.pt", "data/annotations/stage2_val400.json" ] ] test_types = [] num_workers = 32 # ========================= input ========================== s1_batch_size = 64 s2_batch_size = 1 s3_batch_size = 1 # max_txt_l = 32 pre_text = False # ========================= model ========================== model = dict( llama_model_path="ckpts/vicuna-7b-v0", input_dim=1024 if pc_encoder == "uni3d" else 512, attr_dim=512, encoder_num_layers=1, mlp_dropout=0.1, low_resource=False, system_path="prompts/system.txt", prompt_template="\n### Human: {}\n### Assistant: ", max_txt_len=512, end_sym="\n###", stage=1, add_scene_token=True, debug=False, obj_norm_scale=200, scene_norm_scale=50, grad_scale=1, ) optimizer = dict( opt="adamW", lr=5e-3, opt_betas=[0.9, 0.999], # default weight_decay=0.02, max_grad_norm=-1, # requires a positive float, use -1 to disable # use a different lr for some modules, e.g., larger lr for new modules different_lr=dict( enable=True, module_names=["module.llama_model", "module.relation_module"], lr=[1e-5, 1e-5], wd=[0.02, 0.02] ), ) scheduler = dict(sched="cosine", epochs=3, min_lr_multi=0.01, warmup_epochs=0.2) evaluate = False deep_fusion = False fp16 = True gradient_checkpointing = True # ========================= wandb ========================== wandb = dict( enable=False, entity="huanghaifeng", # username or team name to store the runs, see https://docs.wandb.ai/ref/python/init project="Scene-LLM", ) dist_url = "env://" device = "cuda" # ========================= others ========================== output_dir = "outputs/tmp" # output dir resume = False # if True, load optimizer and scheduler states as well debug = False log_freq = 100 # eval_freq = 500 seed = 42 save_latest = False do_save = True auto_resume = True pretrained_path = ""
In this config, you set: This is right.
But later you set train_file_s2
again here:
This replaces the former train_file_s2
and is wrong. So I mean you maybe forgot to delete this part.
Train Epoch: [0] [ 2400/102750] eta: 5:20:57 lr: 0.000012 stage2-loss: nan stage2-cosine_loss: No data stage2-l2_loss: No data stage2-obj_norm: nan stage2-scene_norm: 0.0000 stage2-target_norm: No data time: 0.1851 data: 0.0030 max mem: 28326 res mem: 29120
I followed the readme to train the second stage, and the loss became nan. how to solve this problem