Open ApolloRay opened 3 weeks ago
I used a training framework based on llamafactory for training. Some training logs are in the attachment. I hope it will be helpful to you. train2.log
I follow your step, but always get a lr=0 and loss=0(in A800)
I used a training framework based on llamafactory for training. Some training logs are in the attachment. I hope it will be helpful to you. train2.log
可以看一下baseline的training_loss.png 和training_eval_loss.png 这两个图片吗
{"current_steps": 1, "total_steps": 42, "loss": 1.4653, "learning_rate": 0.0, "epoch": 0.06896551724137931, "percentage": 2.38, "elapsed_time": "0:00:15", "remaining_time": "0:10:23"} {"current_steps": 2, "total_steps": 42, "loss": 1.5467, "learning_rate": 0.0, "epoch": 0.13793103448275862, "percentage": 4.76, "elapsed_time": "0:00:25", "remaining_time": "0:08:35"} {"current_steps": 3, "total_steps": 42, "loss": 1.5921, "learning_rate": 0.0, "epoch": 0.20689655172413793, "percentage": 7.14, "elapsed_time": "0:00:35", "remaining_time": "0:07:45"} {"current_steps": 4, "total_steps": 42, "loss": 1.5006, "learning_rate": 0.0, "epoch": 0.27586206896551724, "percentage": 9.52, "elapsed_time": "0:00:46", "remaining_time": "0:07:19"} {"current_steps": 5, "total_steps": 42, "loss": 1.497, "learning_rate": 0.0, "epoch": 0.3448275862068966, "percentage": 11.9, "elapsed_time": "0:00:56", "remaining_time": "0:06:56"} {"current_steps": 5, "total_steps": 42, "eval_loss": 1.633915901184082, "epoch": 0.3448275862068966, "percentage": 11.9, "elapsed_time": "0:00:59", "remaining_time": "0:07:23"} {"current_steps": 6, "total_steps": 42, "loss": 1.4125, "learning_rate": 1.6666666666666667e-06, "epoch": 0.41379310344827586, "percentage": 14.29, "elapsed_time": "0:01:15", "remaining_time": "0:07:30"} {"current_steps": 7, "total_steps": 42, "loss": 1.8342, "learning_rate": 3.3333333333333333e-06, "epoch": 0.4827586206896552, "percentage": 16.67, "elapsed_time": "0:01:27", "remaining_time": "0:07:16"} {"current_steps": 8, "total_steps": 42, "loss": 0.9277, "learning_rate": 5e-06, "epoch": 0.5517241379310345, "percentage": 19.05, "elapsed_time": "0:01:39", "remaining_time": "0:07:03"} {"current_steps": 9, "total_steps": 42, "loss": 0.4277, "learning_rate": 4.991893270335526e-06, "epoch": 0.6206896551724138, "percentage": 21.43, "elapsed_time": "0:01:52", "remaining_time": "0:06:53"} {"current_steps": 10, "total_steps": 42, "loss": 0.555, "learning_rate": 4.967625656594782e-06, "epoch": 0.6896551724137931, "percentage": 23.81, "elapsed_time": "0:02:59", "remaining_time": "0:09:35"} {"current_steps": 10, "total_steps": 42, "eval_loss": 0.5451239347457886, "epoch": 0.6896551724137931, "percentage": 23.81, "elapsed_time": "0:03:02", "remaining_time": "0:09:45"} {"current_steps": 11, "total_steps": 42, "loss": 0.5934, "learning_rate": 4.927354543565131e-06, "epoch": 0.7586206896551724, "percentage": 26.19, "elapsed_time": "0:03:16", "remaining_time": "0:09:14"} {"current_steps": 12, "total_steps": 42, "loss": 0.3716, "learning_rate": 4.8713411048678635e-06, "epoch": 0.8275862068965517, "percentage": 28.57, "elapsed_time": "0:03:30", "remaining_time": "0:08:46"} {"current_steps": 13, "total_steps": 42, "loss": 0.3841, "learning_rate": 4.799948609147061e-06, "epoch": 0.896551724137931, "percentage": 30.95, "elapsed_time": "0:03:44", "remaining_time": "0:08:19"} {"current_steps": 14, "total_steps": 42, "loss": 0.3145, "learning_rate": 4.7136400641330245e-06, "epoch": 0.9655172413793104, "percentage": 33.33, "elapsed_time": "0:03:56", "remaining_time": "0:07:53"} {"current_steps": 15, "total_steps": 42, "loss": 0.2931, "learning_rate": 4.612975213859487e-06, "epoch": 1.0344827586206897, "percentage": 35.71, "elapsed_time": "0:04:10", "remaining_time": "0:07:31"} {"current_steps": 15, "total_steps": 42, "eval_loss": 0.2773745656013489, "epoch": 1.0344827586206897, "percentage": 35.71, "elapsed_time": "0:04:13", "remaining_time": "0:07:36"} {"current_steps": 16, "total_steps": 42, "loss": 0.2822, "learning_rate": 4.498606908508754e-06, "epoch": 1.103448275862069, "percentage": 38.1, "elapsed_time": "0:04:26", "remaining_time": "0:07:13"} {"current_steps": 17, "total_steps": 42, "loss": 0.1852, "learning_rate": 4.3712768704277535e-06, "epoch": 1.1724137931034484, "percentage": 40.48, "elapsed_time": "0:04:39", "remaining_time": "0:06:51"} {"current_steps": 18, "total_steps": 42, "loss": 0.2736, "learning_rate": 4.231810883773999e-06, "epoch": 1.2413793103448276, "percentage": 42.86, "elapsed_time": "0:04:53", "remaining_time": "0:06:30"} {"current_steps": 19, "total_steps": 42, "loss": 0.2119, "learning_rate": 4.081113438988443e-06, "epoch": 1.3103448275862069, "percentage": 45.24, "elapsed_time": "0:05:55", "remaining_time": "0:07:10"} {"current_steps": 20, "total_steps": 42, "loss": 0.2181, "learning_rate": 3.92016186682789e-06, "epoch": 1.3793103448275863, "percentage": 47.62, "elapsed_time": "0:06:09", "remaining_time": "0:06:46"} {"current_steps": 20, "total_steps": 42, "eval_loss": 0.30065488815307617, "epoch": 1.3793103448275863, "percentage": 47.62, "elapsed_time": "0:06:12", "remaining_time": "0:06:50"} {"current_steps": 21, "total_steps": 42, "loss": 0.3093, "learning_rate": 3.7500000000000005e-06, "epoch": 1.4482758620689655, "percentage": 50.0, "elapsed_time": "0:06:26", "remaining_time": "0:06:26"} {"current_steps": 22, "total_steps": 42, "loss": 0.21, "learning_rate": 3.5717314035076355e-06, "epoch": 1.5172413793103448, "percentage": 52.38, "elapsed_time": "0:06:39", "remaining_time": "0:06:03"} {"current_steps": 23, "total_steps": 42, "loss": 0.1676, "learning_rate": 3.386512217606339e-06, "epoch": 1.5862068965517242, "percentage": 54.76, "elapsed_time": "0:06:52", "remaining_time": "0:05:40"} {"current_steps": 24, "total_steps": 42, "loss": 0.1236, "learning_rate": 3.195543659791132e-06, "epoch": 1.6551724137931034, "percentage": 57.14, "elapsed_time": "0:07:07", "remaining_time": "0:05:20"} {"current_steps": 25, "total_steps": 42, "loss": 0.1185, "learning_rate": 3.0000642344401115e-06, "epoch": 1.7241379310344827, "percentage": 59.52, "elapsed_time": "0:07:19", "remaining_time": "0:04:59"} {"current_steps": 25, "total_steps": 42, "eval_loss": 0.19051770865917206, "epoch": 1.7241379310344827, "percentage": 59.52, "elapsed_time": "0:07:23", "remaining_time": "0:05:01"} {"current_steps": 26, "total_steps": 42, "loss": 0.1742, "learning_rate": 2.8013417006383078e-06, "epoch": 1.793103448275862, "percentage": 61.9, "elapsed_time": "0:07:36", "remaining_time": "0:04:40"} {"current_steps": 27, "total_steps": 42, "loss": 0.2009, "learning_rate": 2.6006648502735384e-06, "epoch": 1.8620689655172413, "percentage": 64.29, "elapsed_time": "0:07:48", "remaining_time": "0:04:20"} {"current_steps": 28, "total_steps": 42, "loss": 0.1278, "learning_rate": 2.399335149726463e-06, "epoch": 1.9310344827586206, "percentage": 66.67, "elapsed_time": "0:08:51", "remaining_time": "0:04:25"} {"current_steps": 29, "total_steps": 42, "loss": 0.0778, "learning_rate": 2.1986582993616926e-06, "epoch": 2.0, "percentage": 69.05, "elapsed_time": "0:09:04", "remaining_time": "0:04:04"} {"current_steps": 30, "total_steps": 42, "loss": 0.0696, "learning_rate": 1.9999357655598894e-06, "epoch": 2.0689655172413794, "percentage": 71.43, "elapsed_time": "0:09:18", "remaining_time": "0:03:43"} {"current_steps": 30, "total_steps": 42, "eval_loss": 0.1704787164926529, "epoch": 2.0689655172413794, "percentage": 71.43, "elapsed_time": "0:09:21", "remaining_time": "0:03:44"} {"current_steps": 31, "total_steps": 42, "loss": 0.0474, "learning_rate": 1.8044563402088686e-06, "epoch": 2.1379310344827585, "percentage": 73.81, "elapsed_time": "0:09:35", "remaining_time": "0:03:24"} {"current_steps": 32, "total_steps": 42, "loss": 0.0381, "learning_rate": 1.613487782393661e-06, "epoch": 2.206896551724138, "percentage": 76.19, "elapsed_time": "0:09:49", "remaining_time": "0:03:04"} {"current_steps": 33, "total_steps": 42, "loss": 0.0138, "learning_rate": 1.4282685964923643e-06, "epoch": 2.2758620689655173, "percentage": 78.57, "elapsed_time": "0:10:02", "remaining_time": "0:02:44"} {"current_steps": 34, "total_steps": 42, "loss": 0.0627, "learning_rate": 1.2500000000000007e-06, "epoch": 2.344827586206897, "percentage": 80.95, "elapsed_time": "0:10:15", "remaining_time": "0:02:24"} {"current_steps": 35, "total_steps": 42, "loss": 0.0278, "learning_rate": 1.079838133172111e-06, "epoch": 2.413793103448276, "percentage": 83.33, "elapsed_time": "0:10:28", "remaining_time": "0:02:05"} {"current_steps": 35, "total_steps": 42, "eval_loss": 0.17023761570453644, "epoch": 2.413793103448276, "percentage": 83.33, "elapsed_time": "0:10:31", "remaining_time": "0:02:06"} {"current_steps": 36, "total_steps": 42, "loss": 0.0669, "learning_rate": 9.188865610115572e-07, "epoch": 2.4827586206896552, "percentage": 85.71, "elapsed_time": "0:10:44", "remaining_time": "0:01:47"} {"current_steps": 37, "total_steps": 42, "loss": 0.0391, "learning_rate": 7.681891162260016e-07, "epoch": 2.5517241379310347, "percentage": 88.1, "elapsed_time": "0:11:47", "remaining_time": "0:01:35"} {"current_steps": 38, "total_steps": 42, "loss": 0.0539, "learning_rate": 6.28723129572247e-07, "epoch": 2.6206896551724137, "percentage": 90.48, "elapsed_time": "0:12:01", "remaining_time": "0:01:15"} {"current_steps": 39, "total_steps": 42, "loss": 0.0453, "learning_rate": 5.013930914912477e-07, "epoch": 2.689655172413793, "percentage": 92.86, "elapsed_time": "0:12:14", "remaining_time": "0:00:56"} {"current_steps": 40, "total_steps": 42, "loss": 0.0265, "learning_rate": 3.8702478614051353e-07, "epoch": 2.7586206896551726, "percentage": 95.24, "elapsed_time": "0:12:28", "remaining_time": "0:00:37"} {"current_steps": 40, "total_steps": 42, "eval_loss": 0.1742357462644577, "epoch": 2.7586206896551726, "percentage": 95.24, "elapsed_time": "0:12:32", "remaining_time": "0:00:37"} {"current_steps": 41, "total_steps": 42, "loss": 0.074, "learning_rate": 2.8635993586697555e-07, "epoch": 2.8275862068965516, "percentage": 97.62, "elapsed_time": "0:12:46", "remaining_time": "0:00:18"} {"current_steps": 42, "total_steps": 42, "loss": 0.0251, "learning_rate": 2.0005139085293945e-07, "epoch": 2.896551724137931, "percentage": 100.0, "elapsed_time": "0:13:00", "remaining_time": "0:00:00"} {"current_steps": 42, "total_steps": 42, "epoch": 2.896551724137931, "percentage": 100.0, "elapsed_time": "0:13:43", "remaining_time": "0:00:00"}
for training log.
`model_name_or_path: Qwen2-VL-7B-Instruct
stage: sft do_train: true finetuning_type: full
dataset: mire_train # video: mllm_video_demo template: qwen2_vl cutoff_len: 4096 max_samples: 10000 overwrite_cache: true preprocessing_num_workers: 16 val_size: 0.1
output_dir: saves/qwen2_vl-7b-mire/full/sft logging_steps: 1 save_steps: 0.2 plot_loss: true overwrite_output_dir: true
per_device_train_batch_size: 4 gradient_accumulation_steps: 2 learning_rate: 5.0e-6 num_train_epochs: 3.0 lr_scheduler_type: cosine warmup_ratio: 0.05 fp16: true ddp_timeout: 180000000
val_size: 0.1 per_device_eval_batch_size: 8 eval_strategy: steps eval_steps: 5
flash_attn: fa2 deepspeed: examples/deepspeed/ds_z3_offload_config.json` with 8 * A800
I used a training framework based on llamafactory for training. Some training logs are in the attachment. I hope it will be helpful to you. train2.log