Open clareliu1234 opened 2 months ago
num_gpus=2
output_dir=./bge-sft-output
model_path=./bge-m3/bge-m3
train_data=./newdata batch_size=4 query_max_len=256 # max 8192 passage_max_len=256 # max 8192
torchrun --nproc_per_node $num_gpus \ -m FlagEmbedding.BGE_M3.run \ --output_dir $output_dir \ --model_name_or_path $model_path \ --train_data $train_data \ --learning_rate 1e-5 \ --fp16 \ --num_train_epochs 5 \ --per_device_train_batch_size $batch_size \ --dataloader_drop_last True \ --normlized True \ --temperature 0.02 \ --query_max_len $query_max_len \ --passage_max_len $passage_max_len \ --train_group_size 6 \ --negatives_cross_device \ --logging_steps 10 \ --same_task_within_batch True \ --save_steps 10000 \ --unified_finetuning True \ --deepspeed ./df_config.json \ --per_device_eval_batch_size 4
查看一下训练数据,可以用现有模型打一下分数看看,是否是负样本太简单,模型很容易区分。
查看一下训练数据,可以用现有模型打一下分数看看,是否是负样本太简单,模型很容易区分。
感谢回复,我的正样本是从文中随机抽取的一句话,然后用大模型生成的正样本,负样本就是别的文中随机抽的某句话,也是用bge-large-en-v1.5打了分,不过分值是在0.4以下,出现loss为0是因为负样本和正样本太容易区分了吗?应该怎么解决呢?再次感谢您的回复
这种构造数据的方式确实太简单了,可以尝试难样本挖掘策略:https://github.com/FlagOpen/FlagEmbedding/tree/master/examples/finetune#hard-negatives
2024-08-31 12:49:22,685] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-08-31 12:49:22,685] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-08-31 12:49:23,762] [INFO] [comm.py:652:init_distributed] cdb=None [2024-08-31 12:49:23,762] [INFO] [comm.py:683:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl [2024-08-31 12:49:23,765] [INFO] [comm.py:652:init_distributed] cdb=None 08/31/2024 12:49:23 - WARNING - main - Process rank: 0, device: cuda:0, n_gpu: 1, distributed training: True, 16-bits training: True 08/31/2024 12:49:23 - INFO - main - Training/evaluation parameters RetrieverTrainingArguments( _n_gpu=1, accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, adafactor=False, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, auto_find_batch_size=False, batch_eval_metrics=False, bf16=False, bf16_full_eval=False, colbert_dim=-1, data_seed=None, dataloader_drop_last=True, dataloader_num_workers=0, dataloader_persistent_workers=False, dataloader_pin_memory=True, dataloader_prefetch_factor=None, ddp_backend=None, ddp_broadcast_buffers=None, ddp_bucket_cap_mb=None, ddp_find_unused_parameters=None, ddp_timeout=1800, debug=[], deepspeed=./df_config.json, disable_tqdm=False, dispatch_batches=None, do_eval=False, do_predict=False, do_train=False, enable_sub_batch=True, eval_accumulation_steps=None, eval_delay=0, eval_do_concat_batches=True, eval_on_start=False, eval_steps=None, eval_strategy=IntervalStrategy.NO, eval_use_gather_object=False, evaluation_strategy=None, fix_encoder=False, fix_position_embedding=False, fp16=True, fp16_backend=auto, fp16_full_eval=False, fp16_opt_level=O1, fsdp=[], fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_min_num_params=0, fsdp_transformer_layer_cls_to_wrap=None, full_determinism=False, gradient_accumulation_steps=1, gradient_checkpointing=False, gradient_checkpointing_kwargs=None, greater_is_better=None, group_by_length=False, half_precision_backend=auto, hub_always_push=False, hub_model_id=None, hub_private_repo=False, hub_strategy=HubStrategy.EVERY_SAVE, hub_token=,
ignore_data_skip=False,
include_inputs_for_metrics=False,
include_num_input_tokens_seen=False,
include_tokens_per_second=False,
jit_mode_eval=False,
label_names=None,
label_smoothing_factor=0.0,
learning_rate=1e-05,
length_column_name=length,
load_best_model_at_end=False,
local_rank=0,
log_level=passive,
log_level_replica=warning,
log_on_each_node=True,
logging_dir=./bge-sft-output/runs/Aug31_12-49-22_autodl-container-bdf4448313-f2394cf4,
logging_first_step=False,
logging_nan_inf_filter=True,
logging_steps=10,
logging_strategy=IntervalStrategy.STEPS,
lr_scheduler_kwargs={},
lr_scheduler_type=SchedulerType.LINEAR,
max_grad_norm=1.0,
max_steps=-1,
metric_for_best_model=None,
mp_parameters=,
neftune_noise_alpha=None,
negatives_cross_device=True,
no_cuda=False,
normlized=True,
num_train_epochs=5.0,
optim=OptimizerNames.ADAMW_TORCH,
optim_args=None,
optim_target_modules=None,
output_dir=./bge-sft-output,
overwrite_output_dir=False,
past_index=-1,
per_device_eval_batch_size=4,
per_device_train_batch_size=4,
prediction_loss_only=False,
push_to_hub=False,
push_to_hub_model_id=None,
push_to_hub_organization=None,
push_to_hub_token=,
ray_scope=last,
remove_unused_columns=True,
report_to=[],
restore_callback_states_from_checkpoint=False,
resume_from_checkpoint=None,
run_name=./bge-sft-output,
save_on_each_node=False,
save_only_model=False,
save_safetensors=True,
save_steps=10000,
save_strategy=IntervalStrategy.STEPS,
save_total_limit=None,
seed=42,
self_distill_start_step=-1,
sentence_pooling_method=cls,
skip_memory_metrics=True,
split_batches=None,
temperature=0.02,
tf32=None,
torch_compile=False,
torch_compile_backend=None,
torch_compile_mode=None,
torch_empty_cache_steps=None,
torchdynamo=None,
tpu_metrics_debug=False,
tpu_num_cores=None,
unified_finetuning=True,
use_cpu=False,
use_ipex=False,
use_legacy_prediction_loop=False,
use_mps_device=False,
use_self_distill=False,
warmup_ratio=0.0,
warmup_steps=0,
weight_decay=0.0,
)
08/31/2024 12:49:23 - INFO - main - Model parameters ModelArguments(model_name_or_path='./bge-m3/bge-m3', config_name=None, tokenizer_name=None, cache_dir=None)
08/31/2024 12:49:23 - INFO - main - Data parameters DataArguments(knowledge_distillation=False, train_data=['./newdata'], cache_path=None, train_group_size=6, query_max_len=256, passage_max_len=256, max_example_num_per_dataset=None, query_instruction_for_retrieval=None, passage_instruction_for_retrieval=None, same_task_within_batch=True, shuffle_ratio=0.0, small_threshold=0, drop_threshold=0)
08/31/2024 12:49:24 - WARNING - main - Process rank: 1, device: cuda:1, n_gpu: 1, distributed training: True, 16-bits training: True
08/31/2024 12:49:25 - INFO - main - Config: XLMRobertaConfig {
"_name_or_path": "./bge-m3/bge-m3",
"architectures": [
"XLMRobertaModel"
],
"attention_probs_dropout_prob": 0.1,
"bos_token_id": 0,
"classifier_dropout": null,
"eos_token_id": 2,
"hidden_act": "gelu",
"hidden_dropout_prob": 0.1,
"hidden_size": 1024,
"id2label": {
"0": "LABEL_0"
},
"initializer_range": 0.02,
"intermediate_size": 4096,
"label2id": {
"LABEL_0": 0
},
"layer_norm_eps": 1e-05,
"max_position_embeddings": 8194,
"model_type": "xlm-roberta",
"num_attention_heads": 16,
"num_hidden_layers": 24,
"output_past": true,
"pad_token_id": 1,
"position_embedding_type": "absolute",
"torch_dtype": "float32",
"transformers_version": "4.44.2",
"type_vocab_size": 1,
"use_cache": true,
"vocab_size": 250002
}
08/31/2024 12:49:25 - INFO - FlagEmbedding.BGE_M3.modeling - loading existing colbert_linear and sparse_linear--------- /root/miniconda3/envs/py310/lib/python3.11/site-packages/FlagEmbedding/BGE_M3/modeling.py:335: FutureWarning: You are using
torch.load
withweights_only=False
(the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value forweights_only
will be flipped toTrue
. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user viatorch.serialization.add_safe_globals
. We recommend you start settingweights_only=True
for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature. colbert_state_dict = torch.load(os.path.join(model_dir, 'colbert_linear.pt'), map_location='cpu') /root/miniconda3/envs/py310/lib/python3.11/site-packages/FlagEmbedding/BGE_M3/modeling.py:336: FutureWarning: You are usingtorch.load
withweights_only=False
(the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value forweights_only
will be flipped toTrue
. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user viatorch.serialization.add_safe_globals
. We recommend you start settingweights_only=True
for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature. sparse_state_dict = torch.load(os.path.join(model_dir, 'sparse_linear.pt'), map_location='cpu')Batch Size Dict: ['0-500: 4', '500-1000: 4', '1000-2000: 4', '2000-3000: 4', '3000-4000: 4', '4000-5000: 4', '5000-6000: 4', '6000-7000: 4', '7000-inf: 4']
loading data from ./newdata/train_title_abstract_new.jsonl ... /root/miniconda3/envs/py310/lib/python3.11/site-packages/FlagEmbedding/BGE_M3/modeling.py:335: FutureWarning: You are using
torch.load
withweights_only=False
(the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value forweights_only
will be flipped toTrue
. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user viatorch.serialization.add_safe_globals
. We recommend you start settingweights_only=True
for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature. colbert_state_dict = torch.load(os.path.join(model_dir, 'colbert_linear.pt'), map_location='cpu') /root/miniconda3/envs/py310/lib/python3.11/site-packages/FlagEmbedding/BGE_M3/modeling.py:336: FutureWarning: You are usingtorch.load
withweights_only=False
(the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value forweights_only
will be flipped toTrue
. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user viatorch.serialization.add_safe_globals
. We recommend you start settingweights_only=True
for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature. sparse_state_dict = torch.load(os.path.join(model_dir, 'sparse_linear.pt'), map_location='cpu') Generating train split: 6991 examples [00:00, 96413.01 examples/s] ---------------------------Rank 0: refresh data--------------------------- ---------------------------Rank 1: refresh data--------------------------- Using /root/.cache/torch_extensions/py311_cu121 as PyTorch extensions root... Using /root/.cache/torch_extensions/py311_cu121 as PyTorch extensions root... Detected CUDA files, patching ldflags Emitting ninja build file /root/.cache/torch_extensions/py311_cu121/fused_adam/build.ninja... /root/miniconda3/envs/py310/lib/python3.11/site-packages/torch/utils/cpp_extension.py:1965: UserWarning: TORCH_CUDA_ARCH_LIST is not set, all archs for visible cards are included for compilation. If this is not desired, please set os.environ['TORCH_CUDA_ARCH_LIST']. warnings.warn( Building extension module fused_adam... Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) ninja: no work to do. Loading extension module fused_adam... Time to load fused_adam op: 0.09208106994628906 seconds Loading extension module fused_adam... Time to load fused_adam op: 0.10176944732666016 seconds {'loss': 0.0, 'grad_norm': 1.0933278645097744e-05, 'learning_rate': 9.98397435897436e-06, 'epoch': 0.01}{'loss': 0.0, 'grad_norm': 2.4871298592188396e-05, 'learning_rate': 9.961080586080587e-06, 'epoch': 0.02}
{'loss': 0.0, 'grad_norm': 6.943765583855566e-06, 'learning_rate': 9.938186813186814e-06, 'epoch': 0.03}
{'loss': 0.0, 'grad_norm': 0.00051667116349563, 'learning_rate': 9.915293040293041e-06, 'epoch': 0.05}
{'loss': 0.0, 'grad_norm': 0.0002898364909924567, 'learning_rate': 9.892399267399268e-06, 'epoch': 0.06}
{'loss': 0.0, 'grad_norm': 6.137516379567387e-07, 'learning_rate': 9.869505494505496e-06, 'epoch': 0.07}
{'loss': 0.0088, 'grad_norm': 5.524310836335644e-06, 'learning_rate': 9.846611721611723e-06, 'epoch': 0.08}
{'loss': 0.0, 'grad_norm': 7.275876669154968e-06, 'learning_rate': 9.823717948717948e-06, 'epoch': 0.09}
{'loss': 0.0, 'grad_norm': 1.3697450640393072e-06, 'learning_rate': 9.800824175824177e-06, 'epoch': 0.1}
{'loss': 0.0007, 'grad_norm': 0.006467437371611595, 'learning_rate': 9.777930402930404e-06, 'epoch': 0.11}
{'loss': 0.0002, 'grad_norm': 6.234044121811166e-06, 'learning_rate': 9.75503663003663e-06, 'epoch': 0.13}
{'loss': 0.0, 'grad_norm': 6.946007943042787e-06, 'learning_rate': 9.732142857142858e-06, 'epoch': 0.14}
{'loss': 0.0, 'grad_norm': 8.99842125363648e-05, 'learning_rate': 9.709249084249084e-06, 'epoch': 0.15}
{'loss': 0.0086, 'grad_norm': 5.538483619689941, 'learning_rate': 9.686355311355313e-06, 'epoch': 0.16}
{'loss': 0.0, 'grad_norm': 2.8746483167196857e-06, 'learning_rate': 9.66346153846154e-06, 'epoch': 0.17}
{'loss': 0.0, 'grad_norm': 3.209952046745457e-05, 'learning_rate': 9.640567765567767e-06, 'epoch': 0.18}
{'loss': 0.0003, 'grad_norm': 0.00031322549330070615, 'learning_rate': 9.617673992673993e-06, 'epoch': 0.19}
{'loss': 0.0128, 'grad_norm': 0.0017342653591185808, 'learning_rate': 9.59478021978022e-06, 'epoch': 0.21}
{'loss': 0.0, 'grad_norm': 0.0001642707793507725, 'learning_rate': 9.571886446886449e-06, 'epoch': 0.22}
{'loss': 0.0006, 'grad_norm': 0.01790499873459339, 'learning_rate': 9.548992673992676e-06, 'epoch': 0.23}
{'loss': 0.0011, 'grad_norm': 0.00010154028859687969, 'learning_rate': 9.5260989010989e-06, 'epoch': 0.24}
{'loss': 0.0021, 'grad_norm': 0.00018275361799169332, 'learning_rate': 9.50320512820513e-06, 'epoch': 0.25}
{'loss': 0.0002, 'grad_norm': 0.006220698356628418, 'learning_rate': 9.480311355311356e-06, 'epoch': 0.26}
{'loss': 0.0, 'grad_norm': 0.0004315146943554282, 'learning_rate': 9.457417582417583e-06, 'epoch': 0.27}
{'loss': 0.0054, 'grad_norm': 0.00035386779927648604, 'learning_rate': 9.43452380952381e-06, 'epoch': 0.29}