Open dreamkillers666 opened 5 months ago
Accelerator
accelerate.DataLoaderConfiguration
Failures:
OOM 问题,超出显存。可以通过降低per_device_train_batch_size和train_group_size来降低显存占用。
(ft_emb) b405@b405-CVN-Z790-GAMING-FROZEN:/media/b405/新加卷1/Workspace_linux/b405/ZH/llmProjects/FlagEmbedding$ torchrun --nproc_per_node 1 \ -m FlagEmbedding.reranker.run \ --output_dir /media/b405/新加卷1/Workspace_linux/b405/ZH/llmProjects/FlagEmbedding/fine_tune_rerank_models/beg_rerank_ft \ --model_name_or_path /media/b405/新加卷1/Workspace_linux/b405/ZH/embeddingModels/bge-reranker-large/ \ --train_data /media/b405/新加卷1/Workspace_linux/b405/ZH/llmProjects/FlagEmbedding/post_process_embedding_finetune_dataset.jsonl \ --learning_rate 6e-5 \ --fp16 \ --num_train_epochs 5 \ --per_device_train_batch_size 10 \ --gradient_accumulation_steps 4 \ --dataloader_drop_last True \ --train_group_size 16 \ --max_len 512 \ --weight_decay 0.01 \ --logging_steps 10 04/20/2024 18:56:35 - WARNING - main - Process rank: 0, device: cuda:0, n_gpu: 1, distributed training: True, 16-bits training: True 04/20/2024 18:56:35 - INFO - main - Training/evaluation parameters TrainingArguments( _n_gpu=1, accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True}, adafactor=False, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, auto_find_batch_size=False, bf16=False, bf16_full_eval=False, data_seed=None, dataloader_drop_last=True, dataloader_num_workers=0, dataloader_persistent_workers=False, dataloader_pin_memory=True, dataloader_prefetch_factor=None, ddp_backend=None, ddp_broadcast_buffers=None, ddp_bucket_cap_mb=None, ddp_find_unused_parameters=None, ddp_timeout=1800, debug=[], deepspeed=None, disable_tqdm=False, dispatch_batches=None, do_eval=False, do_predict=False, do_train=False, eval_accumulation_steps=None, eval_delay=0, eval_steps=None, evaluation_strategy=no, fp16=True, fp16_backend=auto, fp16_full_eval=False, fp16_opt_level=O1, fsdp=[], fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_min_num_params=0, fsdp_transformer_layer_cls_to_wrap=None, full_determinism=False, gradient_accumulation_steps=4, gradient_checkpointing=False, gradient_checkpointing_kwargs=None, greater_is_better=None, group_by_length=False, half_precision_backend=auto, hub_always_push=False, hub_model_id=None, hub_private_repo=False, hub_strategy=every_save, hub_token=,
ignore_data_skip=False,
include_inputs_for_metrics=False,
include_num_input_tokens_seen=False,
include_tokens_per_second=False,
jit_mode_eval=False,
label_names=None,
label_smoothing_factor=0.0,
learning_rate=6e-05,
length_column_name=length,
load_best_model_at_end=False,
local_rank=0,
log_level=passive,
log_level_replica=warning,
log_on_each_node=True,
logging_dir=/media/b405/新加卷1/Workspace_linux/b405/ZH/llmProjects/FlagEmbedding/fine_tune_rerank_models/beg_rerank_ft/runs/Apr20_18-56-35_b405-CVN-Z790-GAMING-FROZEN,
logging_first_step=False,
logging_nan_inf_filter=True,
logging_steps=10,
logging_strategy=steps,
lr_scheduler_kwargs={},
lr_scheduler_type=linear,
max_grad_norm=1.0,
max_steps=-1,
metric_for_best_model=None,
mp_parameters=,
neftune_noise_alpha=None,
no_cuda=False,
num_train_epochs=5.0,
optim=adamw_torch,
optim_args=None,
optim_target_modules=None,
output_dir=/media/b405/新加卷1/Workspace_linux/b405/ZH/llmProjects/FlagEmbedding/fine_tune_rerank_models/beg_rerank_ft,
overwrite_output_dir=False,
past_index=-1,
per_device_eval_batch_size=8,
per_device_train_batch_size=10,
prediction_loss_only=False,
push_to_hub=False,
push_to_hub_model_id=None,
push_to_hub_organization=None,
push_to_hub_token=,
ray_scope=last,
remove_unused_columns=True,
report_to=[],
resume_from_checkpoint=None,
run_name=/media/b405/新加卷1/Workspace_linux/b405/ZH/llmProjects/FlagEmbedding/fine_tune_rerank_models/beg_rerank_ft,
save_on_each_node=False,
save_only_model=False,
save_safetensors=True,
save_steps=500,
save_strategy=steps,
save_total_limit=None,
seed=42,
skip_memory_metrics=True,
split_batches=None,
tf32=None,
torch_compile=False,
torch_compile_backend=None,
torch_compile_mode=None,
torchdynamo=None,
tpu_metrics_debug=False,
tpu_num_cores=None,
use_cpu=False,
use_ipex=False,
use_legacy_prediction_loop=False,
use_mps_device=False,
warmup_ratio=0.0,
warmup_steps=0,
weight_decay=0.01,
)
04/20/2024 18:56:35 - INFO - main - Model parameters ModelArguments(model_name_or_path='/media/b405/新加卷1/Workspace_linux/b405/ZH/embeddingModels/bge-reranker-large/', config_name=None, tokenizer_name=None, cache_dir=None)
04/20/2024 18:56:35 - INFO - main - Data parameters DataArguments(train_data='/media/b405/新加卷1/Workspace_linux/b405/ZH/llmProjects/FlagEmbedding/post_process_embedding_finetune_dataset.jsonl', train_group_size=16, max_len=512)
/home/b405/.local/lib/python3.10/site-packages/accelerate/accelerator.py:436: FutureWarning: Passing the following arguments to
main()
File "/media/b405/新加卷1/Workspace_linux/b405/ZH/llmProjects/FlagEmbedding/FlagEmbedding/reranker/run.py", line 90, in main
trainer.train()
File "/home/b405/.local/lib/python3.10/site-packages/transformers/trainer.py", line 1780, in train
return inner_training_loop(
File "/home/b405/.local/lib/python3.10/site-packages/transformers/trainer.py", line 2118, in _inner_training_loop
tr_loss_step = self.training_step(model, inputs)
File "/home/b405/.local/lib/python3.10/site-packages/transformers/trainer.py", line 3036, in training_step
loss = self.compute_loss(model, inputs)
File "/media/b405/新加卷1/Workspace_linux/b405/ZH/llmProjects/FlagEmbedding/FlagEmbedding/reranker/trainer.py", line 31, in compute_loss
return model(inputs)['loss']
File "/home/b405/.local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
return self._call_impl(*args, kwargs)
File "/home/b405/.local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl
return forward_call(*args, *kwargs)
File "/home/b405/.local/lib/python3.10/site-packages/torch/nn/parallel/distributed.py", line 1523, in forward
else self._run_ddp_forward(inputs, kwargs)
File "/home/b405/.local/lib/python3.10/site-packages/torch/nn/parallel/distributed.py", line 1359, in _run_ddp_forward
return self.module(*inputs, kwargs) # type: ignore[index]
File "/home/b405/.local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
return self._call_impl(*args, *kwargs)
File "/home/b405/.local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl
return forward_call(args, kwargs)
File "/home/b405/.local/lib/python3.10/site-packages/accelerate/utils/operations.py", line 825, in forward
return model_forward(*args, kwargs)
File "/home/b405/.local/lib/python3.10/site-packages/accelerate/utils/operations.py", line 813, in call
return convert_to_fp32(self.model_forward(*args, *kwargs))
File "/home/b405/.local/lib/python3.10/site-packages/torch/amp/autocast_mode.py", line 16, in decorate_autocast
return func(args, kwargs)
File "/media/b405/新加卷1/Workspace_linux/b405/ZH/llmProjects/FlagEmbedding/FlagEmbedding/reranker/modeling.py", line 34, in forward
ranker_out: SequenceClassifierOutput = self.hf_model(batch, return_dict=True)
File "/home/b405/.local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
return self._call_impl(*args, *kwargs)
File "/home/b405/.local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl
return forward_call(args, kwargs)
File "/home/b405/.local/lib/python3.10/site-packages/transformers/models/xlm_roberta/modeling_xlm_roberta.py", line 1208, in forward
outputs = self.roberta(
File "/home/b405/.local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
return self._call_impl(*args, kwargs)
File "/home/b405/.local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl
return forward_call(*args, *kwargs)
File "/home/b405/.local/lib/python3.10/site-packages/transformers/models/xlm_roberta/modeling_xlm_roberta.py", line 837, in forward
encoder_outputs = self.encoder(
File "/home/b405/.local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
return self._call_impl(args, kwargs)
File "/home/b405/.local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl
return forward_call(*args, kwargs)
File "/home/b405/.local/lib/python3.10/site-packages/transformers/models/xlm_roberta/modeling_xlm_roberta.py", line 525, in forward
layer_outputs = layer_module(
File "/home/b405/.local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
return self._call_impl(*args, *kwargs)
File "/home/b405/.local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl
return forward_call(args, kwargs)
File "/home/b405/.local/lib/python3.10/site-packages/transformers/models/xlm_roberta/modeling_xlm_roberta.py", line 456, in forward
layer_output = apply_chunking_to_forward(
File "/home/b405/.local/lib/python3.10/site-packages/transformers/pytorch_utils.py", line 237, in apply_chunking_to_forward
return forward_fn(input_tensors)
File "/home/b405/.local/lib/python3.10/site-packages/transformers/models/xlm_roberta/modeling_xlm_roberta.py", line 468, in feed_forward_chunk
intermediate_output = self.intermediate(attention_output)
File "/home/b405/.local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
return self._call_impl(args, kwargs)
File "/home/b405/.local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl
return forward_call(*args, *kwargs)
File "/home/b405/.local/lib/python3.10/site-packages/transformers/models/xlm_roberta/modeling_xlm_roberta.py", line 367, in forward
hidden_states = self.intermediate_act_fn(hidden_states)
File "/home/b405/.local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
return self._call_impl(args, kwargs)
File "/home/b405/.local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl
return forward_call(*args, *kwargs)
File "/home/b405/.local/lib/python3.10/site-packages/transformers/activations.py", line 78, in forward
return self.act(input)
torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 198.00 MiB. GPU 0 has a total capacity of 23.64 GiB of which 210.94 MiB is free. Including non-PyTorch memory, this process has 22.93 GiB memory in use. Of the allocated memory 21.74 GiB is allocated by PyTorch, and 628.00 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
0%| | 0/25 [00:00<?, ?it/s]
[2024-04-20 18:56:38,931] torch.distributed.elastic.multiprocessing.api: [ERROR] failed (exitcode: 1) local_rank: 0 (pid: 358104) of binary: /usr/bin/python3
Traceback (most recent call last):
File "/home/b405/.local/bin/torchrun", line 8, in
sys.exit(main())
File "/home/b405/.local/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/init.py", line 347, in wrapper
return f( args, **kwargs)
File "/home/b405/.local/lib/python3.10/site-packages/torch/distributed/run.py", line 812, in main
run(args)
File "/home/b405/.local/lib/python3.10/site-packages/torch/distributed/run.py", line 803, in run
elastic_launch(
File "/home/b405/.local/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 135, in call
return launch_agent(self._config, self._entrypoint, list(args))
File "/home/b405/.local/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 268, in launch_agent
raise ChildFailedError(
torch.distributed.elastic.multiprocessing.errors.ChildFailedError:
Accelerator
is deprecated and will be removed in version 1.0 of Accelerate: dict_keys(['dispatch_batches', 'split_batches', 'even_batches', 'use_seedable_sampler']). Please pass anaccelerate.DataLoaderConfiguration
instead: dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True) warnings.warn( 0%| | 0/25 [00:00<?, ?it/s]Traceback (most recent call last): File "/usr/lib/python3.10/runpy.py", line 196, in _run_module_as_main return _run_code(code, main_globals, None, File "/usr/lib/python3.10/runpy.py", line 86, in _run_code exec(code, run_globals) File "/media/b405/新加卷1/Workspace_linux/b405/ZH/llmProjects/FlagEmbedding/FlagEmbedding/reranker/run.py", line 95, inFlagEmbedding.reranker.run FAILED
Failures: