----------------Environment: the same as :https://huggingface.co/edbeeching/gpt-neo-125M-imdb-lora
Transformers 4.27.0.dev0
Pytorch 1.13.1+cuda116
Datasets 2.9.0
Tokenizers 0.13.2
trl 0.4.1.dev0/ trl 0.4.0
peft 0.2.0/peft 0.3.0.dev0
----------------Run the follow commant:
cd trl/examples/sentiment/scripts/gpt-neox-20b_peft
accelerate launch clm_finetune_peft_imdb.py --output_dir chk
---------------The error log
The following values were not passed to accelerate launch and had defaults used instead:
--dynamo_backend was set to a value of 'no'
To avoid this warning pass in values for each of the problematic parameters or run accelerate config.
===================================BUG REPORT===================================
Welcome to bitsandbytes. For bug reports, please submit your error trace to: https://github.com/TimDettmers/bitsandbytes/issues
CUDA SETUP: CUDA runtime path found: /data/anaconda3/envs/MyENV/lib/libcudart.so
CUDA SETUP: Highest compute capability among GPUs detected: 8.9
CUDA SETUP: Detected CUDA version 116
CUDA SETUP: Loading binary /data/anaconda3/envs/MyENV/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cuda116.so...
--------------------model_args------------------
ModelArguments(model_name_or_path='/home/bmb/models/facebook/opt-125m')
--------------------data_args------------------
DataTrainingArguments(dataset_name='imdb', block_size=1024)
--------------------training_args------------------
TrainingArguments(
_n_gpu=1,
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
bf16=False,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_pin_memory=True,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=False,
do_eval=False,
do_predict=False,
do_train=False,
eval_accumulation_steps=None,
eval_delay=0,
eval_steps=None,
evaluation_strategy=no,
fp16=False,
fp16_backend=auto,
fp16_full_eval=False,
fp16_opt_level=O1,
fsdp=[],
fsdp_config={'fsdp_min_num_params': 0, 'xla': False, 'xla_fsdp_grad_ckpt': False},
fsdp_min_num_params=0,
fsdp_transformer_layer_cls_to_wrap=None,
full_determinism=False,
gradient_accumulation_steps=1,
gradient_checkpointing=False,
greater_is_better=None,
group_by_length=False,
half_precision_backend=auto,
hub_model_id=None,
hub_private_repo=False,
hub_strategy=every_save,
hub_token=,
ignore_data_skip=False,
include_inputs_for_metrics=False,
jit_mode_eval=False,
label_names=None,
label_smoothing_factor=0.0,
learning_rate=5e-05,
length_column_name=length,
load_best_model_at_end=False,
local_rank=-1,
log_level=passive,
log_level_replica=warning,
log_on_each_node=True,
logging_dir=chk/runs/Mar14_16-56-10_meetyou-g2,
logging_first_step=False,
logging_nan_inf_filter=True,
logging_steps=500,
logging_strategy=steps,
lr_scheduler_type=linear,
max_grad_norm=1.0,
max_steps=-1,
metric_for_best_model=None,
mp_parameters=,
no_cuda=False,
num_train_epochs=3.0,
optim=adamw_hf,
optim_args=None,
output_dir=chk,
overwrite_output_dir=False,
past_index=-1,
per_device_eval_batch_size=8,
per_device_train_batch_size=8,
prediction_loss_only=False,
push_to_hub=False,
push_to_hub_model_id=None,
push_to_hub_organization=None,
push_to_hub_token=,
ray_scope=last,
remove_unused_columns=True,
report_to=['wandb'],
resume_from_checkpoint=None,
run_name=chk,
save_on_each_node=False,
save_steps=500,
save_strategy=steps,
save_total_limit=None,
seed=42,
sharded_ddp=[],
skip_memory_metrics=True,
tf32=None,
torch_compile=False,
torch_compile_backend=None,
torch_compile_mode=None,
torchdynamo=None,
tpu_metrics_debug=False,
tpu_num_cores=None,
use_ipex=False,
use_legacy_prediction_loop=False,
use_mps_device=False,
warmup_ratio=0.0,
warmup_steps=0,
weight_decay=0.0,
xpu_backend=None,
)
Overriding torch_dtype=None with torch_dtype=torch.float16 due to requirements of bitsandbytes to enable model loading in mixed int8. Either pass torch_dtype=torch.float16 or don't pass this argument at all to remove this warning.
trainable params: 589824 || all params: 125829120 || trainable%: 0.46875
Found cached dataset imdb (/home/bmb/.cache/huggingface/datasets/imdb/plain_text/1.0.0/d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0)
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:00<00:00, 883.45it/s]
Loading cached processed dataset at /home/bmb/.cache/huggingface/datasets/imdb/plain_text/1.0.0/d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0/cache-3066d04213395f11.arrow
Loading cached processed dataset at /home/bmb/.cache/huggingface/datasets/imdb/plain_text/1.0.0/d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0/cache-cbc9fa5ce7478c47.arrow
Loading cached processed dataset at /home/bmb/.cache/huggingface/datasets/imdb/plain_text/1.0.0/d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0/cache-e5a5583605ffc5c8.arrow
Loading cached processed dataset at /home/bmb/.cache/huggingface/datasets/imdb/plain_text/1.0.0/d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0/cache-76ee6f4f562a6026.arrow
Loading cached processed dataset at /home/bmb/.cache/huggingface/datasets/imdb/plain_text/1.0.0/d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0/cache-5ae97da8d5504da3.arrow
Loading cached processed dataset at /home/bmb/.cache/huggingface/datasets/imdb/plain_text/1.0.0/d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0/cache-ba8185ba9646bd60.arrow
/data/anaconda3/envs/MyENV/lib/python3.10/site-packages/transformers/optimization.py:391: FutureWarning: This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set no_deprecation_warning=True to disable this warning
warnings.warn(
0%| | 0/2748 [00:00<?, ?it/s]You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the call method is faster than using a method to encode the text followed by a call to the pad method to get a padded encoding.
Traceback (most recent call last):
File "/home/bmb/projects/trl/examples/sentiment/scripts/gpt-neox-20b_peft/clm_finetune_peft_imdb.py", line 135, in
trainer.train()
File "/data/anaconda3/envs/MyENV/lib/python3.10/site-packages/transformers/trainer.py", line 1631, in train
return inner_training_loop(
File "/data/anaconda3/envs/MyENV/lib/python3.10/site-packages/transformers/trainer.py", line 1900, in _inner_training_loop
tr_loss_step = self.training_step(model, inputs)
File "/data/anaconda3/envs/MyENV/lib/python3.10/site-packages/transformers/trainer.py", line 2643, in training_step
loss = self.compute_loss(model, inputs)
File "/data/anaconda3/envs/MyENV/lib/python3.10/site-packages/transformers/trainer.py", line 2675, in compute_loss
outputs = model(inputs)
File "/data/anaconda3/envs/MyENV/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1194, in _call_impl
return forward_call(*input, *kwargs)
File "/data/anaconda3/envs/MyENV/lib/python3.10/site-packages/peft/peft_model.py", line 530, in forward
return self.base_model(
File "/data/anaconda3/envs/MyENV/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1194, in _call_impl
return forward_call(input, kwargs)
File "/data/anaconda3/envs/MyENV/lib/python3.10/site-packages/accelerate/hooks.py", line 165, in new_forward
output = old_forward(*args, kwargs)
File "/data/anaconda3/envs/MyENV/lib/python3.10/site-packages/transformers/models/opt/modeling_opt.py", line 930, in forward
outputs = self.model.decoder(
File "/data/anaconda3/envs/MyENV/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1194, in _call_impl
return forward_call(*input, *kwargs)
File "/data/anaconda3/envs/MyENV/lib/python3.10/site-packages/accelerate/hooks.py", line 165, in new_forward
output = old_forward(args, kwargs)
File "/data/anaconda3/envs/MyENV/lib/python3.10/site-packages/transformers/models/opt/modeling_opt.py", line 688, in forward
layer_outputs = torch.utils.checkpoint.checkpoint(
File "/data/anaconda3/envs/MyENV/lib/python3.10/site-packages/torch/utils/checkpoint.py", line 249, in checkpoint
return CheckpointFunction.apply(function, preserve, args)
File "/data/anaconda3/envs/MyENV/lib/python3.10/site-packages/torch/utils/checkpoint.py", line 107, in forward
outputs = run_function(args)
File "/data/anaconda3/envs/MyENV/lib/python3.10/site-packages/transformers/models/opt/modeling_opt.py", line 684, in custom_forward
return module(inputs, output_attentions, None)
File "/data/anaconda3/envs/MyENV/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1194, in _call_impl
return forward_call(input, kwargs)
File "/data/anaconda3/envs/MyENV/lib/python3.10/site-packages/accelerate/hooks.py", line 165, in new_forward
output = old_forward(*args, *kwargs)
File "/data/anaconda3/envs/MyENV/lib/python3.10/site-packages/transformers/models/opt/modeling_opt.py", line 323, in forward
hidden_states = self.self_attn_layer_norm(hidden_states)
File "/data/anaconda3/envs/MyENV/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1194, in _call_impl
return forward_call(input, kwargs)
File "/data/anaconda3/envs/MyENV/lib/python3.10/site-packages/accelerate/hooks.py", line 165, in new_forward
output = old_forward(*args, **kwargs)
File "/data/anaconda3/envs/MyENV/lib/python3.10/site-packages/torch/nn/modules/normalization.py", line 190, in forward
return F.layer_norm(
File "/data/anaconda3/envs/MyENV/lib/python3.10/site-packages/torch/nn/functional.py", line 2515, in layer_norm
return torch.layer_norm(input, normalized_shape, weight, bias, eps, torch.backends.cudnn.enabled)
RuntimeError: expected scalar type Half but found Float
Traceback (most recent call last):
File "/data/anaconda3/envs/MyENV/bin/accelerate", line 8, in
sys.exit(main())
File "/data/anaconda3/envs/MyENV/lib/python3.10/site-packages/accelerate/commands/accelerate_cli.py", line 45, in main
args.func(args)
File "/data/anaconda3/envs/MyENV/lib/python3.10/site-packages/accelerate/commands/launch.py", line 915, in launch_command
simple_launcher(args)
File "/data/anaconda3/envs/MyENV/lib/python3.10/site-packages/accelerate/commands/launch.py", line 578, in simple_launcher
raise subprocess.CalledProcessError(returncode=process.returncode, cmd=cmd)
----------------Environment: the same as :https://huggingface.co/edbeeching/gpt-neo-125M-imdb-lora Transformers 4.27.0.dev0 Pytorch 1.13.1+cuda116 Datasets 2.9.0 Tokenizers 0.13.2 trl 0.4.1.dev0/ trl 0.4.0 peft 0.2.0/peft 0.3.0.dev0
----------------Run the follow commant: cd trl/examples/sentiment/scripts/gpt-neox-20b_peft accelerate launch clm_finetune_peft_imdb.py --output_dir chk
---------------The error log The following values were not passed to accelerate launch and had defaults used instead: --dynamo_backend was set to a value of 'no' To avoid this warning pass in values for each of the problematic parameters or run accelerate config.
===================================BUG REPORT=================================== Welcome to bitsandbytes. For bug reports, please submit your error trace to: https://github.com/TimDettmers/bitsandbytes/issues CUDA SETUP: CUDA runtime path found: /data/anaconda3/envs/MyENV/lib/libcudart.so CUDA SETUP: Highest compute capability among GPUs detected: 8.9 CUDA SETUP: Detected CUDA version 116 CUDA SETUP: Loading binary /data/anaconda3/envs/MyENV/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cuda116.so... --------------------model_args------------------ ModelArguments(model_name_or_path='/home/bmb/models/facebook/opt-125m') --------------------data_args------------------ DataTrainingArguments(dataset_name='imdb', block_size=1024) --------------------training_args------------------ TrainingArguments( _n_gpu=1, adafactor=False, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, auto_find_batch_size=False, bf16=False, bf16_full_eval=False, data_seed=None, dataloader_drop_last=False, dataloader_num_workers=0, dataloader_pin_memory=True, ddp_bucket_cap_mb=None, ddp_find_unused_parameters=None, ddp_timeout=1800, debug=[], deepspeed=None, disable_tqdm=False, do_eval=False, do_predict=False, do_train=False, eval_accumulation_steps=None, eval_delay=0, eval_steps=None, evaluation_strategy=no, fp16=False, fp16_backend=auto, fp16_full_eval=False, fp16_opt_level=O1, fsdp=[], fsdp_config={'fsdp_min_num_params': 0, 'xla': False, 'xla_fsdp_grad_ckpt': False}, fsdp_min_num_params=0, fsdp_transformer_layer_cls_to_wrap=None, full_determinism=False, gradient_accumulation_steps=1, gradient_checkpointing=False, greater_is_better=None, group_by_length=False, half_precision_backend=auto, hub_model_id=None, hub_private_repo=False, hub_strategy=every_save, hub_token=,
ignore_data_skip=False,
include_inputs_for_metrics=False,
jit_mode_eval=False,
label_names=None,
label_smoothing_factor=0.0,
learning_rate=5e-05,
length_column_name=length,
load_best_model_at_end=False,
local_rank=-1,
log_level=passive,
log_level_replica=warning,
log_on_each_node=True,
logging_dir=chk/runs/Mar14_16-56-10_meetyou-g2,
logging_first_step=False,
logging_nan_inf_filter=True,
logging_steps=500,
logging_strategy=steps,
lr_scheduler_type=linear,
max_grad_norm=1.0,
max_steps=-1,
metric_for_best_model=None,
mp_parameters=,
no_cuda=False,
num_train_epochs=3.0,
optim=adamw_hf,
optim_args=None,
output_dir=chk,
overwrite_output_dir=False,
past_index=-1,
per_device_eval_batch_size=8,
per_device_train_batch_size=8,
prediction_loss_only=False,
push_to_hub=False,
push_to_hub_model_id=None,
push_to_hub_organization=None,
push_to_hub_token=,
ray_scope=last,
remove_unused_columns=True,
report_to=['wandb'],
resume_from_checkpoint=None,
run_name=chk,
save_on_each_node=False,
save_steps=500,
save_strategy=steps,
save_total_limit=None,
seed=42,
sharded_ddp=[],
skip_memory_metrics=True,
tf32=None,
torch_compile=False,
torch_compile_backend=None,
torch_compile_mode=None,
torchdynamo=None,
tpu_metrics_debug=False,
tpu_num_cores=None,
use_ipex=False,
use_legacy_prediction_loop=False,
use_mps_device=False,
warmup_ratio=0.0,
warmup_steps=0,
weight_decay=0.0,
xpu_backend=None,
)
Overriding torch_dtype=None with torch_dtype=torch.float16 due to requirements of bitsandbytes to enable model loading in mixed int8. Either pass torch_dtype=torch.float16 or don't pass this argument at all to remove this warning.
trainable params: 589824 || all params: 125829120 || trainable%: 0.46875
Found cached dataset imdb (/home/bmb/.cache/huggingface/datasets/imdb/plain_text/1.0.0/d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0)
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:00<00:00, 883.45it/s]
Loading cached processed dataset at /home/bmb/.cache/huggingface/datasets/imdb/plain_text/1.0.0/d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0/cache-3066d04213395f11.arrow
Loading cached processed dataset at /home/bmb/.cache/huggingface/datasets/imdb/plain_text/1.0.0/d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0/cache-cbc9fa5ce7478c47.arrow
Loading cached processed dataset at /home/bmb/.cache/huggingface/datasets/imdb/plain_text/1.0.0/d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0/cache-e5a5583605ffc5c8.arrow
Loading cached processed dataset at /home/bmb/.cache/huggingface/datasets/imdb/plain_text/1.0.0/d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0/cache-76ee6f4f562a6026.arrow
Loading cached processed dataset at /home/bmb/.cache/huggingface/datasets/imdb/plain_text/1.0.0/d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0/cache-5ae97da8d5504da3.arrow
Loading cached processed dataset at /home/bmb/.cache/huggingface/datasets/imdb/plain_text/1.0.0/d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0/cache-ba8185ba9646bd60.arrow
/data/anaconda3/envs/MyENV/lib/python3.10/site-packages/transformers/optimization.py:391: FutureWarning: This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set no_deprecation_warning=True to disable this warning
warnings.warn(
0%| | 0/2748 [00:00<?, ?it/s]You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the call method is faster than using a method to encode the text followed by a call to the pad method to get a padded encoding.
Traceback (most recent call last):
File "/home/bmb/projects/trl/examples/sentiment/scripts/gpt-neox-20b_peft/clm_finetune_peft_imdb.py", line 135, in
trainer.train()
File "/data/anaconda3/envs/MyENV/lib/python3.10/site-packages/transformers/trainer.py", line 1631, in train
return inner_training_loop(
File "/data/anaconda3/envs/MyENV/lib/python3.10/site-packages/transformers/trainer.py", line 1900, in _inner_training_loop
tr_loss_step = self.training_step(model, inputs)
File "/data/anaconda3/envs/MyENV/lib/python3.10/site-packages/transformers/trainer.py", line 2643, in training_step
loss = self.compute_loss(model, inputs)
File "/data/anaconda3/envs/MyENV/lib/python3.10/site-packages/transformers/trainer.py", line 2675, in compute_loss
outputs = model(inputs)
File "/data/anaconda3/envs/MyENV/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1194, in _call_impl
return forward_call(*input, *kwargs)
File "/data/anaconda3/envs/MyENV/lib/python3.10/site-packages/peft/peft_model.py", line 530, in forward
return self.base_model(
File "/data/anaconda3/envs/MyENV/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1194, in _call_impl
return forward_call(input, kwargs)
File "/data/anaconda3/envs/MyENV/lib/python3.10/site-packages/accelerate/hooks.py", line 165, in new_forward
output = old_forward(*args, kwargs)
File "/data/anaconda3/envs/MyENV/lib/python3.10/site-packages/transformers/models/opt/modeling_opt.py", line 930, in forward
outputs = self.model.decoder(
File "/data/anaconda3/envs/MyENV/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1194, in _call_impl
return forward_call(*input, *kwargs)
File "/data/anaconda3/envs/MyENV/lib/python3.10/site-packages/accelerate/hooks.py", line 165, in new_forward
output = old_forward(args, kwargs)
File "/data/anaconda3/envs/MyENV/lib/python3.10/site-packages/transformers/models/opt/modeling_opt.py", line 688, in forward
layer_outputs = torch.utils.checkpoint.checkpoint(
File "/data/anaconda3/envs/MyENV/lib/python3.10/site-packages/torch/utils/checkpoint.py", line 249, in checkpoint
return CheckpointFunction.apply(function, preserve, args)
File "/data/anaconda3/envs/MyENV/lib/python3.10/site-packages/torch/utils/checkpoint.py", line 107, in forward
outputs = run_function(args)
File "/data/anaconda3/envs/MyENV/lib/python3.10/site-packages/transformers/models/opt/modeling_opt.py", line 684, in custom_forward
return module(inputs, output_attentions, None)
File "/data/anaconda3/envs/MyENV/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1194, in _call_impl
return forward_call(input, kwargs)
File "/data/anaconda3/envs/MyENV/lib/python3.10/site-packages/accelerate/hooks.py", line 165, in new_forward
output = old_forward(*args, *kwargs)
File "/data/anaconda3/envs/MyENV/lib/python3.10/site-packages/transformers/models/opt/modeling_opt.py", line 323, in forward
hidden_states = self.self_attn_layer_norm(hidden_states)
File "/data/anaconda3/envs/MyENV/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1194, in _call_impl
return forward_call(input, kwargs)
File "/data/anaconda3/envs/MyENV/lib/python3.10/site-packages/accelerate/hooks.py", line 165, in new_forward
output = old_forward(*args, **kwargs)
File "/data/anaconda3/envs/MyENV/lib/python3.10/site-packages/torch/nn/modules/normalization.py", line 190, in forward
return F.layer_norm(
File "/data/anaconda3/envs/MyENV/lib/python3.10/site-packages/torch/nn/functional.py", line 2515, in layer_norm
return torch.layer_norm(input, normalized_shape, weight, bias, eps, torch.backends.cudnn.enabled)
RuntimeError: expected scalar type Half but found Float
Traceback (most recent call last):
File "/data/anaconda3/envs/MyENV/bin/accelerate", line 8, in
sys.exit(main())
File "/data/anaconda3/envs/MyENV/lib/python3.10/site-packages/accelerate/commands/accelerate_cli.py", line 45, in main
args.func(args)
File "/data/anaconda3/envs/MyENV/lib/python3.10/site-packages/accelerate/commands/launch.py", line 915, in launch_command
simple_launcher(args)
File "/data/anaconda3/envs/MyENV/lib/python3.10/site-packages/accelerate/commands/launch.py", line 578, in simple_launcher
raise subprocess.CalledProcessError(returncode=process.returncode, cmd=cmd)