hiyouga / LLaMA-Factory

A WebUI for Efficient Fine-Tuning of 100+ LLMs (ACL 2024)
https://arxiv.org/abs/2403.13372
Apache License 2.0
25.84k stars 3.21k forks source link

triton.runtime.autotuner.OutOfResources #4688

Open GitIgnoreMaybe opened 3 days ago

GitIgnoreMaybe commented 3 days ago

Reminder

System Info

llamafactory-0.8.3.dev0, Ubuntu 22.04.3 LTS, py3.10, cuda11.8.0

Reproduction

Command:

"_name_or_path": "microsoft/Phi-3-small-8k-instruct",
  "architectures": [
    "Phi3SmallForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout_prob": 0.0,
  "auto_map": {
    "AutoConfig": "microsoft/Phi-3-small-8k-instruct--configuration_phi3_small.Phi3SmallConfig",
    "AutoModelForCausalLM": "microsoft/Phi-3-small-8k-instruct--modeling_phi3_small.Phi3SmallForCausalLM",
    "AutoModelForSequenceClassification": "microsoft/Phi-3-small-8k-instruct--modeling_phi3_small.Phi3SmallForSequenceClassification",
    "AutoTokenizer": "microsoft/Phi-3-small-8k-instruct--tokenization_phi3_small.Phi3SmallTokenizer"
  },
  "blocksparse_block_size": 64,
  "blocksparse_homo_head_pattern": false,
  "blocksparse_num_local_blocks": 16,
  "blocksparse_triton_kernel_block_size": 64,
  "blocksparse_vert_stride": 8,
  "bos_token_id": 100257,
  "dense_attention_every_n_layers": 2,
  "embedding_dropout_prob": 0.1,
  "eos_token_id": 100257,
  "ff_dim_multiplier": null,
  "ff_intermediate_size": 14336,
  "ffn_dropout_prob": 0.1,
  "gegelu_limit": 20.0,
  "gegelu_pad_to_256": true,
  "hidden_act": "gegelu",
  "hidden_size": 4096,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "max_position_embeddings": 8192,
  "model_type": "phi3small",
  "mup_attn_multiplier": 1.0,
  "mup_embedding_multiplier": 10.0,
  "mup_use_scaling": true,
  "mup_width_multiplier": 8.0,
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 8,
  "pad_sequence_to_multiple_of_64": true,
  "reorder_and_upcast_attn": false,
  "rope_embedding_base": 1000000,
  "rope_position_scale": 1.0,
  "rope_scaling": null,
  "torch_dtype": "bfloat16",
  "transformers_version": "4.42.3",
  "use_cache": true,
  "vocab_size": 100352
}

Error

Traceback (most recent call last):
  File "/usr/local/bin/llamafactory-cli", line 8, in <module>
    sys.exit(main())
  File "/root/LLaMA-Factory/src/llamafactory/cli.py", line 111, in main
    run_exp()
  File "/root/LLaMA-Factory/src/llamafactory/train/tuner.py", line 50, in run_exp
    run_sft(model_args, data_args, training_args, finetuning_args, generating_args, callbacks)
  File "/root/LLaMA-Factory/src/llamafactory/train/sft/workflow.py", line 90, in run_sft
    train_result = trainer.train(resume_from_checkpoint=training_args.resume_from_checkpoint)
  File "/usr/local/lib/python3.10/dist-packages/transformers/trainer.py", line 1932, in train
    return inner_training_loop(
  File "/usr/local/lib/python3.10/dist-packages/transformers/trainer.py", line 2268, in _inner_training_loop
    tr_loss_step = self.training_step(model, inputs)
  File "/usr/local/lib/python3.10/dist-packages/transformers/trainer.py", line 3307, in training_step
    loss = self.compute_loss(model, inputs)
  File "/usr/local/lib/python3.10/dist-packages/transformers/trainer.py", line 3338, in compute_loss
    outputs = model(**inputs)
  File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1527, in _call_impl
    return forward_call(*args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/accelerate/utils/operations.py", line 819, in forward
    return model_forward(*args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/accelerate/utils/operations.py", line 807, in __call__
    return convert_to_fp32(self.model_forward(*args, **kwargs))
  File "/usr/local/lib/python3.10/dist-packages/torch/amp/autocast_mode.py", line 16, in decorate_autocast
    return func(*args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/peft/peft_model.py", line 1430, in forward
    return self.base_model(
  File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1527, in _call_impl
    return forward_call(*args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/peft/tuners/tuners_utils.py", line 179, in forward
    return self.model.forward(*args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/accelerate/hooks.py", line 169, in new_forward
    output = module._old_forward(*args, **kwargs)
  File "/root/.cache/huggingface/modules/transformers_modules/microsoft/Phi-3-small-8k-instruct/69caae1f2acea34b26f535fecb1f2abb9a304695/modeling_phi3_small.py", line 956, in forward
    outputs = self.model(
  File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1527, in _call_impl
    return forward_call(*args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/accelerate/hooks.py", line 169, in new_forward
    output = module._old_forward(*args, **kwargs)
  File "/root/.cache/huggingface/modules/transformers_modules/microsoft/Phi-3-small-8k-instruct/69caae1f2acea34b26f535fecb1f2abb9a304695/modeling_phi3_small.py", line 849, in forward
    layer_outputs = self._gradient_checkpointing_func(
  File "/root/LLaMA-Factory/src/llamafactory/model/model_utils/checkpointing.py", line 65, in custom_gradient_checkpointing_func
    return gradient_checkpointing_func(func, *args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/torch/_compile.py", line 24, in inner
    return torch._dynamo.disable(fn, recursive)(*args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/torch/_dynamo/eval_frame.py", line 328, in _fn
    return fn(*args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/torch/_dynamo/external_utils.py", line 17, in inner
    return fn(*args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/torch/utils/checkpoint.py", line 451, in checkpoint
    return CheckpointFunction.apply(function, preserve, *args)
  File "/usr/local/lib/python3.10/dist-packages/torch/autograd/function.py", line 539, in apply
    return super().apply(*args, **kwargs)  # type: ignore[misc]
  File "/usr/local/lib/python3.10/dist-packages/torch/utils/checkpoint.py", line 230, in forward
    outputs = run_function(*args)
  File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1527, in _call_impl
    return forward_call(*args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/accelerate/hooks.py", line 169, in new_forward
    output = module._old_forward(*args, **kwargs)
  File "/root/.cache/huggingface/modules/transformers_modules/microsoft/Phi-3-small-8k-instruct/69caae1f2acea34b26f535fecb1f2abb9a304695/modeling_phi3_small.py", line 671, in forward
    hidden_states, self_attn_weights, present_key_values = self.self_attn(
  File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1527, in _call_impl
    return forward_call(*args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/accelerate/hooks.py", line 169, in new_forward
    output = module._old_forward(*args, **kwargs)
  File "/root/.cache/huggingface/modules/transformers_modules/microsoft/Phi-3-small-8k-instruct/69caae1f2acea34b26f535fecb1f2abb9a304695/modeling_phi3_small.py", line 616, in forward
    attn_function_output = self._apply_blocksparse_attention(
  File "/root/.cache/huggingface/modules/transformers_modules/microsoft/Phi-3-small-8k-instruct/69caae1f2acea34b26f535fecb1f2abb9a304695/modeling_phi3_small.py", line 382, in _apply_blocksparse_attention
    context_layer = self._blocksparse_layer(
  File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1527, in _call_impl
    return forward_call(*args, **kwargs)
  File "/root/.cache/huggingface/modules/transformers_modules/microsoft/Phi-3-small-8k-instruct/69caae1f2acea34b26f535fecb1f2abb9a304695/triton_blocksparse_attention_layer.py", line 165, in forward
    return blocksparse_flash_attn_padded_fwd(
  File "/root/.cache/huggingface/modules/transformers_modules/microsoft/Phi-3-small-8k-instruct/69caae1f2acea34b26f535fecb1f2abb9a304695/triton_flash_blocksparse_attn.py", line 994, in blocksparse_flash_attn_padded_fwd
    _fwd_kernel_batch_inference[grid](
  File "/usr/local/lib/python3.10/dist-packages/triton/runtime/autotuner.py", line 232, in run
    return self.fn.run(*args, **kwargs)
  File "<string>", line 65, in _fwd_kernel_batch_inference
  File "/usr/local/lib/python3.10/dist-packages/triton/compiler/compiler.py", line 579, in __getattribute__
    self._init_handles()
  File "/usr/local/lib/python3.10/dist-packages/triton/compiler/compiler.py", line 568, in _init_handles
    raise OutOfResources(self.shared, max_shared, "shared memory")
triton.runtime.autotuner.OutOfResources: out of resource: shared memory, Required: 246272, Hardware limit: 101376. Reducing block sizes or `num_stages` may help.
SCR-20240705-frfy

Expected behavior

Hello, I'm really not sure if this is a LLaMA Factory issue or the Cloud GPU provider. Does anyone knows what to do?

Others

No response

codemayq commented 3 days ago

The reproduction command is not posted, we don't know what process you are doing.

GitIgnoreMaybe commented 3 days ago

Hey @codemayq,

Thanks for the help.

llamafactory-cli train \
    --stage sft \
    --do_train True \
    --model_name_or_path microsoft/Phi-3-small-8k-instruct \
    --preprocessing_num_workers 16 \
    --finetuning_type lora \
    --quantization_bit 4 \
    --quantization_method bitsandbytes \
    --template phi \
    --flash_attn fa2 \
    --dataset_dir data \
    --dataset custom_instruct_training_data.json \
    --cutoff_len 1024 \
    --learning_rate 1.0e-04 \
    --num_train_epochs 1.0 \
    --max_samples 1000 \
    --per_device_train_batch_size 5 \
    --gradient_accumulation_steps 8 \
    --lr_scheduler_type cosine \
    --max_grad_norm 1.0 \
    --logging_steps 5 \
    --save_steps 100 \
    --warmup_steps 0 \
    --optim adamw_torch \
    --packing False \
    --report_to none \
    --output_dir saves/Phi3-7B-8k-Chat/lora/train_2024-07-05-13-47-27 \
    --bf16 True \
    --plot_loss True \
    --ddp_timeout 180000000 \
    --include_num_input_tokens_seen True \
    --lora_rank 256 \
    --lora_alpha 512 \
    --lora_dropout 0 \
    --lora_target all \
    --val_size 0.1 \
    --eval_strategy steps \
    --eval_steps 100 \
    --per_device_eval_batch_size 5
GitIgnoreMaybe commented 3 days ago

~I think my LoRA rank and LoRA alpha was wrong.~

hiyouga commented 2 days ago

decrease the train batch size

GitIgnoreMaybe commented 1 day ago

@hiyouga Thanks for the help.

This didn't work either. But I figured out that the quantization creates the issue. It works when I'm not quantizing. Sounds like a bug, right?

Failing with this:

llamafactory-cli train \
    --stage sft \
    --do_train True \
    --model_name_or_path microsoft/Phi-3-small-8k-instruct \
    --preprocessing_num_workers 16 \
    --finetuning_type lora \
    --quantization_bit 4 \
    --quantization_method bitsandbytes \
    --template phi \
    --flash_attn fa2 \
    --dataset_dir data \
    --dataset data_query_expansion.json \
    --cutoff_len 512 \
    --learning_rate 0.0001 \
    --num_train_epochs 8.0 \
    --max_samples 1000 \
    --per_device_train_batch_size 1 \
    --gradient_accumulation_steps 16 \
    --lr_scheduler_type cosine \
    --max_grad_norm 1.0 \
    --logging_steps 5 \
    --save_steps 100 \
    --warmup_steps 0 \
    --optim adamw_torch \
    --packing False \
    --report_to none \
    --output_dir saves/Phi3-7B-8k-Chat/lora/output-q4 \
    --bf16 True \
    --plot_loss True \
    --ddp_timeout 180000000 \
    --include_num_input_tokens_seen True \
    --lora_rank 8 \
    --lora_alpha 16 \
    --lora_dropout 0 \
    --lora_target all \
    --val_size 0.1 \
    --eval_strategy steps \
    --eval_steps 100 \
    --per_device_eval_batch_size 1

This worked:

llamafactory-cli train \
    --stage sft \
    --do_train True \
    --model_name_or_path microsoft/Phi-3-small-8k-instruct \
    --preprocessing_num_workers 16 \
    --finetuning_type lora \
    --quantization_method bitsandbytes \
    --template phi \
    --flash_attn fa2 \
    --dataset_dir data \
    --dataset data_query_expansion.json \
    --cutoff_len 512 \
    --learning_rate 0.0001 \
    --num_train_epochs 8.0 \
    --max_samples 1000 \
    --per_device_train_batch_size 1 \
    --gradient_accumulation_steps 16 \
    --lr_scheduler_type cosine \
    --max_grad_norm 1.0 \
    --logging_steps 5 \
    --save_steps 100 \
    --warmup_steps 0 \
    --optim adamw_torch \
    --packing False \
    --report_to none \
    --output_dir saves/Phi3-7B-8k-Chat/lora/output-q4 \
    --bf16 True \
    --plot_loss True \
    --ddp_timeout 180000000 \
    --include_num_input_tokens_seen True \
    --lora_rank 8 \
    --lora_alpha 16 \
    --lora_dropout 0 \
    --lora_target all \
    --val_size 0.1 \
    --eval_strategy steps \
    --eval_steps 100 \
    --per_device_eval_batch_size 1