FlagOpen / FlagEmbedding

Retrieval and Retrieval-augmented LLMs
MIT License
5.94k stars 428 forks source link

Training with unsloth #919

Open ZetangForward opened 1 week ago

ZetangForward commented 1 week ago

Currently, Unsloth can only support single GPU training, how can you implement it with 8-GPU training? Thx

namespace-Pt commented 6 days ago

Hi, it's weird that unsloth was able to use DDP two months ago :anguished:. Maybe you should wait newer version of unsloth or use some LLM framework like Megatron for efficient training. I'll update the README...

We chose to use unsloth because it can efficiently tune the Llama-3 model given its huge vocab size while the native implementation of huggingface cannot. An alternative is to break the loss computation of long sequences into parts and manually implement the backward function to avoid instantiating the entire seq_len x vocab_size table in GPU memory.

muratsilahtaroglu commented 5 days ago

            "name": "longllm_qlora_train",
            "type": "debugpy",
            "request": "launch",
            "program": "/home/name/.conda/envs/unsloth2/bin/torchrun",
            "console": "integratedTerminal",
            "cwd": "${workspaceFolder}/Long_LLM/longllm_qlora",
            "justMyCode": false,
            "env": {
                "PYTORCH_CUDA_ALLOC_CONF": "expandable_segments:True",
                "CUDA_VISIBLE_DEVICES": "0"
            },
            "args": [
                "--nproc_per_node","1",
                "-m", "my_train",
                "--data_root", "/data/long-llm",
                "--output_dir", "Long_LLM/longllm_qlora/outputs/llama3-8B",
                "--model_name_or_path", "meta-llama/Meta-Llama-3-8B-Instruct",
                "--train_data", "data/long-llm/infbench/math_find.jsonl",
                "--max_length", "8920",
                "--group_by_length",
                "--rope_theta", "200e6",
                "--attn_impl", "flash_attention_2",
                "--gradient_checkpointing",
                "--use_reentrant", "True",
                "--learning_rate", "5e-5",
                "--num_train_epochs", "1",

                "--save_strategy", "epoch",
                "--logging_steps", "5",
                "--bf16",
                "--lora_tune",
                "--lora_extra_params", "embed_tokens",
                //"--load_in_4_bit",
                "--chat_template", "llama-3"
            ]
        } ```

        **above my  running args and belove my Traninings args**

      ```  TrainingArgs(
_n_gpu=1,
accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False},
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
batch_eval_metrics=False,
bf16=True,
bf16_full_eval=False,
colossal_mp=bf16,
colossal_plugin=gemini,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_persistent_workers=False,
dataloader_pin_memory=True,
dataloader_prefetch_factor=None,
ddp_backend=None,
ddp_broadcast_buffers=None,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=False,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=False,
dispatch_batches=None,
do_eval=False,
do_predict=False,
do_train=False,
eval_accumulation_steps=None,
eval_beacon_ratio=[32],
eval_beacon_ratio_mix=adapt-1024,
eval_delay=0,
eval_do_concat_batches=True,
eval_max_length=4096,
eval_method=perplexity,
eval_min_length=512,
eval_on_start=False,
eval_steps=None,
eval_strategy=no,
evaluation_strategy=None,
fp16=False,
fp16_backend=auto,
fp16_full_eval=False,
fp16_opt_level=O1,
fsdp=[],
fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False},
fsdp_min_num_params=0,
fsdp_transformer_layer_cls_to_wrap=None,
full_determinism=False,
gradient_accumulation_steps=1,
gradient_checkpointing=True,
gradient_checkpointing_kwargs={'use_reentrant': True},
greater_is_better=None,
group_by_length=True,
group_by_stride=None,
half_precision_backend=auto,
hub_always_push=False,
hub_model_id=None,
hub_private_repo=False,
hub_strategy=every_save,
hub_token=<HUB_TOKEN>,
ignore_data_skip=False,
include_inputs_for_metrics=False,
include_num_input_tokens_seen=False,
include_tokens_per_second=False,
jit_mode_eval=False,
label_names=None,
label_smoothing_factor=0.0,
learning_rate=5e-05,
length_column_name=length,
load_best_model_at_end=False,
local_rank=0,
log_level=passive,
log_level_replica=warning,
log_on_each_node=True,
log_path=data/outputs/metrics.log,
logging_dir=Long_LLM/longllm_qlora/outputs/llama3-8B/runs/Jul02_11-39-46_tga-gpu-a6000,
logging_first_step=False,
logging_nan_inf_filter=True,
logging_steps=5,
logging_strategy=steps,
lora_alpha=16,
lora_dropout=0.0,
lora_extra_params=['embed_tokens'],
lora_rank=32,
lora_targets=['q_proj', 'k_proj', 'v_proj', 'o_proj'],
lora_tune=True,
lr_scheduler_kwargs={},
lr_scheduler_type=linear,
max_eval_num=None,
max_grad_norm=1.0,
max_steps=-1,
metric_for_best_model=None,
metrics=[],
min_length=0,
mp_parameters=,
neftune_noise_alpha=None,
no_cuda=False,
num_train_epochs=1.0,
only_train_beacon=True,
optim=adamw_torch,
optim_args=None,
optim_target_modules=None,
output_dir=Long_LLM/longllm_qlora/outputs/llama3-8B,
overwrite_output_dir=False,
past_index=-1,
per_device_eval_batch_size=1,
per_device_train_batch_size=1,
prediction_loss_only=False,
push_to_hub=False,
push_to_hub_model_id=None,
push_to_hub_organization=None,
push_to_hub_token=<PUSH_TO_HUB_TOKEN>,
ray_scope=last,
remove_unused_columns=False,
report_to=[],
restore_callback_states_from_checkpoint=False,
resume_from_checkpoint=None,
run_name=Long_LLM/longllm_qlora/outputs/llama3-8B,
save_on_each_node=False,
save_only_model=False,
save_safetensors=True,
save_steps=500,
save_strategy=epoch,
save_total_limit=None,
seed=42,
skip_memory_metrics=True,
sort_by_stride=None,
split_batches=None,
tf32=None,
torch_compile=False,
torch_compile_backend=None,
torch_compile_mode=None,
torchdynamo=None,
tpu_metrics_debug=False,
tpu_num_cores=None,
use_colossal=False,
use_cpu=False,
use_ipex=False,
use_legacy_prediction_loop=False,
use_mps_device=False,
use_reentrant=True,
warmup_ratio=0.0,
warmup_steps=0,
weight_decay=0.0,
) ```
**I am using 1 gpu or cpu but I have below error.**

'Unsloth currently does not work on multi GPU setups - sadly we are a 2 brother team so enabling it will require much more work, so we have to prioritize. Please understand!\nWe do have a separate beta version, which you can contact us about!\nThank you for your understanding and we appreciate it immensely!')
namespace-Pt commented 3 days ago

Try the following?

CUDA_VISIBLE_DEVICES=0 torchrun --nproc_per_node 1 -m main.train \
--data_root /data/long-llm \
--output_dir data/outputs/$output_name \
--model_name_or_path meta-llama/Meta-Llama-3-8B-Instruct \
--train_data long-llm:gpt/one_detail_book.train.64K.json long-llm:gpt/one_detail_paper.train.64K.json long-llm:gpt/multi_detail_book.train.json long-llm:gpt/multi_detail_paper_short.train.json long-llm:gpt/multi_detail_paper_long.train.json long-llm:gpt/bio_book.train.json long-llm:longalpaca/train.json long-llm:redpajama/train.json[5000] \
--max_length 81920 \
--group_by_length \
--rope_theta 200e6 \
--attn_impl flash_attention_2 \
--gradient_checkpointing \
--use_reentrant True \
--learning_rate 5e-5 \
--num_train_epochs 1 \
--save_only_model \
--save_strategy epoch \
--logging_steps 5 \
--bf16 \
--lora_tune \
--lora_extra_params embed_tokens \
--load_in_4_bit \
--chat_template llama-3