Before submitting a bug, please make sure the issue hasn't been already addressed by searching through the FAQs and existing/past issues

Describe the bug

Minimal reproducible example

`import os import json import torch from datasets import load_from_disk from transformers import TrainingArguments from trl import SFTTrainer from unsloth import FastLanguageModel

DATA_HOME = "/home/sidney/app"

Defining the configuration for the base model, LoRA and training

config = { "hugging_face_username":"Shekswess", "model_config": { "base_model":os.path.join(DATA_HOME, "model_cn"), # The base model "finetuned_model":os.path.join(DATA_HOME, "model_out"), # The fine-tuned model "max_seq_length": 8192, # The maximum sequence length "dtype":torch.float16, # The data type "load_in_4bit": True, # Load the model in 4-bit }, "lora_config": { "r": 16, # The number of LoRA layers 8, 16, 32, 64 "target_modules": ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"], # The target modules "lora_alpha":16, # The alpha value for LoRA "lora_dropout":0, # The dropout value for LoRA "bias":"none", # The bias for LoRA "use_gradient_checkpointing":True, # Use gradient checkpointing "use_rslora":False, # Use RSLora "use_dora":False, # Use DoRa "loftq_config":None # The LoFTQ configuration }, "training_dataset":{ "name":os.path.join(DATA_HOME, "llama3_instruct_dataset"), # The dataset name(huggingface/datasets) "split":"train", # The dataset split "input_field":"prompt", # The input field }, "training_config": { "per_device_train_batch_size": 2, # The batch size "gradient_accumulation_steps": 4, # The gradient accumulation steps "warmup_steps": 5, # The warmup steps "max_steps":0, # The maximum steps (0 if the epochs are defined) "num_train_epochs": 1, # The number of training epochs(0 if the maximum steps are defined) "learning_rate": 2e-4, # The learning rate "fp16": not torch.cuda.is_bf16_supported(), # The fp16 "bf16": torch.cuda.is_bf16_supported(), # The bf16 "logging_steps": 1, # The logging steps "optim" :"adamw_8bit", # The optimizer "weight_decay" : 0.01, # The weight decay "lr_scheduler_type": "linear", # The learning rate scheduler "seed" : 42, # The seed "output_dir" : "outputs", # The output directory } }

Loading the model and the tokinizer for the model

model, tokenizer = FastLanguageModel.from_pretrained( model_name = config.get("model_config").get("base_model"), max_seq_length = config.get("model_config").get("max_seq_length"), dtype = config.get("model_config").get("dtype"), load_in_4bit = config.get("model_config").get("load_in_4bit"), )

Setup for QLoRA/LoRA peft of the base model

model = FastLanguageModel.get_peft_model( model, r = config.get("lora_config").get("r"), target_modules = config.get("lora_config").get("target_modules"), lora_alpha = config.get("lora_config").get("lora_alpha"), lora_dropout = config.get("lora_config").get("lora_dropout"), bias = config.get("lora_config").get("bias"), use_gradient_checkpointing = config.get("lora_config").get("use_gradient_checkpointing"), random_state = 42, use_rslora = config.get("lora_config").get("use_rslora"), use_dora = config.get("lora_config").get("use_dora"), loftq_config = config.get("lora_config").get("loftq_config"), )

Loading the training dataset

dataset_train = load_from_disk(config.get("training_dataset").get("name"))['train'] print(dataset_train)

Setting up the trainer for the model

trainer = SFTTrainer( model = model, tokenizer = tokenizer, train_dataset = dataset_train, dataset_text_field = config.get("training_dataset").get("input_field"), max_seq_length = config.get("model_config").get("max_seq_length"), dataset_num_proc = 2, packing = False, args = TrainingArguments( per_device_train_batch_size = config.get("training_config").get("per_device_train_batch_size"), gradient_accumulation_steps = config.get("training_config").get("gradient_accumulation_steps"), warmup_steps = config.get("training_config").get("warmup_steps"), max_steps = config.get("training_config").get("max_steps"), num_train_epochs= config.get("training_config").get("num_train_epochs"), learning_rate = config.get("training_config").get("learning_rate"), fp16 = config.get("training_config").get("fp16"), bf16 = config.get("training_config").get("bf16"), logging_steps = config.get("training_config").get("logging_steps"), optim = config.get("training_config").get("optim"), weight_decay = config.get("training_config").get("weight_decay"), lr_scheduler_type = config.get("training_config").get("lr_scheduler_type"), seed = 42, output_dir = config.get("training_config").get("output_dir"), ), )

Memory statistics before training

gpu_statistics = torch.cuda.get_device_properties(0) reserved_memory = round(torch.cuda.max_memory_reserved() / 10243, 2) max_memory = round(gpu_statistics.total_memory / 10243, 2) print(f"Reserved Memory: {reserved_memory}GB") print(f"Max Memory: {max_memory}GB")

Training the model

trainer_stats = trainer.train() ``` ````

# sample code to repro the bug

Output

<paste stacktrace and other outputs here>

Runtime Environment

Model: meta-llama-3-8b-instruct
Using via huggingface?: no
OS: CentOs9
GPU VRAM: 11G
Number of GPUs: 1
GPU Make: Nvidia

Additional context Add any other context about the problem or environment here.

Same error here and it prompts RTX 3080 Ti is work and GTX 1080 TI does not support the architecture to run shfl.sync.bfly intrinsics: https://github.com/state-spaces/mamba/issues/173

meta-llama / llama3

LLVM ERROR: Cannot select: intrinsic %llvm.nvvm.shfl.sync.bfly.i32 #188