I decodes the prediction samples of Llama but it seems not right, am I training it well?

Hello, I was training Llama 3.1 (instruct) on my dataset. I am using this code I created:

import torch
from trl import SFTTrainer
from datasets import load_dataset, Dataset
from transformers import TrainingArguments, get_cosine_schedule_with_warmup
from unsloth.chat_templates import get_chat_template
from unsloth import FastLanguageModel, is_bfloat16_supported
import seaborn as sns
import matplotlib.pyplot as plt
import evaluate
from transformers import TrainerCallback, EarlyStoppingCallback
import nltk
import numpy as np

nltk.download('punkt')

max_seq_length = 2048

model_name = "unsloth/llama-3-8b-Instruct-bnb-4bit"

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=model_name,
    max_seq_length=max_seq_length,
    load_in_4bit=True,
    dtype=None,
)

model = FastLanguageModel.get_peft_model(
    model,
    r=32,
    lora_alpha=16,
    lora_dropout=0.1,
    target_modules=[
        "q_proj", "k_proj", "v_proj", "up_proj",
        "down_proj", "o_proj", "gate_proj"
    ],
    bias="none",
    use_gradient_checkpointing="unsloth",
    use_rslora=True,
    loftq_config=None,
)

tokenizer = get_chat_template(
    tokenizer,
    mapping={"role": "from", "content": "value", "user": "human", "assistant": "gpt"},
    chat_template="llama-3",
)

def format_synthesis(example):
    prompt = example['prompt']
    completion = example['completion']
    if completion.strip() in ["NA", "NO"]:
        print(f"Invalid entry skipped: {example}")
        return None
    cleaned_prompt = ' '.join(prompt.strip().replace('\\n', ' ').replace('\\', '').replace('"', '').replace("'", '').split())
    cleaned_completion = ' '.join(completion.strip().replace('\\n', ' ').replace('\\', '').replace('"', '').replace("'", '').split())
    messages = [
        {'from': 'human', 'value': cleaned_prompt},
        {'from': 'gpt', 'value': cleaned_completion}
    ]
    return {'messages': messages}

dataset = load_dataset('json', data_files='finetuning_file_cleaned.jsonl', split='train')

print("Muestra del dataset original:")
print(dataset[0])

dataset = dataset.map(
    format_synthesis,
    remove_columns=dataset.column_names,
    num_proc=2
)

dataset = dataset.filter(lambda example: example is not None)

print("\nMuestra después del formateo:")
print(dataset[0])

def apply_template(example):
    text = tokenizer.apply_chat_template(
        example['messages'],
        tokenize=False,
        add_generation_prompt=False
    )
    return {"text": text}

dataset = dataset.map(
    apply_template,
    num_proc=2
)

print("\nMuestra después de aplicar el template:")
print(dataset[0]['text'])

dataset = dataset.filter(lambda example: 'text' in example and example['text'])

train_valid = dataset.train_test_split(test_size=0.2, seed=42)
train_dataset = train_valid['train']
eval_dataset = train_valid['test']

print("\nEjemplos del dataset de entrenamiento:")
for i in range(2):
    print(f"\nEjemplo {i}:")
    print(train_dataset[i]['text'])

training_args = TrainingArguments(
    learning_rate=5e-6,  
    per_device_train_batch_size=1, 
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=8,  
    num_train_epochs=3,
    fp16=not is_bfloat16_supported(),
    bf16=is_bfloat16_supported(),
    logging_steps=10,
    optim="adamw_torch",
    weight_decay=0.01,
    warmup_steps=50,
    output_dir="output",
    seed=42,
    evaluation_strategy="steps",
    eval_steps=50,
    save_steps=50,
    save_total_limit=3,
    logging_dir="logs",
    report_to=["none"],
    metric_for_best_model='eval_loss',
    greater_is_better=False,
    load_best_model_at_end=True,
    eval_accumulation_steps=8,
    max_grad_norm=1.0,
)

optimizer = torch.optim.AdamW(
    model.parameters(),
    lr=training_args.learning_rate,
    weight_decay=training_args.weight_decay
)

num_training_steps = int(
    len(train_dataset) / (
        training_args.per_device_train_batch_size * training_args.gradient_accumulation_steps
    ) * training_args.num_train_epochs
)

scheduler = get_cosine_schedule_with_warmup(
    optimizer,
    num_warmup_steps=training_args.warmup_steps,
    num_training_steps=num_training_steps
)

rouge_metric = evaluate.load("rouge")

def compute_metrics(eval_preds):
    preds = eval_preds.predictions
    labels = eval_preds.label_ids
    print(f"Predictions type: {type(preds)}, shape: {preds.shape}")
    print(f"Labels type: {type(labels)}, shape: {labels.shape}")

    if isinstance(preds, torch.Tensor):
        preds = preds.cpu().numpy()
    if isinstance(labels, torch.Tensor):
        labels = labels.cpu().numpy()

    if preds.ndim == 3:
        preds = np.argmax(preds, axis=-1)

    print(f"Predictions after argmax (if applied), shape: {preds.shape}")

    preds = np.where(preds != -100, preds, tokenizer.pad_token_id)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)

    print(f"Predictions after tolist: {type(preds)}, length: {len(preds)}")
    print(f"First prediction sample: {preds[0][:10]}") 

    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True, clean_up_tokenization_spaces=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True, clean_up_tokenization_spaces=True)

    print(f"First few decoded predictions: {decoded_preds[:2]}")
    print(f"First few decoded labels: {decoded_labels[:2]}")

    result = rouge_metric.compute(
        predictions=decoded_preds,
        references=decoded_labels,
        use_stemmer=True
    )
    result = {key: value * 100 for key, value in result.items()}
    print("Calculated metrics:", result)
    return result

trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    compute_metrics=compute_metrics,
    optimizers=(optimizer, scheduler),
    max_seq_length=max_seq_length,
    dataset_num_proc=2,
    packing=False,
    dataset_text_field="text",
    args=training_args,
)

class CustomCallback(TrainerCallback):
    def on_log(self, args, state, control, **kwargs):
        if 'loss' in state.log_history[-1]:
            print(f"Paso {state.global_step}: pérdida={state.log_history[-1]['loss']}")
        if 'eval_loss' in state.log_history[-1]:
            print(f"Pérdida de validación en el paso {state.global_step}: {state.log_history[-1]['eval_loss']}")

    def on_evaluate(self, args, state, control, **kwargs):
        metrics = kwargs['metrics']
        print(f"Métricas en el paso {state.global_step}: {metrics}")

trainer.add_callback(CustomCallback())
trainer.add_callback(EarlyStoppingCallback(early_stopping_patience=3))

trainer.train()

model.save_pretrained_merged("model", tokenizer, save_method="merged_16bit")

import seaborn as sns
import matplotlib.pyplot as plt

training_loss = []
validation_loss = []
rouge1 = []
rouge2 = []
rougeL = []
rougeLsum = []
eval_steps = []

for log in trainer.state.log_history:
    if 'loss' in log:
        training_loss.append(log['loss'])
    if 'eval_loss' in log:
        validation_loss.append(log['eval_loss'])
        eval_steps.append(log['step'])
    if 'eval_rouge1' in log:
        rouge1.append(log['eval_rouge1'])
    if 'eval_rouge2' in log:
        rouge2.append(log['eval_rouge2'])
    if 'eval_rougeL' in log:
        rougeL.append(log['eval_rougeL'])
    if 'eval_rougeLsum' in log:
        rougeLsum.append(log['eval_rougeLsum'])

sns.set(style='white', context='talk', palette='pastel')

def add_value_labels(ax, spacing=5):
    lines = ax.get_lines()
    for line in lines:
        x_data = line.get_xdata()
        y_data = line.get_ydata()
        if len(x_data) > 0 and len(y_data) > 0:
            ax.annotate(
                f'{y_data[-1]:.2f}',
                xy=(x_data[-1], y_data[-1]),
                xytext=(5, 0),
                textcoords='offset points',
                ha='left',
                va='center',
                fontsize=10
            )

plt.figure(figsize=(12, 6))
steps = list(range(len(training_loss)))
plt.plot(steps, training_loss, label='Pérdida de Entrenamiento', linewidth=2)
if validation_loss:
    validation_steps = eval_steps
    plt.plot(
        validation_steps,
        validation_loss,
        label='Pérdida de Validación',
        linewidth=2
    )

plt.xlabel('Pasos')
plt.ylabel('Pérdida')
plt.title('Pérdida de Entrenamiento y Validación')
plt.legend()
ax = plt.gca()
add_value_labels(ax)
plt.tight_layout()
plt.savefig('loss_curve2.png', dpi=300, format='png')
plt.close()

if rouge1:
    plt.figure(figsize=(12, 6))
    plt.plot(eval_steps, rouge1, label='ROUGE-1', linewidth=2)
    plt.plot(eval_steps, rouge2, label='ROUGE-2', linewidth=2)
    plt.plot(eval_steps, rougeL, label='ROUGE-L', linewidth=2)
    plt.plot(eval_steps, rougeLsum, label='ROUGE-Lsum', linewidth=2)

    plt.xlabel('Pasos de Evaluación')
    plt.ylabel('Puntuación ROUGE')
    plt.title('Puntuaciones ROUGE durante la Evaluación')
    plt.legend()
    ax = plt.gca()
    add_value_labels(ax)
    plt.tight_layout()
    plt.savefig('rouge_scores2.png', dpi=300, format='png')
    plt.close()

Unfortunately, I when I print the decoded labels form the model to compute my metrics, I get just nonsense: Predictions type: <class 'numpy.ndarray'>, shape: (179, 1064, 128256) Labels type: <class 'numpy.ndarray'>, shape: (179, 1064) Predictions after argmax (if applied), shape: (179, 1064) Predictions after tolist: <class 'numpy.ndarray'>, length: 179 First prediction sample: [ 14924 128006 128007 271 40 315 220 11 320 27560] First few decoded predictions: ['Question\n\nI of, ( Cu Type: Alloy, Crystal Motif: Face, Crystal Structure: Face, Morphology: NAocrar, Size:, Shape: NA-shaped,assistant\n\nA 1: Synolve 0 mg of Auachecyl trim (HDA) in 10 mL of toionized water. a 10 ml glass-bottom flaskial. Step 2: Addicate the H for 30 minutes to theDA is fully dissolved. Step 3: Add 1.5 ml of uCl4 (50.1 M) to 0.3 ml of CuCl2 (0.1 M) toous solutions to the vial. son stirring. Step 4: Continue the mixture at to 100°CC for a oil bath under Step 5: Maintain few minutes, the mixture turns dark color to blue yellow, indicating the to turnss0.3 ml of prepared DT+)-glucose (0 M) intoous solution into the v mixture. Step 6: Continue the vial and let it mixture for 30 minutes. Step 7: Remove the temperature to 120 °C and maintain heat the 30 minutes minutes. Step 8: Remove the mixture to cool down to room temperature. Step 9: Centify the product nan solutionrown precipCu nan by times by deone and ethanol to centrifugation. 100 rpm for 10 minutes. Step 10: Collect-disperse the Auitate Au in. Step TEM preparation of Au Auostars, Step 1: Diss 1.5 ml of HAuCl4 (0.1 M) and 0.3 ml of CuCl2 (0.05 M) aque. a 10 ml round bottom vial. Step 2: Add 0 ml of ethanolleylamine (OLA) to the vial under Step 3: Heat the mixture up an oil bath at 150 °C for 30 hours. Step 4: After the mixtureidal solution to room temperature. Step 5: Pur few of oform and ethanol ( the v and centrif centrifuge at 6000 rpm for remove the imp OLA. Step 6: Re-disperse the precipitated particles in ethanol. Step!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!', 'Question\n\nIassistant\n\nIt is no words instructions reactions for. the prompt text. I be used into a instructions concise-by-step instructions. the catals.!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!'] First few decoded labels: ['user\n\nComposition: AuCu, Material Type: Metal, Structural Motif: NA, Crystal Structure: NA, Morphology: nanostars, Size: NA, Shape: star-shapedassistant\n\nStep 1: Dissolve 45 mg of hexadecylamine (HDA) into 4 ml of deionized water in a 40 ml round bottom vial. Step 2: Sonicate the mixture for 30 minutes until HDA is completely dissolved. Step 3: Add 0.3 ml of HAuCl4 (0.1 M) and 0.3 ml of CuCl2 (0.1 M) aqueous solutions to the vial under magnetic stirring. Step 4: Heat the mixture up to 100 °C in an oil bath. Step 5: After a few minutes, the solution turns from green to light-blue, and then quickly inject 0.3 ml of freshly prepared D-(+)-glucose (1 M) aqueous solution into the hot mixture. Step 6: Cap the vial and stir the mixture for 30 minutes. Step 7: Increase the temperature to 150 °C and further stir for 10 more minutes. Step 8: Allow the mixture to cool down to room temperature. Step 9: Purify the resulting purple-brown AuCu solution several times with acetone and ethanol by centrifugation at 6000 rpm for 5 minutes. Step 10: Re-disperse the precipitated particles in ethanol. For the synthesis of rounded nanostars: Step 1: Mix 0.3 ml of HAuCl4 (0.05 M) and 0.3 ml of CuCl2 (0.1 M) in ethanol in a 40 ml round bottom vial. Step 2: Add 4 ml of oleylamine (OLA) to the vial. Step 3: Heat the mixture in an oil bath at 130 °C for 2 hours. Step 4: Cool the colloidal solution to room temperature. Step 5: Add a mixture of chloroform and ethanol to the solution and then centrifuge at 3000 rpm to remove any residual OLA. Step 6: Re-disperse the precipitated nanoparticles in ethanol.', 'user\n\nassistant\n\nThere are no specific synthesis procedures mentioned in the provided text that can be converted into clear, step-by-step instructions for heterogeneous catalyst synthesis.'] Calculated metrics: {'rouge1': 68.79999377501494, 'rouge2': 38.45706763890132, 'rougeL': 58.38013880947452, 'rougeLsum': 59.16003761551416} {'eval_loss': 1.4974863529205322, 'eval_rouge1': 68.79999377501494, 'eval_rouge2': 38.45706763890132, 'eval_rougeL': 58.38013880947452, 'eval_rougeLsum': 59.16003761551416, 'eval_runtime': 266.1983, 'eval_samples_per_second': 0.672, 'eval_steps_per_second': 0.672, 'epoch': 0.56} Pérdida de validación en el paso 50: 1.4974863529205322.

Any of you can help me? Do you know how can I Finetune the base model?

Thank you.

unslothai / unsloth

I decodes the prediction samples of Llama but it seems not right, am I training it well? #1114