unslothai / unsloth

Finetune Llama 3, Mistral, Phi & Gemma LLMs 2-5x faster with 80% less memory
https://unsloth.ai
Apache License 2.0
12.29k stars 796 forks source link

Where are the logs going?? #691

Open whymusticode opened 1 week ago

whymusticode commented 1 week ago

I can not get logging to work with unsloth, what am I doing wrong?

I'm doing the following:

args = TrainingArguments(
    per_device_train_batch_size = batchSize,
    per_device_eval_batch_size = batchSize,
    gradient_accumulation_steps = gradAccum,
    warmup_steps = 50,
    learning_rate = 10e-5,
    num_train_epochs = 5,
    fp16 = not is_bfloat16_supported(),
    bf16 = is_bfloat16_supported(),
    output_dir = saveFolder,
    optim = "adamw_8bit",
    seed = 3407,
    save_strategy='steps',
    save_steps = saveSteps,

    load_best_model_at_end = True,
    metric_for_best_model = "eval_loss",  
    evaluation_strategy = "steps",
    eval_steps = saveSteps,

    logging_steps = 1,

    logging_dir = saveFolder+'/log',
    log_level = "info",  
)

https://github.com/huggingface/transformers/blob/main/src/transformers/training_args.py is the correct documentation I believe. I've spent a few hours trying different combinations of arguments

nothing shows up in saveFolder+'/log' I can't find any log files being generated anywhere.

whymusticode commented 1 week ago
class CustomTrainer(SFTTrainer):
    def __init__(self, label_weights, **kwargs):
        super().__init__(**kwargs)
        self.label_weights = label_weights

    def evaluate(self, eval_dataset=None, ignore_keys=None, metric_key_prefix="eval"):
        results = super().evaluate(eval_dataset, ignore_keys, metric_key_prefix)
        validation_loss = results[f"{metric_key_prefix}_loss"]
        lr = self.optimizer.param_groups[0]['lr']
        out = f'Step: {self.state.global_step}, Validation Loss: {validation_loss:.4f}, Learning Rate: {lr:.6f}'
        with open(logging_dir + '/eval_log.txt', 'a') as logSave:
            logSave.write(out + '\n')
        return results

    def log(self, logs):
        super().log(logs)
        with open(logging_dir + '/train_log.txt', 'a') as logSave:
            logSave.write(str(logs) + '\n')

trainer = CustomTrainer(
    label_weights=None,
    model=model,
    train_dataset=dataSubSplit['train'],
    eval_dataset=dataSubSplit['test'],
    dataset_text_field="transcript",
    max_seq_length=max_seq_length,
    tokenizer=tokenizer,
    callbacks=[early_stopping_callback],
    args=TrainingArguments(
        per_device_train_batch_size=batchSize,
        per_device_eval_batch_size=batchSize,
        gradient_accumulation_steps=gradAccum,
        warmup_steps=50,
        learning_rate=10e-5,
        num_train_epochs=5,
        fp16=not is_bfloat16_supported(),
        bf16=is_bfloat16_supported(),
        output_dir=saveFolder,
        optim="adamw_8bit",
        seed=3407,
        save_strategy='steps',
        save_steps=saveSteps,
        load_best_model_at_end=True,
        metric_for_best_model="eval_loss",
        evaluation_strategy="steps",
        eval_steps=saveSteps,
        logging_steps=1,
        logging_dir=logging_dir,
        log_level="info",
    ),
)

^ this works just fine, but I figure it's not necessary, and I just want to see what the right way to do it is

danielhanchen commented 4 days ago

Much apologies on the delay - my bro and I relocated to the US, hence the slowness.

TBH I never bothered to check where the logs are going lol - it should be in logging_dir - is it in the absolute or relative path? Maybe try adding an absolute path