Closed gabrielcolab closed 2 weeks ago
For some reason, I wonder why the following works with the Zypher syntax, and I only used Unsloth for converting to GGUF and it works perfect with Zephyr syntax (referenced above).
# -*- coding: utf-8 -*-
Automatically generated by Colaboratory.
# Commented out IPython magic to ensure Python compatibility.
# %%capture
# import torch
# major_version, minor_version = torch.cuda.get_device_capability()
# if major_version >= 8:
# # Use this for new GPUs like Ampere, Hopper GPUs (RTX 30xx, RTX 40xx, A100, H100, L40)
# !pip install "unsloth[colab_ampere] @ git+"
# else:
# # Use this for older GPUs (V100, Tesla T4, RTX 20xx)
# !pip install "unsloth[colab] @ git+"
# pass
from google.colab import files
uploaded = files.upload()
import os
import pandas as pd
import json
import torch
import shutil
from os import system
from datetime import datetime
from datasets import load_dataset
from peft import LoraConfig
from peft.utils.other import prepare_model_for_kbit_training
from transformers import (
from trl import SFTTrainer
print("Loading the dataset...")
df = pd.read_csv("bible.csv")
# Select rows starting from the 2nd row
df = df.iloc[1:, :]
# Select relevant columns and rename them
df = df[['book', 'chapter', 'verse', 'text']]
df.columns = ['label', 'chapter', 'verse', 'text']
# Concatenate 'label', 'chapter', and 'verse' into a single 'label' column
df['label'] = df['label'].astype(str) + ' ' + df['chapter'].astype(str) + ':' + df['verse'].astype(str)
# Select relevant columns
df = df[['label', 'text']]
result = df.to_dict(orient='records')
#result = list(df.head(1000).to_json(orient="records"))
result = list( df.to_json( orient="records" ) )
result[0] = '{"json":['
result[-1] = ']'
result = ''.join(result)
result = result.strip('"\'')
result = json.loads( result )
with open( 'data.json', 'w' ) as json_file:
json.dump( result, json_file )
modelName = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
model = AutoModelForCausalLM.from_pretrained(
device_map = 'auto',
trust_remote_code = True,
token = False,
model = prepare_model_for_kbit_training( model )
peft_config = LoraConfig(
r = 32,
lora_alpha = 16,
bias = "none",
lora_dropout = 0.05, # Conventional
task_type = "CAUSAL_LM",
model.add_adapter( peft_config )
model.config.pretraining_tp = 1
tokenizer = AutoTokenizer.from_pretrained( modelName, trust_remote_code = True, use_fast=False )
tokenizer.pad_token = tokenizer.eos_token
if os.path.isdir( "./temp" ):
shutil.rmtree( "./temp" )
model.config.use_cache = False
# def formatting_func( example ):
# text = f"<|system|></s>\n<|user|>Question: { example['text'] }\n</s><|assistant|>Answer: { example['index'] }</s>\n"
# return text
def formatting_func(example):
text = f"<|system|></s>\n<|user|>Question: {example['label']}\n</s><|assistant|>Answer: {example['text']}</s>\n"
return text
def generate_and_tokenize_prompt( prompt ):
return tokenizer( formatting_func( prompt ), truncation = True, max_length = 2048 )
dataset = load_dataset("json", data_files="data.json", field='json', split="train")
dataset = generate_and_tokenize_prompt )
training_arguments = TrainingArguments(
per_device_train_batch_size = 2,
gradient_accumulation_steps = 4,
warmup_ratio = 0.1,
num_train_epochs = 1,
learning_rate = 2e-5,
fp16 = not torch.cuda.is_bf16_supported(),
bf16 = torch.cuda.is_bf16_supported(),
logging_steps = 1,
optim = "adamw_8bit",
weight_decay = 0.1,
lr_scheduler_type = "linear",
seed = 3407,
output_dir = "outputs",
trainer = SFTTrainer(
if os.path.isdir( "./outputs" ):
shutil.rmtree( "./outputs" )
directory = "trained"
if os.path.isdir( directory ):
shutil.rmtree( directory )
model.save_pretrained( directory )
tokenizer.save_pretrained( directory )
print( f"Model saved '{directory}'." )
import os
import shutil
import torch
import warnings
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel
warnings.filterwarnings( "ignore" )
baseModel = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
peft = "./trained"
outdir = "./tinyKJV"
print( f"Loading base model: {baseModel}")
model = AutoModelForCausalLM.from_pretrained(
torch_dtype = torch.float16,
device_map = "cuda"
print( f"Loading PEFT: {peft}" )
model = PeftModel.from_pretrained( model, peft )
print( "Running merge_and_unload" )
model = model.merge_and_unload()
tokenizer = AutoTokenizer.from_pretrained( baseModel )
if os.path.isdir( outdir ):
shutil.rmtree( outdir )
model.save_pretrained( outdir )
tokenizer.save_pretrained( outdir )
print( f"Model saved to {outdir}" )
from unsloth import FastLanguageModel
model, tokenizer = FastLanguageModel.from_pretrained(
load_in_4bit = True,
max_seq_length = 2048,
messages = [
"role": "system",
"content": "You a helpful assistant of the Bible named Christian.",
{"role": "user", "content": "What is Genesis 1:1 about?"},
prompt = tokenizer.apply_chat_template(
tokenize = True,
add_generation_prompt = True,
return_tensors = "pt",
FastLanguageModel.for_inference(model) # Enable native 2x faster inference
from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer)
_ = model.generate(prompt, streamer = text_streamer, max_new_tokens = 120)
model.save_pretrained("tinyKJV") # Local saving
token = ""
#model.push_to_hub("oliverbob/bibleai", token = token) # Online saving
# Save to q4_k_m GGUF
if False: model.save_pretrained_gguf("model", tokenizer, quantization_method="q4_k_m")
if True: model.push_to_hub_gguf("oliverbob/biblegpt", tokenizer, quantization_method="q4_k_m", token=token)
But when I trided to make changes to the "length of the dataset", It doesn't seem to work well.
Was gonna mention first maybe your Zephyr template is partially a bit off :))
texts = f"<|system|></s>\n<|user|>Question: {example['context']}\n</s><|assistant|>Answer: {example['response']}</s>\n"
should rather be
texts = f"<|system|>\n</s>\n<|user|>\nQuestion: {example['context']}</s>\n<|assistant|>\nAnswer: {example['response']}</s>\n"
The Ollama model file has
TEMPLATE """<|system|>
{{ .System }}</s>
{{ .Prompt }}</s>
so you need to append newlines to the user / assistant.
But if normal HF works, I'm guessing packing = True
is breaking things maybe? Have you tried maybe turning it off? (After using the correct format as well :) )
Was gonna mention just added chat templates to Unsloth a few minutes ago :) for ChatML. Zephyr, Vicuna etc are all supported - maybe these can help?
Thanks for this discussion. I have recently released OpenBible finetuned with Tinyllama. Like what Gab has said, the unsloth base model (tinyllama-bnb-4bit) results are not good, or doesn't make any sensible chat responses (ignores zephyr template). But the TinyLlama/TinyLlama-1.1B-Chat-v1.0 does fine. The zephyr syntax is also working good.
The only downside is that even if you feed tinyllama with a large dataset like is that it hallucinates often.
Is there any way we can improve it the model during the finetune phase?
The only significant setting I modified was: weight_decay = 0.0001. Here is the actual notebook for the OpenBible to play around the code for finetuning research purposes.
Oh was gonna say link was right :)) No need for models :)
Oh TinyLlama was pretrained by the TinyLlama team - I just quantized it for easier use :) I just added unsloth/tinyllama-chat
and unsloth/tinyllama-chat-bnb-4bit
for 4x faster downloading as well!
Sadly small models can hallucinate and cause issues - I recommend upping weight decay to 0.01, reducing the learning rate to 2e-5, increasing the lora rank to say 32 or 64, increasing lora alpha * 2 of the lora rank, and I would increase the batch size to 4 or 8. This can make the model less likely to hallucinate.
Again great work on the model though :)
I will try this on the scripts shared. We'll see how it goes.
I have a question about merging:
If I have saved/pushed model.push_to_hub("user/lora_model"); previously saved.
Then I finetune again and If I do
if True: model.save_pretrained_merged("saved_model", tokenizer, save_method = "lora",)
if True: model.push_to_hub_merged("user/lora_model", tokenizer, save_method = "lora", token = token)
so that when I do a fresh train() on the lora_model (and not unsloth, or tinyllama) for the following model:
model, tokenizer = FastLanguageModel.from_pretrained(
model_name = "user/lora_model",
max_seq_length = max_seq_length,
dtype = dtype,
load_in_4bit = load_in_4bit,
# token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
trainer.train() again new data will be augmented into the old model if I do push_to_hub_merged("user/lora_model")?
I guess my question is, will it augment the new training data into the model everytime this function or changes is pushed?
@bethelangela Cool - tell me how it goes! :)
@oliverbob So when you save a LoRA model that you just finetuned, then load your LoRA model for further finetuning, you're altering the LoRA model you just changed on new data - so yes, everytime you reload it, new data will be added it.
There might be catastrophic forgetting, so I suggest with new finetunes with your LoRA model, I would maybe add a few examples from your first training run over to the new training run.
@oliverbob hey, just went through colab you shared before. I trained to find way to send private message, but there was not way to do it. You have some hugging face token in notebook. Please check
What could be wrong with this notebook?
It trained successfully but it didn't follow the Zephyr syntax in Ollama. Am I missing something?
I have not gotten my first successful/useful project in Unsloth.
Any help is appreciated. Happy hearts to everyone.