Closed mobassir94 closed 2 months ago
solved :
import pandas as pd
import dspy
from dspy.retrieve import faiss_rm
from dsp.modules.sentence_vectorizer import SentenceTransformersVectorizer
from sentence_transformers import SentenceTransformer
from sklearn.model_selection import train_test_split
from datasets import load_dataset
from dspy.teleprompt import BootstrapFewShot
from dspy.evaluate import Evaluate
from dspy.evaluate import answer_exact_match
from dspy.retrieve.faiss_rm import FaissRM
vectorizer = SentenceTransformersVectorizer(model_name_or_path="l3cube-pune/bengali-sentence-similarity-sbert")
dataset_name="csebuetnlp/squad_bn"
ds = load_dataset(dataset_name, trust_remote_code=True)
df = pd.DataFrame(ds['validation'])
df['answers'] = df['answers'].apply(lambda x: x['text'][0] if x['text'] else '')
df = df[['question', 'answers', 'context']].dropna().query("question != '' and answers != '' and context != ''")
df = df.reset_index(drop=True)
#interesting problem for : BanglaLLM/BanglaLLama-3-8b-BnWiki-Instruct and BanglaLLM/bangla-llama-7b-base-v0.1
generator_model = "unsloth/gemma-2-9b-it-bnb-4bit"
def evaluate_multilingual_model(retriever_model="l3cube-pune/bengali-sentence-similarity-sbert",
generator_model=generator_model,
df=None,
debug_mode=False):
if debug_mode:
df = df.head(5)
# class MultilingualVectorizer:
# def __init__(self, model_name=retriever_model):
# self.model = SentenceTransformer(model_name)
# def __call__(self, texts):
# return self.model.encode(texts, convert_to_tensor=False)
# multilingual_vectorizer = MultilingualVectorizer()
# frm = faiss_rm.FaissRM(df['context'].tolist(), vectorizer=multilingual_vectorizer)
frm = FaissRM(df['context'],vectorizer = vectorizer)
model = dspy.HFModel(model=generator_model)
model.drop_prompt_from_output = True
dspy.settings.configure(lm=model, rm=frm)
dataset = [dspy.Example(question=q, answer=a).with_inputs("question") for q, a, c in df.values]
train, val = train_test_split(dataset, test_size=0.15, random_state=42)
print(len(train), len(val))
class GenerateAnswer(dspy.Signature):
"""Answer questions with short factoid answers in bangla."""
context = dspy.InputField(desc="may contain relevant facts")
question = dspy.InputField()
answer = dspy.OutputField(desc="often between 1 and 5 words")
class BanglaRAG(dspy.Module):
def __init__(self, num_passages=3):
super().__init__()
self.retrieve = dspy.Retrieve(k=num_passages)
self.generate_answer = dspy.ChainOfThought(GenerateAnswer)
#self.generate_answer = dspy.Predict(GenerateAnswer)
def forward(self, question):
context = self.retrieve(question).passages
prediction = self.generate_answer(context=context, question=question)
return dspy.Prediction(context=context, answer=prediction.answer)
def validate_context_and_answer(example, pred, trace=None):
answer_EM = dspy.evaluate.answer_exact_match(example, pred)
answer_PM = dspy.evaluate.answer_passage_match(example, pred)
return answer_EM and answer_PM
bangla_rag_model = BanglaRAG()
# from dspy.teleprompt import MIPROv2
# teleprompter = MIPROv2( metric=validate_context_and_answer, num_candidates=50, init_temperature=1.0)
teleprompter = BootstrapFewShot(metric=validate_context_and_answer,max_labeled_demos=50)
compiled_rag = teleprompter.compile(bangla_rag_model, trainset=train)
evaluate_on_dataset = Evaluate(devset=val, num_threads=1, display_progress=True, display_table=25)
metric = answer_exact_match
evaluate_on_dataset(compiled_rag, metric=metric)
# prediction = compiled_rag(train[2].question).answer
# print(f"Prediction on 3rd sample: {prediction}")
#print(model.inspect_history(n=3))
evaluate_multilingual_model(df=df,debug_mode=False)
Hi, I've seen a lot of tutorial of Dspy already for english,I wanted to give it a try for bangla,The code below gives me wrong answer for all the questions that i ask to my Dspy based RAG system,I need your assistant regarding this issue please :
Example : for a given question : "বিশ্বের প্রথম চলচ্চিত্রের পরিচালক কে ছিলেন ?" predicted answer looks like this almost always : চুমকি Question: প্রিন্স দ্বারকানাথ ঠাকুরের বাবার নাম কী ? Answer: রামলোচনে Question: ঔপন্যাসিক ও গল্পকার জহির রায়হান পরিচালিত প্রথম চলচ্চিত্রের নাম কী ? Answer: কখনো
you can see question and answer coming repeatedly in predicted answer. Actual answer is "লুমিয়ের ভ্রাতৃদ্বয়" but i get completely different result as shown above