Closed ananthanarayanan431 closed 4 days ago
Hey there, @ananthanarayanan431! How can I assist you today? If you have any bugs, questions, or want to become a contributor, I'm here to help while we wait for a human maintainer.
I found similar issues that might help you resolve the ValueError: Dataset feature "contexts" should be of type Sequence[string], got <class 'datasets.features.features.Sequence'>
.
Issue #543:
Issue #286:
contexts
field is a list of strings. For example:
"contexts": DF["chain_context"].apply(lambda x: [x])
Issue #413:
Solution: The error is due to the context
data not being in the format of list[str]
. The following code snippet was provided to resolve the issue:
from datasets import Dataset
# Example data
questions = ["question 1", "question 2", "question 3"]
ground_truths = [["ground_truth to the question 1"], ["ground_truth to the question 2"], ["ground_truth to the question 3"]]
answers = ["answer to the question 1", "answer to the question 2", "answer to the question 3"]
contexts = [["context 1"], ["context 2"], ["context 3"]]
data = {
"question": questions,
"answer": answers,
"contexts": contexts,
"ground_truths": ground_truths
}
dataset = Dataset.from_dict(data)
To resolve your issue, ensure that the contexts
field in your dataset is a list of strings. Here is an updated version of your Query
method:
def Query(self):
if not os.path.exists(PATH):
vector_index = VectorStoreIndex.from_documents(documents=self.document)
vector_index.storage_context.persist(persist_dir=PATH)
else:
storage_context = StorageContext.from_defaults(persist_dir=PATH)
vector_index = load_index_from_storage(storage_context=storage_context)
if self.testset is None or len(self.testset) == 0:
self.testset = self.Building()
query_engine = vector_index.as_query_engine()
testset_df = self.testset.to_pandas()
questions = testset_df["question"].tolist()
ground_truth = testset_df["ground_truth"].tolist()
# Ensure contexts are in the correct format (List[str])
contexts = testset_df["contexts"].apply(lambda x: [str(item) for item in x] if isinstance(x, list) else [str(x)]).tolist()
data = {
'question': questions,
'answer': [query_engine.query(item).response for item in questions],
'contexts': contexts,
'ground_truth': ground_truth,
}
dataset = Dataset.from_dict(data)
metrics = [
faithfulness,
answer_relevancy,
context_precision,
context_utilization,
context_recall,
harmfulness,
answer_correctness,
]
result = evaluate(
query_engine=query_engine,
metrics=metrics,
dataset=dataset,
raise_exceptions=False,
)
ans = result.to_pandas()
ans.to_csv('./val1.csv')
with open("metrics.json", "w") as json_file:
json.dump(result, json_file, indent=4)
print(type(result))
return result
This should ensure that the contexts
field is correctly formatted as a list of strings, resolving the ValueError
[1][2][3].
To continue talking to Dosu, mention @dosu.
@dosu @shahules786
code:
import os import constant import json
os.environ['GROQ_API_KEY']=constant.GROQ_API_KEY os.environ['OPENAI_API_KEY']=constant.OPENAI_API_KEY
from llama_index.core import SimpleDirectoryReader from llama_index.llms.openai import OpenAI from llama_index.llms.groq import Groq from llama_index.embeddings.openai import OpenAIEmbedding from llama_index.core import VectorStoreIndex from llama_index.core.settings import Settings from llama_index.core import load_index_from_storage from llama_index.core.storage import StorageContext
import ragas
from ragas.testset.generator import TestsetGenerator from ragas.testset.evolutions import simple,reasoning,multi_context
from ragas.metrics import ( faithfulness, answer_relevancy, context_precision, context_recall, answer_correctness, context_utilization, )
from datasets import Dataset, Value, Sequence
from ragas.metrics.critique import harmfulness from ragas.integrations.llama_index import evaluate
DATASET="./data.csv" PATH="./STORE"
embeddings=OpenAIEmbedding() Settings.embed_model=embeddings
class Evaluation: def init(self): self.generator_llm=OpenAI(model="gpt-4o-mini") self.critics_llm=OpenAI(model="gpt-4o-mini") self.evalutor_llm=OpenAI(model="gpt-4o-mini") self.embeddings=OpenAIEmbedding() self.testset="" self.document=""
def Building(self):
document=SimpleDirectoryReader(input_files=[r"E:\GitaGPT\Backend\gita.pdf"]).load_data()
self.document=document
generator=TestsetGenerator.from_llama_index(
generator_llm=self.generator_llm,
critic_llm=self.critics_llm,
embeddings=self.embeddings,
)
testset=generator.generate_with_llamaindex_docs(
documents=document,
test_size=14,
distributions={simple:0.5, reasoning:0.25, multi_context:0.25},
)
self.testset=testset
return testset
def Query(self):
if not os.path.exists(PATH):
vector_index=VectorStoreIndex.from_documents(documents=self.document)
vector_index.storage_context.persist(persist_dir=PATH)
else:
storage_context=StorageContext.from_defaults(persist_dir=PATH)
vector_index=load_index_from_storage(storage_context=storage_context)
if self.testset is None or len(self.testset)==0:
self.testset=self.Building()
query_engine = vector_index.as_query_engine()
testset_df = self.testset.to_pandas()
questions = testset_df["question"].tolist()
ground_truth = testset_df["ground_truth"].tolist()
contexts = testset_df["contexts"].apply(lambda x: [str(item) for item in x] if isinstance(x, list) else [str(x)]).tolist()
data = {
'question': questions,
'answer': [query_engine.query(item).response for item in questions],
'contexts': contexts,
'ground_truth': ground_truth,
}
dataset = Dataset.from_dict(data)
metrics=[
faithfulness,
answer_relevancy,
context_precision,
context_utilization,
context_recall,
harmfulness,
answer_correctness,
]
result=evaluate(
query_engine=query_engine,
metrics=metrics,
dataset=dataset,
raise_exceptions=False,
)
ans=result.to_pandas()
ans.to_csv('./val1.csv')
with open("metrics.json", "w") as json_file:
json.dump(result, json_file, indent=4)
print(type(result))
return result
if name=="main": obj=Evaluation() print(ragas.version) print(obj.Query())
Error: ValueError: Dataset feature "contexts" should be of type Sequence[string], got <class 'datasets.features.features.Sequence'>
I'm wasting my money in running every time so I need a proper solution to it
To resolve the ValueError: Dataset feature "contexts" should be of type Sequence[string], got <class 'datasets.features.features.Sequence'>
, ensure that the contexts
field in your dataset is correctly formatted as a list of strings. Here is the updated Query
method with the necessary adjustments:
def Query(self):
if not os.path.exists(PATH):
vector_index = VectorStoreIndex.from_documents(documents=self.document)
vector_index.storage_context.persist(persist_dir=PATH)
else:
storage_context = StorageContext.from_defaults(persist_dir=PATH)
vector_index = load_index_from_storage(storage_context=storage_context)
if self.testset is None or len(self.testset) == 0:
self.testset = self.Building()
query_engine = vector_index.as_query_engine()
testset_df = self.testset.to_pandas()
questions = testset_df["question"].tolist()
ground_truth = testset_df["ground_truth"].tolist()
# Ensure contexts are in the correct format (List[str])
contexts = testset_df["contexts"].apply(lambda x: [str(item) for item in x] if isinstance(x, list) else [str(x)]).tolist()
data = {
'question': questions,
'answer': [query_engine.query(item).response for item in questions],
'contexts': contexts,
'ground_truth': ground_truth,
}
dataset = Dataset.from_dict(data)
metrics = [
faithfulness,
answer_relevancy,
context_precision,
context_utilization,
context_recall,
harmfulness,
answer_correctness,
]
result = evaluate(
query_engine=query_engine,
metrics=metrics,
dataset=dataset,
raise_exceptions=False,
)
ans = result.to_pandas()
ans.to_csv('./val1.csv')
with open("metrics.json", "w") as json_file:
json.dump(result, json_file, indent=4)
print(type(result))
return result
This ensures that the contexts
field is correctly formatted as a list of strings, which should resolve the ValueError
[1].
It looks like this issue needs your specialized knowledge, @jjmachan, could you step in and help out?
To continue talking to Dosu, mention @dosu.
@dosu @antono @peterkaminski Same code as above and It's not working Please look into it
A maintainer has already been looped in.
To continue talking to Dosu, mention @dosu.
@dosu What's the solution?
To resolve the ValueError: Dataset feature "contexts" should be of type Sequence[string], got <class 'datasets.features.features.Sequence'>
, you need to ensure that the "contexts" feature in your dataset is of type Sequence[string]
. Here is a solution to convert the "contexts" feature to the correct type:
from datasets import Dataset, Sequence, Value
def convert_contexts_to_sequence_string(ds: Dataset) -> Dataset:
if "contexts" in ds.features:
if not (isinstance(ds.features["contexts"], Sequence) and ds.features["contexts"].feature.dtype == "string"):
contexts = [[context] if isinstance(context, str) else context for context in ds["contexts"]]
ds = ds.remove_columns("contexts")
ds = ds.add_column("contexts", contexts)
return ds
# Example usage
ds = convert_contexts_to_sequence_string(ds)
This function checks if the "contexts" feature exists and if it is not already of type Sequence[string]
. If it is not, it converts the "contexts" feature to the correct type [1].
To continue talking to Dosu, mention @dosu.
from llama_index.core import SimpleDirectoryReader from llama_index.llms.openai import OpenAI from llama_index.embeddings.openai import OpenAIEmbedding from llama_index.core import VectorStoreIndex from llama_index.core.settings import Settings from llama_index.core import load_index_from_storage from llama_index.core.storage import StorageContext from llama_index.core import Document
from ragas.testset.generator import TestsetGenerator from ragas.testset.evolutions import simple,reasoning,multi_context
from ragas.metrics import ( faithfulness, answer_relevancy, context_precision, context_recall, answer_correctness, context_utilization, )
from datasets import Dataset, Value, Sequence
from ragas.metrics.critique import harmfulness from ragas.integrations.llama_index import evaluate import ast
DATASET="./data.csv" PATH="./STORE"
embeddings=OpenAIEmbedding() Settings.embed_model=embeddings
class Evaluation: def init(self): self.generator_llm=OpenAI(model="gpt-4o-mini") self.critics_llm=OpenAI(model="gpt-4o-mini") self.evalutor_llm=OpenAI(model="gpt-4o-mini") self.embeddings=OpenAIEmbedding()
def _RAGAS_(self):
document = SimpleDirectoryReader(input_files=[r"file"]).load_data()
generator=TestsetGenerator.from_llama_index(
generator_llm=self.generator_llm,
critic_llm=self.critics_llm,
embeddings=self.embeddings,
)
testset=generator.generate_with_llamaindex_docs(
documents=self.document,
test_size=15,
distributions={simple:0.25, reasoning:0.5, multi_context:0.25},
)
if not os.path.exists(PATH):
vector_index=VectorStoreIndex.from_documents(documents=document)
vector_index.storage_context.persist(persist_dir=PATH)
else:
storage_context=StorageContext.from_defaults(persist_dir=PATH)
vector_index=load_index_from_storage(storage_context=storage_context)
query_engine = vector_index.as_query_engine()
testset_df = testset.to_pandas()
testset_df.to_csv('answer.csv')
questions = testset_df["question"].tolist()
print(type(questions))
print(questions)
print("-"*100)
ground_truth = testset_df["ground_truth"].tolist()
print(type(ground_truth))
print(questions)
print("-"*100)
data = {
'question':[],
'answer':[],
'contexts':[],
'ground_truth':ground_truth,
}
for val in questions:
data['question'].append(val)
answer = query_engine.query(val)
data['answer'].append(answer.response)
data["contexts"].append([node.node.get_content() for node in answer.source_nodes])
dataset = Dataset.from_dict(data)
metrics=[
faithfulness,
answer_relevancy,
context_precision,
context_utilization,
context_recall,
harmfulness,
answer_correctness,
]
result=evaluate(
query_engine=query_engine,
metrics=metrics,
dataset=dataset,
raise_exceptions=False,
)
ans=result.to_pandas()
ans.to_csv('./val1.csv')
with open("metrics.json", "w") as json_file:
json.dump(result, json_file, indent=4)
print(type(result))
return result
if name=="main": obj=Evaluation() print(obj.RAGAS())
Complete working code and It's working fine!
[ ] I have checked the documentation and related resources and couldn't resolve my bug.
Describe the bug ValueError: Dataset feature "contexts" should be of type Sequence[string], got <class 'datasets.features.features.Sequence'>
Ragas version: 0.1.16 Python version: 3.12.2
Code to Reproduce
import os import constant import json
os.environ['OPENAI_API_KEY']=constant.OPENAI_API_KEY
from llama_index.core import SimpleDirectoryReader from llama_index.llms.openai import OpenAI from llama_index.embeddings.openai import OpenAIEmbedding from llama_index.core import VectorStoreIndex from llama_index.core.settings import Settings from llama_index.core import load_index_from_storage from llama_index.core.storage import StorageContext
import ragas
from ragas.testset.generator import TestsetGenerator from ragas.testset.evolutions import simple,reasoning,multi_context
from ragas.metrics import ( faithfulness, answer_relevancy, context_precision, context_recall, answer_correctness, context_utilization, )
from datasets import Dataset, Value, Sequence
from ragas.metrics.critique import harmfulness from ragas.integrations.llama_index import evaluate
DATASET="./data.csv" PATH="./STORE"
embeddings=OpenAIEmbedding() Settings.embed_model=embeddings
class Evaluation: def init(self): self.generator_llm=OpenAI(model="gpt-4o-mini") self.critics_llm=OpenAI(model="gpt-4o-mini") self.evalutor_llm=OpenAI(model="gpt-4o-mini") self.embeddings=OpenAIEmbedding() self.testset="" self.document=""
if name=="main": obj=Evaluation() print(ragas.version) print(obj.Query())
Error trace ValueError due to Validation.py file in source code folder
Expected behavior
or
{ "faithfulness": 0.8262903762903762, "answer_relevancy": 0.8911953547432045, "context_precision": 0.9999999999499997, "context_recall": 0.9285714285714286, "harmfulness": 0.14285714285714285, "answer_correctness": 0.8267676575354559 }
Additional context I need help to solve this bug and I still can't able to sort it out!