Hi, I am trying to use TestsetGenerator to produce a synthetic dataset paired with LlamaIndex and 'Ollama', it successfully completes the embedding process, but before startin the generation process the ValueError: 'a' cannot be empty unless no samples are taken exception is raise; I personally think this is because, for some reason, TestsetGenerator produces the double of the required embeddings for the LlamaIndex Document, at least this is what I can get by looking at the embeddings progress bar.
Code:
import json
from pathlib import Path
from ragas.testset.generator import TestsetGenerator
from ragas.testset.evolutions import simple, reasoning, multi_context
from llama_index.core import Document
from llama_index.llms.ollama import Ollama
from llama_index.embeddings.ollama import OllamaEmbedding
from knowlege.collections import Document as MyDocument
from knowlege.chunker import chunk # tried chunking to lower the documents length
files_gen = Path("./data/").rglob("*")
files = [f.resolve() for f in files_gen]
json_files = [f for f in files if f.suffix.lower() == ".json"]
docs = []
with open(json_files[4], 'r', encoding='utf-8') as fp:
data = json.load(fp)
for item in data:
chunks = chunk(MyDocument(
name=item['title'],
content=item['content']
))
for c in chunks:
docs.append(Document(text=c))
generator_llm = Ollama(model='gemma:2b')
critic_llm = Ollama(model='gemma:2b')
embeddings = OllamaEmbedding(model_name='gemma:2b')
generator = TestsetGenerator.from_llama_index(
generator_llm,
critic_llm,
embeddings
)
# tried with original documents
test_docs = docs[:5]
for d in test_docs:
print(d.doc_id)
print(len(d.text))
# e861670f-3439-405d-a287-a90dfc885f9e
# 219
# a44e3cc3-e498-4662-b3e7-52220145da05
# 227
# d21bd3e2-9d7d-41de-a99e-2acd538b46d3
# 274
# c1555996-fd90-436e-ab93-dbed31471551
# 596
# 7d8145b9-7d67-4d4d-a363-695e7b0f0f79
# 646
# tried with proof-of-concept documents
damn = [
Document(text='Sensitive Data Exposure, which is more of a broad symptom rather than a root cause, the focus is on failures related to cryptography (or lack thereof). Which often lead to exposure of sensitive data.'),
Document(text='For example, passwords, credit card numbers, health records, personal information, and business secrets require extra protection, mainly if that data falls under privacy laws'),
Document(text='Secure software requires a secure development lifecycle, some form of secure design pattern, paved road methodology, secured component library, tooling, and threat modeling.')
]
test_set = generator.generate_with_llamaindex_docs(
damn,
test_size=1,
distributions={simple: 0.5, reasoning: 0.25, multi_context: 0.25},
raise_exceptions=False
)
Output:
Filename and doc_id are the same for all nodes.
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
Cell In[9], line 1
----> 1 test_set = generator.generate_with_llamaindex_docs(
2 damn,
3 test_size=1,
4 distributions={simple: 0.5, reasoning: 0.25, multi_context: 0.25},
5 raise_exceptions=False
6 )
File D:\Desktop\UNI\Tirocinio-Tesi\project\prototypes\.venv\Lib\site-packages\ragas\testset\generator.py:183, in TestsetGenerator.generate_with_llamaindex_docs(self, documents, test_size, distributions, with_debugging_logs, is_async, raise_exceptions, run_config)
178 # chunk documents and add to docstore
179 self.docstore.add_documents(
180 [Document.from_llamaindex_document(doc) for doc in documents]
181 )
--> 183 return self.generate(
184 test_size=test_size,
185 distributions=distributions,
186 with_debugging_logs=with_debugging_logs,
187 is_async=is_async,
188 run_config=run_config,
189 raise_exceptions=raise_exceptions,
190 )
File D:\Desktop\UNI\Tirocinio-Tesi\project\prototypes\.venv\Lib\site-packages\ragas\testset\generator.py:279, in TestsetGenerator.generate(self, test_size, distributions, with_debugging_logs, is_async, raise_exceptions, run_config)
268 patch_logger("ragas.llms.prompt", logging.DEBUG)
270 exec = Executor(
271 desc="Generating",
272 keep_progress_bar=True,
273 raise_exceptions=raise_exceptions,
274 run_config=run_config,
275 )
277 current_nodes = [
278 CurrentNodes(root_node=n, nodes=[n])
--> 279 for n in self.docstore.get_random_nodes(k=test_size)
280 ]
281 total_evolutions = 0
282 for evolution, probability in distributions.items():
File D:\Desktop\UNI\Tirocinio-Tesi\project\prototypes\.venv\Lib\site-packages\ragas\testset\docstore.py:328, in InMemoryDocumentStore.get_random_nodes(self, k, alpha)
325 prob = np.array(scores) * np.array(similarity_scores)
326 prob = prob / np.sum(prob)
--> 328 nodes = rng.choice(np.array(self.nodes), size=k, p=prob).tolist()
330 for node in nodes:
331 idx = self.nodes.index(node)
File numpy\\random\\_generator.pyx:803, in numpy.random._generator.Generator.choice()
ValueError: a cannot be empty unless no samples are taken
Hi, I am trying to use
TestsetGenerator
to produce a synthetic dataset paired withLlamaIndex
and 'Ollama', it successfully completes the embedding process, but before startin the generation process theValueError: 'a' cannot be empty unless no samples are taken
exception is raise; I personally think this is because, for some reason, TestsetGenerator produces the double of the required embeddings for the LlamaIndexDocument
, at least this is what I can get by looking at the embeddings progress bar.Code:
Output: