[x] I have checked the documentation and related resources and couldn't resolve my bug.
Describe the bug
I have a code using the package ragas that works in a notebook but doesn't work in a script. I am trying to have it as a script. The error is inside the package, so I have no idea how to fix it. The version of the package is the same in the script and notebook '0.1.4'
Ragas version: 0.1.4
Python version: 3.11.6
Code to Reproduce
`
import importlib.resources
import os
import pandas as pd
import yaml
from dotenv import load_dotenv
from google.cloud import storage
from google.cloud.storage import Blob
from langchain.docstore.document import Document
from langchain_community.document_loaders import PDFPlumberLoader
from langchain_experimental.text_splitter import SemanticChunker
from langchain_openai.embeddings import OpenAIEmbeddings
from ragas.testset.evolutions import multi_context, reasoning, simple
from ragas.testset.generator import TestsetGenerator
from tqdm.notebook import tqdm
load_dotenv()
def get_pdf_files(collection_name, filegroup_name):
"""Get the pdf files from the collection and filegroup from the ds-librairie-genai-provisioning"""
def combine_all_page(docs):
"""Combine all pages from one PDF into one Document object"""
try:
page_content = "/n".join([doc.page_content for doc in docs])
page_metadata = docs[0].metadata
if "page" in page_metadata:
page_metadata.pop("page")
return [Document(page_content=page_content, metadata=page_metadata)]
except Exception as e:
print(e)
def generate_testset(configs_file):
"""Generate RAGAS synthetic test set"""
with open(configs_file) as f:
configs = yaml.safe_load(f)
collection_name = configs["collection_name"]
filegroup_name = configs["filegroup_name"]
version = configs["version"]
files = get_pdf_files(collection_name, filegroup_name)
print(f"Number of files: {len(files)}")
docs = []
for file in tqdm(files):
doc_ = PDFPlumberLoader(
str(file),
text_kwargs={
"x_tolerance": configs["x_tolerance"],
"y_tolerance": configs["y_tolerance"],
},
).load()
doc_ = combine_all_page(doc_)
docs.extend(doc_)
print(f"Number of documents: {len(docs)}")
for doc_ in docs:
doc_.metadata["filename"] = doc_.metadata["source"]
splitted_testset_docs = docs
# RAGAS to generate a synthetic testset
generator = TestsetGenerator.with_openai()
testset = generator.generate_with_langchain_docs(
splitted_testset_docs,
test_size=configs["test_size"],
with_debugging_logs=True,
distributions=configs["distributions"],
)
`
Error trace
`
AttributeError Traceback (most recent call last)
Cell In[2], line 3
1 from generate_testset import generate_testset
----> 3 generate_testset("generate-testset.yml")
File ~/github_repos/ds-research-genai/common/src/rag_pipeline_evaluator/generate_testset.py:86, in generate_testset(configs_file)
83 print("len(splitted_testset_docs)", len(splitted_testset_docs))
84 print("splitted_testset_docs[0]", splitted_testset_docs[0])
---> 86 testset = generator.generate_with_langchain_docs(
87 splitted_testset_docs,
88 test_size=configs["test_size"],
89 with_debugging_logs=True,
90 distributions=configs["distributions"],
91 )
93 # Remove nan and short ground truth
94 testset_improved = testset
File ~/github_repos/ds-research-genai/common/src/rag_pipeline_evaluator/venv/lib/python3.11/site-packages/ragas/testset/generator.py:179, in TestsetGenerator.generate_with_langchain_docs(self, documents, test_size, distributions, with_debugging_logs, is_async, raise_exceptions, run_config)
174 # chunk documents and add to docstore
175 self.docstore.add_documents(
176 [Document.from_langchain_document(doc) for doc in documents]
177 )
--> 179 return self.generate(
180 test_size=test_size,
181 distributions=distributions,
182 with_debugging_logs=with_debugging_logs,
183 is_async=is_async,
184 raise_exceptions=raise_exceptions,
185 run_config=run_config,
186 )
File ~/github_repos/ds-research-genai/common/src/rag_pipeline_evaluator/venv/lib/python3.11/site-packages/ragas/testset/generator.py:227, in TestsetGenerator.generate(self, test_size, distributions, with_debugging_logs, is_async, raise_exceptions, run_config)
225 # init filters and evolutions
226 for evolution in distributions:
--> 227 self.init_evolution(evolution)
228 evolution.init(is_async=is_async, run_config=run_config)
230 if with_debugging_logs:
File ~/github_repos/ds-research-genai/common/src/rag_pipeline_evaluator/venv/lib/python3.11/site-packages/ragas/testset/generator.py:189, in TestsetGenerator.init_evolution(self, evolution)
188 def init_evolution(self, evolution: Evolution) -> None:
--> 189 if evolution.generator_llm is None:
190 evolution.generator_llm = self.generator_llm
191 if evolution.docstore is None:
AttributeError: 'str' object has no attribute 'generator_llm'
`
Expected behavior
No error. Generate a test set.
Additional context
It does the generating till 100%, then fails.
In a notebook, the exact same code with the exact same arguments work.
evolution is inside the package and I do not touch it directly. I have no idea why it would be a string rather than the correct object.
The error happens both if I call the scrip inside a notebook after importing it or in a terminal. But the same code in a notebook is ok.
[x] I have checked the documentation and related resources and couldn't resolve my bug.
Describe the bug I have a code using the package ragas that works in a notebook but doesn't work in a script. I am trying to have it as a script. The error is inside the package, so I have no idea how to fix it. The version of the package is the same in the script and notebook '0.1.4'
Ragas version: 0.1.4 Python version: 3.11.6
Code to Reproduce ` import importlib.resources import os
import pandas as pd import yaml from dotenv import load_dotenv from google.cloud import storage from google.cloud.storage import Blob from langchain.docstore.document import Document from langchain_community.document_loaders import PDFPlumberLoader from langchain_experimental.text_splitter import SemanticChunker from langchain_openai.embeddings import OpenAIEmbeddings from ragas.testset.evolutions import multi_context, reasoning, simple from ragas.testset.generator import TestsetGenerator from tqdm.notebook import tqdm
load_dotenv()
def get_pdf_files(collection_name, filegroup_name): """Get the pdf files from the collection and filegroup from the ds-librairie-genai-provisioning"""
def combine_all_page(docs): """Combine all pages from one PDF into one Document object""" try: page_content = "/n".join([doc.page_content for doc in docs]) page_metadata = docs[0].metadata if "page" in page_metadata: page_metadata.pop("page") return [Document(page_content=page_content, metadata=page_metadata)] except Exception as e: print(e)
def generate_testset(configs_file): """Generate RAGAS synthetic test set""" with open(configs_file) as f: configs = yaml.safe_load(f)
`
Error trace `
AttributeError Traceback (most recent call last) Cell In[2], line 3 1 from generate_testset import generate_testset ----> 3 generate_testset("generate-testset.yml")
File ~/github_repos/ds-research-genai/common/src/rag_pipeline_evaluator/generate_testset.py:86, in generate_testset(configs_file) 83 print("len(splitted_testset_docs)", len(splitted_testset_docs)) 84 print("splitted_testset_docs[0]", splitted_testset_docs[0]) ---> 86 testset = generator.generate_with_langchain_docs( 87 splitted_testset_docs, 88 test_size=configs["test_size"], 89 with_debugging_logs=True, 90 distributions=configs["distributions"], 91 ) 93 # Remove nan and short ground truth 94 testset_improved = testset
File ~/github_repos/ds-research-genai/common/src/rag_pipeline_evaluator/venv/lib/python3.11/site-packages/ragas/testset/generator.py:179, in TestsetGenerator.generate_with_langchain_docs(self, documents, test_size, distributions, with_debugging_logs, is_async, raise_exceptions, run_config) 174 # chunk documents and add to docstore 175 self.docstore.add_documents( 176 [Document.from_langchain_document(doc) for doc in documents] 177 ) --> 179 return self.generate( 180 test_size=test_size, 181 distributions=distributions, 182 with_debugging_logs=with_debugging_logs, 183 is_async=is_async, 184 raise_exceptions=raise_exceptions, 185 run_config=run_config, 186 )
File ~/github_repos/ds-research-genai/common/src/rag_pipeline_evaluator/venv/lib/python3.11/site-packages/ragas/testset/generator.py:227, in TestsetGenerator.generate(self, test_size, distributions, with_debugging_logs, is_async, raise_exceptions, run_config) 225 # init filters and evolutions 226 for evolution in distributions: --> 227 self.init_evolution(evolution) 228 evolution.init(is_async=is_async, run_config=run_config) 230 if with_debugging_logs:
File ~/github_repos/ds-research-genai/common/src/rag_pipeline_evaluator/venv/lib/python3.11/site-packages/ragas/testset/generator.py:189, in TestsetGenerator.init_evolution(self, evolution) 188 def init_evolution(self, evolution: Evolution) -> None: --> 189 if evolution.generator_llm is None: 190 evolution.generator_llm = self.generator_llm 191 if evolution.docstore is None:
AttributeError: 'str' object has no attribute 'generator_llm' `
Expected behavior No error. Generate a test set.
Additional context It does the generating till 100%, then fails.
In a notebook, the exact same code with the exact same arguments work.
evolution is inside the package and I do not touch it directly. I have no idea why it would be a string rather than the correct object.
The error happens both if I call the scrip inside a notebook after importing it or in a terminal. But the same code in a notebook is ok.