Open zel2023 opened 1 week ago
Did you solve that?
I did not solve that.
I got same issue ,this is a tiny code, This bug is really weird.
import numpy as np
import json
import pyarrow as pa
import lancedb
class TextEmbeder:
def __init__(self) -> None:
pass
def encode(self, x):
return np.abs(np.around(np.random.randn(3), 3)).tolist()
textembeder = TextEmbeder()
textembeder.encode("a")[:3]
def build_fake_data():
res = []
for index in range(1000):
id_ = str(index)
text = f"hello test {index}"
vector = textembeder.encode(text)
extra_data = json.dumps(
{"attr1": index, "attr2": index * 2, "attr3": "B站"}, ensure_ascii=True
)
res.append(
{"id": id_, "text": text, "extra_data": extra_data, "vector": vector}
)
return res
fake_data_list = build_fake_data()
fake_data_list[0].keys()
db_url = "data/database"
db_table_name = "smalltest"
db_connection = lancedb.connect(db_url)
db_connection.table_names()
schema = pa.schema(
[
pa.field("vector", pa.list_(pa.float64())),
pa.field("id", pa.string()),
pa.field("text", pa.string()),
pa.field("extra_data", pa.string()),
]
)
db_connection.create_table(name=db_table_name, schema=schema, mode="overwrite")
table = db_connection.open_table(db_table_name)
table.add(fake_data_list)
query_vector = textembeder.encode("hh")
query_vector[:4]
docs = (
table.search(query=query_vector, vector_column_name="vector", query_type="vector")
.limit(4)
.to_list()
)
print(docs)
env:
pip show lancedb
Name: lancedb
Version: 0.13.0
Summary: lancedb
Home-page:
Author:
Author-email: LanceDB Devs <dev@lancedb.com>
License: Apache-2.0
Location: /data2/miniconda3/envs/hz_grahrag/lib/python3.11/site-packages
Requires: attrs, cachetools, deprecation, overrides, packaging, pydantic, pylance, requests, retry, tqdm
Required-by: graphrag
Name: pyarrow
Version: 15.0.2
Summary: Python library for Apache Arrow
Home-page: https://arrow.apache.org/
Author:
Author-email:
License: Apache License, Version 2.0
Location: /data2/miniconda3/envs/hz_grahrag/lib/python3.11/site-packages
Requires: numpy
Required-by: datasets, datashaper, graphrag, pylance, streamlit
I had fix this issue,in schema
,need set vectorsize, `pa.field("vector", pa.list(pa.float32(), list_size=DIM_VALUE)),`
schema = pa.schema(
[
pa.field("vector", pa.list_(pa.float32(), list_size=DIM_VALUE)),
pa.field("id", pa.string()),
pa.field("text", pa.string()),
pa.field("extra_data", pa.string()),
]
)
Congratulations! However, there is no “schema” in my code:
import os
import pandas as pd
import tiktoken
import asyncio
from graphrag.query.context_builder.entity_extraction import EntityVectorStoreKey
from graphrag.query.indexer_adapters import (
read_indexer_covariates,
read_indexer_entities,
read_indexer_relationships,
read_indexer_reports,
read_indexer_text_units,
)
from graphrag.query.input.loaders.dfs import (
store_entity_semantic_embeddings,
)
from graphrag.query.llm.oai.chat_openai import ChatOpenAI
from graphrag.query.llm.oai.embedding import OpenAIEmbedding
from graphrag.query.llm.oai.typing import OpenaiApiType
from graphrag.query.question_gen.local_gen import LocalQuestionGen
from graphrag.query.structured_search.local_search.mixed_context import (
LocalSearchMixedContext,
)
from graphrag.query.structured_search.local_search.search import LocalSearch
from graphrag.vector_stores.lancedb import LanceDBVectorStore
INPUT_DIR = "xx"
LANCEDB_URI = f"{INPUT_DIR}/lancedb"
COMMUNITY_REPORT_TABLE = "create_final_community_reports"
ENTITY_TABLE = "create_final_nodes"
ENTITY_EMBEDDING_TABLE = "create_final_entities"
RELATIONSHIP_TABLE = "create_final_relationships"
#COVARIATE_TABLE = "create_final_covariates"
TEXT_UNIT_TABLE = "create_final_text_units"
COMMUNITY_LEVEL = 2
# read nodes table to get community and degree data
entity_df = pd.read_parquet(f"{INPUT_DIR}/{ENTITY_TABLE}.parquet")
entity_embedding_df = pd.read_parquet(f"{INPUT_DIR}/{ENTITY_EMBEDDING_TABLE}.parquet")
entities = read_indexer_entities(entity_df, entity_embedding_df, COMMUNITY_LEVEL)
# load description embeddings to an in-memory lancedb vectorstore
# to connect to a remote db, specify url and port values.
description_embedding_store = LanceDBVectorStore(
collection_name="entity_description_embeddings",
)
description_embedding_store.connect(db_uri=LANCEDB_URI)
entity_description_embeddings = store_entity_semantic_embeddings(
entities=entities, vectorstore=description_embedding_store
)
print(f"Entity count: {len(entity_df)}")
entity_df.head()
relationship_df = pd.read_parquet(f"{INPUT_DIR}/{RELATIONSHIP_TABLE}.parquet")
relationships = read_indexer_relationships(relationship_df)
print(f"Relationship count: {len(relationship_df)}")
relationship_df.head()
# NOTE: covariates are turned off by default, because they generally need prompt tuning to be valuable
# Please see the GRAPHRAG_CLAIM_* settings
#covariate_df = pd.read_parquet(f"{INPUT_DIR}/{COVARIATE_TABLE}.parquet")
#claims = read_indexer_covariates(covariate_df)
#print(f"Claim records: {len(claims)}")
#covariates = {"claims": claims}
report_df = pd.read_parquet(f"{INPUT_DIR}/{COMMUNITY_REPORT_TABLE}.parquet")
reports = read_indexer_reports(report_df, entity_df, COMMUNITY_LEVEL)
print(f"Report records: {len(report_df)}")
report_df.head()
text_unit_df = pd.read_parquet(f"{INPUT_DIR}/{TEXT_UNIT_TABLE}.parquet")
text_units = read_indexer_text_units(text_unit_df)
print(f"Text unit records: {len(text_unit_df)}")
text_unit_df.head()
api_key = "xx"
llm_model = "deepseek-chat"
embedding_model = "text-embedding-3-small"
#embedding_model = os.environ["text-embedding-3-small"]
llm = ChatOpenAI(
api_key=api_key,
model=llm_model,
api_type=OpenaiApiType.OpenAI, # OpenaiApiType.OpenAI or OpenaiApiType.AzureOpenAI
max_retries=20,
api_base="https://api.agicto.cn/v1"
)
token_encoder = tiktoken.get_encoding("cl100k_base")
text_embedder = OpenAIEmbedding(
api_key=api_key,
api_base="https://api.agicto.cn/v1",
api_type=OpenaiApiType.OpenAI,
model=embedding_model,
deployment_name=embedding_model,
max_retries=20,
)
context_builder = LocalSearchMixedContext(
community_reports=reports,
text_units=text_units,
entities=entities,
relationships=relationships,
# if you did not run covariates during indexing, set this to None
covariates=None,
entity_text_embeddings=description_embedding_store,
embedding_vectorstore_key=EntityVectorStoreKey.TITLE, # if the vectorstore uses entity title as ids, set this to EntityVectorStoreKey.TITLE
text_embedder=text_embedder,
token_encoder=token_encoder,
)
local_context_params = {
"text_unit_prop": 0.5,
"community_prop": 0.1,
"conversation_history_max_turns": 5,
"conversation_history_user_turns_only": True,
"top_k_mapped_entities": 10,
"top_k_relationships": 10,
"include_entity_rank": True,
"include_relationship_weight": True,
"include_community_rank": False,
"return_candidate_context": False,
"embedding_vectorstore_key": EntityVectorStoreKey.TITLE, # set this to EntityVectorStoreKey.TITLE if the vectorstore uses entity title as ids
"max_tokens": 12_000, # change this based on the token limit you have on your model (if you are using a model with 8k limit, a good setting could be 5000)
}
llm_params = {
"max_tokens": 2_000, # change this based on the token limit you have on your model (if you are using a model with 8k limit, a good setting could be 1000=1500)
"temperature": 0.0,
}
search_engine = LocalSearch(
llm=llm,
context_builder=context_builder,
token_encoder=token_encoder,
llm_params=llm_params,
context_builder_params=local_context_params,
response_type="multiple paragraphs", # free form text describing the response type and format, can be anything, e.g. prioritized list, single paragraph, multiple paragraphs, multiple-page report
)
async def main():
result = await search_engine.asearch("what is the relationship between xiaozhang and xiaoming?")
print(result.response)
if __name__ == "__main__":
asyncio.run(main())
I meet this problem too, do anyone have any suggestions?
i had the same issue, and my problem is solved by adjusting the command. Previously, I use "graphrag.index --root ./ragtest" for indexing, and "graphrag.query --root ./ragtest --method local "explain the relationship between Jay and May." for querying.
Then i use "python -m graphrag.index --root ./ragtest" and "python -m graphrag.query --root ./ragtest --method local "explain the relationship between Jay and May.", it solves my problem.
To sum up, in my particular case, i didn't include "pyhton -m" in my execution command, and it turns out problematic.
Do you need to file an issue?
Describe the bug
I attempted to refer to https://github.com/microsoft/graphrag/blob/main/docs/examples_notebooks/local_search.ipynb to write a Python file for running local search but failed. However, using https://github.com/microsoft/graphrag/blob/main/docs/examples_notebooks/global_search.ipynb as a reference, I successfully wrote a Python file to run global search. Additionally, I successfully ran the local search by referring to https://github.com/microsoft/graphrag/blob/94f1e62e5c06795fc8c361dba6580bb76d6e77ce/docs/get_started.md. Below is the error message:
Entity count: 3 Relationship count: 2 Report records: 1 Text unit records: 1 Traceback (most recent call last): File "/data/zelongzheng/graphrag-main/local_search.py", line 172, in
asyncio.run(main())
File "/home/zelongzheng/anaconda3/envs/graphrag/lib/python3.11/asyncio/runners.py", line 190, in run
return runner.run(main)
^^^^^^^^^^^^^^^^
File "/home/zelongzheng/anaconda3/envs/graphrag/lib/python3.11/asyncio/runners.py", line 118, in run
return self._loop.run_until_complete(task)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/zelongzheng/anaconda3/envs/graphrag/lib/python3.11/asyncio/base_events.py", line 654, in run_until_complete
return future.result()
^^^^^^^^^^^^^^^
File "/data/zelongzheng/graphrag-main/local_search.py", line 167, in main
result = await search_engine.asearch("what is the relationship between xiaozhang and xiaoming?")
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/data/zelongzheng/graphrag-main/graphrag/query/structured_search/local_search/search.py", line 67, in asearch
context_text, context_records = self.context_builder.build_context(
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/data/zelongzheng/graphrag-main/graphrag/query/structured_search/local_search/mixed_context.py", line 140, in build_context
selected_entities = map_query_to_entities(
^^^^^^^^^^^^^^^^^^^^^^
File "/data/zelongzheng/graphrag-main/graphrag/query/context_builder/entity_extraction.py", line 57, in map_query_to_entities
search_results = text_embedding_vectorstore.similarity_search_by_text(
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/data/zelongzheng/graphrag-main/graphrag/vector_stores/lancedb.py", line 136, in similarity_search_by_text
return self.similarity_search_by_vector(query_embedding, k)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/data/zelongzheng/graphrag-main/graphrag/vector_stores/lancedb.py", line 115, in similarity_search_by_vector
.to_list()
^^^^^^^^^
File "/home/zelongzheng/anaconda3/envs/graphrag/lib/python3.11/site-packages/lancedb/query.py", line 320, in to_list
return self.to_arrow().to_pylist()
^^^^^^^^^^^^^^^
File "/home/zelongzheng/anaconda3/envs/graphrag/lib/python3.11/site-packages/lancedb/query.py", line 647, in to_arrow
return self.to_batches().read_all()
^^^^^^^^^^^^^^^^^
File "/home/zelongzheng/anaconda3/envs/graphrag/lib/python3.11/site-packages/lancedb/query.py", line 678, in to_batches
result_set = self._table._execute_query(query, batch_size)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/zelongzheng/anaconda3/envs/graphrag/lib/python3.11/site-packages/lancedb/table.py", line 1742, in _execute_query
return ds.scanner(
^^^^^^^^^^^
File "/home/zelongzheng/anaconda3/envs/graphrag/lib/python3.11/site-packages/lance/dataset.py", line 369, in scanner
builder = builder.nearest(**nearest)
^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/zelongzheng/anaconda3/envs/graphrag/lib/python3.11/site-packages/lance/dataset.py", line 2449, in nearest
raise TypeError(
TypeError: Query column vector must be a vector. Got list.
Steps to reproduce
1.pip install graphrag==0.3.6
2.Build the graph and use the command
python -m graphrag.query --root ./ragtest --method local "what is the relationship between xiaozhang and xiaoming?"
to confirm successful execution.3.Write a python file using https://github.com/microsoft/graphrag/blob/main/docs/examples_notebooks/local_search.ipynb as a reference. Modify certain parts of this file: change INPUT_DIR, comment out all variables related to covariates since no related files were generated when building the graph, set API_KEY, llm_model (deepseek-chat), embedding_model (text-embedding-3-small), and api_base="https://api.agicto.cn/v1".
4.Run the file using Python.
Expected Behavior
I expect it will response for what I ask.
GraphRAG Config Used
Logs and screenshots
Additional Information