caretdev / langchain-iris

MIT License
2 stars 2 forks source link

Unable to use DOT_product for queries: "Neither 'Column' object nor 'comparator_factory' object has an attribute 'DOT_product'" #3

Open sapiriu opened 2 months ago

sapiriu commented 2 months ago

Hi,

I built a custom model and I'm trying to query iris with it, with dot product on embeddings, but I get the following error, traceback:

similarity_search_with_score_by_vector
    self.distance_strategy(embedding).label("distance")
    ^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/User/path/venv/lib/python3.12/site-packages/langchain_iris/vectorstores.py", line 194, in distance_strategy
    return self.table.c.embedding.DOT_product
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/User/path/venv/lib/python3.12/site-packages/sqlalchemy/sql/elements.py", line 1498, in __getattr__
    raise AttributeError(
AttributeError: Neither 'Column' object nor 'comparator_factory' object has an attribute 'DOT_product'

My custom model that I defined:

class CustomIris(IRISVector):
    def __init__(
            self, *args,
            collection_name="documents",
            distance_strategy="dot",
            dimension=1024,
            embedding_function=embeddings,
            **kwargs
    ):
        super().__init__(
            *args,
            collection_name=collection_name,
            embedding_function=embedding_function,
            distance_strategy=distance_strategy,
            dimension=dimension,
            **kwargs
        )

    @staticmethod
    def query_result_to_document(result):
        result_dict = result._asdict()
        result_dict["metadata"] = json.loads(result.metadata)
        if "distance" in result_dict:
            result_dict.pop("distance")
        return Document(**result_dict) # Document is custom model django ORM

    @property
    def table(self) -> Table:
        return Table(
            self.collection_name,
            Base.metadata,
            Column("id", INT, primary_key=True, nullable=False),
            Column("origin_id", VARCHAR(255), nullable=False),
            Column("company_id", INT, nullable=False),
            Column("url", VARCHAR(255), nullable=False),
            Column("title", VARCHAR(255), nullable=False),
            Column("text", TEXT, nullable=False),
            Column("embedding", (
                IRISVectorType(self.dimension)
                if self.native_vector
                else IRISListBuild(self.dimension, float)
            ), nullable=False),
            Column("document_type", VARCHAR(255), nullable=False),
            Column("metadata", TEXT, nullable=False),
            Column("is_hidden", BOOLEAN, nullable=False),
            extend_existing=True,
        )

    def similarity_search_with_score_by_vector(
            self,
            embedding: List[float],
            k: int = 4,
            filter: Optional[dict] = None,
    ) -> List[Tuple[Document, float]]:
        filter_by = True
        if filter is not None:
            filter_clauses = []
            for key, value in filter.items():
                if key == "company_id":
                    filter_clauses.append(self.table.c.company_id == value)
                elif key == "origin_id":
                    filter_clauses.append(self.table.c.origin_id == value)
                elif key == "document_type":
                    filter_clauses.append(self.table.c.document_type.in_(value))
                elif key == "is_hidden":
                    filter_clauses.append(self.table.c.is_hidden == value)
                else:
                    filter_clauses.append(
                        self.table.c.metadata.like(
                            "%" + json.dumps(dict(zip((key,), (value,))))[1:-1] + "%"
                        )
                    )
            filter_by = and_(*filter_clauses)

        embedding = [float(v) for v in embedding]

        # Execute the query and fetch the results
        with Session(self._conn) as session:
            results: Sequence[Row] = (
                session.query(
                    self.table,
                    (
                        self.distance_strategy(embedding).label("distance")
                        if self.native_vector
                        else self.table.c.embedding.func(
                            self.distance_strategy, embedding
                        ).label("distance")
                    ),
                )
                .filter(filter_by)
                .order_by(asc("distance"))
                .limit(k)
                .all()
            )

        documents_with_scores = [
            (
                self.query_result_to_document(result),
                (
                    round(float(result.distance), 15)
                    if self.embedding_function is not None
                    else None
                ),
            )
            for result in results
        ]
        return documents_with_scores

How I call it:

iris = iris = CustomIris(
    connection_string=CONNECTION_STRING,
)
results = iris.similarity_search_with_score(
    "how to debug objectscript",
    filter={
        "company_id": 45,
    }
)
print(results)

Another small thing: I cannot import DistanceStrategy from vectorstores.py in order to use the same object, instead of hardcoding the string in my code.

If I use cosine distance, it works like a charm.

daimor commented 1 month ago

soryy for the late response please check with the latest version