googleapis / langchain-google-firestore-python

Apache License 2.0
21 stars 11 forks source link

Embeddings returned in payload from retriever #104

Open dqcloudadmin opened 1 month ago

dqcloudadmin commented 1 month ago

When searching with MMR and a langchain retriever, the full embeddings are returned.

Recommend a pop of embedding in the convert firestore document function:

Working code:

def convert_firestore_document(
    document: DocumentSnapshot,
    page_content_fields: Optional[List[str]] = None,
    metadata_fields: Optional[List[str]] = None,
) -> Document:
    data_doc = document.to_dict()

    # Remove the 'embedding' field if it exists in data_doc
    data_doc.pop('embedding', None)

    metadata = {
        "reference": {
            "path": document.reference.path,
            FIRESTORE_TYPE: DOC_REF,
        }
    }

    # Check for vector fields and move them from the data_doc to the metadata
    vector_keys = [k for k in data_doc if isinstance(data_doc[k], Vector)]
    for k in vector_keys:
        metadata[k] = _convert_from_firestore(data_doc.pop(k))

    set_page_fields = set(
        page_content_fields or (data_doc.keys() - set(metadata_fields or []))
    )
    set_metadata_fields = set(metadata_fields or (data_doc.keys() - set_page_fields))

    page_content = {}

    for k in sorted(set_metadata_fields):
        if k in data_doc:
            metadata[k] = _convert_from_firestore(data_doc[k])
    for k in sorted(set_page_fields):
        if k in data_doc:
            page_content[k] = _convert_from_firestore(data_doc[k])

    if len(page_content) == 1:
        page_content = str(page_content.popitem()[1])  # type: ignore
    else:
        page_content = json.dumps(page_content)  # type: ignore

    return Document(page_content=page_content, metadata=metadata)  # type: ignore