collective / plone-nuclia

Implementing Nuclia AI for Plone search
GNU General Public License v2.0
4 stars 2 forks source link

Analyze methods to build a Nuclia knowledge box #3

Open stevepiercy opened 1 year ago

stevepiercy commented 1 year ago

Potential strategies to build the knowledge box index.

stevepiercy commented 1 year ago

upload.py

import nucliadb_sdk
from nucliadb_sdk.client import NucliaDBClient
from nucliadb_models.text import TextFormat
import json
import glob
import hashlib
import os

API_KEY = os.environ.get("NUA_KEY")

def generate_nuclia_sync():
    result = {"docs": {}}
    for doc in glob.glob("./docs/**/*.md"):
        hash = hashlib.md5(open(doc, "rb").read()).hexdigest()
        result["docs"][doc] = hash
    return result

def upload_doc(kb: nucliadb_sdk.KnowledgeBox, doc: str):
    with open(doc, "r") as md_file:
        text = md_file.read()
    slug = doc.strip(".").strip("/").replace("/", "-").replace(".", "-")
    kb.upload(slug, text=text, format=TextFormat.MARKDOWN)

def sync():
    # Get all pages uploaded and last sync
    with open("nuclia_sync.json", "r") as sync_info:
        old_data = json.load(sync_info)
    new_data = generate_nuclia_sync()

    client = NucliaDBClient(
        url="https://europe-1.nuclia.cloud/api/v1/kb/df8b4c24-2807-4888-ad6c-ae97357a638b",
        api_key=API_KEY,
    )
    kb = nucliadb_sdk.KnowledgeBox(client)

    to_delete = []
    for doc, _ in old_data["docs"].items():
        if doc not in new_data["docs"]:
            to_delete.append(doc)

    for doc, hash in new_data["docs"].items():
        if doc not in old_data["docs"]:
            upload_doc(kb, doc)
        elif hash != old_data["docs"][doc]:
            upload_doc(kb, doc)

    with open("nuclia_sync.json", "w") as sync_info:
        json.dump(new_data, sync_info)
    print("Remember to do a make upload-sync to make sure we update status")

if __name__ == "__main__":
    sync()