Open stevepiercy opened 1 year ago
upload.py
import nucliadb_sdk
from nucliadb_sdk.client import NucliaDBClient
from nucliadb_models.text import TextFormat
import json
import glob
import hashlib
import os
API_KEY = os.environ.get("NUA_KEY")
def generate_nuclia_sync():
result = {"docs": {}}
for doc in glob.glob("./docs/**/*.md"):
hash = hashlib.md5(open(doc, "rb").read()).hexdigest()
result["docs"][doc] = hash
return result
def upload_doc(kb: nucliadb_sdk.KnowledgeBox, doc: str):
with open(doc, "r") as md_file:
text = md_file.read()
slug = doc.strip(".").strip("/").replace("/", "-").replace(".", "-")
kb.upload(slug, text=text, format=TextFormat.MARKDOWN)
def sync():
# Get all pages uploaded and last sync
with open("nuclia_sync.json", "r") as sync_info:
old_data = json.load(sync_info)
new_data = generate_nuclia_sync()
client = NucliaDBClient(
url="https://europe-1.nuclia.cloud/api/v1/kb/df8b4c24-2807-4888-ad6c-ae97357a638b",
api_key=API_KEY,
)
kb = nucliadb_sdk.KnowledgeBox(client)
to_delete = []
for doc, _ in old_data["docs"].items():
if doc not in new_data["docs"]:
to_delete.append(doc)
for doc, hash in new_data["docs"].items():
if doc not in old_data["docs"]:
upload_doc(kb, doc)
elif hash != old_data["docs"][doc]:
upload_doc(kb, doc)
with open("nuclia_sync.json", "w") as sync_info:
json.dump(new_data, sync_info)
print("Remember to do a make upload-sync to make sure we update status")
if __name__ == "__main__":
sync()
Potential strategies to build the knowledge box index.
make text
outputs plain text files inplone/documentation
.sitemap.xml
s, https://6.docs.plone.org/sitemap.xml, https://training.plone.org/sitemap.xml, and https://2022.training.plone.org/sitemap.xml, although that does not include modification times.