vliz-be-opsci / py-trav-harv

python module that will allow an enduser to perform link traversal on a triple store.
0 stars 0 forks source link

provide proper prefix support in the travharv config #35

Open marc-portier opened 2 months ago

marc-portier commented 2 months ago

the prefix config in the yml should extend to

an updated test-yml should show this is all actually working (and if needed implementation fixes should make it work)

marc-portier commented 2 months ago

quick separate exercise showing how the config entries in the yml can be normalised using the prefix declarations

from typing import Dict, List
import validators
import re
from re import Match
from rdflib import Namespace, Graph, URIRef
from rdflib.namespace import NamespaceManager
# see https://rdflib.readthedocs.io/en/stable/namespaces_and_bindings.html

def makeNSM(pfx_declarations: Dict[str, str]) -> Dict[str, Namespace]:
    pfxs = {k: Namespace(v) for k, v in pfx_declarations.items()}
    print(f"{pfxs=}")

    nsm = NamespaceManager(Graph(), bind_namespaces="none")
    for pf, ns in pfxs.items():
        nsm.bind(pf, ns, override=True)
    print(f"{list(nsm.namespaces())=}")
    return nsm

def resolve_uri(uri: str, nsm: NamespaceManager) -> URIRef:
    # TODO reconsider the validators trick -- we might want to explicitely demand <> surrounding the <uri>
    return URIRef(uri) if validators.url(uri) else nsm.expand_curie(uri)

def resolve_literals(literal_uris: List[str], nsm: NamespaceManager) -> List[URIRef]:
    return [resolve_uri(u, nsm) for u in literal_uris]

def resolve_sparql(sparql, nsm):
    pfxlines: str = "\n".join((f"PREFIX {p}: {u.n3()}" for p,u in nsm.namespaces()))
    return f"{pfxlines}\n{sparql}"

PPATH_RE: str = r'(([^<>\/\s]+)|<([^>]+)>)\s*\/'  # how to match parts of property-paths

def ppath_split(ppath: str) -> List[str]:
    return (m.group(2) or m.group(3) for m in re.finditer(pattern=PPATH_RE, string=ppath + "/"))

def resolve_ppaths(ppaths: List[str], nsm: NamespaceManager):
    return [
        " / ".join(resolve_uri(part, nsm).n3() for part in ppath_split(ppath)) for ppath in ppaths
    ]

def do():
    yml_pfx_declarations = dict(
        schema="https://schema.org",
        ex="https://example.org/",
    )
    yml_literals = [
        "ex:test",
        "schema:DataSet",
        "https://demo.me/whatever",
    ]
    yml_sparql = """select * where ?s schema:name ?n ."""
    yml_ppaths = [
        "<https://demo.me/whatever> / ex:some",
        "ex:some",
        "<https://demo.me/whatever>",
        "schema:owner / schema:name",
    ]

    # make actual namespaces that can be used
    nsm: NamespaceManager = makeNSM(yml_pfx_declarations)

    literals = resolve_literals(yml_literals, nsm)
    print(f"{literals=}")
    sparql = resolve_sparql(yml_sparql, nsm)
    print(f"{sparql=}")
    ppaths = resolve_ppaths(yml_ppaths, nsm)
    print(f"{ppaths=}")
marc-portier commented 1 month ago

waiting for PR #51 to get merged with main branch