FIAF / fiafcore

FIAFcore is an ontology for film archives, based primarily on the FIAF Cataloguing Manual.
Creative Commons Attribution 4.0 International
5 stars 2 forks source link

Audit mandatory attributes #10

Open paulduchesne opened 10 months ago

paulduchesne commented 10 months ago

Small script to assess label/description/source for FIAFcore entities.

import pandas
import pathlib
import rdflib

fiafcore_path = pathlib.Path.home() / 'git' / 'FIAFcore' / 'classes'
frags = [x for x in fiafcore_path.rglob('*') if x.suffix == '.ttl']

graph = rdflib.Graph()
for x in sorted(frags):
    graph += rdflib.Graph().parse(x)

fiafcore_classes = [s for s,p,o in graph.triples((None, rdflib.RDF.type, rdflib.OWL.Class))]

df = pandas.DataFrame(columns=['entity', 'attribute', 'result'])
for x in sorted(fiafcore_classes):
    for y in [rdflib.RDFS.label, rdflib.URIRef('http://purl.org/dc/elements/1.1/description'), rdflib.URIRef('http://purl.org/dc/elements/1.1/source')]:
        for z in ['en', 'es', 'fr']:
            test = [o for s,p,o in graph.triples((x, y, None)) if o.language == z]
            df.loc[len(df)] = [x,y+'_'+z,len(test)]

print(df.result.unique()) # if this is not 0 and 1 something has gone wrong
pivoted_df = df.pivot(index='entity', columns='attribute', values='result').reset_index()

for x in list(pivoted_df.columns.values):
    if x != 'entity':
        print(x, sum(pivoted_df[x]))
paulduchesne commented 10 months ago

Variant showing v1.1 progress

from IPython.display import display
import altair
import pandas
import pathlib
import rdflib

def plot_graph(source):

    graph = rdflib.Graph().parse(source, format='ttl')
    fiafcore_classes = [s for s,p,o in graph.triples((None, rdflib.RDF.type, rdflib.OWL.Class))]

    df = pandas.DataFrame(columns=['entity', 'attribute', 'result'])
    for x in sorted(fiafcore_classes):
        for y in [rdflib.RDFS.label, rdflib.URIRef('http://purl.org/dc/elements/1.1/description'), 
            rdflib.URIRef('http://purl.org/dc/elements/1.1/source')]:
            for z in ['en', 'es', 'fr']:
                test = [o for s,p,o in graph.triples((x, y, None)) if o.language == z]
                df.loc[len(df)] = [x,y+'_'+z,len(test)]

    duplicates = df.loc[~df.result.isin([1,0])]
    if len(duplicates):
        raise Exception('Multiple attributes per entity found.')

    pivoted_df = df.pivot(index='entity', columns='attribute', values='result').reset_index()
    cooked_df = pandas.DataFrame(columns=['category', 'count'])
    for x in list(pivoted_df.columns.values):
        if x != 'entity':
            cooked_df.loc[len(cooked_df)] = [(pathlib.Path(x).stem), 
                sum(pivoted_df[x])/len(pivoted_df.entity.unique())]

    print(source)
    display(altair.Chart(cooked_df, title='').mark_bar().encode(
        y='category', x=altair.X('count', axis=altair.Axis(format='%'))).properties(width=800, height=300))

plot_graph('https://raw.githubusercontent.com/FIAF/FIAFcore/main/FIAFcore.ttl')
plot_graph('https://raw.githubusercontent.com/FIAF/FIAFcore/v1.1.0/FIAFcore.ttl')