dice-group / Ontolearn

Ontolearn is an open-source software library for explainable structured machine learning in Python. It learns OWL class expressions from positive and negative examples.
https://ontolearn-docs-dice-group.netlify.app/index.html
MIT License
39 stars 9 forks source link

Massive Evolearner performance difference based on single knowledge base initialization #337

Closed Demirrr closed 8 months ago

Demirrr commented 10 months ago

There is a bug in using a single knowledge base class with multiple learners

For each learning problem, knowledge base instance and the learner is intialized

# examples/concept_learning_evaluation.py
import json
import os
import time
import pandas as pd
from ontolearn.knowledge_base import KnowledgeBase
from ontolearn.concept_learner import CELOE, OCEL, EvoLearner
from ontolearn.learners import Drill, TDL
from ontolearn.learning_problem import PosNegLPStandard
from ontolearn.metrics import Accuracy, F1
from owlapy.model import OWLClass, OWLNamedIndividual, IRI
import argparse
from rdflib import Graph

pd.set_option("display.precision", 5)

def compute_f1_score(individuals, pos, neg):
    tp = len(pos.intersection(individuals))
    tn = len(neg.difference(individuals))

    fp = len(neg.intersection(individuals))
    fn = len(pos.difference(individuals))

    try:
        recall = tp / (tp + fn)
    except ZeroDivisionError:
        return 0

    try:
        precision = tp / (tp + fp)
    except ZeroDivisionError:
        return 0

    if precision == 0 or recall == 0:
        return 0

    f_1 = 2 * ((precision * recall) / (precision + recall))
    return f_1

def dl_concept_learning(args):
    with open(args.lps) as json_file:
        settings = json.load(json_file)

    kb = KnowledgeBase(path=args.kb)

    # dictionary to store the data
    data = dict()
    for str_target_concept, examples in settings['problems'].items():
        p = set(examples['positive_examples'])
        n = set(examples['negative_examples'])
        print('\n\n')

        print('Target concept: ', str_target_concept)
        data.setdefault("LP", []).append(str_target_concept)

        typed_pos = set(map(OWLNamedIndividual, map(IRI.create, p)))
        typed_neg = set(map(OWLNamedIndividual, map(IRI.create, n)))
        lp = PosNegLPStandard(pos=typed_pos, neg=typed_neg)

        print("OCEL starts..", end="\t")
        model = OCEL(knowledge_base=KnowledgeBase(path=args.kb), quality_func=F1(), max_runtime=args.max_runtime)
        start_time = time.time()
        pred_ocel = model.fit(lp).best_hypotheses(n=1)
        print("OCEL ends..", end="\t")
        rt_ocel = time.time() - start_time
        f1_ocel = compute_f1_score(individuals={i for i in kb.individuals(pred_ocel.concept)}, pos=lp.pos, neg=lp.neg)
        data.setdefault("F1-OCEL", []).append(f1_ocel)
        data.setdefault("RT-OCEL", []).append(rt_ocel)
        print(f"OCEL Quality: {f1_ocel:.3f}", end="\t")
        print(f"OCEL Runtime: {rt_ocel:.3f}")

        print("CELOE starts..", end="\t")
        model = CELOE(knowledge_base=KnowledgeBase(path=args.kb), quality_func=F1(), max_runtime=args.max_runtime)
        start_time = time.time()
        pred_celoe = model.fit(lp).best_hypotheses(n=1)
        print("CELOE Ends..", end="\t")
        rt_celoe = time.time() - start_time
        f1_celoe = compute_f1_score(individuals={i for i in kb.individuals(pred_celoe.concept)}, pos=lp.pos, neg=lp.neg)
        data.setdefault("F1-CELOE", []).append(f1_celoe)
        data.setdefault("RT-CELOE", []).append(rt_celoe)
        print(f"CELOE Quality: {f1_celoe:.3f}", end="\t")
        print(f"CELOE Runtime: {rt_celoe:.3f}")

        print("Evo starts..", end="\t")
        model = EvoLearner(knowledge_base=KnowledgeBase(path=args.kb), quality_func=F1(), max_runtime=args.max_runtime)
        start_time = time.time()
        pred_evo = model.fit(lp).best_hypotheses(n=1)
        print("Evo ends..", end="\t")
        rt_evo = time.time() - start_time
        f1_evo = compute_f1_score(individuals={i for i in kb.individuals(pred_evo.concept)}, pos=lp.pos, neg=lp.neg)
        data.setdefault("F1-Evo", []).append(f1_evo)
        data.setdefault("RT-Evo", []).append(rt_evo)
        print(f"Evo Quality: {f1_evo:.3f}", end="\t")
        print(f"Evo Runtime: {rt_evo:.3f}")

        print("DRILL starts..", end="\t")
        start_time = time.time()
        model = Drill(knowledge_base=KnowledgeBase(path=args.kb),
                      path_pretrained_kge=args.path_pretrained_kge,
                      quality_func=F1(),
                      max_runtime=args.max_runtime)
        pred_drill = model.fit(lp).best_hypotheses(n=1)
        print("DRILL ends..", end="\t")
        rt_drill = time.time() - start_time
        f1_drill = compute_f1_score(individuals=set(kb.individuals(pred_drill.concept)), pos=lp.pos, neg=lp.neg)
        data.setdefault("F1-DRILL", []).append(f1_drill)
        data.setdefault("RT-DRILL", []).append(rt_drill)
        print(f"DRILL Quality: {f1_drill:.3f}", end="\t")
        print(f"DRILL Runtime: {rt_drill:.3f}")

        print("TDL starts..", end="\t")
        start_time = time.time()
        model = TDL(knowledge_base=KnowledgeBase(path=args.kb), dataframe_triples=pd.DataFrame(
            data=[(str(s), str(p), str(o)) for s, p, o in Graph().parse(args.kb)],
            columns=['subject', 'relation', 'object'], dtype=str).sort_values('subject'),
                    kwargs_classifier={"criterion": "gini", "random_state": 0},
                    max_runtime=args.max_runtime)
        pred_tdl = model.fit(lp).best_hypotheses(n=1)
        print("TDL ends..", end="\t")
        rt_tdl = time.time() - start_time
        # Compute quality of best prediction
        f1_tdl = compute_f1_score(individuals={i for i in kb.individuals(pred_tdl)}, pos=lp.pos, neg=lp.neg)
        data.setdefault("F1-TDL", []).append(f1_tdl)
        data.setdefault("RT-TDL", []).append(rt_tdl)
        print(f"TDL Quality: {f1_tdl:.3f}", end="\t")
        print(f"TDL Runtime: {rt_tdl:.3f}")

    df = pd.DataFrame.from_dict(data)
    df.to_csv(args.report, index=False)
    print(df)
    print(df.select_dtypes(include="number").mean())

if __name__ == '__main__':
    parser = argparse.ArgumentParser(description='Description Logic Concept Learning')

    parser.add_argument("--max_runtime", type=int, default=60)
    parser.add_argument("--lps", type=str, required=True)
    parser.add_argument("--kb", type=str, required=True)
    parser.add_argument("--path_pretrained_kge", type=str, default=None)
    parser.add_argument("--report", type=str, default="report.csv")
    dl_concept_learning(parser.parse_args())

A knowledge base is used in multiple learners

# examples/faulty_concept_learning_evaluation.py
import json
import os
import time
import pandas as pd
from ontolearn.knowledge_base import KnowledgeBase
from ontolearn.concept_learner import CELOE, OCEL, EvoLearner
from ontolearn.learners import Drill, TDL
from ontolearn.learning_problem import PosNegLPStandard
from ontolearn.metrics import Accuracy, F1
from owlapy.model import OWLClass, OWLNamedIndividual, IRI
import argparse
from rdflib import Graph

pd.set_option("display.precision", 5)

def compute_f1_score(individuals, pos, neg):
    tp = len(pos.intersection(individuals))
    tn = len(neg.difference(individuals))

    fp = len(neg.intersection(individuals))
    fn = len(pos.difference(individuals))

    try:
        recall = tp / (tp + fn)
    except ZeroDivisionError:
        return 0

    try:
        precision = tp / (tp + fp)
    except ZeroDivisionError:
        return 0

    if precision == 0 or recall == 0:
        return 0

    f_1 = 2 * ((precision * recall) / (precision + recall))
    return f_1

def dl_concept_learning(args):
    with open(args.lps) as json_file:
        settings = json.load(json_file)

    kb = KnowledgeBase(path=args.kb)
    # Our ongoing work
    # kwargs_classifier is for sklearn.tree.DecisionTreeClassifier.html#sklearn-tree-decisiontreeclassifier
    tdl = TDL(knowledge_base=kb,
              # From rdflib into dataframe sorted by subject
              dataframe_triples=pd.DataFrame(
                  data=[(str(s), str(p), str(o)) for s, p, o in Graph().parse(args.kb)],
                  columns=['subject', 'relation', 'object'], dtype=str).sort_values('subject'),
              kwargs_classifier={"criterion": "gini", "random_state": 0},
              max_runtime=args.max_runtime)

    drill = Drill(knowledge_base=kb,
                  path_pretrained_kge=args.path_pretrained_kge,
                  quality_func=F1(),
                  max_runtime=args.max_runtime)
    ocel = OCEL(knowledge_base=kb, quality_func=F1(), max_runtime=args.max_runtime)
    celoe = CELOE(knowledge_base=kb, quality_func=F1(), max_runtime=args.max_runtime)
    evo = EvoLearner(knowledge_base=kb, quality_func=F1(), max_runtime=args.max_runtime)

    # dictionary to store the data
    data = dict()
    for str_target_concept, examples in settings['problems'].items():
        p = set(examples['positive_examples'])
        n = set(examples['negative_examples'])
        print('\n\n')

        print('Target concept: ', str_target_concept)
        data.setdefault("LP", []).append(str_target_concept)

        typed_pos = set(map(OWLNamedIndividual, map(IRI.create, p)))
        typed_neg = set(map(OWLNamedIndividual, map(IRI.create, n)))
        lp = PosNegLPStandard(pos=typed_pos, neg=typed_neg)

        start_time = time.time()
        print("OCEL starts..", end="\t")
        pred_ocel = ocel.fit(lp).best_hypotheses(n=1)
        print("OCEL ends..", end="\t")
        rt_ocel = time.time() - start_time
        f1_ocel = compute_f1_score(individuals={i for i in kb.individuals(pred_ocel.concept)}, pos=lp.pos, neg=lp.neg)
        print(f"OCEL Quality: {f1_ocel:.3f}")
        data.setdefault("F1-OCEL", []).append(f1_ocel)
        data.setdefault("RT-OCEL", []).append(rt_ocel)
        print(f"OCEL Runtime: {rt_ocel:.3f}")

        start_time = time.time()
        print("CELOE starts..", end="\t")
        pred_celoe = celoe.fit(lp).best_hypotheses(n=1)
        print("CELOE Ends..", end="\t")
        rt_celoe = time.time() - start_time
        f1_celoe = compute_f1_score(individuals={i for i in kb.individuals(pred_celoe.concept)}, pos=lp.pos, neg=lp.neg)
        print(f"CELOE Quality: {f1_celoe:.3f}")
        data.setdefault("F1-CELOE", []).append(f1_celoe)
        data.setdefault("RT-CELOE", []).append(rt_celoe)
        print(f"CELOE Runtime: {rt_celoe:.3f}", end="\t")

        start_time = time.time()
        print("Evo starts..", end="\t")
        pred_evo = evo.fit(lp).best_hypotheses(n=1)
        print("Evo ends..", end="\t")
        rt_evo = time.time() - start_time
        f1_evo = compute_f1_score(individuals={i for i in kb.individuals(pred_evo.concept)}, pos=lp.pos, neg=lp.neg)
        print(f"Evo Quality: {f1_evo:.3f}")
        data.setdefault("F1-Evo", []).append(f1_evo)
        data.setdefault("RT-Evo", []).append(rt_evo)
        print(f"Evo Runtime: {rt_evo:.3f}", end="\t")

        start_time = time.time()
        print("DRILL starts..", end="\t")
        pred_drill = drill.fit(lp).best_hypotheses(n=1)
        print("DRILL ends..", end="\t")
        rt_drill = time.time() - start_time
        f1_drill = compute_f1_score(individuals=set(kb.individuals(pred_drill.concept)), pos=lp.pos, neg=lp.neg)
        print(f"DRILL Quality: {f1_drill:.3f}")
        data.setdefault("F1-DRILL", []).append(f1_drill)
        data.setdefault("RT-DRILL", []).append(rt_drill)
        print(f"DRILL Runtime: {rt_drill:.3f}", end="\t")

        start_time = time.time()
        # Get best prediction
        print("TDL starts..", end="\t")
        pred_tdl = tdl.fit(lp).best_hypotheses(n=1)
        print("TDL ends..", end="\t")
        rt_tdl = time.time() - start_time
        # Compute quality of best prediction
        f1_tdl = compute_f1_score(individuals={i for i in kb.individuals(pred_tdl)}, pos=lp.pos, neg=lp.neg)
        print(f"TDL Quality: {f1_tdl:.3f}", end="\t")
        print(f"TDL Runtime: {rt_tdl:.3f}")

        data.setdefault("F1-TDL", []).append(f1_tdl)
        data.setdefault("RT-TDL", []).append(rt_tdl)

    df = pd.DataFrame.from_dict(data)
    df.to_csv(args.report, index=False)
    print(df)
    print(df.select_dtypes(include="number").mean())

if __name__ == '__main__':
    parser = argparse.ArgumentParser(description='Description Logic Concept Learning')

    parser.add_argument("--max_runtime", type=int, default=60)
    parser.add_argument("--lps", type=str, required=True)
    parser.add_argument("--kb", type=str, required=True)
    parser.add_argument("--path_pretrained_kge", type=str, default=None)
    parser.add_argument("--report", type=str, default="report.csv")
    dl_concept_learning(parser.parse_args())

python examples/concept_learning_evaluation.py --lps LPs/Family/lps.json --kb KGs/Family/family-benchmark_rich_background.owl --max_runtime 60 --report family_results.csv && python -c 'import pandas as pd; print(pd.read_csv("family_results.csv", index_col=0).to_markdown(floatfmt=".3f"))'

LP F1-OCEL RT-OCEL F1-CELOE RT-CELOE F1-Evo RT-Evo F1-DRILL RT-DRILL F1-TDL RT-TDL
Aunt 0.837 16.651 0.911 10.527 1.000 2.393 0.882 60.090 1.000 0.609
Brother 1.000 0.023 1.000 0.020 1.000 0.784 1.000 0.113 1.000 0.516
Cousin 0.721 15.187 0.793 9.117 1.000 4.313 0.830 60.119 1.000 0.598
Daughter 1.000 0.016 1.000 0.010 1.000 0.562 1.000 0.113 1.000 0.409
Father 1.000 0.014 1.000 0.011 1.000 0.723 1.000 0.103 1.000 0.450
Granddaughter 1.000 0.014 1.000 0.011 1.000 0.516 1.000 0.112 1.000 0.628
Grandfather 1.000 0.013 1.000 0.012 1.000 0.468 1.000 0.105 1.000 0.427
Grandgranddaughter 1.000 0.017 1.000 0.019 1.000 0.443 1.000 0.097 1.000 0.518
Grandgrandfather 1.000 0.781 1.000 0.217 1.000 0.418 1.000 1.024 1.000 0.500
Grandgrandmother 1.000 0.608 1.000 0.263 1.000 0.655 1.000 1.172 1.000 0.503
Grandgrandson 1.000 0.838 1.000 0.342 1.000 0.549 1.000 0.255 1.000 0.554
Grandmother 1.000 0.013 1.000 0.012 1.000 0.466 1.000 0.099 1.000 0.385
Grandson 1.000 0.014 1.000 0.012 1.000 0.525 1.000 0.102 1.000 0.379
Mother 1.000 0.015 1.000 0.012 1.000 0.533 1.000 0.101 1.000 0.438
PersonWithASibling 1.000 0.013 1.000 0.010 1.000 0.583 0.737 60.096 1.000 0.496
Sister 1.000 0.014 1.000 0.011 1.000 0.842 1.000 0.111 1.000 0.427
Son 1.000 0.015 1.000 0.013 1.000 0.530 1.000 0.101 1.000 0.409
Uncle 0.905 22.261 0.905 9.054 1.000 1.154 0.938 60.090 1.000 0.489

python examples/faulty_concept_learning_evaluation.py --lps LPs/Family/lps.json --kb KGs/Family/family-benchmark_rich_background.owl --max_runtime 60 --report family_results.csv && python -c 'import pandas as pd; print(pd.read_csv("family_results.csv", index_col=0).to_markdown(floatfmt=".3f"))'

LP F1-OCEL RT-OCEL F1-CELOE RT-CELOE F1-Evo RT-Evo F1-DRILL RT-DRILL F1-TDL RT-TDL
Aunt 0.837 17.143 0.911 8.104 1.000 2.750 0.921 60.044 1.000 0.220
Brother 1.000 0.031 1.000 0.007 1.000 0.363 1.000 0.079 1.000 0.320
Cousin 0.721 11.408 0.793 8.930 0.348 0.291 0.861 60.067 1.000 0.241
Daughter 1.000 0.027 1.000 0.010 1.000 0.365 1.000 0.125 1.000 0.217
Father 1.000 0.004 1.000 0.002 1.000 0.346 1.000 0.008 1.000 0.230
Granddaughter 1.000 0.003 1.000 0.002 0.949 0.298 1.000 0.006 1.000 0.200
Grandfather 1.000 0.003 1.000 0.001 0.909 0.286 1.000 0.006 1.000 0.206
Grandgranddaughter 1.000 0.003 1.000 0.001 1.000 0.256 1.000 0.003 1.000 0.174
Grandgrandfather 1.000 0.803 1.000 0.162 0.944 0.407 1.000 0.474 1.000 0.173
Grandgrandmother 1.000 2.428 1.000 0.274 1.000 0.272 1.000 0.474 1.000 0.176
Grandgrandson 1.000 2.015 1.000 0.169 0.486 0.274 1.000 0.194 1.000 0.182
Grandmother 1.000 0.008 1.000 0.002 0.654 0.278 1.000 0.005 1.000 0.194
Grandson 1.000 0.003 1.000 0.002 0.451 0.278 1.000 0.006 1.000 0.329
Mother 1.000 0.004 1.000 0.002 0.510 0.294 1.000 0.009 1.000 0.225
PersonWithASibling 1.000 0.005 1.000 0.002 0.571 0.291 0.737 60.336 1.000 0.246
Sister 1.000 0.003 1.000 0.002 0.800 0.287 1.000 0.077 1.000 0.206
Son 1.000 0.004 1.000 0.002 0.556 0.279 1.000 0.007 1.000 0.221
Uncle 0.905 18.493 0.905 6.729 0.657 0.257 0.938 60.058 1.000 0.248
alkidbaci commented 8 months ago

I investigated this a bit and tried to narrow down the cause of the problem. I found out the following:

  1. Using a single kb and the model initialized outside the "for" loop like in faulty_concept_learning_evaluation.py it gives the bad results indeed.
  2. On another scenario when we are using a single kb again but this time we initialize the model for every learning problem (i.e inside the "for" loop), the quality was improved and was the same to that yielded by concept_learning_evaluation.py.

There is also another interesting point. When accessing the quality of the hypothesis using h.quality, no matter if the model was defined before the "for" loop or inside it, the quality was 1.0 always (like we expect to be). This is contradictory with the quality measured by the the method compute_f1_score in the script for the first case (1). When exploring the generated concept it turns out that the method compute_f1_score in the script is computing the right quality and the quality stored in the hypothesis node is wrong. So basically EvoLearner is tricked that it has found the best concept but actually that is not the case.

This was just an initial investigation and the above mentioned points need further investigation to answer the following:

Leaving this comment here for the record.