Different algorithm options

As already discussed some time in the past, it makes sense to provide different weak supervision algorithms, just as you have multiple machine learning algorithms and not just one.

A suggestion by Jens was something like the following:

def calc_add(values,taken_label):
    #basic idea is to push the values a fraction of the weightes_precision of the same label (fraction = 1-max_precision)
    add = 0
    max_v = -1
    taken=False
    for a in values:
        if a["label"]==taken_label and a["precision"]>max_v:
            max_v = a["precision"]
    for a in values:
        if a["label"]==taken_label:
            if a["precision"]==max_v and not taken:
                taken = True
            else:
                add+=a["weighted_precision"]
    return add

from collections import defaultdict

def calc_confidence(values):
    if not values:
        return None,None
    count = len(values)
    sum_precision = 0
    weighted = defaultdict(float)
    count = defaultdict(int)
    max_precision = -1
    for a in values:
        sum_precision += a["precision"]
        count[a["label"]]+=1
        if a["precision"]> max_precision:
            max_precision = a["precision"]
    for a in values:
        a["weighted_precision"]=a["precision"]/sum_precision*max_precision

    options = defaultdict(float)
    for a in values:
        options[a["label"]]+=a["weighted_precision"]
    max = -1
    max_label=""
    for b in options:
        if max < options[b]:
            max=options[b]
            max_label=b
    if count[max_label]>1 and max_precision != 1:   
        #if doesn't change values only for performance/debug purposes
        max += (1-max_precision)*calc_add(values,max_label)
    return max_label, max

where the records could look something like this:

#precision is the final precision so for lf = precision and al = precision*confidence

tests = {
"testA" : [{"label":"blue","precision":.8},{"label":"red","precision":.1}],
"testA_x" : [{"label":"blue","precision":.8},{"label":"red","precision":.6}],
"testA2" : [{"label":"blue","precision":.2},{"label":"red","precision":.1}],
"testB" : [{"label":"blue","precision":.8},{"label":"blue","precision":.5},{"label":"blue","precision":.2}],
"testB2" : [{"label":"blue","precision":.8},{"label":"blue","precision":.5},{"label":"blue","precision":.2},{"label":"blue","precision":.3}],
"testB3" : [{"label":"blue","precision":.8},{"label":"blue","precision":.5},{"label":"blue","precision":.2},{"label":"blue","precision":.1}],
"testC" : [{"label":"blue","precision":.8},{"label":"blue","precision":.5},{"label":"blue","precision":1}],
"testC2" : [{"label":"red","precision":.8},{"label":"red","precision":.5},{"label":"blue","precision":1}],
"testC3" : [{"label":"red","precision":.8},{"label":"blue","precision":.5},{"label":"blue","precision":.1}],
"testD" : [{"label":"blue","precision":.8},{"label":"red","precision":.5},{"label":"blue","precision":1}],
"testE" : [{"label":"blue","precision":.5},{"label":"red","precision":.5},{"label":"green","precision":1}],
"testE_x" : [{"label":"blue","precision":.5},{"label":"red","precision":.5},{"label":"green","precision":.9}],
"testE2" : [{"label":"blue","precision":.5},{"label":"red","precision":.5},{"label":"green","precision":.5}],
"testE3" : [{"label":"blue","precision":.5},{"label":"red","precision":.51},{"label":"green","precision":.5}],
"testE4" : [{"label":"blue","precision":1},{"label":"red","precision":1},{"label":"green","precision":1}],
"testE5" : [{"label":"blue","precision":.1},{"label":"red","precision":.1},{"label":"green","precision":.1}],
"testF" : [{"label":"red","precision":.8},{"label":"blue","precision":.5},{"label":"blue","precision":.3}],
"testF2" : [{"label":"red","precision":.8},{"label":"blue","precision":.5},{"label":"blue","precision":.31}],
"testN" : [{"label":"blue","precision":1}],
"testNix" : []}

for x in tests:
    print(x,calc_confidence(tests[x]))

I think we should look into this in the near future, to provide different options.

code-kern-ai / weak-nlp

Different algorithm options #2