Closed bigfatnoob closed 9 years ago
use sdiv to weight things try tiny=2
try NOT categorical rejection of features. rather, weight all features by NORMALIZED sdiv scores inside euclidean
def sdiv(lst, tiny=3,cohen=0.3,
num1=lambda x:x[0], num2=lambda x:x[1]):
"Divide lst of (num1,num2) using variance of num2."
#----------------------------------------------
class Counts(): # Add/delete counts of numbers.
def __init__(i,inits=[]):
i.zero()
for number in inits: i + number
def zero(i): i.n = i.mu = i.m2 = 0.0
def sd(i) :
if i.n < 2: return i.mu
else:
return (max(0,i.m2)*1.0/(i.n - 1))**0.5
def __add__(i,x):
i.n += 1
delta = x - i.mu
i.mu += delta/(1.0*i.n)
i.m2 += delta*(x - i.mu)
def __sub__(i,x):
if i.n < 2: return i.zero()
i.n -= 1
delta = x - i.mu
i.mu -= delta/(1.0*i.n)
i.m2 -= delta*(x - i.mu)
#----------------------------------------------
def divide(this,small): #Find best divide of 'this'
lhs,rhs = Counts(), Counts(num2(x) for x in this)
n0, least, cut = 1.0*rhs.n, rhs.sd(), None
for j,x in enumerate(this):
if lhs.n > tiny and rhs.n > tiny:
maybe= lhs.n/n0*lhs.sd()+ rhs.n/n0*rhs.sd()
if maybe < least :
if abs(lhs.mu - rhs.mu) >= small:
cut,least = j,maybe
rhs - num2(x)
lhs + num2(x)
return cut,least
#----------------------------------------------
def recurse(this, small,cuts):
cut,sd = divide(this,small)
if cut:
recurse(this[:cut], small, cuts)
recurse(this[cut:], small, cuts)
else:
cuts += [(sd * len(this)/len(lst),this)]
return cuts
#---| main |-----------------------------------
small = Counts(num2(x) for x in lst).sd()*cohen
if lst:
return recurse(sorted(lst,key=num1),small,[])
def fss(d=newCIIdataDeTune(),want=0.25):
rank=[]
for i in range(d.sfem):
xs=sdiv(d.projects,
num1=lambda x:x[i],
num2=lambda x:x[d.effort])
xpect = sum(map(lambda x: x[0],xs))
rank += [(xpect,i)]
rank = sorted(rank)
keep = int(len(rank)*want)
doomed= map(lambda x:x[1], rank[keep:])
for project in d.projects:
for col in doomed:
project[col] = 3
return d
After Clustering on first run. Look for attributes that don't matter. Start again from scratch and use the attributes that only matter(or modify distance function based on the right attributes)