ai-se / george

stuff for george
MIT License
1 stars 0 forks source link

Better feature weighting #5

Closed bigfatnoob closed 9 years ago

bigfatnoob commented 10 years ago

After Clustering on first run. Look for attributes that don't matter. Start again from scratch and use the attributes that only matter(or modify distance function based on the right attributes)

timm commented 9 years ago

use sdiv to weight things try tiny=2

try NOT categorical rejection of features. rather, weight all features by NORMALIZED sdiv scores inside euclidean

def sdiv(lst, tiny=3,cohen=0.3,
         num1=lambda x:x[0], num2=lambda x:x[1]):
  "Divide lst of (num1,num2) using variance of num2."
  #----------------------------------------------
  class Counts(): # Add/delete counts of numbers.
    def __init__(i,inits=[]):
      i.zero()
      for number in inits: i + number 
    def zero(i): i.n = i.mu = i.m2 = 0.0
    def sd(i)  : 
      if i.n < 2: return i.mu
      else:       
        return (max(0,i.m2)*1.0/(i.n - 1))**0.5
    def __add__(i,x):
      i.n  += 1
      delta = x - i.mu
      i.mu += delta/(1.0*i.n)
      i.m2 += delta*(x - i.mu)
    def __sub__(i,x):
      if i.n < 2: return i.zero()
      i.n  -= 1
      delta = x - i.mu
      i.mu -= delta/(1.0*i.n)
      i.m2 -= delta*(x - i.mu)    

  #----------------------------------------------
  def divide(this,small): #Find best divide of 'this'
    lhs,rhs = Counts(), Counts(num2(x) for x in this)
    n0, least, cut = 1.0*rhs.n, rhs.sd(), None
    for j,x  in enumerate(this): 
      if lhs.n > tiny and rhs.n > tiny: 
        maybe= lhs.n/n0*lhs.sd()+ rhs.n/n0*rhs.sd()
        if maybe < least :  
          if abs(lhs.mu - rhs.mu) >= small:
            cut,least = j,maybe
      rhs - num2(x)
      lhs + num2(x)    
    return cut,least
  #----------------------------------------------
  def recurse(this, small,cuts):
    cut,sd = divide(this,small)
    if cut: 
      recurse(this[:cut], small, cuts)
      recurse(this[cut:], small, cuts)
    else:   
      cuts += [(sd * len(this)/len(lst),this)]
    return cuts
  #---| main |-----------------------------------
  small = Counts(num2(x) for x in lst).sd()*cohen
  if lst: 
    return recurse(sorted(lst,key=num1),small,[])

def fss(d=newCIIdataDeTune(),want=0.25):
  rank=[]
  for i in range(d.sfem):
    xs=sdiv(d.projects,
         num1=lambda x:x[i],
         num2=lambda x:x[d.effort])
    xpect = sum(map(lambda x: x[0],xs))
    rank += [(xpect,i)]
  rank = sorted(rank)
  keep = int(len(rank)*want)
  doomed= map(lambda x:x[1], rank[keep:])
  for project in d.projects:
    for col in doomed:
      project[col] = 3
  return d