timm / luamine

data mining and optimization
https://github.com/timm/luamine/blob/master/README.md
Other
2 stars 1 forks source link

Python version of bins.lua #1

Open ginfung opened 8 years ago

ginfung commented 8 years ago
from __future__ import division
from copy import deepcopy
from collections import namedtuple

class ListStat(object):
    def __init__(self):
        self.mu = 0
        self.n = 0
        self.m2 = 0

    def __repr__(self):
        return "mu = %f, n = %d, m2 = %f" % (self.mu, self.n, self.m2)

def num1(z, stat):
    stat.n += 1
    delta = z - stat.mu
    stat.mu += delta / stat.n
    stat.m2 += delta * (z - stat.mu)

    if abs(stat.m2) < 1e-4:
        stat.m2 = 0

    return stat

def unnum(z, stat):  # opposite to num1
    if stat.n == 1:
        stat.n, stat.mu, stat.m2 = 0, 0, 0
        return stat

    stat.n -= 1
    delta = z - stat.mu
    stat.mu -= delta / stat.n
    stat.m2 -= delta * (z - stat.mu)

    if abs(stat.m2) < 1e-4:
        stat.m2 = 0

    return stat

def num0(l=[]):
    stat = ListStat()
    for element in l:
        stat = num1(element, stat)
    return stat

def sd(stat):  # get standard deviation from ListStat
    return (stat.m2 / (stat.n - 1)) ** 0.5 if stat.n > 1 else 0

BinInfo = namedtuple("BinInfo", "enough cohen maxBins minBin small verbose trivial")
Range = namedtuple("Range", "lo also n up")
ranges = []

def bins1(i, nums, all, lvl):
    if i.verbose:
        print "|.."*lvl, str(nums)

    cut = -1
    n = len(nums)
    start, stop = nums[0], nums[-1]

    if stop - start >= i.small:
        lhs, rhs = num0(), deepcopy(all)
        score, score1 = sd(rhs), None
        for j, new in enumerate(nums):
            num1(new, lhs)
            unnum(new, rhs)
            score1 = (lhs.n * sd(lhs) + rhs.n * sd(rhs)) / n
            if new != nums[min(j+1, n-1)] and lhs.n >= i.enough and rhs.n >= i.enough and \
                    new - start >= i.small and score1 * i.trivial < score:
                cut, score, lo, hi = j+1, score1, deepcopy(lhs), deepcopy(rhs)

        if cut != -1:
            bins1(i, nums[:cut], lo, lvl+1)
            bins1(i, nums[cut:], hi, lvl+1)
        else:  # we've found a leaf range
            global ranges
            ranges.append(Range(lo=start, also=None, n=len(nums), up=stop))

def bins(t, enough=None, cohen=0.2, maxBins=16, minBin=4, small=None, verbose=False, trivial=1.05):
    nums = sorted(t)
    all = num0(t)
    enough = enough or max(minBin, all.n/maxBins)
    small = small or sd(all) * cohen
    i = BinInfo(enough, cohen, maxBins, minBin, small, verbose, trivial)
    global ranges
    ranges = []
    bins1(i, nums, all, 1)
    return ranges
timm commented 8 years ago

cool

now see learn association rules from the unprivatized data and report thair confindence and support in the unprivitized data and in the privitized data

ginfung commented 8 years ago

@timm It seems that ranges does not include the maximum number in t sometimes.

that problem is probably symmetric (occurs at either end)

so should i change the code such that, at end, range[1].lo is set to min and range[#range].up is set to max?