ieaves / tenzing

MIT License
0 stars 0 forks source link

Rethink how we treat inf and nans #19

Closed sbrugman closed 5 years ago

sbrugman commented 5 years ago

POC

import pandas as pd
import numpy as np

series = pd.Series([np.nan, 0, 1.0, 2.0, 3.0, np.nan, np.inf, -np.inf])

def int_summarize(series, mask):
    return {'ints': len(series[mask])}

def nan_summarize(series, mask):
    return {'nans': len(series[mask])}

def inf_summarize(series, mask):
    return {'infs': len(series[mask])}

def base_summarize(series, mask):
    return {'n_records': len(series[mask])}

def unique_summarize(series, mask):
    return {'n_unique': series[mask].nunique()}

def zeros_summarize(series, mask):
    return {'n_zeros': series[mask].count()}

#

def infstuff(series, mask, summary):
    mask &= np.isinf(series)
    summary.update(inf_summarize(series, mask))

def nanstuff(series, mask, summary):
    mask &= ~series.notna()
    summary.update(nan_summarize(series, mask))

def intstuff(series, mask, summary):
    summary.update(int_summarize(series, mask))

def basestuff(series, mask, summary):
    summary.update(base_summarize(series, mask))

def uniquestuff(series, mask, summary):
    summary.update(unique_summarize(series, mask))

def zerosstuff(series, mask, summary):
    mask &= series == 0
    summary.update(zeros_summarize(series, mask))

#

def intfilter(series):
    return zerosfilter(series)

def zerosfilter(series):
    return uniquefilter(series)

def uniquefilter(series):
    return inffilter(series)

def inffilter(series):
    mask = np.isinf(series)
    return nanfilter(series) & (~mask)

def nanfilter(series):
    mask = ~series.notna()
    return (basefilter(series)) & (~mask)

def basefilter(series):
    return rootfilter(series)

def rootfilter(series):
    # True mask
    return np.ones_like(series, dtype=bool)

# top-down
summary = {}
intstuff(series, zerosfilter(series), summary)
zerosstuff(series, uniquefilter(series), summary)
uniquestuff(series, inffilter(series), summary)
infstuff(series, nanfilter(series), summary)
nanstuff(series, basefilter(series), summary)
basestuff(series, rootfilter(series), summary)

print(summary)
sbrugman commented 5 years ago

For reference, the bottom-up version

import pandas as pd
import numpy as np

series = pd.Series([np.nan, 0, 1.0, 2.0, 3.0, np.nan, np.inf, -np.inf])

def int_summarize(series, mask):
    return {'ints': len(series[mask])}

def nan_summarize(series, mask):
    return {'nans': len(series[mask])}

def inf_summarize(series, mask):
    return {'infs': len(series[mask])}

def base_summarize(series, mask):
    return {'n_records': len(series[mask])}

def unique_summarize(series, mask):
    return {'n_unique': series[mask].nunique()}

def zeros_summarize(series, mask):
    return {'n_zeros': series[mask].count()}

def infstuff(series, summary):
    mask = np.isinf(series)
    summary.update(inf_summarize(series, mask))
    return series[~mask]

def nanstuff(series, summary):
    mask = ~series.notna()
    summary.update(nan_summarize(series, mask))
    return series[~mask]

def intstuff(series, summary):
    summary.update(int_summarize(series, series))
    return series

def basestuff(series, summary):
    summary.update(base_summarize(series, series))
    return series

def uniquestuff(series, summary):
    summary.update(unique_summarize(series, series))
    return series

def zerosstuff(series, summary):
    mask = series == 0
    summary.update(zeros_summarize(series, mask))
    return series

# Bottom-up
summary = {}
baseseries = basestuff(series, summary)
nanseries = nanstuff(series, summary)
infseries = infstuff(nanseries, summary)
_ = uniquestuff(infseries, summary)
_ = zerosstuff(infseries, summary)
_ = intstuff(infseries, summary)

print(summary)