biocore / gneiss

compositional data analysis toolbox
https://biocore.github.io/gneiss/
BSD 3-Clause "New" or "Revised" License
55 stars 28 forks source link

Proportionality test #285

Open mortonjt opened 3 years ago

mortonjt commented 3 years ago

We will want to add some support for proportionality

mortonjt commented 3 years ago

I already have both methods implemented, see code snippet below -- it'll be just a matter of refactoring and merging in.

from scipy import stats
from skbio.stats.composition import ilr
from statsmodels.regression.linear_model import OLS

def regression_test(Bxy, Bxyr, iAr):
    # Bxy  : n x 1
    # Bxyr : n x 1
    # Bxy  : n x r
    intr = np.ones((len(Bxy), 1))
    Bxy = Bxy.reshape(-1, 1)
    Bxyr = Bxyr.reshape(-1, 1)
    X = np.hstack((intr, Bxyr, iAr))
    Y = Bxy.reshape(-1, 1)
    return OLS(Y.ravel(), X)

def proportionality(table, pairs, references):
    """ Adapted from `Linear Association in Compositional Data Analysis`

    This is the Regression test from 5.2.

    Parameters
    ----------
    table : biom.Table
        Table of abundances
    pairs : list of tuple of str
        List of pairs to test
    references : list of str
        Reference Features

    Return
    ------
    tests : list of OLS results
    """
    # Extract reference features
    Ar = np.vstack([table.data(r, axis='observation') for r in references]).T
    Ar[Ar==0] = 0.65
    gAr = np.mean(np.log(Ar), axis=1)
    lrA = ilr(Ar)
    r, s = 2, len(references)
    results = []
    for i, (x, y) in enumerate(pairs):
        print(i)
        Ax, Ay = table.data(x, axis='observation'), table.data(y, axis='observation')
        idx = np.logical_and(Ax > 0, Ay > 0)
        Ax, Ay = np.log(Ax[idx]), np.log(Ay[idx])
        n = np.sum(idx)
        df = n - 2
        # Compute balances 
        Bxyr = np.sqrt( r * s / (r + s)) * (0.5 * (Ax + Ay) - gAr[idx])
        Bxy = Ax - Ay
        try:
            res = regression_test(Bxy, Bxyr, lrA[idx])
            fit = res.fit()
            summ = fit.summary()
            fval = float(summ.tables[0].data[2][3])
            pval = float(summ.tables[0].data[3][3])
            results.append((x, y, fval, pval))
        except:
            continue
    return results