tskit-dev / tutorials

A set of tutorials for msprime and tskit.
Creative Commons Attribution 4.0 International
19 stars 15 forks source link

Cross coalescence rates: advanced analysis tutorial. #277

Open hyanwong opened 4 months ago

hyanwong commented 4 months ago

@nspope sent me this, which is a good core for a new tutorial on cross coalescence rates. Probably worth waiting until some of the core functionality (e.g. rates_from_ecdf) is implemented in tskit.

import stdpopsim
import numpy as np
import matplotlib.pyplot as plt

def rates_from_ecdf(weights, atoms, quantiles):
    """
    Estimate rates from weighted time-to-event data
    """
    assert weights.size == atoms.size
    assert quantiles.size > 2
    assert quantiles[0] == 0.0 and quantiles[-1] == 1.0

    # sort and filter inputs
    time_order = np.argsort(atoms)
    weights = weights[time_order]
    atoms = atoms[time_order]
    nonzero = weights > 0
    atoms = atoms[nonzero]
    weights = weights[nonzero]

    # find interior quantiles
    weights = np.append(0, weights)
    atoms = np.append(0, atoms)
    ecdf = np.cumsum(weights)
    indices = np.searchsorted(ecdf, ecdf[-1] * quantiles[1:-1], side='left')
    lower, upper = atoms[indices - 1], atoms[indices]
    ecdfl, ecdfu = ecdf[indices - 1] / ecdf[-1], ecdf[indices] / ecdf[-1]

    # interpolate ECDF
    assert np.all(ecdfu - ecdfl > 0)
    slope = (upper - lower) / (ecdfu - ecdfl)
    breaks = np.append(0, lower + slope * (quantiles[1:-1] - ecdfl))

    # calculate coalescence rates within intervals
    # this uses a Kaplan-Meier-type censored estimator: https://github.com/tskit-dev/tskit/pull/2119
    coalrate = np.full(quantiles.size - 1, np.nan)
    propcoal = np.diff(quantiles[:-1]) / (1 - quantiles[:-2])
    coalrate[:-1] = -np.log(1 - propcoal) / np.diff(breaks)
    last = indices[-1]
    coalrate[-1] = np.sum(weights[last:]) / np.dot(atoms[last:] - breaks[-1], weights[last:]) 

    return coalrate, breaks

# --- on data

import stdpopsim
engine = stdpopsim.get_engine("msprime")
homsap = stdpopsim.get_species("HomSap")
demogr = homsap.get_demographic_model("OutOfAfrica_4J17")
contig = homsap.get_contig("chr17", genetic_map='HapMapII_GRCh38', right=1e7)
sample = {"CEU" : 100, "YRI" : 100, "JPT" : 100, "CHB" : 100}
ts = engine.simulate(demogr, contig, sample, seed=1024).trim()

pop_idx = {p.metadata['name'] : i for i, p in enumerate(ts.populations())}
sample_sets = [
    np.flatnonzero(ts.nodes_population[:ts.num_samples] == i) 
    for i in range(ts.num_populations)
]
quantiles = np.linspace(0.0, 1.0, 25)
indexes = [(pop_idx["CEU"], pop_idx["CHB"])] # CEU vs CHB
node_coal_weight = ts.pair_coalescence_counts(sample_sets, indexes=indexes).flatten()
rates, breaks = rates_from_ecdf(node_coal_weight, ts.nodes_time, quantiles)

# sanity check: mean of cross-coalescence time distribution should equal branch-mode divergence
print(
    "Mean of cross-coalescence time distr: ",
    2 * node_coal_weight @ ts.nodes_time / np.sum(node_coal_weight),
    "Branch-mode divergence: ",
    ts.divergence(sample_sets, indexes=indexes, mode='branch'),
)

# sanity check: compare estimated rates to expectation under model
theory_rates = demogr.model.debug().coalescence_rate_trajectory(breaks, {"CEU":1, "CHB":1})[0]
plt.clf()
plt.step(breaks, rates, color='firebrick', label='estimated')
plt.step(breaks, theory_rates, color='black', label='theoretical')
plt.xscale('log')
plt.yscale('log')
plt.xlabel('Generations in past')
plt.ylabel('Cross-coalescence rate')
plt.legend()
plt.savefig('rate-check.png')

# as an aside: rates are a 1-to-1 transformation of the pair coalescence time distribution. We can plot this distribution using e.g. a weighted density estimate or histogram.
hist_density, hist_breaks = np.histogram(
    np.log10(ts.nodes_time[ts.num_samples:]), 
    weights=node_coal_weight[ts.num_samples:],
    bins=25,
    density=True,
)
plt.clf()
plt.fill_between(hist_breaks[:-1], hist_density, step='pre', alpha=0.5)
plt.xlabel('log10 generations in past')
plt.ylabel('Density')
plt.savefig('rate-hist.png')

# this also gives a very easy way to get coalescence rates if we have a fixed timegrid
# (but, rates will be noisier in bins with few nodes, and have to watch for nan)
hist_density, hist_breaks = np.histogram(
    ts.nodes_time, # note natural time scale
    weights=node_coal_weight,
    bins=np.logspace(2, 5, 25),
    density=False,
)
hist_density /= hist_density.sum()
hist_rates = -np.log(1 - hist_density / (1 - np.cumsum(hist_density))) / np.diff(hist_breaks)
hist_theory_rates = demogr.model.debug().coalescence_rate_trajectory(hist_breaks, {"CEU":1, "CHB":1})[0]
hist_theory_rates = hist_theory_rates[1:] # remove [0, first break]

plt.clf()
plt.step(hist_breaks[:-1], hist_rates, color='firebrick', label='estimated')
plt.step(hist_breaks[:-1], hist_theory_rates, color='black', label='theoretical')
plt.xscale('log')
plt.yscale('log')
plt.xlabel('Generations in past')
plt.ylabel('Cross-coalescence rate')
plt.legend()
plt.savefig('rate-check-2.png')
nspope commented 4 months ago

just a note that

print(
    "Mean of cross-coalescence time distr: ",
    2 * node_coal_weight @ ts.nodes_time / np.sum(node_coal_weight),
    "Branch-mode divergence: ",
    ts.divergence(sample_sets, indexes=indexes, mode='branch'),
)

will only match if the tree sequence is trimmed first (which it isn't in this example)