from numpy import random
import os
import pandas as pd
from tqdm import tqdm
#from src.config import ROOT_DIR
import matplotlib.pyplot as plt
import pickle
import seaborn as sns
import glob
from sklearn.cluster import KMeans
from sklearn import metrics
from scipy.spatial.distance import cdist
from pandarallel import pandarallel
pandarallel.initialize(nb_workers=32)
from mplh.color_utils import get_colors
from mplh.fig_utils import legend_from_color
from mplh import cluster_help as ch
from src.simulations.utils.config import read_config_file, write_config_file
from dynamicTreeCut import cutreeHybrid
from scipy.spatial.distance import pdist
from scipy.cluster.hierarchy import linkage
from sklearn.model_selection import ParameterGrid
from src.simulations.utils.config import check_required
class Simulation:
"""
Lineage tracing simulation. Will initialize cells based on
their parameters and grow as well. This should be a flexible
framework, to add different ways to initialize, grow, and metrics to
have. Additionally can cluster these results.
"""
def __init__(self, params_f):
if isinstance(params_f, str):
params = read_config_file(params_f)
else:
params = params_f
self.params = params
check_required(params, ['initialize', 'num_cells', 'num_mt_positions', 'prefix'])
self.prefix = params['prefix']
self.num_mt_positions = params['num_mt_positions']
self.num_cells = params['num_cells']
if not os.path.exists(params['local_outdir']):
os.mkdir(params['local_outdir'])
def initialize(self):
self.init_clone_dict()
self.init_cell_coverage()
self.init_cell_af()
#self.init_clone_mt()
#should be external method
def grow(self):
p = self.params
type = p["growth"]["type"]
if type == "poisson":
self.grow_poisson(p['growth']['poisson'])
elif type == "binomial":
self.grow_binomial(p['growth']['binomial'])
return
# Static Method
@staticmethod
def clone_counts_to_cell_series(clone_counts):
clone_counts = np.array(clone_counts)
num_cells = clone_counts.sum()
clone_cell = -1 * np.ones(shape=[num_cells, ])
clone_cell[:clone_counts[0]] = 0
for ind, val in enumerate(clone_counts[1:]):
start = clone_counts[:ind + 1].sum()
end = clone_counts[:ind + 1].sum() + val
# starts at sum(clone_counts[i-1]) ends at clone_counts[i].sum()
clone_cell[start:end] = ind + 1
clone_cell = pd.Series(clone_cell, dtype=int)
return clone_cell
def init_clone_dict(self):
### Add in potential to overwrite the values
# Gets the clone dictionary. Should also have clone to mt dict.
clones = self.params['initialize']['clone_sizes']
num_cells = self.num_cells
# Option 1: List of fraction of size of each clone. 0s are nonclone size, listed first
if type(clones) == list:
#clone_cell = pd.Series(index=range(num_cells))
clone_counts = np.random.multinomial(num_cells, clones)
clone_cell = self.clone_counts_to_cell_series(clone_counts)
self.clone_cell = clone_cell
# Option 2: 1 clone. ID'd as 1
elif type(clones) == int: #One number for dominant clone. the others are not.
clone_cell = np.zeros(shape=[num_cells,])
clone_cell[:num_cells] = 1
clone_cell = clone_cell[::-1]
clone_cell = pd.Series(clone_cell, dtype=int)
self.clone_cell = clone_cell
# Option 3 To ADD, beta binomial and more complex distributions
self.num_clones = len(set(clone_cell.values))-1 # Remove the non-clone
return clone_cell
def init_cell_coverage(self):
"""
There are different modes to the coverage, either a constant or through a distribution.
:return:
"""
p = self.params['initialize']['coverage']
type = p['type']
num_cells = self.num_cells
num_pos = self.num_mt_positions
c = np.zeros([num_cells, num_pos])
if type == 'constant':
c[:, :] = p['cov_constant']
elif type == "poisson":
# Get the number of coverage per cell based on poisson (should be reads)
mu_cov_per_cell = p['mu_cov_per_cell']
num_reads_per_cell = random.poisson(lam=mu_cov_per_cell,
size=num_cells)
# Number of reads at each position, based on the average for each cell
for i in num_cells:
c[i, :] = random.poisson(num_reads_per_cell[i],
size=num_pos)
self.cells_mt_coverage = c
return c
def init_cell_af(self):
"""
Initialize the cell-by-mtPos af dataframe. Unless a clone:mt dict was provided,
the first N MT positions will be the clone AFs.
Creates self.clone_mt_dict and self.cell_af"""
p = self.params['initialize']
hets = self.params['het']
q = self.params['het_err_rate']
clone_df = self.clone_cell
num_clones = self.num_clones
n_cells = self.num_cells
n_mt = self.num_mt_positions
# Output
cell_af = pd.DataFrame(np.zeros(shape=[n_cells, n_mt]))
if 'mt_clone_map' in p and p['mt_clone_map'] is not None:
self.clone_mt_dict = p['mt_clone_map']
else:
# Each clone points to a mt position
self.clone_mt_dict = dict()
for i in range(1,num_clones+1):
self.clone_mt_dict[i] = i
# TODO Add the MT clone map so it can contain multiple mutants in lineages
# If there is a heteroplasmy table in params, it is list of mutant heteroplasmy AFs.
# If not, will randomly draw based on number of clones
if type(hets) == list:
if (len(hets) != num_clones):
print('here')
assert(len(hets) == num_clones)
## Loop through each clone,
## Generate the AF for the clone and non-clones using coverage for each cell
## Fill in cell_by_af for that position.
for ind in range(1, num_clones+1):
# Generate AF: (clone_df == ind).sum()
n_dom_cells = (clone_df==ind).sum()
het = hets[ind-1]
curr_mt = self.clone_mt_dict[ind]
if p['coverage']['type'] == 'constant':
c = p['coverage']['cov_constant']
af_i = random.binomial(c, het,
n_dom_cells) / c
af_j = random.binomial(c, q,
n_cells - n_dom_cells) / c
# Update the dom_cells and non_dom for the current MT
cell_af.loc[np.flatnonzero(clone_df == ind), curr_mt] = af_i
cell_af.loc[np.flatnonzero(clone_df != ind), curr_mt] = af_j
# Each cell and position has it's own coverage value, so need to update each
else:
c = self.cells_mt_coverage
#Get the cells coverage for the mt position
curr_mt_cov= c[:, curr_mt]
# Get cell indicies for the clones and nonclones
curr_clone_inds = np.flatnonzero(clone_df==ind)
curr_nonclone_inds = np.flatnonzero(clone_df!=ind)
for cell in curr_clone_inds:
# Get one value for curr_mt and cell based on coverage
cell_af.loc[cell, curr_mt] = random.binomial(curr_mt_cov[cell], het)
for cell in curr_nonclone_inds:
cell_af.loc[cell, curr_mt] = random.binomial(curr_mt_cov[cell], q)
# Loop through each coverage
#for c in n_dom_cells:
#####
# TODO
# Add noise to the other non-lineage positions
#####
self.cell_af = cell_af
return
def init_clone_mt(self):
p = self.params
if p["initialize"]['type'] == 'growth':
## TODO
# Create a phylogeny and then get the averages of the mutants
self.average_clone_mt()
# If not growth, should aready be there.
return
def average_clone_mt(self):
return
def extract_clone_cells(self, clone_id):
ids = np.flatnonzero(self.clone_cell == clone_id)
return ids
def simulate_expand_cells_af(self, af, growth_inds, sigma):
"""
Given a cell-by-af vector, expand the AF.
Expanded AF occurs by duplicating cells that grew based on
the growth_inds vector. It will add standard error to each
af based on sigma
:param af:
:param growth: Indices of AF to copy
:param sigma: Variance to add to AF of child.
:return:
"""
new_af = af.iloc[growth_inds].copy() + random.normal(0, sigma, size=af.iloc[growth_inds].shape)
new_af.index = np.arange(af.index[-1]+1, af.index[-1]+1+new_af.shape[0])
new_af = pd.concat((af,new_af), axis=0)
#new_af = np.append(af, np.concatenate(new_af))
return new_af
def grow_binomial(self, p):
timesteps = p["time_steps"]
rates = p["rates"]
sigma = self.params['growth']["mutant_af_sigma_noise"]
cell_af = self.cell_af
clone_mt_dict = self.clone_mt_dict
num_clones = self.num_clones+1
new_dict = {}
for curr_clone in range(num_clones):
curr_rate = rates[curr_clone]
ids = self.extract_clone_cells(curr_clone)
new_cells = cell_af.loc[ids].copy()
for i in range(timesteps):
# Simulate growth for each clone separately.
growth_inds = np.flatnonzero(random.binomial(1, curr_rate, size=new_cells.shape[0]))
#new_ids =
new_cells = self.simulate_expand_cells_af(new_cells, growth_inds, sigma)
new_dict[curr_clone] = new_cells
# Create list of cells
####TODO
## new_lineage_mutants chances. This will see if a mutation will change
####TODO
## Add death + stimulation rate as well as growth
# Save the new cell clones df and cell af
clone_counts = [i.shape[0] for i in new_dict.values()]
self.new_clone_cell = self.clone_counts_to_cell_series(clone_counts)
self.new_cell_af = pd.DataFrame(new_dict[0])
for clone in range(1, self.num_clones+1):
self.new_cell_af = pd.concat((self.new_cell_af, new_dict[clone]),axis=0).reset_index(drop=True)
return
def grow_poisson(self):
# TODO growth of poisson refactor
return
def subsample_new(self, to_delete=False):
new_cell_af = self.new_cell_af
p = self.params
if 'sequence_subsample' in p and p['sequence_subsample'] is not None:
self.subsample_new_cell_af = new_cell_af.sample(n=self.params['sequence_subsample'])
else:
self.subsample_new_cell_af = new_cell_af.sample(n=self.num_cells)
self.subsample_new_clone_cell = self.new_clone_cell.loc[
self.subsample_new_cell_af.index]
if to_delete:
self.new_cell_af = None
self.new_clone_cell = None
def combine_init_growth(self):
clones = pd.concat(
(self.clone_cell, self.subsample_new_clone_cell)).reset_index(
drop=True)
combined_cell_af = self.cell_af.append(self.subsample_new_cell_af).reset_index(drop=True)
combined_meta = np.concatenate((np.ones(shape=[self.cell_af.shape[0],]), np.zeros(shape=[self.subsample_new_cell_af.shape[0]])))
combined_meta = pd.Series(combined_meta, name='After Growth', dtype=int)
assert(combined_meta.shape[0] == self.cell_af.shape[0]+self.subsample_new_cell_af.shape[0])
assert (combined_cell_af.shape[0] == self.cell_af.shape[0] +
self.subsample_new_cell_af.shape[0])
assert(combined_meta.shape[0] == clones.shape[0])
assert(combined_cell_af.shape[0] == clones.shape[0])
self.combined_meta = combined_meta
self.combined_clones = clones
self.combined_cell_af = combined_cell_af
return
def save(self, f_save=None):
if f_save is None:
f_save = os.path.join(self.params['local_outdir'], self.params['prefix']+'.p')
f = open(f_save, 'wb')
pickle.dump(self.__dict__, f, 2)
f.close()
def save_to_mgatk_format(self):
"""
Converts into the proper files needed for mgatk. (i.e variant and coverage files)
:return:
"""
def load(self):
filename = self.params['filename']
f = open(filename, 'rb')
tmp_dict = pickle.load(f)
f.close()
self.__dict__.update(tmp_dict)
def compare_before_after(self):
"""
Creates a df that contains information on
the number of cells from each clone before as well as after.
:return:
df.at[ind, "Dominant Before"] = (full_sim.clone_cell == 1).sum()
df.at[ind, "Dominant After"] = (full_sim.subsample_new_clone_cell == 1).sum()
"""
return
def cluster_compare_before_after(self):
"""
Compares the performance of clustering on grouping the same
clones together.
:return:
"""
return
@staticmethod
def plot_cluster(cell_af, cell_meta=None, mt_meta=None, f_save=None):
ch.plot_cluster(cell_af, row_meta=cell_meta, col_meta=mt_meta,
fsave=f_save, to_col_clust=False, to_z=True)
@staticmethod
def cluster(cell_af):
"""
Dynamic tree clustering of the rows of cell_af
:param cell_af:
:return:
"""
distances = pdist(cell_af, "euclidean")
link = linkage(distances, "average")
clusters = cutreeHybrid(link, distances)['labels']
return clusters
@staticmethod
def cluster_kmeans(cell_af):
distortions = []
inertias = []
mapping1 = {}
mapping2 = {}
K = range(1, 10)
for k in K:
# Building and fitting the model
kmeanModel = KMeans(n_clusters=k).fit(cell_af)
kmeanModel.fit(cell_af)
distortions.append(sum(
np.min(cdist(cell_af, kmeanModel.cluster_centers_, 'euclidean'),
axis=1)) / cell_af.shape[0])
inertias.append(kmeanModel.inertia_)
mapping1[k] = sum(
np.min(cdist(cell_af, kmeanModel.cluster_centers_, 'euclidean'),
axis=1)) / cell_af.shape[0]
mapping2[k] = kmeanModel.inertia_
def main():
return
if "__name__" == "__main__":
main()
No newline at end of file
eleted file mode 100644
ndex e67049d..0000000
++ /dev/null
growth of poisson refactor
https://github.com/LewisLabUCSD/Mito_Trace/blob/1b49ccc3ce9d63a2fe9ce0be3c14d640c867846e/src/simulations/simulation.py#L301
d33839409e2b3ba48b84e057a8abd16fb6ef6f1b