Closed github-actions[bot] closed 3 years ago
Add the MT clone map so it can contain multiple mutants in lineages
https://github.com/LewisLabUCSD/Mito_Trace/blob/ed20573388eb857887a8c9e9a6c539936f9a3f45/src/simulations/simulation_pymc.py#L111
# from numpy import random # import os # import pandas as pd # import pickle # from src.simulations.utils.config import read_config_file, write_config_file # from src.simulations.utils.config import check_required import pymc3 as pm import matplotlib.pyplot as plt num_cells = 10000 num_mt_positions = 10 clone_dist = [0.10,0.01,.89] hets = [0.2,0.3] # len(hets) == len(clone_dist)-1 avg_cov = 50 het_err_rate = 0.1 df = np.concatenate((np.random.binomial(10,0.3,(100,4)), np.random.binomial(10,0.6,(90,4)))) clone_id = np.concatenate((np.zeros([100,]), np.ones([90,]))).astype(int) mt_id = [0,1,2,3] with pm.Model() as model: clone_ids = pm.Mulinomial(10000, clone_dist) beta = pm.Beta('beta', alpha=2,beta=2, shape=2) #p = pm.Bernoulli('p', 1, beta, shape=2) #p = pm.Binomial('p', 1, beta) #q = pm.Binomial('q', 1, beta) s = pm.Binomial('s', 10, beta[clone_id], observed=df) #s = pm.Binomial('s', 10, p, observed=df[:30,0]) #t = pm.Binomial('t', 10, q, observed=df[30:, 0]) #s = pm.Binomial('s', 10, beta, shape=(30,4), observed=df[:30]) #t = pm.Binomial('t', 10, beta, shape=(25, 4), observed=df[30:]) #vec = pm.math.concatenate((s, t), axis=0) # data = pm.Data("data", df) # u = pm.Normal('u', vec, observed=data) #u = pm.Deterministic('u', vec) trace = pm.sample(draws=8000, init='adapt_diag') print(pm.summary(trace)) dot = pm.model_to_graphviz(model) dot.render('simulation_pymc.gv') pm.plot_trace(trace) plt.savefig('simulation_trace.png') print('here') # # with pm.Model() as model: # clone_counts = pm.Multinomial(num_cells, clone_dist) # num_clones = len(clone_counts) - 1 # # clone_cell = -1 * np.ones(shape=[num_cells, ]) # # clone_cell[:clone_counts[0]] = 0 # # for ind, val in enumerate(clone_counts[1:]): # start = clone_counts[:ind + 1].sum() # end = clone_counts[:ind + 1].sum() + val # # starts at sum(clone_counts[i-1]) ends at clone_counts[i].sum() # clone_cell[start:end] = ind + 1 # # c = pm.Poisson('cov', avg_cov, shape=[num_cells, num_mt_positions]) # # clone_mt_dict = dict() # for i in range(1, num_clones + 1): # clone_mt_dict[i] = i # # cell_af = np.zeros([num_cells, num_mt_positions]) # for ind in range(num_clones): # # Generate AF: (clone_df == ind).sum() # n_dom_cells = clone_counts[ind] # het = hets[ind] # # curr_mt = clone_mt_dict[ind] # # af_i = pm.Binomial('af', avg_cov, het, shape=n_dom_cells) # af_j = pm.Binomial('het af', avg_cov, het_err_rate, shape=num_cells - n_dom_cells) # / c # # # # Update the dom_cells and non_dom for the current MT # cell_af[np.flatnonzero(clone_df == ind), curr_mt] = af_i # cell_af[np.flatnonzero(clone_df != ind), curr_mt] = af_j # # cell_af = pm.Deterministic(y) # # pm.model_to_graphviz(model) # # # def init_cell_af(self): # """1C. Initialize the cell-by-mtPos af dataframe. Unless a clone:mt dict was # provided, the first N MT positions will be the clone AFs. Creates # self.clone_mt_dict and self.cell_af # """ # clone_df = self.clone_cell # # Output # cell_af = pd.DataFrame(np.zeros(shape=[n_cells, n_mt])) # # # Each clone points to a mt position # self.clone_mt_dict = dict() # for i in range(1, num_clones + 1): # self.clone_mt_dict[i] = i # # # TODO Add the MT clone map so it can contain multiple mutants in lineages # # # If there is a heteroplasmy table in params, it is list of mutant heteroplasmy AFs. # # If not, will randomly draw based on number of clones # if type(hets) == list: # assert (len(hets) == num_clones) # # ## Loop through each clone, # ## Generate the AF for the clone and non-clones using coverage for each cell # ## Fill in cell_by_af for that position. # for ind in range(1, num_clones + 1): # # Generate AF: (clone_df == ind).sum() # n_dom_cells = (clone_df == ind).sum() # het = hets[ind - 1] # # curr_mt = self.clone_mt_dict[ind] # # if p['coverage']['type'] == 'constant': # c = p['coverage']['cov_constant'] # # af_i = random.binomial(c, het, n_dom_cells) / c # af_j = random.binomial(c, q, n_cells - n_dom_cells) / c # # # Update the dom_cells and non_dom for the current MT # cell_af.loc[ # np.flatnonzero(clone_df == ind), curr_mt] = af_i # cell_af.loc[ # np.flatnonzero(clone_df != ind), curr_mt] = af_j # # # Each cell and position has it's own coverage value, so need to update each # else: # c = self.cells_mt_coverage # # Get the cells coverage for the mt position # curr_mt_cov = c[:, curr_mt] # # # Get cell indicies for the clones and nonclones # curr_clone_inds = np.flatnonzero(clone_df == ind) # curr_nonclone_inds = np.flatnonzero(clone_df != ind) # for cell in curr_clone_inds: # # Get one value for curr_mt and cell based on coverage # cell_af.loc[cell, curr_mt] = random.binomial( # curr_mt_cov[cell], het) # for cell in curr_nonclone_inds: # cell_af.loc[cell, curr_mt] = random.binomial( # curr_mt_cov[cell], # q) # Loop through each coverage # for c in n_dom_cells: # # # class Simulation: # """Lineage tracing simulation of one sample # # Will initialize cells based on their parameters and grow as well. This # should be a flexible framework, to add different ways to initialize, grow, # and metrics to have. Additionally can cluster these results. # # :ivar params # :type params: dict # """ # # def __init__(self, params_f): # """ # :param params_f: Parameter yaml file for the specifications # :type params_f: yaml file or dict # """ # if isinstance(params_f, str): # params = read_config_file(params_f) # else: # params = params_f # # self.params = params # check_required(params, ['initialize', 'num_cells', 'num_mt_positions', 'prefix']) # self.prefix = params['prefix'] # self.num_mt_positions = params['num_mt_positions'] # self.num_cells = params['num_cells'] # if not os.path.exists(params['local_outdir']): # os.mkdir(params['local_outdir']) # # # def initialize(self): # """ (1) Pre-growth cell population is instantiated. # # Creates a clone-MT dictionary, cell coverage matrix # (or an int, depending on parameters), and cell-AF matrix. # :return: # """ # self.init_clone_dict() # self.init_cell_coverage() # self.init_cell_af() # #self.init_clone_mt() # # #should be external method # def grow(self): # """ (2) Growth of cells is run.""" # p = self.params # type = p["growth"]["type"] # if type == "poisson": # self.grow_poisson(p['growth']['poisson']) # elif type == "binomial": # self.grow_binomial(p['growth']['binomial']) # return # # # Static Method # @staticmethod # def clone_counts_to_cell_series(clone_counts): # """ Generates new cell IDs based on cluster count iterable # :param clone_counts: Each i'th element is the number of cells in # cluster i. # :type clone_counts: iterable # :return each index name is a cell ID and each value is which cluster # the cell belongs too. # :rtype pd.Series # """ # clone_counts = np.array(clone_counts) # num_cells = clone_counts.sum() # clone_cell = -1 * np.ones(shape=[num_cells, ]) # # clone_cell[:clone_counts[0]] = 0 # for ind, val in enumerate(clone_counts[1:]): # start = clone_counts[:ind + 1].sum() # end = clone_counts[:ind + 1].sum() + val # # starts at sum(clone_counts[i-1]) ends at clone_counts[i].sum() # clone_cell[start:end] = ind + 1 # # clone_cell = pd.Series(clone_cell, dtype=int) # return clone_cell # # def init_clone_dict(self): # """1A # """ # # ### Add in potential to overwrite the values # # Gets the clone dictionary. Should also have clone to mt dict. # clones = self.params['initialize']['clone_sizes'] # num_cells = self.num_cells # # # Option 1: List of fraction of size of each clone. 0s are nonclone size, listed first # if type(clones) == list: # #clone_cell = pd.Series(index=range(num_cells)) # clone_counts = np.random.multinomial(num_cells, clones) # clone_cell = self.clone_counts_to_cell_series(clone_counts) # self.clone_cell = clone_cell # # Option 2: 1 clone. ID'd as 1 # elif type(clones) == int: #One number for dominant clone. the others are not. # clone_cell = np.zeros(shape=[num_cells,]) # clone_cell[:num_cells] = 1 # clone_cell = clone_cell[::-1] # clone_cell = pd.Series(clone_cell, dtype=int) # self.clone_cell = clone_cell # # # Option 3 To ADD, beta binomial and more complex distributions # # self.num_clones = len(set(clone_cell.values))-1 # Remove the non-clone # return clone_cell # # # def init_cell_coverage(self): # """1B # # There are different modes to the coverage, either a constant or # through a distribution. # """ # p = self.params['initialize']['coverage'] # type = p['type'] # # num_cells = self.num_cells # num_pos = self.num_mt_positions # c = np.zeros([num_cells, num_pos]) # # if type == 'constant': # c[:, :] = p['cov_constant'] # elif type == "poisson": # # Get the number of coverage per cell based on poisson (should be reads) # mu_cov_per_cell = p['mu_cov_per_cell'] # num_reads_per_cell = random.poisson(lam=mu_cov_per_cell, # size=num_cells) # # # Number of reads at each position, based on the average for each cell # for i in num_cells: # c[i, :] = random.poisson(num_reads_per_cell[i], # size=num_pos) # self.cells_mt_coverage = c # return c # # # # ##### # # TODO # # Add noise to the other non-lineage positions # ##### # self.cell_af = cell_af # return # # # def init_clone_mt(self): # p = self.params # if p["initialize"]['type'] == 'growth': # ## TODO # # Create a phylogeny and then get the averages of the mutants # self.average_clone_mt() # # If not growth, should aready be there. # return # # def average_clone_mt(self): # return # # def extract_clone_cells(self, clone_id): # """ # Args: # clone_id: # """ # ids = np.flatnonzero(self.clone_cell == clone_id) # return ids # # def simulate_expand_cells_af(self, af, growth_inds, sigma): # """Given a cell-by-af vector, expand the AF. # # Expanded AF occurs by duplicating cells that grew based on the # growth_inds vector. It will add standard error to each af based on sigma # :param af: :param growth: Indices of AF to copy :param sigma: Variance # to add to AF of child. :return: # # Args: # af: # growth_inds: # sigma: # """ # # new_af = af.iloc[growth_inds].copy() + random.normal(0, sigma, size=af.iloc[growth_inds].shape) # new_af.index = np.arange(af.index[-1]+1, af.index[-1]+1+new_af.shape[0]) # new_af = pd.concat((af,new_af), axis=0) # #new_af = np.append(af, np.concatenate(new_af)) # return new_af # # def grow_binomial(self, p): # """ (2.1) # Args: # p: # """ # timesteps = p["time_steps"] # rates = p["rates"] # # sigma = self.params['growth']["mutant_af_sigma_noise"] # cell_af = self.cell_af # clone_mt_dict = self.clone_mt_dict # # num_clones = self.num_clones+1 # new_dict = {} # for curr_clone in range(num_clones): # curr_rate = rates[curr_clone] # ids = self.extract_clone_cells(curr_clone) # new_cells = cell_af.loc[ids].copy() # for i in range(timesteps): # # Simulate growth for each clone separately. # growth_inds = np.flatnonzero(random.binomial(1, curr_rate, size=new_cells.shape[0])) # #new_ids = # new_cells = self.simulate_expand_cells_af(new_cells, growth_inds, sigma) # # new_dict[curr_clone] = new_cells # # Create list of cells # # ####TODO # ## new_lineage_mutants chances. This will see if a mutation will change # # # ####TODO # ## Add death + stimulation rate as well as growth # # Save the new cell clones df and cell af # clone_counts = [i.shape[0] for i in new_dict.values()] # self.new_clone_cell = self.clone_counts_to_cell_series(clone_counts) # # self.new_cell_af = pd.DataFrame(new_dict[0]) # for clone in range(1, self.num_clones+1): # self.new_cell_af = pd.concat((self.new_cell_af, new_dict[clone]),axis=0).reset_index(drop=True) # return # # # def grow_poisson(self): # # TODO growth of poisson refactor # return # # # def subsample_new(self, to_delete=False): # """(3) Subsample from new cell population # # :param to_delete: To remove the cells that grew (which takes up # a lot of RAM). # :type to_delete: bool # """ # new_cell_af = self.new_cell_af # p = self.params # if 'sequence_subsample' in p and p['sequence_subsample'] is not None: # self.subsample_new_cell_af = new_cell_af.sample(n=self.params['sequence_subsample']) # else: # self.subsample_new_cell_af = new_cell_af.sample(n=self.num_cells) # # self.subsample_new_clone_cell = self.new_clone_cell.loc[ # self.subsample_new_cell_af.index] # # if to_delete: # self.new_cell_af = None # self.new_clone_cell = None # # # def combine_init_growth(self): # """(4) Add the pre- and post- population of cells into a group. # # :return: # """ # combined_cell_af = self.cell_af.append(self.subsample_new_cell_af).reset_index(drop=True) # combined_clones = pd.concat( # (self.clone_cell, self.subsample_new_clone_cell)).reset_index( # drop=True) # # # combined_befaft = np.concatenate((np.zeros(shape=[self.cell_af.shape[0],]), np.ones(shape=[self.subsample_new_cell_af.shape[0]]))) # combined_meta = pd.DataFrame({"pre_post": combined_befaft, "clone": combined_clones}) # #combined_meta = pd.Series(combined_meta, name='After Growth', dtype=int) # assert(combined_meta.shape[0] == self.cell_af.shape[0]+self.subsample_new_cell_af.shape[0]) # assert (combined_cell_af.shape[0] == self.cell_af.shape[0] + # self.subsample_new_cell_af.shape[0]) # assert(combined_meta.shape[0] == combined_clones.shape[0]) # assert(combined_cell_af.shape[0] == combined_clones.shape[0]) # self.combined_meta = combined_meta # self.combined_clones = combined_clones # self.combined_cell_af = combined_cell_af # return # # def save(self, f_save=None): # """ # Args: # f_save: # """ # if f_save is None: # f_save = os.path.join(self.params['local_outdir'], self.params['prefix']+'.p') # f = open(f_save, 'wb') # pickle.dump(self.__dict__, f, 2) # f.close() # # @staticmethod # def expand_to_mgatk(curr_mt_af,mt_ref): # ref = mt_ref[curr_mt_af.name] # pos = curr_mt_af.name # return pd.DataFrame({"Ref":ref, "Pos":pos, "Val":curr_mt_af}) # # def test_save_to_mgatk_format(self): # df = pd.DataFrame( [[10,0,1,3,5], [3,0,5,5,0], [6,2,1,1,0]] , columns=np.arange(0,5)) # mt_ref_dict = {0: "A", 1: "G", 2: "C", 3: "C", 4: "T"} # mt_ref = pd.DataFrame({"Pos": mt_ref_dict.keys(), "Ref": mt_ref_dict}) # return # # def save_to_mgatk_format(self, mt_ref, out_f): # """Converts into the proper files needed for mgatk. (i.e variant and # coverage files) # # :return: # """ # cell_af = self.subsample_new_cell_af # chars = ["A", "G", "C", "T"] # def alt_generate(x): # curr = chars.copy() # curr.remove(x["Ref"]) # return np.random.choice(curr) # alt_ref = mt_ref.apply(alt_generate, axis=1) # # # First use the AF and choose an alternative allele # df_stack = cell_af.stack().reset_index().rename( # {"level_0": "Cell", "level_1": "MT_pos", 0: "Coverage"}, # axis=1) # df_stack["Nucleotide"] = df_stack["MT_pos"].apply( # lambda x: alt_ref[x]) # # # Add on the reference allele # df_stack_ref = cell_af.stack().reset_index().rename( # {"level_0": "Cell", "level_1": "MT_pos", 0: "Coverage"}, # axis=1) # df_stack_ref["Coverage"] = 1-df_stack_ref["Coverage"] # df_stack["Nucleotide"] = df_stack["MT_pos"].apply( # lambda x: mt_ref[x]) # # df_stack = pd.concat(df_stack, df_stack_ref) # for ind, val in df_stack.groupby("Nucleotide"): # # Drop the 0s # curr = val[val["Coverage"]>0] # # Save file # curr_out_f = out_f + "_" + ind + ".txt" # curr.to_csv(curr_out_f) # # # Save the coverage. # coverage = self.cells_mt_coverage # if type(coverage) != int: # coverage_stack = pd.DataFrame(coverage).stack().reset_index().rename( # {"level_0": "Cell", "level_1": "MT Position", 0: "Coverage"}, # axis=1) # else: # coverage_stack = pd.DataFrame(self.cells_mt_coverage)*np.ones(shape=cell_af.shape).stack().reset_index().rename( # {"level_0": "Cell", "level_1": "MT Position", 0: "Coverage"}, # axis=1) # curr_out_f = out_f + "_" + "coverage.txt" # coverage_stack.to_csv(curr_out_f) # return # # def load(self): # filename = self.params['filename'] # f = open(filename, 'rb') # tmp_dict = pickle.load(f) # f.close() # self.__dict__.update(tmp_dict) # # def compare_before_after(self): # """Creates a df that contains information on the number of cells from # each clone before as well as after. :return: df.at[ind, "Dominant # Before"] = (full_sim.clone_cell == 1).sum() df.at[ind, "Dominant After"] # = (full_sim.subsample_new_clone_cell == 1).sum() # """ # # return # # def cluster_compare_before_after(self): # """Compares the performance of clustering on grouping the same clones # together. :return: # """ # return # # # def main(): # return # if "__name__" == "__main__": # main() No newline at end of file ew file mode 100644 ndex 0000000..b48b215 inary files /dev/null and b/src/simulations/simulation_trace.png differ eleted file mode 100644 ndex fba6e66..0000000 ++ /dev/null
9e6b706ec5bc1e7d33e9ac1bd2d08ce92e874819
Add the MT clone map so it can contain multiple mutants in lineages
https://github.com/LewisLabUCSD/Mito_Trace/blob/ed20573388eb857887a8c9e9a6c539936f9a3f45/src/simulations/simulation_pymc.py#L111
9e6b706ec5bc1e7d33e9ac1bd2d08ce92e874819