Closed abearab closed 1 year ago
similar to this https://github.com/mhorlbeck/ScreenProcessing/blob/0ee5192ecc17348665bd1387ddfa9037efb7964f/process_experiments.py#L206C1-L251C90
if exptParameters['generate_pseudogene_dist'] != 'off' and len(exptParameters['analyses']) > 0: print('Generating a pseudogene distribution from negative controls') sys.stdout.flush() pseudoTableList = [] pseudoLibTables = [] negValues = negTable.values negColumns = negTable.columns if exptParameters['generate_pseudogene_dist'].lower() == 'manual': for pseudogene in range(exptParameters['num_pseudogenes']): randIndices = np.random.randint( 0, len(negTable), exptParameters['pseudogene_size']) pseudoTable = negValues[randIndices, :] pseudoIndex = ['pseudo_%d_%d' % (pseudogene, i) for i in range( exptParameters['pseudogene_size'])] pseudoSeqs = ['seq_%d_%d' % (pseudogene, i) for i in range( exptParameters['pseudogene_size'])] # so pseudogenes aren't treated as duplicates pseudoTableList.append(pd.DataFrame( pseudoTable, index=pseudoIndex, columns=negColumns)) pseudoLib = pd.DataFrame({'gene': ['pseudo_%d' % pseudogene]*exptParameters['pseudogene_size'], 'transcripts': ['na']*exptParameters['pseudogene_size'], 'sequence': pseudoSeqs}, index=pseudoIndex) pseudoLibTables.append(pseudoLib) elif exptParameters['generate_pseudogene_dist'].lower() == 'auto': for pseudogene, (gene, group) in enumerate(libraryTable[sublibColumn].drop_duplicates(['gene', 'sequence']).groupby('gene')): if gene == 'negative_control': continue for transcript, (transcriptName, transcriptGroup) in enumerate(group.groupby('transcripts')): randIndices = np.random.randint( 0, len(negTable), len(transcriptGroup)) pseudoTable = negValues[randIndices, :] pseudoIndex = ['pseudo_%d_%d_%d' % ( pseudogene, transcript, i) for i in range(len(transcriptGroup))] pseudoSeqs = ['seq_%d_%d_%d' % ( pseudogene, transcript, i) for i in range(len(transcriptGroup))] pseudoTableList.append(pd.DataFrame( pseudoTable, index=pseudoIndex, columns=negColumns)) pseudoLib = pd.DataFrame({'gene': ['pseudo_%d' % pseudogene]*len(transcriptGroup), 'transcripts': ['pseudo_transcript_%d' % transcript]*len(transcriptGroup), 'sequence': pseudoSeqs}, index=pseudoIndex) pseudoLibTables.append(pseudoLib) else: print('generate_pseudogene_dist parameter not recognized, defaulting to off')
similar to this https://github.com/mhorlbeck/ScreenProcessing/blob/0ee5192ecc17348665bd1387ddfa9037efb7964f/process_experiments.py#L206C1-L251C90