GilbertLabUCSF / ScreenPro2

[This is an archived repository, see the current version in Arc Institute GitHub]
https://github.com/ArcInstitute/ScreenPro2
MIT License
1 stars 1 forks source link

Label `non-targeting` guides to `pseudo_###` through random selection #14

Closed abearab closed 1 year ago

abearab commented 1 year ago

similar to this https://github.com/mhorlbeck/ScreenProcessing/blob/0ee5192ecc17348665bd1387ddfa9037efb7964f/process_experiments.py#L206C1-L251C90

    if exptParameters['generate_pseudogene_dist'] != 'off' and len(exptParameters['analyses']) > 0:
        print('Generating a pseudogene distribution from negative controls')
        sys.stdout.flush()

        pseudoTableList = []
        pseudoLibTables = []
        negValues = negTable.values
        negColumns = negTable.columns

        if exptParameters['generate_pseudogene_dist'].lower() == 'manual':
            for pseudogene in range(exptParameters['num_pseudogenes']):
                randIndices = np.random.randint(
                    0, len(negTable), exptParameters['pseudogene_size'])
                pseudoTable = negValues[randIndices, :]
                pseudoIndex = ['pseudo_%d_%d' % (pseudogene, i) for i in range(
                    exptParameters['pseudogene_size'])]
                pseudoSeqs = ['seq_%d_%d' % (pseudogene, i) for i in range(
                    exptParameters['pseudogene_size'])]  # so pseudogenes aren't treated as duplicates
                pseudoTableList.append(pd.DataFrame(
                    pseudoTable, index=pseudoIndex, columns=negColumns))
                pseudoLib = pd.DataFrame({'gene': ['pseudo_%d' % pseudogene]*exptParameters['pseudogene_size'],
                                          'transcripts': ['na']*exptParameters['pseudogene_size'],
                                          'sequence': pseudoSeqs}, index=pseudoIndex)
                pseudoLibTables.append(pseudoLib)

        elif exptParameters['generate_pseudogene_dist'].lower() == 'auto':
            for pseudogene, (gene, group) in enumerate(libraryTable[sublibColumn].drop_duplicates(['gene', 'sequence']).groupby('gene')):
                if gene == 'negative_control':
                    continue
                for transcript, (transcriptName, transcriptGroup) in enumerate(group.groupby('transcripts')):
                    randIndices = np.random.randint(
                        0, len(negTable), len(transcriptGroup))
                    pseudoTable = negValues[randIndices, :]
                    pseudoIndex = ['pseudo_%d_%d_%d' % (
                        pseudogene, transcript, i) for i in range(len(transcriptGroup))]
                    pseudoSeqs = ['seq_%d_%d_%d' % (
                        pseudogene, transcript, i) for i in range(len(transcriptGroup))]
                    pseudoTableList.append(pd.DataFrame(
                        pseudoTable, index=pseudoIndex, columns=negColumns))
                    pseudoLib = pd.DataFrame({'gene': ['pseudo_%d' % pseudogene]*len(transcriptGroup),
                                              'transcripts': ['pseudo_transcript_%d' % transcript]*len(transcriptGroup),
                                              'sequence': pseudoSeqs}, index=pseudoIndex)
                    pseudoLibTables.append(pseudoLib)

        else:
            print('generate_pseudogene_dist parameter not recognized, defaulting to off')