hoelzer-lab / ribap

A comprehensive bacterial core gene-set annotation pipeline based on Roary and pairwise ILPs
GNU General Public License v3.0
19 stars 3 forks source link

add UpSet visualization as a final step #3

Closed hoelzer closed 4 years ago

hoelzer commented 4 years ago
hoelzer commented 4 years ago
#!usr/bin/env python3

input_file = '/data/prostlocal2/projects/chlamydia_comparative_study/flamingo_ribap/ribap/ribap_gene_absence_presence.csv'
input_strain_ids = '/data/prostlocal2/projects/chlamydia_comparative_study/flamingo_ribap/ribap/strain_ids.txt'
output_file = '/data/prostlocal2/projects/chlamydia_comparative_study/flamingo_ribap/ribap/gene_subsets.txt'

with open(input_strain_ids) as file:
    strain_dict = {}
    for line in file:
        line = line.strip()
        prokka_id = line.split(',')[0]
        my_strain = line.split(',')[1]
        strain_dict[prokka_id] = my_strain

with open(input_file) as holytable:
    my_dict = {}
    my_list = []
    for line in holytable:
        line = line.strip()
        if line.startswith("Cluster_ID"):
            strain = line.split('\t')
            my_list = strain[3:]
            for i in my_list:
                my_dict[i + '_RENAMED'] = []
        else:
            ids = line.split('\t')
            for x in line.split('\t')[3:]:
                if x == "NA":
                    continue
                else:
                    my_dict[strain_dict[x.split('_')[0]]].append(ids[0])

for i in my_dict:
    print(i, my_dict[i])

with open(output_file, 'w') as out:
    for key, value in my_dict.items():
        out.write(key + '\n')
        for entry in value:
            out.write(entry + '\n')

# write to multiple output files                
for key, value in my_dict.items():
    with open(f'/data/prostlocal2/projects/chlamydia_comparative_study/flamingo_ribap/ribap/subsets/{key}_subset.txt', 'w') as out:
        for entry in value:
            out.write(entry + '\n')