Open Kronos573 opened 1 year ago
def one_prot_inter_sort(protein_list,dict_prot_et_leurs_part):
protein_comb = list(set(list(itertools.combinations(protein_list, 2))))
protein_comb.sort()
# Create similarity and dissimilarity dataframe
disim_dist_dict = {}
sim_dist_dict = {}
for protein in protein_list :
# Initialize similarity and dissimilary dict for each protein
disim_dist_dict[protein] = {}
sim_dist_dict[protein] = {}
for vs_protein in protein_list :
# Initialize similarity and dissimilarity dict for compared protein to 0
disim_dist_dict[protein][vs_protein] = 0
sim_dist_dict[protein][vs_protein] = 0
# For each protein pairs in the list
for protein in protein_list :
protein_partners=[]
if protein in dict_prot_et_leurs_part :
for partner in dict_prot_et_leurs_part[protein]: #on crée une liste des partenaires qui sont eux-mm présents dans la list_code
if partner[0] in protein_list :
protein_partners.append(partner[0])
for vs_protein in protein_list :
protein_vs_partners=[]
# Get data for protein to compare with
if vs_protein in dict_prot_et_leurs_part :
for partner in dict_prot_et_leurs_part[vs_protein]: #on crée une liste des partenaires qui sont eux-mm présents dans la list_code
if partner[0] in protein_list :
protein_vs_partners.append(partner[0])
protein_disim=set(protein_vs_partners)^set(protein_partners)
disim_dist_dict[protein][vs_protein]=len(protein_disim)
sim_dist_dict[protein][vs_protein]=len(protein_list)-len(protein_disim)
##Finished creating disim and sim dict
# Create DataFrames from the similarity and dissimilarity dict
disim_dist_df = pd.DataFrame.from_dict(disim_dist_dict)
sim_dist_df = pd.DataFrame.from_dict(sim_dist_dict)
# Setting diagonal to zeros
np.fill_diagonal(disim_dist_df.values, -1)
np.fill_diagonal(sim_dist_df.values, -1)
# Get max similarity and dissimilarity scores
max_sim = max(sim_dist_df.max())
max_disim = max(disim_dist_df.max())
## Creating a list of priorities to link the proteins based on the content
# In order of priority : Similarity (descending) > Dissimilarity (ascending) > Sequence size difference (ascending)
# Initializing step number and list of priorities
step = 0
floor_list = []
# From highest to lowest similarity score
for sim in [i for i in range(0,max_sim+1)][::-1] :
# Initialize list of links with same similarity score
disim_list = []
# For each combination of pairs of proteins
for comb in protein_comb :
# If similarity score corresponds to the searched similarity score
if sim_dist_df[comb[0]][comb[1]] == sim :
# Add combination to the list (protein1, protein2, dissimilarity score, absolute difference of sequence length)
disim_list.append((comb[0], comb[1], disim_dist_df[comb[0]][comb[1]]))# abs(protein_length_dict[comb[0]] - protein_length_dict[comb[1]])))
# Sort the list of links based on the difference of sequence length (ascending)
#disim_list = sorted(disim_list, key=lambda tup: tup[3])
# Sort the list of links based on the dissimilarity score (ascending)
disim_list = sorted(disim_list, key=lambda tup: tup[2])
# For each link in the link list
for comb in disim_list :
# Append it to the global priority list
floor_list.append((step, (comb[0], comb[1])))
step += 1
# Initialize dict containing the number of links existing for each protein
link_dict = {}
for protein in protein_list :
link_dict[protein] = 0
## Linking algorithm
# The objective of this part is to create a list of protein, where proteins are aggregated based on their feature content.
serie_dict = {}
# For each possible link from highest to lowest priority
for floor in floor_list :
# Get the 2 proteins inplicated in the potential link
protein_1 = floor[1][0]
protein_2 = floor[1][1]
# Check if proteins are already linked or not
out_1 = isUsed(serie_dict, protein_1)
out_2 = isUsed(serie_dict, protein_2)
# In case the two protein have existing links :
if out_1 != False and out_2 != False :
# If the two proteins have only one link
if link_dict[protein_1] == 1 and link_dict[protein_2] == 1 :
# Get to position in priority list of the 2 proteins
floor_id_1, elem_nb_1 = out_1
floor_id_2, elem_nb_2 = out_2
if floor_id_1 != floor_id_2 :
# Get minimum and maximum postion of the 2 proteins in the priority list
min_floor = min(floor_id_1, floor_id_2)
max_floor = max(floor_id_1, floor_id_2)
# Add a link to each protein
link_dict[protein_1] += 1
link_dict[protein_2] += 1
# Create link between the two existing lists depending on the orientation of the lists
if elem_nb_1 == 0 and elem_nb_2 == 0 :
# Flipping list containing protein 1 and append list containing protein 2
serie_dict[min_floor] = serie_dict[floor_id_1][::-1] + serie_dict[floor_id_2]
elif elem_nb_1 == 0 and elem_nb_2 != 0 :
# Append to list containing protein 2 list containing protein 1
serie_dict[min_floor] = serie_dict[floor_id_2] + serie_dict[floor_id_1]
elif elem_nb_1 != 0 and elem_nb_2 == 0 :
# Append to list containing protein 1 list containing protein 2
serie_dict[min_floor] = serie_dict[floor_id_1] + serie_dict[floor_id_2]
elif elem_nb_1 != 0 and elem_nb_2 != 0 :
# Add to list containging protein 1 flipped list containing protein 2
serie_dict[min_floor] = serie_dict[floor_id_1] + serie_dict[floor_id_2][::-1]
del serie_dict[max_floor]
# In case only protein 2 has already a link
elif out_1 != False :
# Get to position in priority list of the not linked protein
floor_id, elem_nb = out_1
floor_protein_list = serie_dict[floor_id]
# Protein to link at the start of the existing linked list
if elem_nb == 0 :
# Create link between isolated protein and linked list of proteins by the protein 1 end
serie_dict[floor_id] = [protein_2] + floor_protein_list
# Add a link to each protein
link_dict[protein_1] += 1
link_dict[protein_2] += 1
# Protein to link at the end of the existing linked list
elif elem_nb == len(floor_protein_list)-1 :
# Create link between isolated protein and linked list of proteins by the protein 1 end
serie_dict[floor_id] = floor_protein_list + [protein_2]
# Add a link to each protein
link_dict[protein_1] += 1
link_dict[protein_2] += 1
# In case only protein 1 has already one link
elif out_2 != False :
# Get to position in priority list of the not linked protein
floor_id, elem_nb = out_2
floor_protein_list = serie_dict[floor_id]
# Protein to link at the start of the existing linked list
if elem_nb == 0 :
# Create link between isolated protein and linked list of proteins by the protein 2 end
serie_dict[floor_id] = [protein_1] + floor_protein_list
# Add a link to each protein
link_dict[protein_1] += 1
link_dict[protein_2] += 1
# Protein to link at the end of the existing linked list
elif elem_nb == len(floor_protein_list)-1 :
# Create link between isolated protein and linked list of proteins by the protein 2 end
serie_dict[floor_id] = floor_protein_list + [protein_1]
# Add a link to each protein
link_dict[protein_1] += 1
link_dict[protein_2] += 1
# In case none of the two protein have exisiting links
else :
# Link the two proteins
serie_dict[floor[0]] = [protein_1, protein_2]
# Add a link to each protein
link_dict[protein_1] += 1
link_dict[protein_2] += 1
if len(protein_list) != 1:
protein_list = serie_dict[0]
## Reorder the feature count DataFrame in case you want to output it
#df = df.reindex(protein_list)
return protein_list
def two_prot_inter_sort(df,column_protein_1,column_protein_2,column_val, seuil_min, seuil_max=1) :