Kronos573 / Image_stagee

0 stars 0 forks source link

algo de tris #2

Open Kronos573 opened 1 year ago

Kronos573 commented 1 year ago

def two_prot_inter_sort(df,column_protein_1,column_protein_2,column_val, seuil_min, seuil_max=1) :

protein_1_list=list(set(list(df[column_protein_1]))) #on initisalise les listes des proteines qu'on veut comparer
protein_2_list=list(set(list(df[column_protein_2])))
protein_1_list.sort()
protein_2_list.sort()
val_interaction=list(df[column_val])
new_val_inter_list=new_val_inter(val_interaction,seuil_min,seuil_max)
dico_protein_1_partners={}#intialise le dico contenant seulement les partenaires
for index,row in df.iterrows():
    if row[column_protein_1] in dico_protein_1_partners :
        dico_protein_1_partners[row[column_protein_1]].append((row[column_protein_2], new_val_inter_list[c]))
    else :

        dico_protein_1_partners[row[column_protein_1]] = [(row[column_protein_2], new_val_inter_list[c])]
    c+=1

dico_protein_1_protein_2={}
for protein_1 in protein_1_list :#initialise le nouveau dict
    dico_protein_1_protein_2[protein_1]=[]

for protein_1 in protein_1_list :
    for protein_2 in range(len(protein_2_list)) :
        partenaire=False
        for i in range(len(dico_protein_1_partners[protein_1])) :#verifie s'il y a une intéraction/si protein_2 est partenaire
            if protein_2_list[protein_2]==dico_protein_1_partners[protein_1][i][0] :
                dico_protein_1_protein_2[protein_1].append((protein_2_list[protein_2],dico_protein_1_partners[protein_1][i][1]))
                partenaire=True

        if partenaire==False :
            dico_protein_1_protein_2[protein_1].append((protein_2_list[protein_2],-1))

#toutes les combinaisons de prot1 list
protein_1_comb = list(set(list(itertools.combinations(protein_1_list, 2))))
protein_1_comb.sort()

disim_dist_dict = {}
sim_dist_dict = {}
len_dist_dict = {}
for protein_1 in protein_1_list :
    # Initialize similarity,length and dissimilary dict for each protein
    disim_dist_dict[protein_1] = {}
    sim_dist_dict[protein_1] = {}
    len_dist_dict[protein_1] = {}
    for vs_protein_1 in protein_1_list :
        # Initialize similarity, length and dissimilarity dict for compared protein to 0
        disim_dist_dict[protein_1][vs_protein_1] = 0
        sim_dist_dict[protein_1][vs_protein_1] = 0
        len_dist_dict[protein_1][vs_protein_1] = 0

x=0

for protein, vs_protein in protein_1_comb :
    for partner in range(len(dico_protein_1_protein_2[protein])) :
        if dico_protein_1_protein_2[protein][partner][1]==-1 and dico_protein_1_protein_2[vs_protein][partner][1]==-1 :

            disim_dist_dict[protein][vs_protein]+=1
        else :
            if (dico_protein_1_protein_2[protein][partner][1]==-1 and dico_protein_1_protein_2[vs_protein][partner][1]!=-1) or (dico_protein_1_protein_2[protein][partner][1]!=-1 and dico_protein_1_protein_2[vs_protein][partner][1]==-1) :

                len_dist_dict[protein_1][vs_protein_1]+=1

            else :
                if abs(dico_protein_1_protein_2[protein][partner][1]-dico_protein_1_protein_2[vs_protein][partner][1])<0.3 :
                    sim_dist_dict[protein][vs_protein]+=1

                else :
                    disim_dist_dict[protein][vs_protein]+=1

    x+=1
    print(x)
# Create DataFrames from the similarity and dissimilarity dict
disim_dist_df = pd.DataFrame.from_dict(disim_dist_dict)
sim_dist_df = pd.DataFrame.from_dict(sim_dist_dict)
len_dist_df = pd.DataFrame.from_dict(len_dist_dict)
# Setting diagonal to zeros
np.fill_diagonal(disim_dist_df.values, -1)
np.fill_diagonal(sim_dist_df.values, -1)
np.fill_diagonal(len_dist_df.values, -1)
# Get max similarity and dissimilarity scores
max_sim = max(sim_dist_df.max())
max_disim = max(disim_dist_df.max())
max_length = max(len_dist_df.max())

## Creating a list of priorities to link the proteins based on the content
# In order of priority : Similarity (descending) > Dissimilarity (ascending) > Sequence size difference (ascending)

# Initializing step number and list of priorities
step = 0
floor_list = []

# From highest to lowest similarity score
for sim in [i for i in range(0,max_sim+1)][::-1] :
    # Initialize list of links with same similarity score
    disim_list = []
    # For each combination of pairs of proteins
    for comb in protein_1_comb :
        # If similarity score corresponds to the searched similarity score
        if sim_dist_df[comb[0]][comb[1]] == sim :
            # Add combination to the list (protein1, protein2, dissimilarity score, absolute difference of sequence length)
            disim_list.append((comb[0], comb[1], disim_dist_df[comb[0]][comb[1]],len_dist_df[comb[0]][comb[1]]))# abs(protein_length_dict[comb[0]] - protein_length_dict[comb[1]])))

    # Sort the list of links based on the difference of sequence length (ascending)
    disim_list = sorted(disim_list, key=lambda tup: tup[3])
    # Sort the list of links based on the dissimilarity score (ascending)
    disim_list = sorted(disim_list, key=lambda tup: tup[2])

    # For each link in the link list
    for comb in disim_list :
        # Append it to the global priority list
        floor_list.append((step, (comb[0], comb[1])))
        step += 1

# Initialize dict containing the number of links existing for each protein
link_dict = {}
for protein in protein_1_list :
    link_dict[protein] = 0

## Linking algorithm
# The objective of this part is to create a list of protein, where proteins are aggregated based on their feature content.

serie_dict = {}
# For each possible link from highest to lowest priority
for floor in floor_list :
    # Get the 2 proteins inplicated in the potential link
    protein_1 = floor[1][0]
    protein_2 = floor[1][1]
    # Check if proteins are already linked or not
    out_1 = isUsed(serie_dict, protein_1)
    out_2 = isUsed(serie_dict, protein_2)
    # In case the two protein have existing links :
    if out_1 != False and out_2 != False :
        # If the two proteins have only one link
        if link_dict[protein_1] == 1 and link_dict[protein_2] == 1 :
            # Get to position in priority list of the 2 proteins
            floor_id_1, elem_nb_1 = out_1
            floor_id_2, elem_nb_2 = out_2
            if floor_id_1 != floor_id_2 :
                # Get minimum and maximum postion of the 2 proteins in the priority list
                min_floor = min(floor_id_1, floor_id_2)
                max_floor = max(floor_id_1, floor_id_2)
                # Add a link to each protein
                link_dict[protein_1] += 1
                link_dict[protein_2] += 1
                # Create link between the two existing lists depending on the orientation of the lists
                if elem_nb_1 == 0 and elem_nb_2 == 0 :
                    # Flipping list containing protein 1 and append list containing protein 2
                    serie_dict[min_floor] = serie_dict[floor_id_1][::-1] + serie_dict[floor_id_2]
                elif elem_nb_1 == 0 and elem_nb_2 != 0 :
                    # Append to list containing protein 2 list containing protein 1
                    serie_dict[min_floor] = serie_dict[floor_id_2] +  serie_dict[floor_id_1]
                elif elem_nb_1 != 0 and elem_nb_2 == 0 :
                    # Append to list containing protein 1 list containing protein 2
                    serie_dict[min_floor] = serie_dict[floor_id_1] + serie_dict[floor_id_2]
                elif elem_nb_1 != 0 and elem_nb_2 != 0 :
                    # Add to list containging protein 1 flipped list containing protein 2
                    serie_dict[min_floor] = serie_dict[floor_id_1] + serie_dict[floor_id_2][::-1]
                del serie_dict[max_floor]

    # In case only protein 2 has already a link
    elif out_1 != False :
        # Get to position in priority list of the not linked protein
        floor_id, elem_nb = out_1
        floor_protein_list = serie_dict[floor_id]
        # Protein to link at the start of the existing linked list
        if elem_nb == 0 :
            # Create link between isolated protein and linked list of proteins by the protein 1 end
            serie_dict[floor_id] = [protein_2] + floor_protein_list
            # Add a link to each protein
            link_dict[protein_1] += 1
            link_dict[protein_2] += 1
        # Protein to link at the end of the existing linked list
        elif elem_nb == len(floor_protein_list)-1 :
            # Create link between isolated protein and linked list of proteins by the protein 1 end
            serie_dict[floor_id] = floor_protein_list + [protein_2]
            # Add a link to each protein
            link_dict[protein_1] += 1
            link_dict[protein_2] += 1

    # In case only protein 1 has already one link
    elif out_2 != False :
        # Get to position in priority list of the not linked protein
        floor_id, elem_nb = out_2
        floor_protein_list = serie_dict[floor_id]
        # Protein to link at the start of the existing linked list
        if elem_nb == 0 :
            # Create link between isolated protein and linked list of proteins by the protein 2 end
            serie_dict[floor_id] = [protein_1] + floor_protein_list
            # Add a link to each protein
            link_dict[protein_1] += 1
            link_dict[protein_2] += 1
        # Protein to link at the end of the existing linked list
        elif elem_nb == len(floor_protein_list)-1 :
            # Create link between isolated protein and linked list of proteins by the protein 2 end
            serie_dict[floor_id] = floor_protein_list + [protein_1]
            # Add a link to each protein
            link_dict[protein_1] += 1
            link_dict[protein_2] += 1

    # In case none of the two protein have exisiting links
    else :
        # Link the two proteins
        serie_dict[floor[0]] = [protein_1, protein_2]
        # Add a link to each protein
        link_dict[protein_1] += 1
        link_dict[protein_2] += 1

if len(protein_1_list) != 1:
    protein_1_list = serie_dict[0]

## Reorder the feature count DataFrame in case you want to output it
#df = df.reindex(protein_list)

print(len(protein_1_list))
print()
return protein_1_list
Kronos573 commented 1 year ago

def one_prot_inter_sort(protein_list,dict_prot_et_leurs_part):

protein_comb = list(set(list(itertools.combinations(protein_list, 2))))
protein_comb.sort()
# Create similarity and dissimilarity dataframe
disim_dist_dict = {}
sim_dist_dict = {}

for protein in protein_list :
    # Initialize similarity and dissimilary dict for each protein
    disim_dist_dict[protein] = {}
    sim_dist_dict[protein] = {}
    for vs_protein in protein_list :
        # Initialize similarity and dissimilarity dict for compared protein to 0
        disim_dist_dict[protein][vs_protein] = 0
        sim_dist_dict[protein][vs_protein] = 0

# For each protein pairs in the list
for protein in protein_list :
    protein_partners=[]

    if protein in dict_prot_et_leurs_part :

        for partner in dict_prot_et_leurs_part[protein]: #on crée une liste des partenaires qui sont eux-mm présents dans la list_code
            if partner[0] in protein_list :
                protein_partners.append(partner[0])

    for vs_protein in protein_list :
        protein_vs_partners=[]

    # Get data for protein to compare with
        if vs_protein in dict_prot_et_leurs_part :

            for partner in dict_prot_et_leurs_part[vs_protein]: #on crée une liste des partenaires qui sont eux-mm présents dans la list_code
                if partner[0] in protein_list :
                    protein_vs_partners.append(partner[0])

        protein_disim=set(protein_vs_partners)^set(protein_partners)

        disim_dist_dict[protein][vs_protein]=len(protein_disim)

        sim_dist_dict[protein][vs_protein]=len(protein_list)-len(protein_disim)

  ##Finished creating disim and sim dict

# Create DataFrames from the similarity and dissimilarity dict
disim_dist_df = pd.DataFrame.from_dict(disim_dist_dict)
sim_dist_df = pd.DataFrame.from_dict(sim_dist_dict)

# Setting diagonal to zeros
np.fill_diagonal(disim_dist_df.values, -1)
np.fill_diagonal(sim_dist_df.values, -1)

# Get max similarity and dissimilarity scores
max_sim = max(sim_dist_df.max())
max_disim = max(disim_dist_df.max())

## Creating a list of priorities to link the proteins based on the content
# In order of priority : Similarity (descending) > Dissimilarity (ascending) > Sequence size difference (ascending)

# Initializing step number and list of priorities
step = 0
floor_list = []

# From highest to lowest similarity score
for sim in [i for i in range(0,max_sim+1)][::-1] :
    # Initialize list of links with same similarity score
    disim_list = []
    # For each combination of pairs of proteins
    for comb in protein_comb :
        # If similarity score corresponds to the searched similarity score
        if sim_dist_df[comb[0]][comb[1]] == sim :
            # Add combination to the list (protein1, protein2, dissimilarity score, absolute difference of sequence length)
            disim_list.append((comb[0], comb[1], disim_dist_df[comb[0]][comb[1]]))# abs(protein_length_dict[comb[0]] - protein_length_dict[comb[1]])))

    # Sort the list of links based on the difference of sequence length (ascending)
    #disim_list = sorted(disim_list, key=lambda tup: tup[3])
    # Sort the list of links based on the dissimilarity score (ascending)
    disim_list = sorted(disim_list, key=lambda tup: tup[2])

    # For each link in the link list
    for comb in disim_list :
        # Append it to the global priority list
        floor_list.append((step, (comb[0], comb[1])))
        step += 1

# Initialize dict containing the number of links existing for each protein
link_dict = {}
for protein in protein_list :
    link_dict[protein] = 0

## Linking algorithm
# The objective of this part is to create a list of protein, where proteins are aggregated based on their feature content.

serie_dict = {}
# For each possible link from highest to lowest priority
for floor in floor_list :
    # Get the 2 proteins inplicated in the potential link
    protein_1 = floor[1][0]
    protein_2 = floor[1][1]
    # Check if proteins are already linked or not
    out_1 = isUsed(serie_dict, protein_1)
    out_2 = isUsed(serie_dict, protein_2)
    # In case the two protein have existing links :
    if out_1 != False and out_2 != False :
        # If the two proteins have only one link
        if link_dict[protein_1] == 1 and link_dict[protein_2] == 1 :
            # Get to position in priority list of the 2 proteins
            floor_id_1, elem_nb_1 = out_1
            floor_id_2, elem_nb_2 = out_2
            if floor_id_1 != floor_id_2 :
                # Get minimum and maximum postion of the 2 proteins in the priority list
                min_floor = min(floor_id_1, floor_id_2)
                max_floor = max(floor_id_1, floor_id_2)
                # Add a link to each protein
                link_dict[protein_1] += 1
                link_dict[protein_2] += 1
                # Create link between the two existing lists depending on the orientation of the lists
                if elem_nb_1 == 0 and elem_nb_2 == 0 :
                    # Flipping list containing protein 1 and append list containing protein 2
                    serie_dict[min_floor] = serie_dict[floor_id_1][::-1] + serie_dict[floor_id_2]
                elif elem_nb_1 == 0 and elem_nb_2 != 0 :
                    # Append to list containing protein 2 list containing protein 1
                    serie_dict[min_floor] = serie_dict[floor_id_2] +  serie_dict[floor_id_1]
                elif elem_nb_1 != 0 and elem_nb_2 == 0 :
                    # Append to list containing protein 1 list containing protein 2
                    serie_dict[min_floor] = serie_dict[floor_id_1] + serie_dict[floor_id_2]
                elif elem_nb_1 != 0 and elem_nb_2 != 0 :
                    # Add to list containging protein 1 flipped list containing protein 2
                    serie_dict[min_floor] = serie_dict[floor_id_1] + serie_dict[floor_id_2][::-1]
                del serie_dict[max_floor]

    # In case only protein 2 has already a link
    elif out_1 != False :
        # Get to position in priority list of the not linked protein
        floor_id, elem_nb = out_1
        floor_protein_list = serie_dict[floor_id]
        # Protein to link at the start of the existing linked list
        if elem_nb == 0 :
            # Create link between isolated protein and linked list of proteins by the protein 1 end
            serie_dict[floor_id] = [protein_2] + floor_protein_list
            # Add a link to each protein
            link_dict[protein_1] += 1
            link_dict[protein_2] += 1
        # Protein to link at the end of the existing linked list
        elif elem_nb == len(floor_protein_list)-1 :
            # Create link between isolated protein and linked list of proteins by the protein 1 end
            serie_dict[floor_id] = floor_protein_list + [protein_2]
            # Add a link to each protein
            link_dict[protein_1] += 1
            link_dict[protein_2] += 1

    # In case only protein 1 has already one link
    elif out_2 != False :
        # Get to position in priority list of the not linked protein
        floor_id, elem_nb = out_2
        floor_protein_list = serie_dict[floor_id]
        # Protein to link at the start of the existing linked list
        if elem_nb == 0 :
            # Create link between isolated protein and linked list of proteins by the protein 2 end
            serie_dict[floor_id] = [protein_1] + floor_protein_list
            # Add a link to each protein
            link_dict[protein_1] += 1
            link_dict[protein_2] += 1
        # Protein to link at the end of the existing linked list
        elif elem_nb == len(floor_protein_list)-1 :
            # Create link between isolated protein and linked list of proteins by the protein 2 end
            serie_dict[floor_id] = floor_protein_list + [protein_1]
            # Add a link to each protein
            link_dict[protein_1] += 1
            link_dict[protein_2] += 1

    # In case none of the two protein have exisiting links
    else :
        # Link the two proteins
        serie_dict[floor[0]] = [protein_1, protein_2]
        # Add a link to each protein
        link_dict[protein_1] += 1
        link_dict[protein_2] += 1

if len(protein_list) != 1:
    protein_list = serie_dict[0]

## Reorder the feature count DataFrame in case you want to output it
#df = df.reindex(protein_list)

return protein_list