Open aggreen opened 5 years ago
Code fragment for implementing this (needs generalization to monomer structures, test cases)
def maximize_coverage(inter_structural_hits, n_to_select):
"""
Parameters
----------
inter_structural_hits: pd.DataFrame
n_to_select: int
number of hits to select
"""
# calculate an array that represents the full length of the concatenated sequence
L1 = int(inter_structural_hits.loc[0,"uniprot_end_1"] - inter_structural_hits.loc[0,"uniprot_start_1"])
L2 = int(inter_structural_hits.loc[0,"uniprot_end_2"] - inter_structural_hits.loc[0,"uniprot_start_2"])
inter_structural_hits["i_start_1"] = inter_structural_hits.loc[:,"alignment_start_1"] - inter_structural_hits.loc[0,"uniprot_start_1"]
inter_structural_hits["i_end_1"] = inter_structural_hits.loc[:,"alignment_end_1"] - inter_structural_hits.loc[0,"uniprot_start_1"]
inter_structural_hits["i_start_2"] = inter_structural_hits.loc[:,"alignment_start_2"] + L1
inter_structural_hits["i_end_2"] = inter_structural_hits.loc[:,"alignment_end_2"] + L1
inter_structural_hits["n_positions"] = (inter_structural_hits.alignment_end_1 - inter_structural_hits.alignment_start_1) *\
(inter_structural_hits.alignment_end_2 - inter_structural_hits.alignment_start_2)
# Calculate number of inter-protein residue pairs represented for each structure
coverage_matrix = np.zeros((len(inter_structural_hits), L1+L2))
for idx,row in inter_structural_hits.iterrows():
coverage_matrix[idx, row.i_start_1:row.i_end_1] = 1
coverage_matrix[idx, row.i_start_2:row.i_end_2] = 1
# Take the hit with the maximum number of residues
idx_of_max = inter_structural_hits.n_positions.idxmax()
current_cov = coverage_matrix[idx_of_max,:].reshape(1,-1)
print("starting_coverage", current_cov.sum())
selected_hits = [idx_of_max]
while len(selected_hits) < n_to_select:
if len(selected_hits) >= len(inter_structural_hits):
break
# find positions not covered by current selection
diffs = coverage_matrix - current_cov
# select the sequence that would add the most positions
highest = (diffs==1).sum(axis=1).argmax()
# don't let
if highest in selected_hits:
highest = inter_structural_hits.drop(selected_hits).n_positions.idxmax()
print("arg maximized by len", highest)
else:
print("arg maximized", highest, (diffs==1).sum(axis=1).max())
selected_hits.append(highest)
current_cov = current_cov + coverage_matrix[highest,:]
current_cov[current_cov>0]=1
print("updated coverage", current_cov.sum())
return inter_structural_hits.loc[selected_hits,:]
In the compare stage, the pipeline takes the top N structural hits based on their similarity to the target. This can be problematic because sometimes the user wants to maximize for the amount of input sequence covered by a structure. Create a flag called something like maximize_structural_coverage and write a greedy algorithm to take the longest hits.