Open 1412140736 opened 2 months ago
Hi @1412140736 can you share a snippet and the pdb id to reproduce this?
yes,this is the snippet : (Here are the IDs of some problematic PDB files that I downloaded from RCSB: P55211 4RHW P29597 3NZ0 Q6V1X1 6EOO ) from graphein.protein.config import ProteinGraphConfig from graphein.protein.graphs import construct_graph from functools import partial from graphein.protein.edges.distance import add_distance_threshold,add_peptide_bonds import esm import networkx as nx import os import torch import pandas import warnings import pickle from torch_geometric.data import Data from tqdm import tqdm
pandas.set_option('mode.chained_assignment', None)
protein_model, alphabet = esm.pretrained.esm2_t33_650M_UR50D()
batch_converter = alphabet.get_batch_converter()
protein_model.eval()
new_edge_funcs = {"edge_construction_functions": [partial(add_distance_threshold, long_interaction_threshold=0, threshold=8)]}
config = ProteinGraphConfig(**new_edge_funcs)
pdb_ID="P55211" protein_path = os.getcwd() + "/tmp/" g = construct_graph(config=config, path=str(protein_path)+str(pdb_ID)+".pdb")
Thanks! I could reproduce it.
It looks like removing altlocs throws off the indexing order in the dataframe.
Quick fix first: replace .loc
with .iloc
in add_distance_threshold
:
def add_distance_threshold(
G: nx.Graph, long_interaction_threshold: int, threshold: float = 5.0
):
"""
Adds edges to any nodes within a given distance of each other.
Long interaction threshold is used to specify minimum separation in sequence
to add an edge between networkx nodes within the distance threshold
:param G: Protein Structure graph to add distance edges to
:type G: nx.Graph
:param long_interaction_threshold: minimum distance in sequence for two
nodes to be connected
:type long_interaction_threshold: int
:param threshold: Distance in angstroms, below which two nodes are connected
:type threshold: float
:return: Graph with distance-based edges added
"""
pdb_df = filter_dataframe(
G.graph["pdb_df"], "node_id", list(G.nodes()), True
)
dist_mat = compute_distmat(pdb_df)
interacting_nodes = get_interacting_atoms(threshold, distmat=dist_mat)
interacting_nodes = list(zip(interacting_nodes[0], interacting_nodes[1]))
log.info(f"Found: {len(interacting_nodes)} distance edges")
count = 0
for a1, a2 in interacting_nodes:
n1 = G.graph["pdb_df"].iloc[a1]["node_id"]
n2 = G.graph["pdb_df"].iloc[a2]["node_id"]
n1_chain = G.graph["pdb_df"].iloc[a1]["chain_id"]
n2_chain = G.graph["pdb_df"].iloc[a2]["chain_id"]
n1_position = G.graph["pdb_df"].iloc[a1]["residue_number"]
n2_position = G.graph["pdb_df"].iloc[a2]["residue_number"]
condition_1 = n1_chain == n2_chain
condition_2 = (
abs(n1_position - n2_position) < long_interaction_threshold
)
if not (condition_1 and condition_2):
count += 1
add_edge(G, n1, n2, "distance_threshold")
log.info(
f"Added {count} distance edges. ({len(list(interacting_nodes)) - count}\
removed by LIN)"
)
Longer term fix: resetting the index after removing altlocs.
Thank you for your prompt response. I have followed your advice to change .loc to .iloc in add_distance_threshold. However, a new error has occurred, and the error message is as follows (thanks again for your response): File "/media/aita130/anaconda_space/envs/ZeroBind/lib/python3.9/site-packages/pandas/core/indexing.py", line 873, in _validate_tuple_indexer self._validate_key(k, i) File "/media/aita130/anaconda_space/envs/ZeroBind/lib/python3.9/site-packages/pandas/core/indexing.py", line 1483, in _validate_key raise ValueError(f"Can only index by location with a [{self._valid_types}]") ValueError: Can only index by location with a [integer, integer slice (START point is INCLUDED, END point is EXCLUDED), listlike of integers, boolean array]
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "/home/aita130/lm/zerobind/test.py", line 98, in
Apologies, syntax error on my part. I've updated the codeblock above.
The issue has been resolved. Thank you!
The issue has been resolved. Thank you!
Hello, I'm facing the same problem you encountered. Could you tell me how you overcame it?
The error message is as follows: Constructing edges... ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 0% -:--:-- Traceback (most recent call last): File "/media/aita130/anaconda_space/envs/ZeroBind/lib/python3.9/site-packages/pandas/core/indexes/base.py", line 3803, in get_loc return self._engine.get_loc(casted_key) File "pandas/_libs/index.pyx", line 138, in pandas._libs.index.IndexEngine.get_loc File "pandas/_libs/index.pyx", line 165, in pandas._libs.index.IndexEngine.get_loc File "pandas/_libs/hashtable_class_helper.pxi", line 2263, in pandas._libs.hashtable.Int64HashTable.get_item File "pandas/_libs/hashtable_class_helper.pxi", line 2273, in pandas._libs.hashtable.Int64HashTable.get_item KeyError: 230
The above exception was the direct cause of the following exception:
Traceback (most recent call last): File "/home/aita130/lm/zerobind/test.py", line 27, in
g = construct_graph(config=config, path=str(protein_path)+str(pdb_ID)+".pdb")
File "/media/aita130/anaconda_space/envs/ZeroBind/lib/python3.9/site-packages/graphein/protein/graphs.py", line 855, in construct_graph
g = compute_edges(
File "/media/aita130/anaconda_space/envs/ZeroBind/lib/python3.9/site-packages/graphein/protein/graphs.py", line 682, in compute_edges
func(G)
File "/media/aita130/anaconda_space/envs/ZeroBind/lib/python3.9/site-packages/graphein/protein/edges/distance.py", line 968, in add_distance_threshold
n2 = G.graph["pdb_df"].loc[a2, "node_id"]
File "/media/aita130/anaconda_space/envs/ZeroBind/lib/python3.9/site-packages/pandas/core/indexing.py", line 1066, in getitem
return self.obj._get_value(*key, takeable=self._takeable)
File "/media/aita130/anaconda_space/envs/ZeroBind/lib/python3.9/site-packages/pandas/core/frame.py", line 3921, in _get_value
row = self.index.get_loc(index)
File "/media/aita130/anaconda_space/envs/ZeroBind/lib/python3.9/site-packages/pandas/core/indexes/base.py", line 3805, in get_loc
raise KeyError(key) from err
KeyError: 230